Run workflow using API¶
from pathlib import Path
from rich import print as pprint
import logging
logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results
# Silence dask logging
!dask config set logging.distributed warning
Updated [logging.distributed] to [warning], config saved to /home/verhoes/.config/dask/dask.yaml
Setup session directory¶
In this directory files like structure files are stored and a DuckDB database for meta data.
session_dir = Path("session1")
session_dir
PosixPath('session1')
Search Uniprot for structures¶
from protein_detective.workflow import UniprotQuery, search_structures_in_uniprot
query = UniprotQuery(
taxon_id="9606",
reviewed=True,
subcellular_location_uniprot="nucleus",
subcellular_location_go=["GO:0005634"], # Cellular component - Nucleus
molecular_function_go=["GO:0003677"], # Molecular function - DNA binding
)
search_result = search_structures_in_uniprot(query, session_dir, limit=100)
search_result
WARNING:protein_quest.uniprot:Search for uniprot accessions returned 100 results. There may be more results available, but they are not returned due to the limit of 100. Consider increasing the limit to get more results. WARNING:protein_quest.uniprot:Search for pdbs on uniprot returned 100 results. There may be more results available, but they are not returned due to the limit of 100. Consider increasing the limit to get more results. WARNING:protein_quest.uniprot:Search for alphafold entries on uniprot returned 100 results. There may be more results available, but they are not returned due to the limit of 100. Consider increasing the limit to get more results.
UniprotSearchResult(nr_uniprot_accessions=100, nr_pdbs=100, nr_prot2pdb=100, nr_afs=100, nr_interaction_partners=0)
Use database queries to see what was found.
Fetch structures from PDBe and Alphafold found in Uniprot¶
from protein_detective.workflow import async_retrieve_structures
# Allow asyncio to be nested (needed for jupyter notebooks)
download_path, nr_pdbs, nr_alphafolds = await async_retrieve_structures(session_dir)
download_path, nr_pdbs, nr_alphafolds
Downloading PDBe mmCIF files: 100%|██████████| 100/100 [00:00<00:00, 7187.20it/s] Fetching Alphafold summaries: 100%|██████████| 100/100 [00:00<00:00, 2810.05it/s] Downloading AlphaFold files: 100%|██████████| 100/100 [00:00<00:00, 514.11it/s]
(PosixPath('session1/downloads'), 100, 100)
Filter structures¶
Prepare structures for powerfitting by filtering them based on confidence and nr of residues.
from protein_detective.filter import ConfidenceFilterQuery, FilterOptions, SecondaryStructureFilterQuery
from protein_detective.workflow import filter_structures
options = FilterOptions(
confidence=ConfidenceFilterQuery(confidence=70, min_residues=100, max_residues=1000),
secondary_structure=SecondaryStructureFilterQuery(),
)
filtered_dir, filtered_results = filter_structures(session_dir, options)
pprint(filtered_results[:2])
pprint(filtered_results[-2:])
filtered_dir
0%| | 0/100 [00:00<?, ?file/s]
[ FilteredStructure( uniprot_accession='A0A087WUV0', pdb_id=None, confidence=ConfidenceFilterResult( input_file='AF-A0A087WUV0-F1-model_v6.cif.gz', count=282, filtered_file=PosixPath('filtered/AF-A0A087WUV0-F1-model_v6.cif.gz') ), chain=None, residue=None, secondary_structure=None ), FilteredStructure( uniprot_accession='A0A0C5B5G6', pdb_id=None, confidence=ConfidenceFilterResult( input_file='AF-A0A0C5B5G6-F1-model_v6.cif.gz', count=10, filtered_file=None ), chain=None, residue=None, secondary_structure=None ) ]
[ FilteredStructure( uniprot_accession='O00571', pdb_id='4O2F', confidence=None, chain=ChainFilterStatistics( input_file=PosixPath('session1/downloads/pdbe/4o2f.cif.gz'), chain_id='C', passed=True, output_file=PosixPath('pdb_chain_filtered/4o2f_C2A.cif.gz'), discard_reason=None ), residue=ResidueFilterStatistics( input_file=PosixPath('session1/pdb_chain_filtered/4o2f_C2A.cif.gz'), residue_count=8, passed=False, output_file=None ), secondary_structure=None ), FilteredStructure( uniprot_accession='O00482', pdb_id='4IS8', confidence=None, chain=ChainFilterStatistics( input_file=PosixPath('session1/downloads/pdbe/4is8.cif.gz'), chain_id='A', passed=True, output_file=PosixPath('pdb_chain_filtered/4is8_A2A.cif.gz'), discard_reason=None ), residue=ResidueFilterStatistics( input_file=PosixPath('session1/pdb_chain_filtered/4is8_A2A.cif.gz'), residue_count=230, passed=True, output_file=PosixPath('filtered/4is8_A2A.cif') ), secondary_structure=None ) ]
PosixPath('session1/filtered')
total_nr = len(filtered_results)
total_nr_passed = sum(1 for r in filtered_results if r.passed)
total_nr_discarded = total_nr - total_nr_passed
pprint(f"Total entries: {total_nr}, passed: {total_nr_passed}, discarded: {total_nr_discarded}")
Total entries: 200, passed: 132, discarded: 68
Powerfit¶
from protein_detective.powerfit.options import PowerfitOptions
from protein_detective.powerfit.workflow import powerfit_commands
To run you must have cloned the https://github.com/haddocking/powerfit-tutorial repository in '../../powerfit-tutorial'.
options = PowerfitOptions(
target=Path("../../powerfit-tutorial/ribosome-KsgA.map"),
resolution=13,
angle=20,
nproc=6,
)
Run¶
Here we use printed commands to run powerfit, see powerfit.ipynb for running powerfits using the API.
commands, powerfit_run_id = powerfit_commands(session_dir, options)
powerfit_run_id
2
cwd = str(Path.cwd())
rel_commands = [c.replace(cwd, ".") for c in commands]
pprint(rel_commands[:10])
[ 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A087WUV0-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A087WUV0-F1-model_v6.cif --delimiter , --angle 20', 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A0U1RQI7-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A0U1RQI7-F1-model_v6.cif --delimiter , --angle 20', 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A1B0GTS1-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A1B0GTS1-F1-model_v6.cif --delimiter , --angle 20', 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A1B0GWH4-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A1B0GWH4-F1-model_v6.cif --delimiter , --angle 20', 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A1W2PPF3-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A1W2PPF3-F1-model_v6.cif --delimiter , --angle 20', 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A1W2PQL4-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A1W2PQL4-F1-model_v6.cif --delimiter , --angle 20', 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A1YPR0-F1-model_v6.cif --delimiter , --angle 20', 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A2RRD8-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A2RRD8-F1-model_v6.cif --delimiter , --angle 20', 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A3KN83-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A3KN83-F1-model_v6.cif --delimiter , --angle 20', 'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A6NDX5-F1-model_v6.cif.gz --resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A6NDX5-F1-model_v6.cif --delimiter , --angle 20' ]
These commands should be run on a cluster. Here we will just run a couple of them to show how it works.
!{rel_commands[6]}
Target file read from: /home/verhoes/git/protein-detective/protein-detective/docs/session1/powerfit/2/r ibosome-KsgA.map Target resolution: 13.00 Initial shape of density: 128 128 128 Shape after trimming: 60 73 67 Shape after extending: 60 75 70 Template file read from: /home/verhoes/git/protein-detective/protein-detective/docs/session1/filtered/AF- A1YPR0-F1-model_v6.cif.gz Calculating core-weighted mask. Reading in rotations. Requested rotational sampling density: 20.00 Real rotational sampling density: 20.83 Requested number of processors: 6 Starting search Processing rotations 100% ━━━━━━━━━━╸ 647/648 [ 0:00:02 < 0:00:01 , 248 rot/s ]250 rot/s ] Time for search: 2.924 s Analyzing results Writing solutions to file. Writing PDBs to file. Total time: 0m 3s
!{rel_commands[0]}
Target file read from: /home/verhoes/git/protein-detective/protein-detective/docs/session1/powerfit/2/r ibosome-KsgA.map Target resolution: 13.00 Initial shape of density: 128 128 128 Shape after trimming: 60 73 67 Shape after extending: 60 75 70 Template file read from: /home/verhoes/git/protein-detective/protein-detective/docs/session1/filtered/AF- A0A087WUV0-F1-model_v6.cif.gz Calculating core-weighted mask. Reading in rotations. Requested rotational sampling density: 20.00 Real rotational sampling density: 20.83 Requested number of processors: 6 Starting search Processing rotations 100% ━━━━━━━━━━╸ 647/648 [ 0:00:02 < 0:00:01 , 230 rot/s ]237 rot/s ] Time for search: 3.120 s Analyzing results Writing solutions to file. Writing PDBs to file. Total time: 0m 3s
!{rel_commands[-1]}
Target file read from: /home/verhoes/git/protein-detective/protein-detective/docs/session1/powerfit/2/r ibosome-KsgA.map Target resolution: 13.00 Initial shape of density: 128 128 128 Shape after trimming: 60 73 67 Shape after extending: 60 75 70 Template file read from: /home/verhoes/git/protein-detective/protein-detective/docs/session1/filtered/4is 8_A2A.cif Calculating core-weighted mask. Reading in rotations. Requested rotational sampling density: 20.00 Real rotational sampling density: 20.83 Requested number of processors: 6 Starting search Processing rotations 100% ━━━━━━━━━━━ 648/648 [ 0:00:03 < 0:00:00 , 225 rot/s ], 230 rot/s ]/s ]s ] Time for search: 3.201 s Analyzing results Writing solutions to file. Writing PDBs to file. Total time: 0m 3s
See powerfit.ipynb for running all powerfits using the API.
Report¶
Once all powerfit jobs are done the results can be parsed and reported.
from protein_detective.powerfit.workflow import powerfit_report
solutions = powerfit_report(session_dir)
len(solutions)
5160
solutions
| powerfit_run_id | structure | rank | cc | fishz | relz | translation | rotation | pdb_file | uniprot_acc | pdb_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | AF-A1YPR0-F1-model_v6.cif | 1 | 0.405 | 0.429 | 16.625000 | [211.83, 208.76, 208.76] | [0.0, 0.0, -1.0, -1.0, 0.0, 0.0, 0.0, 1.0, 0.0] | session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz | A1YPR0 | None |
| 1 | 2 | AF-A1YPR0-F1-model_v6.cif | 1 | 0.405 | 0.429 | 16.625000 | [211.83, 208.76, 208.76] | [0.0, 0.0, -1.0, -1.0, 0.0, 0.0, 0.0, 1.0, 0.0] | session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz | A1YPR0 | None |
| 2 | 2 | AF-A0A087WUV0-F1-model_v6.cif | 1 | 0.391 | 0.413 | 17.674999 | [227.18, 260.95, 184.2] | [-0.604, 0.797, 0.0, 0.0, 0.0, 1.0, 0.797, 0.6... | session1/filtered/AF-A0A087WUV0-F1-model_v6.ci... | A0A087WUV0 | None |
| 3 | 2 | AF-A0A087WUV0-F1-model_v6.cif | 2 | 0.390 | 0.412 | 17.653000 | [156.57, 144.29, 214.9] | [0.604, 0.0, 0.797, 0.797, 0.0, -0.604, 0.0, 1... | session1/filtered/AF-A0A087WUV0-F1-model_v6.ci... | A0A087WUV0 | None |
| 4 | 2 | AF-A0A087WUV0-F1-model_v6.cif | 3 | 0.389 | 0.410 | 17.566999 | [267.09, 245.6, 174.99] | [-0.548, 0.184, -0.816, 0.632, -0.548, -0.548,... | session1/filtered/AF-A0A087WUV0-F1-model_v6.ci... | A0A087WUV0 | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5155 | 2 | 4is8_A2A | 1327 | 0.180 | 0.182 | 7.144000 | [156.57, 168.85, 132.01] | [-0.184, -0.816, 0.548, 0.548, -0.548, -0.632,... | session1/filtered/4is8_A2A.cif | O00482 | 4IS8 |
| 5156 | 2 | 4is8_A2A | 1326 | 0.180 | 0.182 | 7.148000 | [251.74, 171.92, 227.18] | [0.797, 0.604, 0.0, 0.0, -0.0, 1.0, 0.604, -0.... | session1/filtered/4is8_A2A.cif | O00482 | 4IS8 |
| 5157 | 2 | 4is8_A2A | 1325 | 0.180 | 0.182 | 7.150000 | [260.95, 264.02, 113.59] | [-0.184, -0.548, -0.816, -0.816, 0.548, -0.184... | session1/filtered/4is8_A2A.cif | O00482 | 4IS8 |
| 5158 | 2 | 4is8_A2A | 1324 | 0.180 | 0.182 | 7.153000 | [125.87, 168.85, 132.01] | [0.548, -0.632, 0.548, 0.184, -0.548, -0.816, ... | session1/filtered/4is8_A2A.cif | O00482 | 4IS8 |
| 5159 | 2 | 4is8_A2A | 1323 | 0.180 | 0.182 | 7.155000 | [205.69, 174.99, 214.9] | [0.548, 0.548, -0.632, -0.816, 0.184, -0.548, ... | session1/filtered/4is8_A2A.cif | O00482 | 4IS8 |
5160 rows × 11 columns
Fit model to solution¶
Rotate/translate the input model PDB files to the top 5 powerfit solutions.
from protein_detective.powerfit.workflow import powerfit_fit_models
fitted = powerfit_fit_models(session_dir, top=5)
fitted
Writing fitted model PDB files: 100%|██████████| 5/5 [00:00<00:00, 84.64it/s]
| powerfit_run_id | structure | rank | fitted_model_file | unfitted_model_file | |
|---|---|---|---|---|---|
| index | |||||
| 0 | 2 | AF-A1YPR0-F1-model_v6.cif | 1 | session1/powerfit/2/AF-A1YPR0-F1-model_v6.cif/... | session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz |
| 1 | 1 | AF-A1YPR0-F1-model_v6.cif | 1 | session1/powerfit/1/AF-A1YPR0-F1-model_v6.cif/... | session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz |
| 2 | 2 | AF-A0A087WUV0-F1-model_v6.cif | 1 | session1/powerfit/2/AF-A0A087WUV0-F1-model_v6.... | session1/filtered/AF-A0A087WUV0-F1-model_v6.ci... |
| 3 | 2 | AF-A0A087WUV0-F1-model_v6.cif | 2 | session1/powerfit/2/AF-A0A087WUV0-F1-model_v6.... | session1/filtered/AF-A0A087WUV0-F1-model_v6.ci... |
| 4 | 2 | AF-A0A087WUV0-F1-model_v6.cif | 3 | session1/powerfit/2/AF-A0A087WUV0-F1-model_v6.... | session1/filtered/AF-A0A087WUV0-F1-model_v6.ci... |