Run workflow using API¶

In [1]:

Copied!

from pathlib import Path

from rich import print as pprint
from pathlib import Path

from rich import print as pprint

In [2]:

Copied!





import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results
import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results

In [3]:

Copied!

# Silence dask logging
!dask config set logging.distributed warning
# Silence dask logging
!dask config set logging.distributed warning

Updated [logging.distributed] to [warning], config saved to /home/verhoes/.config/dask/dask.yaml

Setup session directory¶

In this directory files like structure files are stored and a DuckDB database for meta data.

In [4]:

Copied!

session_dir = Path("session1")
session_dir
session_dir = Path("session1")
session_dir

Out[4]:

PosixPath('session1')

Search Uniprot for structures¶

In [5]:

Copied!

from protein_detective.workflow import UniprotQuery, search_structures_in_uniprot
from protein_detective.workflow import UniprotQuery, search_structures_in_uniprot

In [6]:

Copied!





query = UniprotQuery(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go=["GO:0005634"],  # Cellular component - Nucleus
    molecular_function_go=["GO:0003677"],  # Molecular function - DNA binding
)
query = UniprotQuery(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go=["GO:0005634"],  # Cellular component - Nucleus
    molecular_function_go=["GO:0003677"],  # Molecular function - DNA binding
)

In [7]:

Copied!

search_result = search_structures_in_uniprot(query, session_dir, limit=100)
search_result
search_result = search_structures_in_uniprot(query, session_dir, limit=100)
search_result

WARNING:protein_quest.uniprot:Search for uniprot accessions returned 100 results. There may be more results available, but they are not returned due to the limit of 100. Consider increasing the limit to get more results.
WARNING:protein_quest.uniprot:Search for pdbs on uniprot returned 100 results. There may be more results available, but they are not returned due to the limit of 100. Consider increasing the limit to get more results.
WARNING:protein_quest.uniprot:Search for alphafold entries on uniprot returned 100 results. There may be more results available, but they are not returned due to the limit of 100. Consider increasing the limit to get more results.

Out[7]:

UniprotSearchResult(nr_uniprot_accessions=100, nr_pdbs=100, nr_prot2pdb=100, nr_afs=100, nr_interaction_partners=0)

Use database queries to see what was found.

Fetch structures from PDBe and Alphafold found in Uniprot¶

For Alphafold, only the mmCIF files are downloaded by default, but here for more meta data, we can also download summary JSON files.

In [8]:

Copied!

from protein_detective.workflow import async_retrieve_structures
from protein_detective.workflow import async_retrieve_structures

In [ ]:

Copied!

download_path, nr_pdbs, nr_alphafolds = await async_retrieve_structures(session_dir, what_af_formats={"summary", "cif"})
download_path, nr_pdbs, nr_alphafolds
download_path, nr_pdbs, nr_alphafolds = await async_retrieve_structures(session_dir, what_af_formats={"summary", "cif"})
download_path, nr_pdbs, nr_alphafolds

Downloading PDBe mmCIF files: 100%|██████████| 100/100 [00:00<00:00, 7187.20it/s]
Fetching Alphafold summaries: 100%|██████████| 100/100 [00:00<00:00, 2810.05it/s]
Downloading AlphaFold files: 100%|██████████| 100/100 [00:00<00:00, 514.11it/s]

Out[ ]:

(PosixPath('session1/downloads'), 100, 100)

Filter structures¶

Prepare structures for powerfitting by filtering them based on confidence and nr of residues.

In [10]:

Copied!

from protein_detective.filter import ConfidenceFilterQuery, FilterOptions, SecondaryStructureFilterQuery
from protein_detective.workflow import filter_structures
from protein_detective.filter import ConfidenceFilterQuery, FilterOptions, SecondaryStructureFilterQuery
from protein_detective.workflow import filter_structures

In [11]:

Copied!





options = FilterOptions(
    confidence=ConfidenceFilterQuery(confidence=70, min_residues=100, max_residues=1000),
    secondary_structure=SecondaryStructureFilterQuery(),
)
options = FilterOptions(
    confidence=ConfidenceFilterQuery(confidence=70, min_residues=100, max_residues=1000),
    secondary_structure=SecondaryStructureFilterQuery(),
)

In [12]:

Copied!





filtered_dir, filtered_results = filter_structures(session_dir, options)
pprint(filtered_results[:2])
pprint(filtered_results[-2:])
filtered_dir
filtered_dir, filtered_results = filter_structures(session_dir, options)
pprint(filtered_results[:2])
pprint(filtered_results[-2:])
filtered_dir

  0%|          | 0/100 [00:00<?, ?file/s]

[
    FilteredStructure(
        uniprot_accession='A0A087WUV0',
        pdb_id=None,
        confidence=ConfidenceFilterResult(
            input_file='AF-A0A087WUV0-F1-model_v6.cif.gz',
            count=282,
            filtered_file=PosixPath('filtered/AF-A0A087WUV0-F1-model_v6.cif.gz')
        ),
        chain=None,
        residue=None,
        secondary_structure=None
    ),
    FilteredStructure(
        uniprot_accession='A0A0C5B5G6',
        pdb_id=None,
        confidence=ConfidenceFilterResult(
            input_file='AF-A0A0C5B5G6-F1-model_v6.cif.gz',
            count=10,
            filtered_file=None
        ),
        chain=None,
        residue=None,
        secondary_structure=None
    )
]

[
    FilteredStructure(
        uniprot_accession='O00571',
        pdb_id='4O2F',
        confidence=None,
        chain=ChainFilterStatistics(
            input_file=PosixPath('session1/downloads/pdbe/4o2f.cif.gz'),
            chain_id='C',
            passed=True,
            output_file=PosixPath('pdb_chain_filtered/4o2f_C2A.cif.gz'),
            discard_reason=None
        ),
        residue=ResidueFilterStatistics(
            input_file=PosixPath('session1/pdb_chain_filtered/4o2f_C2A.cif.gz'),
            residue_count=8,
            passed=False,
            output_file=None
        ),
        secondary_structure=None
    ),
    FilteredStructure(
        uniprot_accession='O00482',
        pdb_id='4IS8',
        confidence=None,
        chain=ChainFilterStatistics(
            input_file=PosixPath('session1/downloads/pdbe/4is8.cif.gz'),
            chain_id='A',
            passed=True,
            output_file=PosixPath('pdb_chain_filtered/4is8_A2A.cif.gz'),
            discard_reason=None
        ),
        residue=ResidueFilterStatistics(
            input_file=PosixPath('session1/pdb_chain_filtered/4is8_A2A.cif.gz'),
            residue_count=230,
            passed=True,
            output_file=PosixPath('filtered/4is8_A2A.cif')
        ),
        secondary_structure=None
    )
]

Out[12]:

PosixPath('session1/filtered')

In [13]:

Copied!





total_nr = len(filtered_results)
total_nr_passed = sum(1 for r in filtered_results if r.passed)
total_nr_discarded = total_nr - total_nr_passed
pprint(f"Total entries: {total_nr}, passed: {total_nr_passed}, discarded: {total_nr_discarded}")
total_nr = len(filtered_results)
total_nr_passed = sum(1 for r in filtered_results if r.passed)
total_nr_discarded = total_nr - total_nr_passed
pprint(f"Total entries: {total_nr}, passed: {total_nr_passed}, discarded: {total_nr_discarded}")

Total entries: 200, passed: 132, discarded: 68

Powerfit¶

In [14]:

Copied!

from protein_detective.powerfit.options import PowerfitOptions
from protein_detective.powerfit.workflow import powerfit_commands
from protein_detective.powerfit.options import PowerfitOptions
from protein_detective.powerfit.workflow import powerfit_commands

To run you must have cloned the https://github.com/haddocking/powerfit-tutorial repository in '../../powerfit-tutorial'.

In [20]:

Copied!





options = PowerfitOptions(
    target=Path("../../powerfit-tutorial/ribosome-KsgA.map"),
    resolution=13,
    angle=20,
    nproc=6,
)
options = PowerfitOptions(
    target=Path("../../powerfit-tutorial/ribosome-KsgA.map"),
    resolution=13,
    angle=20,
    nproc=6,
)

Run¶

Here we use printed commands to run powerfit, see powerfit.ipynb for running powerfits using the API.

In [21]:

Copied!

commands, powerfit_run_id = powerfit_commands(session_dir, options)
commands, powerfit_run_id = powerfit_commands(session_dir, options)

In [22]:

Copied!

powerfit_run_id
powerfit_run_id

Out[22]:

In [23]:

Copied!

cwd = str(Path.cwd())
rel_commands = [c.replace(cwd, ".") for c in commands]
pprint(rel_commands[:10])
cwd = str(Path.cwd())
rel_commands = [c.replace(cwd, ".") for c in commands]
pprint(rel_commands[:10])

[
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A087WUV0-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A087WUV0-F1-model_v6.cif --delimiter ,
--angle 20',
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A0U1RQI7-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A0U1RQI7-F1-model_v6.cif --delimiter ,
--angle 20',
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A1B0GTS1-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A1B0GTS1-F1-model_v6.cif --delimiter ,
--angle 20',
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A1B0GWH4-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A1B0GWH4-F1-model_v6.cif --delimiter ,
--angle 20',
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A1W2PPF3-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A1W2PPF3-F1-model_v6.cif --delimiter ,
--angle 20',
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A0A1W2PQL4-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A0A1W2PQL4-F1-model_v6.cif --delimiter ,
--angle 20',
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A1YPR0-F1-model_v6.cif --delimiter , 
--angle 20',
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A2RRD8-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A2RRD8-F1-model_v6.cif --delimiter , 
--angle 20',
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A3KN83-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A3KN83-F1-model_v6.cif --delimiter , 
--angle 20',
    'powerfit ./session1/powerfit/2/ribosome-KsgA.map 13 ./session1/filtered/AF-A6NDX5-F1-model_v6.cif.gz 
--resampling-rate 2 --num 0 --nproc 6 --directory ./session1/powerfit/2/AF-A6NDX5-F1-model_v6.cif --delimiter , 
--angle 20'
]

These commands should be run on a cluster. Here we will just run a couple of them to show how it works.

In [24]:

Copied!

!{rel_commands[6]}
!{rel_commands[6]}

Target file read from:                                                          
/home/verhoes/git/protein-detective/protein-detective/docs/session1/powerfit/2/r
ibosome-KsgA.map                                                                
Target resolution: 13.00                                                        
Initial shape of density: 128 128 128                                           
Shape after trimming: 60 73 67                                                  
Shape after extending: 60 75 70                                                 
Template file read from:                                                        
/home/verhoes/git/protein-detective/protein-detective/docs/session1/filtered/AF-
A1YPR0-F1-model_v6.cif.gz                                                       
Calculating core-weighted mask.                                                 
Reading in rotations.                                                           
Requested rotational sampling density: 20.00                                    
Real rotational sampling density: 20.83                                         
Requested number of processors: 6                                               
Starting search                                                                 
Processing rotations 100% ━━━━━━━━━━╸ 647/648  [ 0:00:02 < 0:00:01 , 248 rot/s ]250 rot/s ]
Time for search: 2.924 s                                                        
Analyzing results                                                               
Writing solutions to file.                                                      
Writing PDBs to file.                                                           
Total time: 0m 3s

In [25]:

Copied!

!{rel_commands[0]}
!{rel_commands[0]}

Target file read from:                                                          
/home/verhoes/git/protein-detective/protein-detective/docs/session1/powerfit/2/r
ibosome-KsgA.map                                                                
Target resolution: 13.00                                                        
Initial shape of density: 128 128 128                                           
Shape after trimming: 60 73 67                                                  
Shape after extending: 60 75 70                                                 
Template file read from:                                                        
/home/verhoes/git/protein-detective/protein-detective/docs/session1/filtered/AF-
A0A087WUV0-F1-model_v6.cif.gz                                                   
Calculating core-weighted mask.                                                 
Reading in rotations.                                                           
Requested rotational sampling density: 20.00                                    
Real rotational sampling density: 20.83                                         
Requested number of processors: 6                                               
Starting search                                                                 
Processing rotations 100% ━━━━━━━━━━╸ 647/648  [ 0:00:02 < 0:00:01 , 230 rot/s ]237 rot/s ]
Time for search: 3.120 s                                                        
Analyzing results                                                               
Writing solutions to file.                                                      
Writing PDBs to file.                                                           
Total time: 0m 3s

In [26]:

Copied!

!{rel_commands[-1]}
!{rel_commands[-1]}

Target file read from:                                                          
/home/verhoes/git/protein-detective/protein-detective/docs/session1/powerfit/2/r
ibosome-KsgA.map                                                                
Target resolution: 13.00                                                        
Initial shape of density: 128 128 128                                           
Shape after trimming: 60 73 67                                                  
Shape after extending: 60 75 70                                                 
Template file read from:                                                        
/home/verhoes/git/protein-detective/protein-detective/docs/session1/filtered/4is
8_A2A.cif                                                                       
Calculating core-weighted mask.                                                 
Reading in rotations.                                                           
Requested rotational sampling density: 20.00                                    
Real rotational sampling density: 20.83                                         
Requested number of processors: 6                                               
Starting search                                                                 
Processing rotations 100% ━━━━━━━━━━━ 648/648  [ 0:00:03 < 0:00:00 , 225 rot/s ], 230 rot/s ]/s ]s ]
Time for search: 3.201 s                                                        
Analyzing results                                                               
Writing solutions to file.                                                      
Writing PDBs to file.                                                           
Total time: 0m 3s

See powerfit.ipynb for running all powerfits using the API.

Report¶

Once all powerfit jobs are done the results can be parsed and reported.

In [27]:

Copied!

from protein_detective.powerfit.workflow import powerfit_report
from protein_detective.powerfit.workflow import powerfit_report

In [28]:

Copied!

solutions = powerfit_report(session_dir)
solutions = powerfit_report(session_dir)

In [29]:

Copied!

len(solutions)
len(solutions)

Out[29]:

In [30]:

Copied!

solutions
solutions

Out[30]:

	powerfit_run_id	structure	rank	cc	fishz	relz	translation	rotation	pdb_file	uniprot_acc	pdb_id
0	1	AF-A1YPR0-F1-model_v6.cif	1	0.405	0.429	16.625000	[211.83, 208.76, 208.76]	[0.0, 0.0, -1.0, -1.0, 0.0, 0.0, 0.0, 1.0, 0.0]	session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz	A1YPR0	None
1	2	AF-A1YPR0-F1-model_v6.cif	1	0.405	0.429	16.625000	[211.83, 208.76, 208.76]	[0.0, 0.0, -1.0, -1.0, 0.0, 0.0, 0.0, 1.0, 0.0]	session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz	A1YPR0	None
2	2	AF-A0A087WUV0-F1-model_v6.cif	1	0.391	0.413	17.674999	[227.18, 260.95, 184.2]	[-0.604, 0.797, 0.0, 0.0, 0.0, 1.0, 0.797, 0.6...	session1/filtered/AF-A0A087WUV0-F1-model_v6.ci...	A0A087WUV0	None
3	2	AF-A0A087WUV0-F1-model_v6.cif	2	0.390	0.412	17.653000	[156.57, 144.29, 214.9]	[0.604, 0.0, 0.797, 0.797, 0.0, -0.604, 0.0, 1...	session1/filtered/AF-A0A087WUV0-F1-model_v6.ci...	A0A087WUV0	None
4	2	AF-A0A087WUV0-F1-model_v6.cif	3	0.389	0.410	17.566999	[267.09, 245.6, 174.99]	[-0.548, 0.184, -0.816, 0.632, -0.548, -0.548,...	session1/filtered/AF-A0A087WUV0-F1-model_v6.ci...	A0A087WUV0	None
...	...	...	...	...	...	...	...	...	...	...	...
5155	2	4is8_A2A	1327	0.180	0.182	7.144000	[156.57, 168.85, 132.01]	[-0.184, -0.816, 0.548, 0.548, -0.548, -0.632,...	session1/filtered/4is8_A2A.cif	O00482	4IS8
5156	2	4is8_A2A	1326	0.180	0.182	7.148000	[251.74, 171.92, 227.18]	[0.797, 0.604, 0.0, 0.0, -0.0, 1.0, 0.604, -0....	session1/filtered/4is8_A2A.cif	O00482	4IS8
5157	2	4is8_A2A	1325	0.180	0.182	7.150000	[260.95, 264.02, 113.59]	[-0.184, -0.548, -0.816, -0.816, 0.548, -0.184...	session1/filtered/4is8_A2A.cif	O00482	4IS8
5158	2	4is8_A2A	1324	0.180	0.182	7.153000	[125.87, 168.85, 132.01]	[0.548, -0.632, 0.548, 0.184, -0.548, -0.816, ...	session1/filtered/4is8_A2A.cif	O00482	4IS8
5159	2	4is8_A2A	1323	0.180	0.182	7.155000	[205.69, 174.99, 214.9]	[0.548, 0.548, -0.632, -0.816, 0.184, -0.548, ...	session1/filtered/4is8_A2A.cif	O00482	4IS8

5160 rows × 11 columns

Fit model to solution¶

Rotate/translate the input model PDB files to the top 5 powerfit solutions.

In [31]:

Copied!

from protein_detective.powerfit.workflow import powerfit_fit_models
from protein_detective.powerfit.workflow import powerfit_fit_models

In [32]:

Copied!

fitted = powerfit_fit_models(session_dir, top=5)
fitted
fitted = powerfit_fit_models(session_dir, top=5)
fitted

Writing fitted model PDB files: 100%|██████████| 5/5 [00:00<00:00, 84.64it/s]

Out[32]:

	powerfit_run_id	structure	rank	fitted_model_file	unfitted_model_file
index
0	2	AF-A1YPR0-F1-model_v6.cif	1	session1/powerfit/2/AF-A1YPR0-F1-model_v6.cif/...	session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz
1	1	AF-A1YPR0-F1-model_v6.cif	1	session1/powerfit/1/AF-A1YPR0-F1-model_v6.cif/...	session1/filtered/AF-A1YPR0-F1-model_v6.cif.gz
2	2	AF-A0A087WUV0-F1-model_v6.cif	1	session1/powerfit/2/AF-A0A087WUV0-F1-model_v6....	session1/filtered/AF-A0A087WUV0-F1-model_v6.ci...
3	2	AF-A0A087WUV0-F1-model_v6.cif	2	session1/powerfit/2/AF-A0A087WUV0-F1-model_v6....	session1/filtered/AF-A0A087WUV0-F1-model_v6.ci...
4	2	AF-A0A087WUV0-F1-model_v6.cif	3	session1/powerfit/2/AF-A0A087WUV0-F1-model_v6....	session1/filtered/AF-A0A087WUV0-F1-model_v6.ci...

In [ ]: