AlphaFold¶
You can download and filter AlphaFold files on confidence.
In [1]:
Copied!
# Generic imports
import logging
from pathlib import Path
from pprint import pprint
logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results
# Generic imports
import logging
from pathlib import Path
from pprint import pprint
logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results
Download Alphafold files¶
In [2]:
Copied!
from protein_quest.alphafold.fetch import fetch_many_async
from protein_quest.alphafold.fetch import fetch_many_async
In [3]:
Copied!
save_dir = Path("alphafold_files")
save_dir = Path("alphafold_files")
To download the summary, the cif, predicted Aligned error document (peaDoc) and the pdb file for 3 AlphaFold entries given their uniprot accessions.
In [8]:
Copied!
summaries = [s async for s in fetch_many_async(["A1YPR0", "O60481", "P50613"], save_dir, what={"pdb", "cif", "paeDoc"})]
pprint(summaries)
summaries = [s async for s in fetch_many_async(["A1YPR0", "O60481", "P50613"], save_dir, what={"pdb", "cif", "paeDoc"})]
pprint(summaries)
Fetching Alphafold summaries: 100%|██████████| 3/3 [00:00<00:00, 8.07it/s] Downloading AlphaFold files: 100%|██████████| 9/9 [00:00<00:00, 55.82it/s]
[AlphaFoldEntry(uniprot_acc='A1YPR0', summary=EntrySummary(entryId='AF-A1YPR0-F1', uniprotAccession='A1YPR0', uniprotId='ZBT7C_HUMAN', uniprotDescription='Zinc finger and BTB ' 'domain-containing ' 'protein 7C', taxId=9606, organismScientificName='Homo sapiens', uniprotStart=1, uniprotEnd=619, uniprotSequence='MANDIDELIGIPFPNHSSEVLCSLNEQRHDGLLCDVLLVVQEQEYRTHRSVLAACSKYFKKLFTAGTLASQPYVYEIDFVQPEALAAILEFAYTSTLTITAGNVKHILNAARMLEIQCIVNVCLEIMEPGGDGGEEDDKEDDDDDEDDDDEEDEEEEEEEEEDDDDDTEDFADQENLPDPQDISCHQSPSKTDHLTEKAYSDTPRDFPDSFQAGSPGHLGVIRDFSIESLLRENLYPKANIPDRRPSLSPFAPDFFPHLWPGDFGAFAQLPEQPMDSGPLDLVIKNRKIKEEEKEELPPPPPPPFPNDFFKDMFPDLPGGPLGPIKAENDYGAYLNFLSATHLGGLFPPWPLVEERKLKPKASQQCPICHKVIMGAGKLPRHMRTHTGEKPYMCTICEVRFTRQDKLKIHMRKHTGERPYLCIHCNAKFVHNYDLKNHMRIHTGVRPYQCEFCYKSFTRSDHLHRHIKRQSCRMARPRRGRKPAAWRAASLLFGPGGPAPDKAAFVMPPALGEVGGHLGGAAVCLPGPSPAKHFLAAPKGALSLQELERQFEETQMKLFGRAQLEAERNAGGLLAFALAENVAAARPYFPLPDPWAAGLAGLPGLAGLNHVASMSEANN', modelCreatedDate='2022-06-01T00:00:00Z', latestVersion=4, allVersions=[1, 2, 3, 4], bcifUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.bcif', cifUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.cif', pdbUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.pdb', paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v4.png', paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v4.json', gene='ZBTB7C', sequenceChecksum='73D82A34502B55BF', sequenceVersionDate='2007-02-06T00:00:00Z', amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-aa-substitutions.csv', amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg19.csv', amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg38.csv', isReviewed=True, isReferenceProteome=True), bcif_file=None, cif_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'), pdb_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.pdb'), pae_image_file=None, pae_doc_file=PosixPath('alphafold_files/AF-A1YPR0-F1-predicted_aligned_error_v4.json'), am_annotations_file=None, am_annotations_hg19_file=None, am_annotations_hg38_file=None), AlphaFoldEntry(uniprot_acc='O60481', summary=EntrySummary(entryId='AF-O60481-F1', uniprotAccession='O60481', uniprotId='ZIC3_HUMAN', uniprotDescription='Zinc finger protein ' 'ZIC 3', taxId=9606, organismScientificName='Homo sapiens', uniprotStart=1, uniprotEnd=467, uniprotSequence='MTMLLDGGPQFPGLGVGSFGAPRHHEMPNREPAGMGLNPFGDSTHAAAAAAAAAAFKLSPAAAHDLSSGQSSAFTPQGSGYANALGHHHHHHHHHHHTSQVPSYGGAASAAFNSTREFLFRQRSSGLSEAASGGGQHGLFAGSASSLHAPAGIPEPPSYLLFPGLHEQGAGHPSPTGHVDNNQVHLGLRGELFGRADPYRPVASPRTDPYAAGAQFPNYSPMNMNMGVNVAAHHGPGAFFRYMRQPIKQELSCKWIDEAQLSRPKKSCDRTFSTMHELVTHVTMEHVGGPEQNNHVCYWEECPREGKSFKAKYKLVNHIRVHTGEKPFPCPFPGCGKIFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYICKVCDKSYTHPSSLRKHMKVHESQGSDSSPAASSGYESSTPPAIASANSKDTTKTPSAVQTSTSHNPGLPPNFNEWYV', modelCreatedDate='2022-06-01T00:00:00Z', latestVersion=4, allVersions=[1, 2, 3, 4], bcifUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.bcif', cifUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.cif', pdbUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.pdb', paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v4.png', paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v4.json', gene='ZIC3', sequenceChecksum='3150CF13C0679568', sequenceVersionDate='1998-08-01T00:00:00Z', amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-aa-substitutions.csv', amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg19.csv', amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg38.csv', isReviewed=True, isReferenceProteome=True), bcif_file=None, cif_file=PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'), pdb_file=PosixPath('alphafold_files/AF-O60481-F1-model_v4.pdb'), pae_image_file=None, pae_doc_file=PosixPath('alphafold_files/AF-O60481-F1-predicted_aligned_error_v4.json'), am_annotations_file=None, am_annotations_hg19_file=None, am_annotations_hg38_file=None), AlphaFoldEntry(uniprot_acc='P50613', summary=EntrySummary(entryId='AF-P50613-F1', uniprotAccession='P50613', uniprotId='CDK7_HUMAN', uniprotDescription='Cyclin-dependent ' 'kinase 7', taxId=9606, organismScientificName='Homo sapiens', uniprotStart=1, uniprotEnd=346, uniprotSequence='MALDVKSRAKRYEKLDFLGEGQFATVYKARDKNTNQIVAIKKIKLGHRSEAKDGINRTALREIKLLQELSHPNIIGLLDAFGHKSNISLVFDFMETDLEVIIKDNSLVLTPSHIKAYMLMTLQGLEYLHQHWILHRDLKPNNLLLDENGVLKLADFGLAKSFGSPNRAYTHQVVTRWYRAPELLFGARMYGVGVDMWAVGCILAELLLRVPFLPGDSDLDQLTRIFETLGTPTEEQWPDMCSLPDYVTFKSFPGIPLHHIFSAAGDDLLDLIQGLFLFNPCARITATQALKMKYFSNRPGPTPGCQLPRPNCPVETLKEQSNPALAIKRKRTEALEQGGLPKKLIF', modelCreatedDate='2022-06-01T00:00:00Z', latestVersion=4, allVersions=[1, 2, 3, 4], bcifUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.bcif', cifUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.cif', pdbUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.pdb', paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v4.png', paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v4.json', gene='CDK7', sequenceChecksum='0A94BFA7DD416CEB', sequenceVersionDate='1996-10-01T00:00:00Z', amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-aa-substitutions.csv', amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg19.csv', amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg38.csv', isReviewed=True, isReferenceProteome=True), bcif_file=None, cif_file=PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif'), pdb_file=PosixPath('alphafold_files/AF-P50613-F1-model_v4.pdb'), pae_image_file=None, pae_doc_file=PosixPath('alphafold_files/AF-P50613-F1-predicted_aligned_error_v4.json'), am_annotations_file=None, am_annotations_hg19_file=None, am_annotations_hg38_file=None)]
In [9]:
Copied!
!ls -sh {save_dir}
!ls -sh {save_dir}
total 4.2M 4.0K A1YPR0.json 548K AF-A1YPR0-F1-model_v4.cif 392K AF-A1YPR0-F1-model_v4.pdb 1.1M AF-A1YPR0-F1-predicted_aligned_error_v4.json 408K AF-O60481-F1-model_v4.cif 292K AF-O60481-F1-model_v4.pdb 632K AF-O60481-F1-predicted_aligned_error_v4.json 320K AF-P50613-F1-model_v4.cif 224K AF-P50613-F1-model_v4.pdb 280K AF-P50613-F1-predicted_aligned_error_v4.json 4.0K O60481.json 4.0K P50613.json
Filter AlphFold structure files on confidence¶
Filter AlphaFold mmcif/PDB files by confidence (plDDT). Passed files are written with residues below threshold removed.
In [10]:
Copied!
from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
Take one of the downloaded files
In [12]:
Copied!
input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]
input_files
input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]
input_files
Out[12]:
[PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'), PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'), PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif')]
We only write a filtered cif file when in the input file there are between 100 and 1000 residues that have a pLDDT score above 50.
In [ ]:
Copied!
query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)
query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)
In [14]:
Copied!
output_dir = Path("./filtered")
output_dir.mkdir(exist_ok=True)
result = filter_files_on_confidence(input_files, query, output_dir)
output_dir = Path("./filtered")
output_dir.mkdir(exist_ok=True)
result = filter_files_on_confidence(input_files, query, output_dir)
In [ ]:
Copied!
list(
filter_files_on_confidence(
input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir
)
)
list(
filter_files_on_confidence(
input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir
)
)
Out[ ]:
[ConfidenceFilterResult(input_file='AF-A1YPR0-F1-model_v4.cif', count=175, filtered_file=PosixPath('filtered/AF-A1YPR0-F1-model_v4.cif')), ConfidenceFilterResult(input_file='AF-O60481-F1-model_v4.cif', count=76, filtered_file=None), ConfidenceFilterResult(input_file='AF-P50613-F1-model_v4.cif', count=244, filtered_file=PosixPath('filtered/AF-P50613-F1-model_v4.cif'))]
2 files have passed, but 1 file only has 75 high confidence residues so it is discarded.
In [ ]:
Copied!