AlphaFold¶
You can download and filter AlphaFold files on confidence.
In [1]:
Copied!
# Generic imports
import logging
from pathlib import Path
from pprint import pprint
logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results
# Generic imports
import logging
from pathlib import Path
from pprint import pprint
logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results
Download Alphafold files¶
In [2]:
Copied!
from protein_quest.alphafold.fetch import fetch_many_async
from protein_quest.alphafold.fetch import fetch_many_async
In [3]:
Copied!
save_dir = Path("alphafold_files")
save_dir = Path("alphafold_files")
To download the summary, the cif and predicted Aligned error document (peaDoc) file for 3 AlphaFold entries given their uniprot accessions.
In [4]:
Copied!
summaries = [
s async for s in fetch_many_async(["A1YPR0", "O60481", "P50613"], save_dir, what={"summary", "cif", "paeDoc"})
]
pprint(summaries)
summaries = [
s async for s in fetch_many_async(["A1YPR0", "O60481", "P50613"], save_dir, what={"summary", "cif", "paeDoc"})
]
pprint(summaries)
Fetching Alphafold summaries: 100%|██████████| 3/3 [00:00<00:00, 553.10it/s] Downloading AlphaFold files: 100%|██████████| 6/6 [00:00<00:00, 38245.93it/s]
[AlphaFoldEntry(uniprot_accession='A1YPR0',
summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],
bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.bcif'),
cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.cif'),
entityType='protein',
fractionPlddtConfident=0.26,
fractionPlddtLow=0.099,
fractionPlddtVeryHigh=0.089,
fractionPlddtVeryLow=0.553,
globalMetricValue=56.03,
isUniProt=True,
latestVersion=6,
modelCreatedDate='2025-08-01T00:00:00Z',
modelEntityId='AF-A1YPR0-F1',
paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),
pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v6.pdb'),
providerId='GDM',
sequence='MANDIDELIGIPFPNHSSEVLCSLNEQRHDGLLCDVLLVVQEQEYRTHRSVLAACSKYFKKLFTAGTLASQPYVYEIDFVQPEALAAILEFAYTSTLTITAGNVKHILNAARMLEIQCIVNVCLEIMEPGGDGGEEDDKEDDDDDEDDDDEEDEEEEEEEEEDDDDDTEDFADQENLPDPQDISCHQSPSKTDHLTEKAYSDTPRDFPDSFQAGSPGHLGVIRDFSIESLLRENLYPKANIPDRRPSLSPFAPDFFPHLWPGDFGAFAQLPEQPMDSGPLDLVIKNRKIKEEEKEELPPPPPPPFPNDFFKDMFPDLPGGPLGPIKAENDYGAYLNFLSATHLGGLFPPWPLVEERKLKPKASQQCPICHKVIMGAGKLPRHMRTHTGEKPYMCTICEVRFTRQDKLKIHMRKHTGERPYLCIHCNAKFVHNYDLKNHMRIHTGVRPYQCEFCYKSFTRSDHLHRHIKRQSCRMARPRRGRKPAAWRAASLLFGPGGPAPDKAAFVMPPALGEVGGHLGGAAVCLPGPSPAKHFLAAPKGALSLQELERQFEETQMKLFGRAQLEAERNAGGLLAFALAENVAAARPYFPLPDPWAAGLAGLPGLAGLNHVASMSEANN',
sequenceChecksum='73D82A34502B55BF',
sequenceEnd=619,
sequenceStart=1,
sequenceVersionDate='2007-02-06T00:00:00Z',
toolUsed='AlphaFold Monomer v2.0 pipeline',
alternativeNames=None,
amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg19.csv'),
amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg38.csv'),
amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-aa-substitutions.csv'),
catalyticActivities=None,
complexName=None,
functions=None,
gene='ZBTB7C',
geneSynonyms=None,
ipSAE=None,
ipTM=None,
isUniProtReferenceProteome=True,
isUniProtReviewed=True,
keywords=None,
msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-A1YPR0-F1-msa_v6.a3m'),
organismCommonNames=None,
organismScientificName='Homo sapiens',
organismSynonyms=None,
plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-confidence_v6.json'),
proteinFullNames=None,
proteinShortNames=None,
stoichiometry=None,
taxId=9606,
taxonomyLineage=None,
uniprotAccession='A1YPR0',
uniprotDescription='Zinc finger and BTB '
'domain-containing '
'protein 7C',
uniprotId='ZBT7C_HUMAN'),
summary_file=PosixPath('alphafold_files/A1YPR0.json'),
bcif_file=None,
cif_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v6.cif'),
pdb_file=None,
pae_doc_file=PosixPath('alphafold_files/AF-A1YPR0-F1-predicted_aligned_error_v6.json'),
am_annotations_file=None,
am_annotations_hg19_file=None,
am_annotations_hg38_file=None,
msa_file=None,
plddt_doc_file=None),
AlphaFoldEntry(uniprot_accession='O60481',
summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],
bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.bcif'),
cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.cif'),
entityType='protein',
fractionPlddtConfident=0.289,
fractionPlddtLow=0.107,
fractionPlddtVeryHigh=0.0,
fractionPlddtVeryLow=0.604,
globalMetricValue=53.88,
isUniProt=True,
latestVersion=6,
modelCreatedDate='2025-08-01T00:00:00Z',
modelEntityId='AF-O60481-F1',
paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v6.json'),
pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v6.pdb'),
providerId='GDM',
sequence='MTMLLDGGPQFPGLGVGSFGAPRHHEMPNREPAGMGLNPFGDSTHAAAAAAAAAAFKLSPAAAHDLSSGQSSAFTPQGSGYANALGHHHHHHHHHHHTSQVPSYGGAASAAFNSTREFLFRQRSSGLSEAASGGGQHGLFAGSASSLHAPAGIPEPPSYLLFPGLHEQGAGHPSPTGHVDNNQVHLGLRGELFGRADPYRPVASPRTDPYAAGAQFPNYSPMNMNMGVNVAAHHGPGAFFRYMRQPIKQELSCKWIDEAQLSRPKKSCDRTFSTMHELVTHVTMEHVGGPEQNNHVCYWEECPREGKSFKAKYKLVNHIRVHTGEKPFPCPFPGCGKIFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYICKVCDKSYTHPSSLRKHMKVHESQGSDSSPAASSGYESSTPPAIASANSKDTTKTPSAVQTSTSHNPGLPPNFNEWYV',
sequenceChecksum='3150CF13C0679568',
sequenceEnd=467,
sequenceStart=1,
sequenceVersionDate='1998-08-01T00:00:00Z',
toolUsed='AlphaFold Monomer v2.0 pipeline',
alternativeNames=None,
amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg19.csv'),
amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg38.csv'),
amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-aa-substitutions.csv'),
catalyticActivities=None,
complexName=None,
functions=None,
gene='ZIC3',
geneSynonyms=None,
ipSAE=None,
ipTM=None,
isUniProtReferenceProteome=True,
isUniProtReviewed=True,
keywords=None,
msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-O60481-F1-msa_v6.a3m'),
organismCommonNames=None,
organismScientificName='Homo sapiens',
organismSynonyms=None,
plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-O60481-F1-confidence_v6.json'),
proteinFullNames=None,
proteinShortNames=None,
stoichiometry=None,
taxId=9606,
taxonomyLineage=None,
uniprotAccession='O60481',
uniprotDescription='Zinc finger protein '
'ZIC 3',
uniprotId='ZIC3_HUMAN'),
summary_file=PosixPath('alphafold_files/O60481.json'),
bcif_file=None,
cif_file=PosixPath('alphafold_files/AF-O60481-F1-model_v6.cif'),
pdb_file=None,
pae_doc_file=PosixPath('alphafold_files/AF-O60481-F1-predicted_aligned_error_v6.json'),
am_annotations_file=None,
am_annotations_hg19_file=None,
am_annotations_hg38_file=None,
msa_file=None,
plddt_doc_file=None),
AlphaFoldEntry(uniprot_accession='P50613',
summary=EntrySummary(allVersions=[1, 2, 3, 4, 5, 6],
bcifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.bcif'),
cifUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.cif'),
entityType='protein',
fractionPlddtConfident=0.127,
fractionPlddtLow=0.092,
fractionPlddtVeryHigh=0.618,
fractionPlddtVeryLow=0.162,
globalMetricValue=82.0,
isUniProt=True,
latestVersion=6,
modelCreatedDate='2025-08-01T00:00:00Z',
modelEntityId='AF-P50613-F1',
paeDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v6.json'),
pdbUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v6.pdb'),
providerId='GDM',
sequence='MALDVKSRAKRYEKLDFLGEGQFATVYKARDKNTNQIVAIKKIKLGHRSEAKDGINRTALREIKLLQELSHPNIIGLLDAFGHKSNISLVFDFMETDLEVIIKDNSLVLTPSHIKAYMLMTLQGLEYLHQHWILHRDLKPNNLLLDENGVLKLADFGLAKSFGSPNRAYTHQVVTRWYRAPELLFGARMYGVGVDMWAVGCILAELLLRVPFLPGDSDLDQLTRIFETLGTPTEEQWPDMCSLPDYVTFKSFPGIPLHHIFSAAGDDLLDLIQGLFLFNPCARITATQALKMKYFSNRPGPTPGCQLPRPNCPVETLKEQSNPALAIKRKRTEALEQGGLPKKLIF',
sequenceChecksum='0A94BFA7DD416CEB',
sequenceEnd=346,
sequenceStart=1,
sequenceVersionDate='1996-10-01T00:00:00Z',
toolUsed='AlphaFold Monomer v2.0 pipeline',
alternativeNames=None,
amAnnotationsHg19Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg19.csv'),
amAnnotationsHg38Url=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg38.csv'),
amAnnotationsUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-aa-substitutions.csv'),
catalyticActivities=None,
complexName=None,
functions=None,
gene='CDK7',
geneSynonyms=None,
ipSAE=None,
ipTM=None,
isUniProtReferenceProteome=True,
isUniProtReviewed=True,
keywords=None,
msaUrl=URL('https://alphafold.ebi.ac.uk/files/msa/AF-P50613-F1-msa_v6.a3m'),
organismCommonNames=None,
organismScientificName='Homo sapiens',
organismSynonyms=None,
plddtDocUrl=URL('https://alphafold.ebi.ac.uk/files/AF-P50613-F1-confidence_v6.json'),
proteinFullNames=None,
proteinShortNames=None,
stoichiometry=None,
taxId=9606,
taxonomyLineage=None,
uniprotAccession='P50613',
uniprotDescription='Cyclin-dependent '
'kinase 7',
uniprotId='CDK7_HUMAN'),
summary_file=PosixPath('alphafold_files/P50613.json'),
bcif_file=None,
cif_file=PosixPath('alphafold_files/AF-P50613-F1-model_v6.cif'),
pdb_file=None,
pae_doc_file=PosixPath('alphafold_files/AF-P50613-F1-predicted_aligned_error_v6.json'),
am_annotations_file=None,
am_annotations_hg19_file=None,
am_annotations_hg38_file=None,
msa_file=None,
plddt_doc_file=None)]
In [7]:
Copied!
!ls -sh {save_dir}
!ls -sh {save_dir}
total 4.3M 4.0K A1YPR0.json 556K AF-A1YPR0-F1-model_v6.cif 1.1M AF-A1YPR0-F1-predicted_aligned_error_v6.json 412K AF-O60481-2-F1-model_v6.cif 600K AF-O60481-2-F1-predicted_aligned_error_v6.json 412K AF-O60481-F1-model_v6.cif 628K AF-O60481-F1-predicted_aligned_error_v6.json 324K AF-P50613-F1-model_v6.cif 276K AF-P50613-F1-predicted_aligned_error_v6.json 8.0K O60481.json 4.0K P50613.json
Filter AlphFold structure files on confidence¶
Filter AlphaFold mmcif/PDB files by confidence (plDDT). Passed files are written with residues below threshold removed.
In [10]:
Copied!
from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
from protein_quest.alphafold.confidence import ConfidenceFilterQuery, filter_files_on_confidence
Take one of the downloaded files
In [12]:
Copied!
input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]
input_files
input_files = [entry.cif_file for entry in summaries if entry.cif_file is not None]
input_files
Out[12]:
[PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.cif'),
PosixPath('alphafold_files/AF-O60481-F1-model_v4.cif'),
PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif')]
We only write a filtered cif file when in the input file there are between 100 and 1000 residues that have a pLDDT score above 50.
In [ ]:
Copied!
query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)
query = ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000)
In [14]:
Copied!
output_dir = Path("./filtered")
output_dir.mkdir(exist_ok=True)
result = filter_files_on_confidence(input_files, query, output_dir)
output_dir = Path("./filtered")
output_dir.mkdir(exist_ok=True)
result = filter_files_on_confidence(input_files, query, output_dir)
In [ ]:
Copied!
list(
filter_files_on_confidence(
input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir
)
)
list(
filter_files_on_confidence(
input_files, ConfidenceFilterQuery(confidence=80, min_residues=100, max_residues=1000), output_dir
)
)
Out[ ]:
[ConfidenceFilterResult(input_file='AF-A1YPR0-F1-model_v4.cif', count=175, filtered_file=PosixPath('filtered/AF-A1YPR0-F1-model_v4.cif')),
ConfidenceFilterResult(input_file='AF-O60481-F1-model_v4.cif', count=76, filtered_file=None),
ConfidenceFilterResult(input_file='AF-P50613-F1-model_v4.cif', count=244, filtered_file=PosixPath('filtered/AF-P50613-F1-model_v4.cif'))]
2 files have passed, but 1 file only has 75 high confidence residues so it is discarded.
In [ ]:
Copied!