Download Alphafold files¶
In [1]:
Copied!
from pathlib import Path
from pprint import pprint
from protein_detective.alphafold import fetch_many, fetch_many_async
from pathlib import Path
from pprint import pprint
from protein_detective.alphafold import fetch_many, fetch_many_async
In [2]:
Copied!
save_dir = Path("alphafold_files")
save_dir = Path("alphafold_files")
In [3]:
Copied!
summaries = fetch_many(["A1YPR0", "O60481"], save_dir)
summaries = fetch_many(["A1YPR0", "O60481"], save_dir)
Fetching Alphafold summaries: 100%|██████████| 2/2 [00:00<00:00, 8.97it/s] Downloading AlphaFold files: 100%|██████████| 2/2 [00:00<00:00, 14847.09it/s]
In [4]:
Copied!
pprint(summaries)
pprint(summaries)
[AlphaFoldEntry(uniprot_acc='A1YPR0', summary=EntrySummary(entryId='AF-A1YPR0-F1', gene='ZBTB7C', sequenceChecksum='73D82A34502B55BF', sequenceVersionDate='2007-02-06', uniprotAccession='A1YPR0', uniprotId='ZBT7C_HUMAN', uniprotDescription='Zinc finger and BTB ' 'domain-containing ' 'protein 7C', taxId=9606, organismScientificName='Homo sapiens', uniprotStart=1, uniprotEnd=619, uniprotSequence='MANDIDELIGIPFPNHSSEVLCSLNEQRHDGLLCDVLLVVQEQEYRTHRSVLAACSKYFKKLFTAGTLASQPYVYEIDFVQPEALAAILEFAYTSTLTITAGNVKHILNAARMLEIQCIVNVCLEIMEPGGDGGEEDDKEDDDDDEDDDDEEDEEEEEEEEEDDDDDTEDFADQENLPDPQDISCHQSPSKTDHLTEKAYSDTPRDFPDSFQAGSPGHLGVIRDFSIESLLRENLYPKANIPDRRPSLSPFAPDFFPHLWPGDFGAFAQLPEQPMDSGPLDLVIKNRKIKEEEKEELPPPPPPPFPNDFFKDMFPDLPGGPLGPIKAENDYGAYLNFLSATHLGGLFPPWPLVEERKLKPKASQQCPICHKVIMGAGKLPRHMRTHTGEKPYMCTICEVRFTRQDKLKIHMRKHTGERPYLCIHCNAKFVHNYDLKNHMRIHTGVRPYQCEFCYKSFTRSDHLHRHIKRQSCRMARPRRGRKPAAWRAASLLFGPGGPAPDKAAFVMPPALGEVGGHLGGAAVCLPGPSPAKHFLAAPKGALSLQELERQFEETQMKLFGRAQLEAERNAGGLLAFALAENVAAARPYFPLPDPWAAGLAGLPGLAGLNHVASMSEANN', modelCreatedDate='2022-06-01', latestVersion=4, allVersions=[1, 2, 3, 4], bcifUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.bcif', cifUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.cif', pdbUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-model_v4.pdb', paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v4.png', paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-predicted_aligned_error_v4.json', amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-aa-substitutions.csv', amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg19.csv', amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-A1YPR0-F1-hg38.csv', isReviewed=True, isReferenceProteome=True), pdb_file=PosixPath('alphafold_files/AF-A1YPR0-F1-model_v4.pdb'), pae_file=None, bcif_file=None, cif_file=None, pae_image_file=None, pae_doc_file=None, am_annotations_file=None, am_annotations_hg19_file=None, am_annotations_hg38_file=None), AlphaFoldEntry(uniprot_acc='O60481', summary=EntrySummary(entryId='AF-O60481-F1', gene='ZIC3', sequenceChecksum='3150CF13C0679568', sequenceVersionDate='1998-08-01', uniprotAccession='O60481', uniprotId='ZIC3_HUMAN', uniprotDescription='Zinc finger protein ' 'ZIC 3', taxId=9606, organismScientificName='Homo sapiens', uniprotStart=1, uniprotEnd=467, uniprotSequence='MTMLLDGGPQFPGLGVGSFGAPRHHEMPNREPAGMGLNPFGDSTHAAAAAAAAAAFKLSPAAAHDLSSGQSSAFTPQGSGYANALGHHHHHHHHHHHTSQVPSYGGAASAAFNSTREFLFRQRSSGLSEAASGGGQHGLFAGSASSLHAPAGIPEPPSYLLFPGLHEQGAGHPSPTGHVDNNQVHLGLRGELFGRADPYRPVASPRTDPYAAGAQFPNYSPMNMNMGVNVAAHHGPGAFFRYMRQPIKQELSCKWIDEAQLSRPKKSCDRTFSTMHELVTHVTMEHVGGPEQNNHVCYWEECPREGKSFKAKYKLVNHIRVHTGEKPFPCPFPGCGKIFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYICKVCDKSYTHPSSLRKHMKVHESQGSDSSPAASSGYESSTPPAIASANSKDTTKTPSAVQTSTSHNPGLPPNFNEWYV', modelCreatedDate='2022-06-01', latestVersion=4, allVersions=[1, 2, 3, 4], bcifUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.bcif', cifUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.cif', pdbUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-model_v4.pdb', paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v4.png', paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-predicted_aligned_error_v4.json', amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-aa-substitutions.csv', amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg19.csv', amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-O60481-F1-hg38.csv', isReviewed=True, isReferenceProteome=True), pdb_file=PosixPath('alphafold_files/AF-O60481-F1-model_v4.pdb'), pae_file=None, bcif_file=None, cif_file=None, pae_image_file=None, pae_doc_file=None, am_annotations_file=None, am_annotations_hg19_file=None, am_annotations_hg38_file=None)]
Download more cif and peaDoc with async
In [ ]:
Copied!
summaries2 = [s async for s in fetch_many_async(["P50613"], save_dir, what={"cif", "paeDoc"})]
pprint(summaries2)
summaries2 = [s async for s in fetch_many_async(["P50613"], save_dir, what={"cif", "paeDoc"})]
pprint(summaries2)
Fetching Alphafold summaries: 100%|██████████| 1/1 [00:00<00:00, 4.87it/s] Downloading AlphaFold files: 100%|██████████| 2/2 [00:00<00:00, 12520.31it/s]
[AlphaFoldEntry(uniprot_acc='P50613', summary=EntrySummary(entryId='AF-P50613-F1', gene='CDK7', sequenceChecksum='0A94BFA7DD416CEB', sequenceVersionDate='1996-10-01', uniprotAccession='P50613', uniprotId='CDK7_HUMAN', uniprotDescription='Cyclin-dependent ' 'kinase 7', taxId=9606, organismScientificName='Homo sapiens', uniprotStart=1, uniprotEnd=346, uniprotSequence='MALDVKSRAKRYEKLDFLGEGQFATVYKARDKNTNQIVAIKKIKLGHRSEAKDGINRTALREIKLLQELSHPNIIGLLDAFGHKSNISLVFDFMETDLEVIIKDNSLVLTPSHIKAYMLMTLQGLEYLHQHWILHRDLKPNNLLLDENGVLKLADFGLAKSFGSPNRAYTHQVVTRWYRAPELLFGARMYGVGVDMWAVGCILAELLLRVPFLPGDSDLDQLTRIFETLGTPTEEQWPDMCSLPDYVTFKSFPGIPLHHIFSAAGDDLLDLIQGLFLFNPCARITATQALKMKYFSNRPGPTPGCQLPRPNCPVETLKEQSNPALAIKRKRTEALEQGGLPKKLIF', modelCreatedDate='2022-06-01', latestVersion=4, allVersions=[1, 2, 3, 4], bcifUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.bcif', cifUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.cif', pdbUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-model_v4.pdb', paeImageUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v4.png', paeDocUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-predicted_aligned_error_v4.json', amAnnotationsUrl='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-aa-substitutions.csv', amAnnotationsHg19Url='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg19.csv', amAnnotationsHg38Url='https://alphafold.ebi.ac.uk/files/AF-P50613-F1-hg38.csv', isReviewed=True, isReferenceProteome=True), pdb_file=None, pae_file=None, bcif_file=None, cif_file=PosixPath('alphafold_files/AF-P50613-F1-model_v4.cif'), pae_image_file=None, pae_doc_file=PosixPath('alphafold_files/AF-P50613-F1-predicted_aligned_error_v4.json'), am_annotations_file=None, am_annotations_hg19_file=None, am_annotations_hg38_file=None)]
In [ ]:
Copied!
!ls -sh {save_dir}
!ls -sh {save_dir}
total 1.3M 392K AF-A1YPR0-F1-model_v4.pdb 292K AF-O60481-F1-model_v4.pdb 320K AF-P50613-F1-model_v4.cif 280K AF-P50613-F1-predicted_aligned_error_v4.json
Apply density filter¶
In [6]:
Copied!
from protein_detective.alphafold.density import filter_out_low_confidence_residues, find_high_confidence_residues
from protein_detective.alphafold.density import filter_out_low_confidence_residues, find_high_confidence_residues
In [7]:
Copied!
pdb_file = summaries[0].pdb_file
if pdb_file is None:
raise ValueError("Summary has not been downloaded yet.") # noqa: EM101, TRY003
residues = set(find_high_confidence_residues(pdb_file, 70))
len(residues)
pdb_file = summaries[0].pdb_file
if pdb_file is None:
raise ValueError("Summary has not been downloaded yet.") # noqa: EM101, TRY003
residues = set(find_high_confidence_residues(pdb_file, 70))
len(residues)
Out[7]:
214
In [8]:
Copied!
out_dir = Path("density_filtered")
out_dir.mkdir(exist_ok=True, parents=True)
out_file = out_dir / pdb_file.name
out_file
out_dir = Path("density_filtered")
out_dir.mkdir(exist_ok=True, parents=True)
out_file = out_dir / pdb_file.name
out_file
Out[8]:
PosixPath('density_filtered/AF-A1YPR0-F1-model_v4.pdb')
In [9]:
Copied!
filter_out_low_confidence_residues(pdb_file, residues, out_file)
filter_out_low_confidence_residues(pdb_file, residues, out_file)
In [10]:
Copied!
!ls -sh {pdb_file} {out_file}
!ls -sh {pdb_file} {out_file}
392K alphafold_files/AF-A1YPR0-F1-model_v4.pdb 148K density_filtered/AF-A1YPR0-F1-model_v4.pdb
In [11]:
Copied!
residues50 = set(find_high_confidence_residues(pdb_file, 50))
len(residues50)
residues50 = set(find_high_confidence_residues(pdb_file, 50))
len(residues50)
Out[11]:
275
In [12]:
Copied!
residues90 = set(find_high_confidence_residues(pdb_file, 90))
len(residues90)
residues90 = set(find_high_confidence_residues(pdb_file, 90))
len(residues90)
Out[12]:
22
In [13]:
Copied!
filter_out_low_confidence_residues(pdb_file, residues90, out_file.with_suffix(".90.pdb"))
filter_out_low_confidence_residues(pdb_file, residues90, out_file.with_suffix(".90.pdb"))
In [14]:
Copied!
!ls -sh {pdb_file} {out_file} {out_file.with_suffix(".90.pdb")}
!ls -sh {pdb_file} {out_file} {out_file.with_suffix(".90.pdb")}
392K alphafold_files/AF-A1YPR0-F1-model_v4.pdb 24K density_filtered/AF-A1YPR0-F1-model_v4.90.pdb 148K density_filtered/AF-A1YPR0-F1-model_v4.pdb
In [ ]:
Copied!