Source code for haddock.libs.libaa2cg

#  CODE TAKEN FROM MARTINIZE 1.1 ##
# Please refer to it for more information
"""
Reduces complexity of protein residue to the MARTINI coarse grained model:
CA, O, Bead(s) in specific atom location.

Reference:
Monticelli et al. The MARTINI coarse-grained force field: extension to proteins.
J. Chem. Theory Comput. (2008) vol. 4 (5) pp. 819-834

Martinize Script from Tserk Wassenaar

Uses Biopython to parse the structure and DSSP output.
Uses pieces of the martinize-1.1.py script to convert the SS types

Outputs a coarse grained pdb file (``*_ss.pdb``) with assigned bfactors.
Outputs a tbl file to map the beads to the atoms they represent.

Updates
 - Updated python version to 2.6 to support isdisjoint() set method in DSSP.py (JR Apr 2012)
 - Residues that DSSP can't handle (incomplete backbone f ex) treated as coil  (JR Apr 2012)
 - Update to_one_letter_code library to protein_letters_3to1 (Jorge Roel 2017)
 - Inclusion of fake beads for corresponding amino-acids <SCd> (Jorge Roel 2017)
 - Changed the mapping routine to include DNA bead types (Rodrigo Honorato 2018)
 - Implemented feature to check if nucleic acid is a candidate for hbond (Rodrigo Honorato 2018)
"""

import collections
import itertools
import math
import os
import random
import subprocess
import tempfile
import warnings

from pathlib import Path
from io import StringIO

from Bio.PDB import Entity, PDBIO, PDBParser
from Bio.PDB.Structure import Structure
from Bio.PDB.StructureBuilder import StructureBuilder

from haddock import log
from haddock.core.exceptions import ModuleError
from haddock.core.typing import Optional
from haddock.libs.libontology import Format

warnings.filterwarnings("ignore")

CRYST_LINE = "CRYST1 " + os.linesep


[docs] def norm(a): """ Args: a: Returns: """ return math.sqrt(norm2(a))
[docs] def norm2(a): """ Args: a: Returns: """ return sum([i * i for i in a])
[docs] def pat(x, c="."): """ Reformats pattern strings. Args: x: c: Returns: """ return x.replace(c, "\x00").split()
[docs] def hash(x, y): """ Makes a dictionary from two lists. Args: x: y: Returns: """ return dict(zip(x, y))
[docs] def spl(x): """ Splits a string. Args: x: Returns: """ return x.split()
[docs] def tt(program): """ Args: program: Returns: """ return "".join([ssd[program].get(chr(i), "C") for i in range(256)])
[docs] def typesub(seq, patterns, types): """ Pattern substitutions. Args: seq: patterns: types: Returns: """ for i, j in zip(patterns, types): seq = seq.replace(i, j) return seq
[docs] def ss_classification(ss, program="dssp"): """ Translates a string encoding the secondary structure to a string of corresponding Martini types, taking the origin of the secondary structure into account, and replacing termini if requested. Args: ss: program: Returns: """ # Translate dssp/pymol/gmx ss to Martini ss ss = ss.translate(sstt[program]) # Separate the different secondary structure types sep = dict([(i, ss.translate(sstd[i])) for i in sstd.keys()]) # Do type substitutions based on patterns # If the ss type is not in the patterns lists, do not substitute # (use empty lists for substitutions) typ = [ typesub(sep[i], patterns.get(i, []), pattypes.get(i, [])) for i in sstd.keys() ] # Translate all types to numerical values typ = [[ord(j) for j in list(i)] for i in typ] # Sum characters back to get a full typed sequence typ = "".join([chr(sum(i)) for i in zip(*typ)]) # Return both the actual as well as the fully typed sequence return ss, typ
# ----+--------------------------------------+ # A | SECONDARY STRUCTURE TYPE DEFINITIONS | # ----+--------------------------------------+ ssdefs = { "dssp": list(".HGIBETSC~"), # DSSP one letter secondary structure code #@# "pymol": list(".H...S...L"), # Pymol one letter secondary structure code #@# "gmx": list(".H...ETS.C"), # Gromacs secondary structure dump code #@# "self": list("FHHHEETSCC"), # Internal CG secondary structure codes #@# } cgss = list("FHHHEETSCC") # Corresponding CG secondary structure types #@# patterns = { "H": pat(".H. .HH. .HHH. .HHHH. .HHHHH. .HHHHHH. .HHHHHHH. .HHHH HHHH.") # @# } pattypes = { "H": pat(".3. .33. .333. .3333. .13332. .113322. .1113222. .1111 2222.") # @# } ss_to_code = { "C": 1, # Free, " ": 1, "S": 2, "H": 3, "1": 4, "2": 5, "3": 6, "E": 7, # Extended "T": 8, # Turn "F": 9, # Fibril } ss_eq = list("CBHHHHBTF") # List of programs for which secondary structure definitions can be processed programs = ssdefs.keys() # Dictionaries mapping ss types to the CG ss types ssd = dict([(i, hash(ssdefs[i], cgss)) for i in programs]) # The translation table depends on the program used to obtain the # secondary structure definitions sstt = dict([(i, tt(i)) for i in programs]) # The following translation tables are used to identify stretches of # a certain type of secondary structure. null = "\x00" sstd = dict([(i, ord(i) * null + i + (255 - ord(i)) * null) for i in cgss]) # ==========================================================================================# # ==========================================================================================# # ==========================================================================================# # CG MAPPING INFORMATION bb = "CA C N O " prot_atoms = { "ALA": [bb + "CB"], "CYS": [bb, "CB SG"], "ASP": [bb, "CB CG OD1 OD2"], "GLU": [bb, "CB CG CD OE1 OE2"], "PHE": [bb, "CB CG CD1", "CD2 CE2", "CE1 CZ"], "GLY": [bb], "HIS": [bb, "CB CG", "CD2 NE2", "ND1 CE1"], "ILE": [bb, "CB CG1 CG2 CD1"], "LYS": [bb, "CB CG CD", "CE NZ"], "LEU": [bb, "CB CG CD1 CD2"], "MET": [bb, "CB CG SD CE"], "ASN": [bb, "CB CG ND1 ND2 OD1 OD2"], # ND1? "PRO": [bb, "CB CG CD"], "GLN": [bb, "CB CG CD OE1 OE2 NE1 NE2"], "ARG": [bb, "CB CG CD", "NE CZ NH1 NH2"], "SER": [bb, "CB OG"], "THR": [bb, "CB OG1 CG2"], "VAL": [bb, "CB CG1 CG2"], "TRP": [bb, "CB CG CD2", "CD1 NE1 CE2", "CE3 CZ3", "CZ2 CH2"], "TYR": [bb, "CB CG CD1", "CD2 CE2", "CE1 CZ OH"], } bead_names = ["BB", "SC1", "SC2", "SC3", "SC4"] # insert beads into the data structure cg_mapping = {} for res in prot_atoms: cg_mapping[res] = collections.OrderedDict() for i, atom_l in enumerate(prot_atoms[res]): bead = bead_names[i] cg_mapping[res][atom_l] = bead ###################################### # Nucleotide mapping, # This is a custom naming convention # but the atom mapping is defined in # 10.1021/acs.jctc.5b00286 - S1 ###################################### DA_beads = collections.OrderedDict() DA_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1" DA_beads["C5' O4' C4'"] = "BB2" DA_beads["C3' C2' C1'"] = "BB3" DA_beads["N9 C4"] = "SC1" DA_beads["C2 N3"] = "SC2" DA_beads["C6 N6 N1"] = "SC3" DA_beads["C8 N7 C5"] = "SC4" DC_beads = collections.OrderedDict() DC_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1" DC_beads["C5' O4' C4'"] = "BB2" DC_beads["C3' C2' C1'"] = "BB3" DC_beads["N1 C6"] = "SC1" DC_beads["N3 C2 O2"] = "SC2" DC_beads["C5 C4 N4"] = "SC3" DG_beads = collections.OrderedDict() DG_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1" DG_beads["C5' O4' C4'"] = "BB2" DG_beads["C3' C2' C1'"] = "BB3" DG_beads["N9 C4"] = "SC1" DG_beads["C2 N2 N3"] = "SC2" DG_beads["C6 O6 N1"] = "SC3" DG_beads["C8 N7 C5"] = "SC4" DT_beads = collections.OrderedDict() DT_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1" DT_beads["C5' O4' C4'"] = "BB2" DT_beads["C3' C2' C1'"] = "BB3" DT_beads["N1 C6"] = "SC1" DT_beads["N3 C2 O2"] = "SC2" DT_beads["C5 C4 O4 C7"] = "SC3" A_beads = collections.OrderedDict() A_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1" A_beads["C5' O4' C4'"] = "BB2" A_beads["C3' C2' O2' C1'"] = "BB3" A_beads["N9 C4"] = "SC1" A_beads["C2 N3"] = "SC2" A_beads["C6 N6 N1"] = "SC3" A_beads["C8 N7 C5"] = "SC4" C_beads = collections.OrderedDict() C_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1" C_beads["C5' O4' C4'"] = "BB2" C_beads["C3' C2' O2' C1'"] = "BB3" C_beads["N1 C6"] = "SC1" C_beads["N3 C2 O2"] = "SC2" C_beads["C5 C4 N4"] = "SC3" G_beads = collections.OrderedDict() G_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1" G_beads["C5' O4' C4'"] = "BB2" G_beads["C3' C2' O2' C1'"] = "BB3" G_beads["N9 C4"] = "SC1" G_beads["C2 N2 N3"] = "SC2" G_beads["C6 O6 N1"] = "SC3" G_beads["C8 N7 C5"] = "SC4" U_beads = collections.OrderedDict() U_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1" U_beads["C5' O4' C4'"] = "BB2" U_beads["C3' C2' O2' C1'"] = "BB3" U_beads["N1 C6"] = "SC1" U_beads["N3 C2 O2"] = "SC2" U_beads["C5 C4 O4"] = "SC3" cg_mapping["DA"] = DA_beads cg_mapping["DC"] = DC_beads cg_mapping["DT"] = DT_beads cg_mapping["DG"] = DG_beads cg_mapping["A"] = A_beads cg_mapping["C"] = C_beads cg_mapping["U"] = U_beads cg_mapping["G"] = G_beads pairing = { ("DG", "DC"): [("N2", "O2"), ("N1", "N3"), ("O6", "N4")], ("DC", "DG"): [("O2", "N2"), ("N3", "N1"), ("N4", "O6")], ("DA", "DT"): [("N6", "O4"), ("N1", "N3")], ("DT", "DA"): [("O4", "N6"), ("N3", "N1")], # ("G", "C"): [("N2", "O2"), ("N1", "N3"), ("O6", "N4")], ("C", "G"): [("O2", "N2"), ("N3", "N1"), ("N4", "O6")], ("A", "U"): [("N6", "O4"), ("N1", "N3")], ("U", "A"): [("O4", "N6"), ("N3", "N1")], } polar = ["GLN", "ASN", "SER", "THR"] charged = ["ARG", "LYS", "ASP", "GLU"]
[docs] def add_dummy(bead_list, dist=0.11, n=2): """ Args: bead_list: dist: n: Returns: """ new_bead_dic = {} # Generate a random vector in a sphere of -1 to +1, to add to the bead position v = [ random.random() * 2.0 - 1, random.random() * 2.0 - 1, random.random() * 2.0 - 1, ] # Calculated the length of the vector and divide by the final distance of the dummy bead norm_v = norm(v) / dist # Resize the vector vn = [i / norm_v for i in v] # m sets the direction of the added vector, currently only works when adding one or two beads. m = 1 for j in range(n): # create two new beads bead_s = str(j + 1) new_name = f"SCD{bead_s}" # set the name of the new bead new_bead_dic[new_name] = [i + (m * j) for i, j in zip(bead_list[-1][1], vn)] m *= -2 return new_bead_dic
[docs] def map_cg(chain): """ Args: chain: Returns: """ m_dic = collections.OrderedDict() for aares in chain: m_dic[aares] = collections.OrderedDict() resn = aares.resname.split()[0] # resname segid = aares.segid.strip() resi = aares.id[1] if resn not in cg_mapping: log.warning(f"Residue {resn} not in cg_mapping — skipping.") continue # skip to next residue # for each atom segment, calculate its center of mass and map the correct bead for atom_segment in cg_mapping[resn]: atoms = [aares[a] for a in atom_segment.split() if a in aares.child_dict] if atoms: if ( "*" in atom_segment ): # this is important to correctly place CG DNA beads # this * means it belongs to the previous residue... find it! target_previous_atom_list = [ a for a in atom_segment.split() if "*" in a ] for target_atom in target_previous_atom_list: # does it exist? target_atom_name = target_atom.split("*")[0] try: previous_atom = chain[resi - 1][target_atom_name] # how far away the previous atom is from this atom segment? # if it is too far away this could be the next chain...! minimum_dist = min([(a - previous_atom) for a in atoms]) if minimum_dist < 2.0: # 2.0 A is very permissive atoms.append(previous_atom) except KeyError: # previous atom not found, move on pass if not atoms: log.warning( "Residue {} {:d} of chain {} cannot be processed: missing atoms {} ".format( resn, resi, aares.parent.id, atom_segment ) ) continue bead_name = cg_mapping[resn][atom_segment] # get center of mass code = list(set([a.bfactor for a in aares if a.bfactor != 0])) if len(code) > 1: emsg = "Something is wrong with HADDOCK codes" raise ModuleError(emsg) if not code: code = 0.0 else: code = code[0] bead_coord = center_of_mass(atoms) atom_segment = " ".join(list(atom_segment.split())).replace("*", "") # restrain for backmapping restrain = ( "assign (segid {}CG and resid {:d} and name {})" " (segid {} and resid {:d} and (name {})) 0 0 0".format( segid, resi, bead_name, segid, resi, " or name ".join(atom_segment.split()), ) ) m_dic[aares][bead_name] = bead_coord, code, restrain # add dummy beads whenever its needed for r in m_dic: if r.resname in polar: d = 0.14 # distance n = 2 # number of dummy beads to be placed elif r.resname in charged: d = 0.11 # distance n = 1 # number of dummy beads to be placed else: continue # add to data structure # this special beads have no HADDOCK code bead_list = [(b, m_dic[r][b][0]) for b in m_dic[r]] dummy_bead_dic = add_dummy(bead_list, dist=d, n=n) for db in dummy_bead_dic: db_coords = dummy_bead_dic[db] # code should be the same as the residue code = m_dic[r][list(m_dic[r])[0]][1] m_dic[r][db] = (db_coords, code, None) return m_dic
[docs] def center_of_mass(entity, geometric=False): """ Returns gravitic [default] or geometric center of mass of an Entity. Geometric assumes all masses are equal (geometric=True) Args: entity: geometric: Returns: """ # Structure, Model, Chain, Residue if isinstance(entity, Entity.Entity): atom_list = entity.get_atoms() # List of Atoms elif hasattr(entity, "__iter__") and [x for x in entity if x.level == "A"]: atom_list = entity else: # Some other weirdo object raise ValueError( "Center of Mass can only be calculated from the following objects:\n" "Structure, Model, Chain, Residue, list of Atoms." ) masses = [] positions = [[], [], []] # [ [X1, X2, ..] , [Y1, Y2, ...] , [Z1, Z2, ...] ] for atom in atom_list: masses.append(atom.mass) for i, coord in enumerate(atom.coord.tolist()): positions[i].append(coord) # If there is a single atom with undefined mass complain loudly. if "ukn" in set(masses) and not geometric: raise ValueError( f"Some Atoms don't have an element assigned.{os.linesep}" "Try adding them manually or calculate the geometrical " "center of mass instead." ) if geometric: return [sum(coord_list) / len(masses) for coord_list in positions] w_pos = [[], [], []] for atom_index, atom_mass in enumerate(masses): w_pos[0].append(positions[0][atom_index] * atom_mass) w_pos[1].append(positions[1][atom_index] * atom_mass) w_pos[2].append(positions[2][atom_index] * atom_mass) return [sum(coord_list) / sum(masses) for coord_list in w_pos]
[docs] def determine_hbonds(structure: Structure): """ Args: structure: Returns: """ nuc = ["DA", "DC", "DG", "DT", "A", "C", "G", "U"] aa = [ "ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", "LEU", "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", "TRP", "TYR", ] pair_list = [] for model in structure: dna_chain_l = [] for chain in model: prot_comp = len( [r for r in chain.get_residues() if r.resname.split()[0] in aa] ) dna_comp = len( [r for r in chain.get_residues() if r.resname.split()[0] in nuc] ) if prot_comp: # protein pass if dna_comp: # nucleic dna_chain_l.append(chain) if len(dna_chain_l) == 1: log.warning("Only one DNA/RNA chain detected, is this correct?") chain_a = dna_chain_l[0] reslist_a = [r for r in chain_a.get_residues()] for ra, rb in itertools.combinations(reslist_a, 2): pair = identify_pairing(ra, rb) if pair: pair_list.append(pair) if ( len(dna_chain_l) > 1 ): # list sizes could be different, this might be improbable for chain_a, chain_b in itertools.combinations(dna_chain_l, 2): reslist_a = [r for r in chain_a.get_residues()] reslist_b = [r for r in chain_b.get_residues()] for ra in reslist_a: for rb in reslist_b: pair = identify_pairing(ra, rb) pair_list.append(pair) return pair_list
[docs] def identify_pairing(ra, rb): """ Args: ra: rb: Returns: """ pair = [] # check if the pairing is correct ra_name = ra.resname.split()[0] rb_name = rb.resname.split()[0] try: atom_pair_list = pairing[ra_name, rb_name] except KeyError: # pairing not possible return # check if distances are ok distance_l = [] for atom_list in atom_pair_list: try: a = ra[atom_list[0]] b = rb[atom_list[1]] distance_l.append(a - b) except KeyError: # residue does not have the necessary sidechain atoms # assume its not a pair return # check P-P distances to make sure its the opposite base # distances for perfect DNA: # opposite = 18.8A # sequential = 6.6A # # 10.0A should be sufficient p_cutoff = 10.0 # Basedist_cutoff = 3.5 basedist_cutoff = 3.5 try: pa = ra.child_dict["P"] pb = rb.child_dict["P"] p_distance = pa - pb except KeyError: # some base is missing its P, use the geometric center instead cen_a = center_of_mass(ra.child_dict.values()) cen_b = center_of_mass(rb.child_dict.values()) p_distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(cen_a, cen_b)])) if p_distance > p_cutoff: resnum_a = ra.id[1] resnum_b = rb.id[1] if all( e < basedist_cutoff for e in distance_l ): # if ALL bonds are within range # KEEP IN MIND THAT this will not account for badly paired DNA # Implement a way to search for the closest possible pair? ## segid_a = ra.get_segid().split()[0] segid_b = rb.get_segid().split()[0] pair = (resnum_a, segid_a), (resnum_b, segid_b) # special atoms, mark them! for atom_pair in atom_pair_list: atom_a, atom_b = atom_pair ra[atom_a].bfactor = 1 rb[atom_b].bfactor = 1 return pair
[docs] def output_cg_restraints(pair_list): """ Args: pair_list: Returns: """ out = open("dna_restraints.def", "w") for i, e in enumerate(pair_list): idx = i + 1 res_a = e[0][0] segid_a = e[0][1] res_b = e[1][0] segid_b = e[1][1] out.write( f"{{===>}} base_a_{idx}=(resid {res_a} and segid {segid_a});\n" f"{{===>}} base_b_{idx}=(resid {res_b} and segid {segid_b});\n\n" ) out.close()
[docs] def extract_groups(pair_list): """ Args: pair_list: Returns: """ # this will be used to define AA restraints out = open("dna-aa_groups.dat", "w") # extract groups group_a = [a[0][0] for a in pair_list] segid_a = list(set([a[0][1] for a in pair_list])) group_b = [a[1][0] for a in pair_list] segid_b = list(set([a[0][1] for a in pair_list])) if len(segid_a) != 1: emsg = "Something is wrong with SEGID A" raise ModuleError(emsg) if len(segid_b) != 1: emsg = "Something is wrong with SEGID B" raise ModuleError(emsg) segid_a = segid_a[0] segid_b = segid_b[0] group_a.sort() group_b.sort() out.write( f"{group_a[0]}:{group_a[-1]}\n{segid_a}\n{group_b[0]}:{group_b[-1]}\n{segid_b}" ) out.close()
[docs] def create_file_with_cryst(pdb_file: str) -> None: """ This function creates a new pdb because the CRYST line is missing from the pdf file. This line is necessary for DSSP. Args: output: str pdb_file: str Returns: pdb_file_copy: str """ with ( open(pdb_file, "r") as file_in, tempfile.NamedTemporaryFile(mode="w", delete=False) as file_out, ): content = file_in.read() file_out.write(CRYST_LINE + content) return file_out.name
[docs] def determine_ss(structure: Structure, skipss: bool, pdbf_path: str) -> Structure: """Determine secondary structures from input structure Args: structure: skipss: pdbf_path: Returns: structure: """ # calculate SS for model in structure: if skipss: continue try: tmp_file_name = create_file_with_cryst(pdbf_path) p = subprocess.Popen( ["dssp", tmp_file_name, "--output-format", "dssp"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) dssp_raw, _ = p.communicate() dssp_raw = dssp_raw.split("#")[1].split("\n")[1:-1] except: # TODO: think about making this exception more specific # no secondary structure detected for this model log.warning("SS could not be assigned, assigning code 1 to all residues") continue finally: if Path(tmp_file_name).exists(): Path(tmp_file_name).unlink() dssp = {} for line in dssp_raw: var_a, var_b = line[11], line[16] dssp.setdefault(var_a, []).append(var_b) # Get SS information and translate it: # DSSP > MARTINI > HADDOCK # this could still be improved for chain in model: dssp_ss = "".join(dssp[chain.id]) _, martini_types = ss_classification( dssp_ss ) # ancestral function, keep it there # transform MARTINI > HADDOCK # and add it to the bfactor col for residue, ss in zip( chain, martini_types ): # chain and martini_types order must match code = ss_to_code[ss] # for atom in residue.get_atoms(): for atom in residue: atom.bfactor = code return structure
[docs] def rename_nucbases(structure: Structure) -> None: """Inplace residue renaming according to HADDOCK ones. Parameters ---------- structure : Bio.PDB.Structure.Structure Input structure """ chainresdic = { c.get_id(): [r.get_resname() for r in c.get_residues()] for m in structure for c in m } nucleotide_list = [ "CYT", "C", "DC", "THY", "T", "DT", "ADE", "A", "DA", "G", "GUA", "DG", "U", "URI", ] rna_resname_mapper = {"CYT": "C", "URI": "U", "ADE": "A", "GUA": "G"} dna_rename_mapper = {"CYT": "DC", "THY": "DT", "ADE": "DA", "GUA": "DG"} if [True for c in chainresdic for e in chainresdic[c] if e in nucleotide_list]: # Check if this is an RNA is_rna = [ True for c in chainresdic for e in chainresdic[c] if e in ["U", "URI"] ] ref_dic = rna_resname_mapper if is_rna else dna_rename_mapper # Loop over models for model in structure: for chain in model: for r in chain.get_residues(): if r.resname in ref_dic.keys(): # Rename residue name r.resname = ref_dic[r.resname]
[docs] def martinize( input_pdb: str, output_path: str, skipss: bool, ) -> tuple[str, bool]: """ Converts an all-atom (AA) PDB structure into a coarse-grained (CG) model using a MARTINI2.2 mapping and generating CG-to-AA restraints for backmapping. Optionally uses secondary structure for mapping. Args: input_pdb (str): Path to the input AA PDB file. output_path (str): Directory where the output files will be written. - A CG PDB file (``*_cg.pdb``) - A restraint table (``*_cg_to_aa.tbl``) skipss (bool): If True, skips secondary structure assignment (DSSP step). If False, assigns secondary structure and encodes it into HADDOCK-compatible B-factors. Returns: tuple[str, bool]: cg_pdb_name: Path to the generated CG PDB file. shape: True if at least one residue with name "SHA" (shape bead) is detected in the structure, False otherwise. """ if not input_pdb: emsg = "No input file detected" raise ModuleError(emsg) p = PDBParser() io = PDBIO() # Parse PDB and run DSSP pdbf_path = os.path.realpath(input_pdb) aa_model = p.get_structure("aa_model", pdbf_path) # set ALL bfactors to 1 for model in aa_model: for chain in model: if chain.id == " ": emsg = "Empty chain id detected" raise ModuleError(emsg) for residue in chain: for atom in residue: atom.bfactor = 1.0 # Assign HADDOCK code according to SS (1-9) determine_ss(structure=aa_model, skipss=skipss, pdbf_path=pdbf_path) # Strandardize naming # WARNING, THIS ASSUMES THAT INPUT DNA/RNA IS 3-LETTER CODE rename_nucbases(aa_model) # Assign HADDOCK code for hydrogen bonding capable nucleotides (0-1) pair_list = determine_hbonds(aa_model) if pair_list: output_cg_restraints(pair_list) # Map CG beads to AA structure structure_builder = StructureBuilder() structure_builder.init_structure("cg_model") structure_builder.init_seg(" ") # Empty SEGID tbl_cg_to_aa = [] restrain_counter = 0 for model in aa_model: structure_builder.init_model(model.id) for chain in model: structure_builder.init_chain(chain.id) structure_builder.init_seg(chain.id) mapping_dic = map_cg(chain) for residue in mapping_dic: if residue.id[0] != " ": # filter HETATMS continue structure_builder.init_residue( residue.resname, residue.id[0], residue.id[1], residue.id[2] ) for i, bead in enumerate(mapping_dic[residue]): bead_name = bead bead_coord = mapping_dic[residue][bead_name][0] haddock_code = mapping_dic[residue][bead_name][1] restrain = mapping_dic[residue][bead_name][2] structure_builder.init_atom( bead_name, bead_coord, haddock_code, 1.00, " ", bead_name, i ) tbl_cg_to_aa.append(restrain) restrain_counter += 1 # Write CG to AA backmapping restraint file tbl_file_name = gen_cg_tbl_backmapping_fname(f"{output_path}", pdbf_path) with open(tbl_file_name, "w") as tbl_file: tbl_file.write("\n" + "\n".join([tbl for tbl in tbl_cg_to_aa if tbl])) # Write pre-CG structure cg_model = structure_builder.get_structure() io.set_structure(cg_model) # Setup in-memory text buffer io_file = StringIO() # Write file in it io.save(io_file, write_end=1) # Go back to the start of the file to read it io_file.seek(0) # Write the actual valid CG structure # make sure atom names are in the correct place # .BB. .BB1. .BB2. and not BB.. BB1.. BB2.. cg_pdb_name = gen_cg_filename(f"{output_path}", pdbf_path) out = open(cg_pdb_name, "w") for line in io_file.readlines(): n_l = line if line.startswith("ATOM"): atom_name = line[12:16].split()[0] # mind the spacing if 1 <= len(atom_name) <= 3: n_l = f"{line[:12]} {atom_name:<3s}{line[16:]}" out.write(n_l) out.close() del io_file return cg_pdb_name
[docs] def gen_cg_filename( output_dir: str, input_fname: str, force_field: Optional[str] = None, ext: Optional[str] = None, ) -> str: """Helper function to standarize CG filename from input file. Parameters ---------- output_dir : str Where to write the file. input_fname : str Name of the original input PDB file. force_field : Optional[str], optional Name of the force-field, by default None ext : Optional[str], optional File extension, by default None Returns ------- cg_fname : str Name of the CG file. """ # Suffix for force-field if defined ff_suffix = f"_{force_field}" if force_field else "" # Set file extension file_ext = ext if ext else Format.PDB # Generate filepath cg_fpath = Path(output_dir, f"{Path(input_fname).stem}_cg{ff_suffix}.{file_ext}") cg_fname = str(cg_fpath) return cg_fname
[docs] def gen_cg_tbl_backmapping_fname(output_dir: str, input_fname: str) -> Path: """Helper function to generate CG backmapping retraints filename. Parameters ---------- output_dir : str Where to write the file. input_fname : str Name of the original input PDB file. Returns ------- tbl_file_name: Path Name of backmapping restraint filename. """ tbl_file_name = Path(output_dir, f"{Path(input_fname).stem}_cg_to_aa.tbl") return tbl_file_name