# CODE TAKEN FROM MARTINIZE 1.1 ##
# Please refer to it for more information
"""
Reduces complexity of protein residue to the MARTINI coarse grained model:
CA, O, Bead(s) in specific atom location.
Reference:
Monticelli et al. The MARTINI coarse-grained force field: extension to proteins.
J. Chem. Theory Comput. (2008) vol. 4 (5) pp. 819-834
Martinize Script from Tserk Wassenaar
Uses Biopython to parse the structure and DSSP output.
Uses pieces of the martinize-1.1.py script to convert the SS types
Outputs a coarse grained pdb file (``*_ss.pdb``) with assigned bfactors.
Outputs a tbl file to map the beads to the atoms they represent.
Updates
- Updated python version to 2.6 to support isdisjoint() set method in DSSP.py (JR Apr 2012)
- Residues that DSSP can't handle (incomplete backbone f ex) treated as coil (JR Apr 2012)
- Update to_one_letter_code library to protein_letters_3to1 (Jorge Roel 2017)
- Inclusion of fake beads for corresponding amino-acids <SCd> (Jorge Roel 2017)
- Changed the mapping routine to include DNA bead types (Rodrigo Honorato 2018)
- Implemented feature to check if nucleic acid is a candidate for hbond (Rodrigo Honorato 2018)
"""
import collections
import itertools
import math
import os
import random
import subprocess
import tempfile
import warnings
from pathlib import Path
from io import StringIO
from Bio.PDB import Entity, PDBIO, PDBParser
from Bio.PDB.Structure import Structure
from Bio.PDB.StructureBuilder import StructureBuilder
from haddock import log
from haddock.core.exceptions import ModuleError
from haddock.core.typing import Optional
from haddock.libs.libontology import Format
warnings.filterwarnings("ignore")
CRYST_LINE = "CRYST1 " + os.linesep
[docs]
def norm(a):
"""
Args:
a:
Returns:
"""
return math.sqrt(norm2(a))
[docs]
def norm2(a):
"""
Args:
a:
Returns:
"""
return sum([i * i for i in a])
[docs]
def pat(x, c="."):
"""
Reformats pattern strings.
Args:
x:
c:
Returns:
"""
return x.replace(c, "\x00").split()
[docs]
def hash(x, y):
"""
Makes a dictionary from two lists.
Args:
x:
y:
Returns:
"""
return dict(zip(x, y))
[docs]
def spl(x):
"""
Splits a string.
Args:
x:
Returns:
"""
return x.split()
[docs]
def tt(program):
"""
Args:
program:
Returns:
"""
return "".join([ssd[program].get(chr(i), "C") for i in range(256)])
[docs]
def typesub(seq, patterns, types):
"""
Pattern substitutions.
Args:
seq:
patterns:
types:
Returns:
"""
for i, j in zip(patterns, types):
seq = seq.replace(i, j)
return seq
[docs]
def ss_classification(ss, program="dssp"):
"""
Translates a string encoding the secondary structure to a string of corresponding Martini types, taking the
origin of the secondary structure into account, and replacing termini if requested.
Args:
ss:
program:
Returns:
"""
# Translate dssp/pymol/gmx ss to Martini ss
ss = ss.translate(sstt[program])
# Separate the different secondary structure types
sep = dict([(i, ss.translate(sstd[i])) for i in sstd.keys()])
# Do type substitutions based on patterns
# If the ss type is not in the patterns lists, do not substitute
# (use empty lists for substitutions)
typ = [
typesub(sep[i], patterns.get(i, []), pattypes.get(i, [])) for i in sstd.keys()
]
# Translate all types to numerical values
typ = [[ord(j) for j in list(i)] for i in typ]
# Sum characters back to get a full typed sequence
typ = "".join([chr(sum(i)) for i in zip(*typ)])
# Return both the actual as well as the fully typed sequence
return ss, typ
# ----+--------------------------------------+
# A | SECONDARY STRUCTURE TYPE DEFINITIONS |
# ----+--------------------------------------+
ssdefs = {
"dssp": list(".HGIBETSC~"), # DSSP one letter secondary structure code #@#
"pymol": list(".H...S...L"), # Pymol one letter secondary structure code #@#
"gmx": list(".H...ETS.C"), # Gromacs secondary structure dump code #@#
"self": list("FHHHEETSCC"), # Internal CG secondary structure codes #@#
}
cgss = list("FHHHEETSCC") # Corresponding CG secondary structure types #@#
patterns = {
"H": pat(".H. .HH. .HHH. .HHHH. .HHHHH. .HHHHHH. .HHHHHHH. .HHHH HHHH.") # @#
}
pattypes = {
"H": pat(".3. .33. .333. .3333. .13332. .113322. .1113222. .1111 2222.") # @#
}
ss_to_code = {
"C": 1, # Free,
" ": 1,
"S": 2,
"H": 3,
"1": 4,
"2": 5,
"3": 6,
"E": 7, # Extended
"T": 8, # Turn
"F": 9, # Fibril
}
ss_eq = list("CBHHHHBTF")
# List of programs for which secondary structure definitions can be processed
programs = ssdefs.keys()
# Dictionaries mapping ss types to the CG ss types
ssd = dict([(i, hash(ssdefs[i], cgss)) for i in programs])
# The translation table depends on the program used to obtain the
# secondary structure definitions
sstt = dict([(i, tt(i)) for i in programs])
# The following translation tables are used to identify stretches of
# a certain type of secondary structure.
null = "\x00"
sstd = dict([(i, ord(i) * null + i + (255 - ord(i)) * null) for i in cgss])
# ==========================================================================================#
# ==========================================================================================#
# ==========================================================================================#
# CG MAPPING INFORMATION
bb = "CA C N O "
prot_atoms = {
"ALA": [bb + "CB"],
"CYS": [bb, "CB SG"],
"ASP": [bb, "CB CG OD1 OD2"],
"GLU": [bb, "CB CG CD OE1 OE2"],
"PHE": [bb, "CB CG CD1", "CD2 CE2", "CE1 CZ"],
"GLY": [bb],
"HIS": [bb, "CB CG", "CD2 NE2", "ND1 CE1"],
"ILE": [bb, "CB CG1 CG2 CD1"],
"LYS": [bb, "CB CG CD", "CE NZ"],
"LEU": [bb, "CB CG CD1 CD2"],
"MET": [bb, "CB CG SD CE"],
"ASN": [bb, "CB CG ND1 ND2 OD1 OD2"], # ND1?
"PRO": [bb, "CB CG CD"],
"GLN": [bb, "CB CG CD OE1 OE2 NE1 NE2"],
"ARG": [bb, "CB CG CD", "NE CZ NH1 NH2"],
"SER": [bb, "CB OG"],
"THR": [bb, "CB OG1 CG2"],
"VAL": [bb, "CB CG1 CG2"],
"TRP": [bb, "CB CG CD2", "CD1 NE1 CE2", "CE3 CZ3", "CZ2 CH2"],
"TYR": [bb, "CB CG CD1", "CD2 CE2", "CE1 CZ OH"],
}
bead_names = ["BB", "SC1", "SC2", "SC3", "SC4"]
# insert beads into the data structure
cg_mapping = {}
for res in prot_atoms:
cg_mapping[res] = collections.OrderedDict()
for i, atom_l in enumerate(prot_atoms[res]):
bead = bead_names[i]
cg_mapping[res][atom_l] = bead
######################################
# Nucleotide mapping,
# This is a custom naming convention
# but the atom mapping is defined in
# 10.1021/acs.jctc.5b00286 - S1
######################################
DA_beads = collections.OrderedDict()
DA_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
DA_beads["C5' O4' C4'"] = "BB2"
DA_beads["C3' C2' C1'"] = "BB3"
DA_beads["N9 C4"] = "SC1"
DA_beads["C2 N3"] = "SC2"
DA_beads["C6 N6 N1"] = "SC3"
DA_beads["C8 N7 C5"] = "SC4"
DC_beads = collections.OrderedDict()
DC_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
DC_beads["C5' O4' C4'"] = "BB2"
DC_beads["C3' C2' C1'"] = "BB3"
DC_beads["N1 C6"] = "SC1"
DC_beads["N3 C2 O2"] = "SC2"
DC_beads["C5 C4 N4"] = "SC3"
DG_beads = collections.OrderedDict()
DG_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
DG_beads["C5' O4' C4'"] = "BB2"
DG_beads["C3' C2' C1'"] = "BB3"
DG_beads["N9 C4"] = "SC1"
DG_beads["C2 N2 N3"] = "SC2"
DG_beads["C6 O6 N1"] = "SC3"
DG_beads["C8 N7 C5"] = "SC4"
DT_beads = collections.OrderedDict()
DT_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
DT_beads["C5' O4' C4'"] = "BB2"
DT_beads["C3' C2' C1'"] = "BB3"
DT_beads["N1 C6"] = "SC1"
DT_beads["N3 C2 O2"] = "SC2"
DT_beads["C5 C4 O4 C7"] = "SC3"
A_beads = collections.OrderedDict()
A_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
A_beads["C5' O4' C4'"] = "BB2"
A_beads["C3' C2' O2' C1'"] = "BB3"
A_beads["N9 C4"] = "SC1"
A_beads["C2 N3"] = "SC2"
A_beads["C6 N6 N1"] = "SC3"
A_beads["C8 N7 C5"] = "SC4"
C_beads = collections.OrderedDict()
C_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
C_beads["C5' O4' C4'"] = "BB2"
C_beads["C3' C2' O2' C1'"] = "BB3"
C_beads["N1 C6"] = "SC1"
C_beads["N3 C2 O2"] = "SC2"
C_beads["C5 C4 N4"] = "SC3"
G_beads = collections.OrderedDict()
G_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
G_beads["C5' O4' C4'"] = "BB2"
G_beads["C3' C2' O2' C1'"] = "BB3"
G_beads["N9 C4"] = "SC1"
G_beads["C2 N2 N3"] = "SC2"
G_beads["C6 O6 N1"] = "SC3"
G_beads["C8 N7 C5"] = "SC4"
U_beads = collections.OrderedDict()
U_beads["O3'* P O1P O2P O5' OP1 OP2"] = "BB1"
U_beads["C5' O4' C4'"] = "BB2"
U_beads["C3' C2' O2' C1'"] = "BB3"
U_beads["N1 C6"] = "SC1"
U_beads["N3 C2 O2"] = "SC2"
U_beads["C5 C4 O4"] = "SC3"
cg_mapping["DA"] = DA_beads
cg_mapping["DC"] = DC_beads
cg_mapping["DT"] = DT_beads
cg_mapping["DG"] = DG_beads
cg_mapping["A"] = A_beads
cg_mapping["C"] = C_beads
cg_mapping["U"] = U_beads
cg_mapping["G"] = G_beads
pairing = {
("DG", "DC"): [("N2", "O2"), ("N1", "N3"), ("O6", "N4")],
("DC", "DG"): [("O2", "N2"), ("N3", "N1"), ("N4", "O6")],
("DA", "DT"): [("N6", "O4"), ("N1", "N3")],
("DT", "DA"): [("O4", "N6"), ("N3", "N1")],
#
("G", "C"): [("N2", "O2"), ("N1", "N3"), ("O6", "N4")],
("C", "G"): [("O2", "N2"), ("N3", "N1"), ("N4", "O6")],
("A", "U"): [("N6", "O4"), ("N1", "N3")],
("U", "A"): [("O4", "N6"), ("N3", "N1")],
}
polar = ["GLN", "ASN", "SER", "THR"]
charged = ["ARG", "LYS", "ASP", "GLU"]
[docs]
def add_dummy(bead_list, dist=0.11, n=2):
"""
Args:
bead_list:
dist:
n:
Returns:
"""
new_bead_dic = {}
# Generate a random vector in a sphere of -1 to +1, to add to the bead position
v = [
random.random() * 2.0 - 1,
random.random() * 2.0 - 1,
random.random() * 2.0 - 1,
]
# Calculated the length of the vector and divide by the final distance of the dummy bead
norm_v = norm(v) / dist
# Resize the vector
vn = [i / norm_v for i in v]
# m sets the direction of the added vector, currently only works when adding one or two beads.
m = 1
for j in range(n): # create two new beads
bead_s = str(j + 1)
new_name = f"SCD{bead_s}" # set the name of the new bead
new_bead_dic[new_name] = [i + (m * j) for i, j in zip(bead_list[-1][1], vn)]
m *= -2
return new_bead_dic
[docs]
def map_cg(chain):
"""
Args:
chain:
Returns:
"""
m_dic = collections.OrderedDict()
for aares in chain:
m_dic[aares] = collections.OrderedDict()
resn = aares.resname.split()[0] # resname
segid = aares.segid.strip()
resi = aares.id[1]
if resn not in cg_mapping:
log.warning(f"Residue {resn} not in cg_mapping — skipping.")
continue # skip to next residue
# for each atom segment, calculate its center of mass and map the correct bead
for atom_segment in cg_mapping[resn]:
atoms = [aares[a] for a in atom_segment.split() if a in aares.child_dict]
if atoms:
if (
"*" in atom_segment
): # this is important to correctly place CG DNA beads
# this * means it belongs to the previous residue... find it!
target_previous_atom_list = [
a for a in atom_segment.split() if "*" in a
]
for target_atom in target_previous_atom_list:
# does it exist?
target_atom_name = target_atom.split("*")[0]
try:
previous_atom = chain[resi - 1][target_atom_name]
# how far away the previous atom is from this atom segment?
# if it is too far away this could be the next chain...!
minimum_dist = min([(a - previous_atom) for a in atoms])
if minimum_dist < 2.0: # 2.0 A is very permissive
atoms.append(previous_atom)
except KeyError:
# previous atom not found, move on
pass
if not atoms:
log.warning(
"Residue {} {:d} of chain {} cannot be processed: missing atoms {} ".format(
resn, resi, aares.parent.id, atom_segment
)
)
continue
bead_name = cg_mapping[resn][atom_segment]
# get center of mass
code = list(set([a.bfactor for a in aares if a.bfactor != 0]))
if len(code) > 1:
emsg = "Something is wrong with HADDOCK codes"
raise ModuleError(emsg)
if not code:
code = 0.0
else:
code = code[0]
bead_coord = center_of_mass(atoms)
atom_segment = " ".join(list(atom_segment.split())).replace("*", "")
# restrain for backmapping
restrain = (
"assign (segid {}CG and resid {:d} and name {})"
" (segid {} and resid {:d} and (name {})) 0 0 0".format(
segid,
resi,
bead_name,
segid,
resi,
" or name ".join(atom_segment.split()),
)
)
m_dic[aares][bead_name] = bead_coord, code, restrain
# add dummy beads whenever its needed
for r in m_dic:
if r.resname in polar:
d = 0.14 # distance
n = 2 # number of dummy beads to be placed
elif r.resname in charged:
d = 0.11 # distance
n = 1 # number of dummy beads to be placed
else:
continue
# add to data structure
# this special beads have no HADDOCK code
bead_list = [(b, m_dic[r][b][0]) for b in m_dic[r]]
dummy_bead_dic = add_dummy(bead_list, dist=d, n=n)
for db in dummy_bead_dic:
db_coords = dummy_bead_dic[db]
# code should be the same as the residue
code = m_dic[r][list(m_dic[r])[0]][1]
m_dic[r][db] = (db_coords, code, None)
return m_dic
[docs]
def center_of_mass(entity, geometric=False):
"""
Returns gravitic [default] or geometric center of mass of an Entity.
Geometric assumes all masses are equal (geometric=True)
Args:
entity:
geometric:
Returns:
"""
# Structure, Model, Chain, Residue
if isinstance(entity, Entity.Entity):
atom_list = entity.get_atoms()
# List of Atoms
elif hasattr(entity, "__iter__") and [x for x in entity if x.level == "A"]:
atom_list = entity
else: # Some other weirdo object
raise ValueError(
"Center of Mass can only be calculated from the following objects:\n"
"Structure, Model, Chain, Residue, list of Atoms."
)
masses = []
positions = [[], [], []] # [ [X1, X2, ..] , [Y1, Y2, ...] , [Z1, Z2, ...] ]
for atom in atom_list:
masses.append(atom.mass)
for i, coord in enumerate(atom.coord.tolist()):
positions[i].append(coord)
# If there is a single atom with undefined mass complain loudly.
if "ukn" in set(masses) and not geometric:
raise ValueError(
f"Some Atoms don't have an element assigned.{os.linesep}"
"Try adding them manually or calculate the geometrical "
"center of mass instead."
)
if geometric:
return [sum(coord_list) / len(masses) for coord_list in positions]
w_pos = [[], [], []]
for atom_index, atom_mass in enumerate(masses):
w_pos[0].append(positions[0][atom_index] * atom_mass)
w_pos[1].append(positions[1][atom_index] * atom_mass)
w_pos[2].append(positions[2][atom_index] * atom_mass)
return [sum(coord_list) / sum(masses) for coord_list in w_pos]
[docs]
def determine_hbonds(structure: Structure):
"""
Args:
structure:
Returns:
"""
nuc = ["DA", "DC", "DG", "DT", "A", "C", "G", "U"]
aa = [
"ALA",
"CYS",
"ASP",
"GLU",
"PHE",
"GLY",
"HIS",
"ILE",
"LYS",
"LEU",
"MET",
"ASN",
"PRO",
"GLN",
"ARG",
"SER",
"THR",
"VAL",
"TRP",
"TYR",
]
pair_list = []
for model in structure:
dna_chain_l = []
for chain in model:
prot_comp = len(
[r for r in chain.get_residues() if r.resname.split()[0] in aa]
)
dna_comp = len(
[r for r in chain.get_residues() if r.resname.split()[0] in nuc]
)
if prot_comp:
# protein
pass
if dna_comp:
# nucleic
dna_chain_l.append(chain)
if len(dna_chain_l) == 1:
log.warning("Only one DNA/RNA chain detected, is this correct?")
chain_a = dna_chain_l[0]
reslist_a = [r for r in chain_a.get_residues()]
for ra, rb in itertools.combinations(reslist_a, 2):
pair = identify_pairing(ra, rb)
if pair:
pair_list.append(pair)
if (
len(dna_chain_l) > 1
): # list sizes could be different, this might be improbable
for chain_a, chain_b in itertools.combinations(dna_chain_l, 2):
reslist_a = [r for r in chain_a.get_residues()]
reslist_b = [r for r in chain_b.get_residues()]
for ra in reslist_a:
for rb in reslist_b:
pair = identify_pairing(ra, rb)
pair_list.append(pair)
return pair_list
[docs]
def identify_pairing(ra, rb):
"""
Args:
ra:
rb:
Returns:
"""
pair = []
# check if the pairing is correct
ra_name = ra.resname.split()[0]
rb_name = rb.resname.split()[0]
try:
atom_pair_list = pairing[ra_name, rb_name]
except KeyError:
# pairing not possible
return
# check if distances are ok
distance_l = []
for atom_list in atom_pair_list:
try:
a = ra[atom_list[0]]
b = rb[atom_list[1]]
distance_l.append(a - b)
except KeyError:
# residue does not have the necessary sidechain atoms
# assume its not a pair
return
# check P-P distances to make sure its the opposite base
# distances for perfect DNA:
# opposite = 18.8A
# sequential = 6.6A
#
# 10.0A should be sufficient
p_cutoff = 10.0
# Basedist_cutoff = 3.5
basedist_cutoff = 3.5
try:
pa = ra.child_dict["P"]
pb = rb.child_dict["P"]
p_distance = pa - pb
except KeyError:
# some base is missing its P, use the geometric center instead
cen_a = center_of_mass(ra.child_dict.values())
cen_b = center_of_mass(rb.child_dict.values())
p_distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(cen_a, cen_b)]))
if p_distance > p_cutoff:
resnum_a = ra.id[1]
resnum_b = rb.id[1]
if all(
e < basedist_cutoff for e in distance_l
): # if ALL bonds are within range
# KEEP IN MIND THAT this will not account for badly paired DNA
# Implement a way to search for the closest possible pair? ##
segid_a = ra.get_segid().split()[0]
segid_b = rb.get_segid().split()[0]
pair = (resnum_a, segid_a), (resnum_b, segid_b)
# special atoms, mark them!
for atom_pair in atom_pair_list:
atom_a, atom_b = atom_pair
ra[atom_a].bfactor = 1
rb[atom_b].bfactor = 1
return pair
[docs]
def output_cg_restraints(pair_list):
"""
Args:
pair_list:
Returns:
"""
out = open("dna_restraints.def", "w")
for i, e in enumerate(pair_list):
idx = i + 1
res_a = e[0][0]
segid_a = e[0][1]
res_b = e[1][0]
segid_b = e[1][1]
out.write(
f"{{===>}} base_a_{idx}=(resid {res_a} and segid {segid_a});\n"
f"{{===>}} base_b_{idx}=(resid {res_b} and segid {segid_b});\n\n"
)
out.close()
[docs]
def create_file_with_cryst(pdb_file: str) -> None:
"""
This function creates a new pdb because the CRYST line is missing from the pdf file.
This line is necessary for DSSP.
Args:
output: str
pdb_file: str
Returns:
pdb_file_copy: str
"""
with (
open(pdb_file, "r") as file_in,
tempfile.NamedTemporaryFile(mode="w", delete=False) as file_out,
):
content = file_in.read()
file_out.write(CRYST_LINE + content)
return file_out.name
[docs]
def determine_ss(structure: Structure, skipss: bool, pdbf_path: str) -> Structure:
"""Determine secondary structures from input structure
Args:
structure:
skipss:
pdbf_path:
Returns:
structure:
"""
# calculate SS
for model in structure:
if skipss:
continue
try:
tmp_file_name = create_file_with_cryst(pdbf_path)
p = subprocess.Popen(
["dssp", tmp_file_name, "--output-format", "dssp"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
dssp_raw, _ = p.communicate()
dssp_raw = dssp_raw.split("#")[1].split("\n")[1:-1]
except: # TODO: think about making this exception more specific
# no secondary structure detected for this model
log.warning("SS could not be assigned, assigning code 1 to all residues")
continue
finally:
if Path(tmp_file_name).exists():
Path(tmp_file_name).unlink()
dssp = {}
for line in dssp_raw:
var_a, var_b = line[11], line[16]
dssp.setdefault(var_a, []).append(var_b)
# Get SS information and translate it:
# DSSP > MARTINI > HADDOCK
# this could still be improved
for chain in model:
dssp_ss = "".join(dssp[chain.id])
_, martini_types = ss_classification(
dssp_ss
) # ancestral function, keep it there
# transform MARTINI > HADDOCK
# and add it to the bfactor col
for residue, ss in zip(
chain, martini_types
): # chain and martini_types order must match
code = ss_to_code[ss]
# for atom in residue.get_atoms():
for atom in residue:
atom.bfactor = code
return structure
[docs]
def rename_nucbases(structure: Structure) -> None:
"""Inplace residue renaming according to HADDOCK ones.
Parameters
----------
structure : Bio.PDB.Structure.Structure
Input structure
"""
chainresdic = {
c.get_id(): [r.get_resname() for r in c.get_residues()]
for m in structure
for c in m
}
nucleotide_list = [
"CYT",
"C",
"DC",
"THY",
"T",
"DT",
"ADE",
"A",
"DA",
"G",
"GUA",
"DG",
"U",
"URI",
]
rna_resname_mapper = {"CYT": "C", "URI": "U", "ADE": "A", "GUA": "G"}
dna_rename_mapper = {"CYT": "DC", "THY": "DT", "ADE": "DA", "GUA": "DG"}
if [True for c in chainresdic for e in chainresdic[c] if e in nucleotide_list]:
# Check if this is an RNA
is_rna = [
True for c in chainresdic for e in chainresdic[c] if e in ["U", "URI"]
]
ref_dic = rna_resname_mapper if is_rna else dna_rename_mapper
# Loop over models
for model in structure:
for chain in model:
for r in chain.get_residues():
if r.resname in ref_dic.keys():
# Rename residue name
r.resname = ref_dic[r.resname]
[docs]
def martinize(
input_pdb: str,
output_path: str,
skipss: bool,
) -> tuple[str, bool]:
"""
Converts an all-atom (AA) PDB structure into a coarse-grained (CG) model
using a MARTINI2.2 mapping and generating CG-to-AA restraints for backmapping.
Optionally uses secondary structure for mapping.
Args:
input_pdb (str):
Path to the input AA PDB file.
output_path (str):
Directory where the output files will be written.
- A CG PDB file (``*_cg.pdb``)
- A restraint table (``*_cg_to_aa.tbl``)
skipss (bool):
If True, skips secondary structure assignment (DSSP step).
If False, assigns secondary structure and encodes it
into HADDOCK-compatible B-factors.
Returns:
tuple[str, bool]:
cg_pdb_name: Path to the generated CG PDB file.
shape: True if at least one residue with name "SHA" (shape bead)
is detected in the structure, False otherwise.
"""
if not input_pdb:
emsg = "No input file detected"
raise ModuleError(emsg)
p = PDBParser()
io = PDBIO()
# Parse PDB and run DSSP
pdbf_path = os.path.realpath(input_pdb)
aa_model = p.get_structure("aa_model", pdbf_path)
# set ALL bfactors to 1
for model in aa_model:
for chain in model:
if chain.id == " ":
emsg = "Empty chain id detected"
raise ModuleError(emsg)
for residue in chain:
for atom in residue:
atom.bfactor = 1.0
# Assign HADDOCK code according to SS (1-9)
determine_ss(structure=aa_model, skipss=skipss, pdbf_path=pdbf_path)
# Strandardize naming
# WARNING, THIS ASSUMES THAT INPUT DNA/RNA IS 3-LETTER CODE
rename_nucbases(aa_model)
# Assign HADDOCK code for hydrogen bonding capable nucleotides (0-1)
pair_list = determine_hbonds(aa_model)
if pair_list:
output_cg_restraints(pair_list)
# Map CG beads to AA structure
structure_builder = StructureBuilder()
structure_builder.init_structure("cg_model")
structure_builder.init_seg(" ") # Empty SEGID
tbl_cg_to_aa = []
restrain_counter = 0
for model in aa_model:
structure_builder.init_model(model.id)
for chain in model:
structure_builder.init_chain(chain.id)
structure_builder.init_seg(chain.id)
mapping_dic = map_cg(chain)
for residue in mapping_dic:
if residue.id[0] != " ": # filter HETATMS
continue
structure_builder.init_residue(
residue.resname, residue.id[0], residue.id[1], residue.id[2]
)
for i, bead in enumerate(mapping_dic[residue]):
bead_name = bead
bead_coord = mapping_dic[residue][bead_name][0]
haddock_code = mapping_dic[residue][bead_name][1]
restrain = mapping_dic[residue][bead_name][2]
structure_builder.init_atom(
bead_name, bead_coord, haddock_code, 1.00, " ", bead_name, i
)
tbl_cg_to_aa.append(restrain)
restrain_counter += 1
# Write CG to AA backmapping restraint file
tbl_file_name = gen_cg_tbl_backmapping_fname(f"{output_path}", pdbf_path)
with open(tbl_file_name, "w") as tbl_file:
tbl_file.write("\n" + "\n".join([tbl for tbl in tbl_cg_to_aa if tbl]))
# Write pre-CG structure
cg_model = structure_builder.get_structure()
io.set_structure(cg_model)
# Setup in-memory text buffer
io_file = StringIO()
# Write file in it
io.save(io_file, write_end=1)
# Go back to the start of the file to read it
io_file.seek(0)
# Write the actual valid CG structure
# make sure atom names are in the correct place
# .BB. .BB1. .BB2. and not BB.. BB1.. BB2..
cg_pdb_name = gen_cg_filename(f"{output_path}", pdbf_path)
out = open(cg_pdb_name, "w")
for line in io_file.readlines():
n_l = line
if line.startswith("ATOM"):
atom_name = line[12:16].split()[0]
# mind the spacing
if 1 <= len(atom_name) <= 3:
n_l = f"{line[:12]} {atom_name:<3s}{line[16:]}"
out.write(n_l)
out.close()
del io_file
return cg_pdb_name
[docs]
def gen_cg_filename(
output_dir: str,
input_fname: str,
force_field: Optional[str] = None,
ext: Optional[str] = None,
) -> str:
"""Helper function to standarize CG filename from input file.
Parameters
----------
output_dir : str
Where to write the file.
input_fname : str
Name of the original input PDB file.
force_field : Optional[str], optional
Name of the force-field, by default None
ext : Optional[str], optional
File extension, by default None
Returns
-------
cg_fname : str
Name of the CG file.
"""
# Suffix for force-field if defined
ff_suffix = f"_{force_field}" if force_field else ""
# Set file extension
file_ext = ext if ext else Format.PDB
# Generate filepath
cg_fpath = Path(output_dir, f"{Path(input_fname).stem}_cg{ff_suffix}.{file_ext}")
cg_fname = str(cg_fpath)
return cg_fname
[docs]
def gen_cg_tbl_backmapping_fname(output_dir: str, input_fname: str) -> Path:
"""Helper function to generate CG backmapping retraints filename.
Parameters
----------
output_dir : str
Where to write the file.
input_fname : str
Name of the original input PDB file.
Returns
-------
tbl_file_name: Path
Name of backmapping restraint filename.
"""
tbl_file_name = Path(output_dir, f"{Path(input_fname).stem}_cg_to_aa.tbl")
return tbl_file_name