Source code for haddock.libs.libpdb

"""Parse molecular structures in PDB format."""

from contextlib import redirect_stdout
import os
from copy import deepcopy
from functools import partial
from io import StringIO
from pathlib import Path

from pdbtools.pdb_segxchain import run as place_seg_on_chain
from pdbtools.pdb_splitchain import run as split_chain
from pdbtools.pdb_splitmodel import run as split_model
from pdbtools.pdb_tidy import run as tidy_pdbfile
from pdbtools.pdb_wc import run as pdb_wc

from haddock.core.exceptions import SetupError
from haddock.core.supported_molecules import supported_residues
from haddock.core.typing import (
    Callable,
    FilePath,
    FilePathT,
    Iterable,
    Optional,
    TypeVar,
    Union,
)
from haddock import log as haddock_log
from haddock.libs.libio import PDBFile, working_directory
from haddock.libs.libutil import get_result_or_same_in_list, sort_numbered_paths

slc_record = slice(0, 6)
slc_serial = slice(6, 11)
slc_name = slice(12, 16)
slc_altloc = slice(16, 17)
slc_resname = slice(17, 20)
slc_chainid = slice(21, 22)
slc_resseq = slice(22, 26)
slc_icode = slice(26, 27)
slc_x = slice(30, 38)
slc_y = slice(38, 46)
slc_z = slice(46, 54)
slc_occ = slice(54, 60)
slc_temp = slice(60, 66)
slc_segid = slice(72, 76)
slc_element = slice(76, 78)
slc_charge = slice(78, 80)



[docs]
def format_atom_name(atom: str, element: str) -> str:
    """
    Format PDB atom name.

    Further Reading:

    * https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html

    Parameters
    ----------
    atom : str
        The atom name.

    element : str
        The atom element code.

    Returns
    -------
    str
        Formatted atom name.
    """
    # string formats for atom name
    _3 = " {:<3s}"
    _4 = "{:<4s}"
    # len of element, atom formatting string
    # anything outside this is an error
    _atom_format_dict = {
        # len of element: {len of atom name}
        1: {1: _3, 2: _3, 3: _3, 4: _4},
        2: {1: _4, 2: _4, 3: _4, 4: _4},
    }

    atm = atom.strip()
    len_atm = len(atm)
    len_ele = len(element.strip())

    try:
        return _atom_format_dict[len_ele][len_atm].format(atm)
    except KeyError as err:
        _ = f"Could not format this atom:type -> {atom}:{element}"
        # raising KeyError assures that no context in IDPConfGen
        # will handle it. @joaomcteixeira never handles pure Python
        # exceptions, those are treated as bugs.
        raise KeyError(_) from err




[docs]
def get_supported_residues(haddock_topology: FilePath) -> list[str]:
    """Read the topology file and identify which data is supported."""
    supported: list[str] = []
    with open(haddock_topology) as input_handler:
        for line in input_handler:
            if "resi" in line[:4].casefold():
                res = line.split()[1]
                supported.append(res)
    return supported



_to_remove = ["REMAR", "CTERB", "CTERA", "NTERA", "NTERB", "CONECT"]

_to_rename = {
    "HSD": "HIS",
    "HSE": "HIS",
    "HID": "HIS",
    "HIE": "HIS",
    "WAT ": "TIP3",
    " 0.00969": " 0.00   ",
}

# Get list of supported residues to keep when pre-processing
_to_keep = list(supported_residues)



[docs]
def split_ensemble(pdb_file_path: Path, dest: Optional[FilePath] = None) -> list[Path]:
    """
    Split a multimodel PDB file into different structures.

    Parameters
    ----------
    dest : str or pathlib.Path
        Destination folder.
    """
    if dest is None:
        dest = Path.cwd()
    assert pdb_file_path.is_file(), pdb_file_path
    with open(pdb_file_path) as input_handler:
        with working_directory(dest):
            # tidy pdb before splitting
            tidied = tidy_pdbfile(input_handler)
            split_model(tidied, outname=str(pdb_file_path.stem))

    return sort_numbered_paths(*get_new_models(pdb_file_path))




[docs]
def split_by_chain(pdb_file_path: FilePath) -> list[Path]:
    """Split a PDB file into multiple structures for each chain."""
    abs_path = Path(pdb_file_path).resolve().parent.absolute()
    with open(pdb_file_path) as input_handler:
        with working_directory(abs_path):
            split_chain(input_handler)

    return get_new_models(pdb_file_path)




[docs]
def tidy(pdb_file_path: FilePath, new_pdb_file_path: FilePath) -> None:
    """Tidy PDB structure."""
    abs_path = Path(pdb_file_path).resolve().parent.absolute()
    with open(pdb_file_path) as input_handler:
        with working_directory(abs_path):
            with open(new_pdb_file_path, "w") as output_handler:
                for line in tidy_pdbfile(input_handler):
                    output_handler.write(line)




[docs]
def swap_segid_chain(pdb_file_path: FilePath, new_pdb_file_path: FilePath) -> None:
    """Add to the Chain ID column the found Segid."""
    abs_path = Path(pdb_file_path).resolve().parent.absolute()
    with open(pdb_file_path) as input_handler:
        with working_directory(abs_path):
            with open(new_pdb_file_path, "w") as output_handler:
                for line in place_seg_on_chain(input_handler):
                    output_handler.write(line)




[docs]
def sanitize(
    pdb_file_path: FilePathT,
    overwrite: bool = True,
    custom_topology: Optional[FilePath] = None,
) -> Union[FilePathT, Path]:
    """Sanitize a PDB file."""
    to_keep = deepcopy(_to_keep)
    if custom_topology:
        custom_res_to_keep = get_supported_residues(custom_topology)
        to_keep.extend(custom_res_to_keep)

    good_lines: list[str] = []
    with open(pdb_file_path) as input_handler:
        for line in input_handler:
            line = line.rstrip(os.linesep)
            # Ignoring lines containing any tag from __to_remove
            if not any([tag in line for tag in _to_remove]):
                for tag, new_tag in _to_rename.items():
                    line = line.replace(tag, new_tag)
                # check if this residue is known
                res = line[17:20].strip()
                if res and res in to_keep:
                    good_lines.append(line)
        # Terminate file with an END statement
        if len(good_lines) > 0 and good_lines[-1] != "END":
            good_lines.append("END")

    # Check if anything has been kept from this file
    if len(good_lines) == 0:
        raise SetupError(f"No coordinates kept after sanitzing {pdb_file_path.name}.")

    if overwrite:
        with open(pdb_file_path, "w") as output_handler:
            for line in good_lines:
                output_handler.write(line + os.linesep)
        return pdb_file_path

    basename = Path(pdb_file_path)
    new_pdb_file = Path(f"{basename.stem}_cleaned{basename.suffix}")
    new_pdb_file.write_text(os.linesep.join(good_lines) + os.linesep)
    return new_pdb_file




[docs]
def identify_chainseg(
    pdb_file_path: FilePath, sort: bool = True
) -> tuple[list[str], list[str]]:
    """Return segID OR chainID."""
    segids: list[str] = []
    chains: list[str] = []
    with open(pdb_file_path) as input_handler:
        for line in input_handler:
            if line.startswith(("ATOM  ", "HETATM")):
                try:
                    segid = line[72:76].strip()[:1]
                except IndexError:
                    segid = ""
                try:
                    chainid = line[21].strip()
                except IndexError:
                    chainid = ""

                if segid:
                    segids.append(segid)
                if chainid:
                    chains.append(chainid)

                if not segid and not chainid:
                    raise ValueError(
                        f"Could not identify chainID or segID in pdb {pdb_file_path}, line {line}"
                    )

    if sort:
        segids = sorted(list(set(segids)))
        chains = sorted(list(set(chains)))
    else:
        segids = list(set(segids))
        chains = list(set(chains))
    return segids, chains




[docs]
def get_new_models(pdb_file_path: FilePath) -> list[Path]:
    """
    Get new PDB models if they exist.

    If no new models are found, return the original path within a list.
    """
    new_models = get_result_or_same_in_list(
        get_pdb_file_suffix_variations,
        pdb_file_path,
    )
    return new_models




[docs]
def get_pdb_file_suffix_variations(file_name: FilePath, sep: str = "_") -> list[Path]:
    """
    List suffix variations of a PDB file in the current path.

    If `file.pdb` is given, and files `file_1.pdb`, `file_2.pdb`, exist
    in the folder, those will be listed.

    Parameters
    ----------
    file_name : str or Path
        The name of the file with extension.

    sep : str
        The separation between the file base name and the suffix.
        Defaults to "_".

    Returns
    -------
    list
        List of Paths with the identified PBD files.
        If no files are found return an empty list.
    """
    basename = Path(file_name)
    return list(Path(".").glob(f"{basename.stem}{sep}*{basename.suffix}"))




[docs]
def read_RECORD_section(
    lines: Iterable[str],
    section_slice: slice,
    func: Callable[[Iterable[str]], Iterable[str]] = set,
) -> Iterable[str]:
    """
    Create a set of observations from a section of the ATOM line.

    Returns
    -------
    set
        A set of the observations.
    """
    # read the chain ID
    chainids = func(
        the_line
        for line in lines
        if line.startswith(("ATOM", "HETATM"))
        and (the_line := line[section_slice].strip())  # noqa: E501
    )
    return chainids



read_chainids = partial(read_RECORD_section, section_slice=slc_chainid, func=list)
read_segids = partial(read_RECORD_section, section_slice=slc_segid, func=list)



[docs]
def add_TER_on_chain_breaks(
    input_pdb: FilePath,
    output_pdb: FilePath,
) -> None:
    """Detect chain breaks and add TER statements between them.

    Parameters
    ----------
    input_pdb : FilePath
        Input PDB filepath with potential chain breaks.
    output_pdb : FilePath
        Output PDB filepath with added TER statements between chain breaks.
    """
    Residue = dict[str, Union[list[str], list[float], str]]
    residueT = TypeVar("residueT", bound=Residue)

    def euclidean_dist(atm1: list[float], atm2: list[float]) -> float:
        """Compute Euclidean distances between two points.

        Parameters
        ----------
        atm1 : list[float]
            Atom 1 coordinates.
        atm2 : list[float]
            Atom 2 coordinates.

        Returns
        -------
        float
            Distance between the two atoms.
        """
        return sum([(c1 - c2) ** 2 for c1, c2 in zip(atm1, atm2)]) ** 0.5

    def detected_chain_break(residue1: residueT, residue2: residueT) -> bool:
        """Detect chain break between two consecutive residues.

        Currently limitted to DNA and protein chains.

        Parameters
        ----------
        residue1 : residueT
            Previous residue (N-1)
        residue2 : residueT
            Current residue (N)

        Returns
        -------
        bool
            True if chain break detected, else False
        """
        # Expected backbone distances set
        backbone_dists = {
            "protein": 3.5,  # very loose !
            "DNA": 4.5,
        }
        # Detect type of residues
        # DNA case
        try:
            # Extract coordinates
            atm1 = residue1["O3'"]
            atm2 = residue2["O5'"]
        except KeyError:
            # Protein case
            try:
                # Extract coordinates
                atm1 = residue1["C"]  # C of -1 residue
                atm2 = residue2["N"]  # N of 0 residue
            # Error in detection of any of the atoms
            except KeyError:
                # Must be a chain break
                # FIXME : This assumes that it is NOT a DNA nor a protein.
                # In the future, if future there is, it may cause an issue if
                # lipids, glycans, other type of entities are added,
                # as this will trigger a TER statement between them.
                return True
            else:
                entity_type = "protein"
        else:
            entity_type = "DNA"
        # Point distance
        upper_dist = backbone_dists[entity_type]
        # Check if distance is within acceptable peptide bond limit
        return euclidean_dist(atm1, atm2) > upper_dist

    def write_residue(fhandler, residue_lines: list[str]) -> None:
        """Writes residues line to file.

        Parameters
        ----------
        fhandler : _type_
            File object on which to write the residue lines.
        residue_lines : list[str]
            Residue to write.
        """
        for _ in residue_lines:
            fhandler.write(_)

    def write_previous_residue(
        fhandler,
        previous: residueT,
        current: residueT,
    ) -> None:
        """Write residue lines to file, possibly ending by TER.

        Parameters
        ----------
        fhandler : _type_
            File object on which to write the residue lines.
        previous : residueT
            Previous residue (N-1)
        current : residueT
            Current residue (N)
        """
        # Write previous residue
        write_residue(fhandler, previous["lines"])
        # Check if bond observed between -2 and -1
        chain_break = detected_chain_break(
            previous,
            current,
        )
        # If chain break detected or new chain in file
        if chain_break or previous["chain"] != current["chain"]:
            fhandler.write(f"TER{os.linesep}")

    # Initiate parsing variables
    BB_atomnames: tuple[str, str, str, str] = (
        "C",
        "N",  # for peptide bonds
        "O3'",
        "O5'",  # for DNA/RNA
    )
    current_resid: tuple[str, str] = ("-", "-")
    previous_residue: residueT = {"lines": []}
    current_residue: residueT = {"lines": []}
    # Read input file
    with open(input_pdb, "r") as fin, open(output_pdb, "w") as fout:
        for _ in fin:
            if _.startswith(
                (
                    "ATOM",
                    "HETATM",
                )
            ):
                # Extract specific data from coordinates record
                chainid = _[slc_chainid].strip()
                resid = _[slc_resseq].strip()
                atname = _[slc_name].strip()

                # Case when new residue
                if current_resid != (chainid, resid):
                    # Make sure it is not first residue
                    if len(previous_residue["lines"]) > 0:
                        # Write previous residue
                        write_previous_residue(fout, previous_residue, current_residue)
                    # Reset previous residue to current residue
                    previous_residue = current_residue
                    # Initialize new current residue
                    current_resid = (
                        chainid,
                        resid,
                    )
                    current_residue = {"lines": [], "chain": chainid}

                # Hold residue line
                current_residue["lines"].append(_)

                # If atom name is in the backbone atoms
                if atname in BB_atomnames:
                    # Hold backbone atom coordinates in current residue
                    current_residue[atname] = [
                        float(_[slc_x]),
                        float(_[slc_y]),
                        float(_[slc_z]),
                    ]
        # Last previous residue
        write_previous_residue(fout, previous_residue, current_residue)
        # Last residue
        write_residue(fout, current_residue["lines"])
        # Write final TER statement
        fout.write(f"TER{os.linesep}")
        # Write END statement
        fout.write(f"END{os.linesep}")




[docs]
def check_combination_chains(combination: list[PDBFile]) -> list[str]:
    """Check if chain IDs are unique for each pdb in combination."""
    chainid_list: list[str] = []
    for pdb in combination:
        segids, chains = identify_chainseg(pdb.rel_path, sort=False)
        chainsegs = sorted(list(set(segids) | set(chains)))
        # check if any of chainsegs is already in chainid_list
        if any(chainseg in chainid_list for chainseg in chainsegs):
            raise ValueError(f"Chain/seg IDs are not unique for pdbs {combination}.")
        chainid_list.extend(chainsegs)
    return chainid_list




[docs]
def check_mol_shape(input_mol: Path) -> bool:
    """
    Checks if molecules provided is a shape or not.

    Args:
        input_pdb (PDBFile): input PDB to check if shape.

    Returns:
        bool: True if shape, False if not.
    """
    shape: bool = False
    with open(input_mol, "rt") as input_file_mol:
        if any("SHA SHA " in line for line in input_file_mol):
            shape = True
    return shape




[docs]
def handle_input_reference(reference: Path) -> list[Path]:
    """Validate the reference file by returning only one model.

    Parameters
    ----------
    reference : Path
        Path to the input reference structure, possibly containing
        an ensemble.

    Returns
    -------
    reference or first_model_path : Path
        Path to the reference structure to be used downstream.
    """
    # Check that the file contains at least one coordinate record before
    # doing anything else — catches empty files, REMARK-only files, non-PDB text.
    with open(reference, "r") as fh:
        has_atoms = any(line.startswith(("ATOM", "HETATM")) for line in fh)
    if not has_atoms:
        raise ValueError(
            f"No atoms found in reference file! "
            "Please check that it is a valid PDB file containing ATOM/HETATM records."
        )

    # pdb_wc writes to stdout; capture it without touching sys.stdout globally.
    buf = StringIO()
    with open(reference, "r") as fh, redirect_stdout(buf):
        pdb_wc(fh, "m")
    wc_return = buf.getvalue()

    # Parse output
    # using here `\n` (and not os.linesep) as it is the output for pdb_wc
    nb_models = 1  # pdb_wc treats a file without MODEL records as a single model
    for line in wc_return.split("\n"):
        if "No. models" in line:
            nb_models = int(line.strip().split()[-1])
            break
    # Return reference as only one structure present
    if nb_models == 1:
        return [reference]
    # If more than one model in reference
    haddock_log.info(
        f"Multiple structures ({nb_models}) found in reference file. "
        "Using all conformations as reference."
    )
    # Split models
    with open(reference, "r") as ref_in:
        split_model(ref_in, "reference_model")
    # Gather individual references and sort them
    references = sorted(
        list(Path(".").glob("reference_model_*.pdb")),
        key=lambda k: int(k.stem.split("_")[-1]),
    )
    assert len(references) == nb_models, (
        "Issue while splitting references conformation: "
        f"{nb_models} detected, {len(references)} generated"
    )
    return references