Source code for haddock.gear.preprocessing

# TODO:
# 1. single_ions may contain PO4 - verify
# 2. the single_ions_elements map seems not needed
"""
Process input PDB files to ensure compatibility with HADDOCK3.

This module checks and modifies PDB files for compatibility with
HADDOCK3. There are three types of checks/modifications:

1. Performed to each PDB line-by-line, in a equal fashion of ``pdb-tools``.
   In fact, this step mostly uses the ``pdb-tools`` package.
2. Performed on each PDB as a whole.
3. Performed on all PDBs together.

Main functions
--------------

* :py:func:`process_pdbs`
* :py:func:`read_additional_residues`

Corrections performed on 1)
---------------------------

The following actions are perfomed sequentially over all PDBs:

#. from ``pdb-tools``: ``pdb_keepcoord``
#. from ``pdb-tools``: ``pdb_tidy`` with ``strict=True``
#. from ``pdb-toos``: ``pdb_element``
#. from ``pdb-tools``: ``pdb_selaltloc``
#. from ``pdb-tools``: ``pdb_pdb_occ`` with ``occupancy=1.00``
#. replace ``MSE`` to ``MET``
#. replace ``HSD`` to ``HIS``
#. replace ``HSE`` to ``HIS``
#. replace ``HID`` to ``HIS``
#. replace ``HIE`` to ``HIS``
#. add_charges_to_ions, see :py:func:`add_charges_to_ions`
#. convert ``ATOM`` to ``HETATM`` for those atoms that should be ``HETATM``.
   Considers the additional residues provided by the user.
   See :py:func:`convert_ATOM_to_HETATM`.
#. convert ``HETATM`` to ``ATOM`` for those atoms that should be ``ATOM``,
#. from ``pdb-toos``: ``pdb_fixinsert``, with ``option_list=[]``.
#. remove unsupported ``HETATM``. Considers residues provided by the user.
#. remove unsupported ``ATOM``. Considers residues provided by the user.
#. from ``pdb-tools``: ``pdb_reatom``, start from ``1``.
#. from ``pdb-tools``: ``pdb_tidy`` with ``strict=True``

Corrections performed on 2)
---------------------------

The following actions are performed sequentially for each PDB:

* :py:func:`models_should_have_the_same_labels`
* :py:func:`solve_no_chainID_no_segID`
* :py:func:`homogenize_chains`

Read the documentation of the above functions for details what they do.

Corrections performed on 3)
---------------------------

The following actions are performed to all PDBs together:

* :py:func:`correct_equal_chain_segids`

Read the documentation of the above functions for details what they do.

When it happens
---------------

The PDB processing step is performed by default when reading the input
molecules and copying them to the `data/` folder inside the run
directory. When PDBs are processed, a copy of the original input PDBs is
also stored in the `data/` folder.

To deactivate this initial PDB processing, set ``skip_preprocess = False``
in the general parameters of the configuration file.

Additional information
----------------------

If you are a developer and want to read more about the history of this
preprocessing module, visit:

https://github.com/haddocking/haddock3/projects/16
"""
import io
import itertools as it
import re
import string
from functools import partial, wraps
from os import linesep
from pathlib import Path

from pdbtools import (
    pdb_chain,
    pdb_chainxseg,
    pdb_element,
    pdb_fixinsert,
    pdb_keepcoord,
    pdb_occ,
    pdb_reatom,
    pdb_rplresname,
    pdb_segxchain,
    pdb_selaltloc,
    pdb_shiftres,
    pdb_tidy,
)

from haddock import log
from haddock.core.exceptions import HaddockError
from haddock.core.supported_molecules import (
    supported_ATOM,
    supported_HETATM,
    supported_non_ions,
    supported_single_ions_atoms_map,
    supported_single_ions_resnames_map,
)
from haddock.core.typing import (
    Any,
    Callable,
    Container,
    Generator,
    Iterable,
    LineIterSource,
    Optional,
    Union,
)
from haddock.libs.libfunc import chainf
from haddock.libs.libio import read_lines
from haddock.libs.libpdb import (
    format_atom_name,
    read_chainids,
    read_segids,
    slc_charge,
    slc_element,
    slc_name,
    slc_resname,
)


# defines chain letters for chain and seg IDs
_ascii_letters = list(string.ascii_uppercase + string.ascii_lowercase)
_CHAINS = it.cycle(_ascii_letters)



[docs]
class ModelsDifferError(HaddockError):
    """MODELS of the PDB differ in atom labels."""

    pass



def _report(log_msg: str) -> Callable[..., Any]:
    """
    Add report functionality to the function (decorator).

    Functions decorated with `_report` log the difference between the
    input and the output. Decorated functions gain an additional boolean
    parameter `report` to activate or deactivate the report
    functionality; defaults to ``False``.

    Note that a generator decorated with ``_report`` no longer behaves
    as a generator if ``report=True`` is given. Instead, it returns a
    list from the exhausted generator.

    **Important:** Do NOT use ``_report`` with infinite generators,
    such as ``itertools.cycle``.
    """

    def decorator(function: Callable[..., Any]) -> Callable[..., Any]:
        @wraps(function)
        def wrapper(
            lines: Iterable[Any], *args: Any, report: bool = False, **kwargs: Any
        ) -> Any:
            if report:
                in_lines = list(lines)
                result = list(function(in_lines, *args, **kwargs))

                # Here we could use sets to increase speed, but for the size
                # of the systems, we can actually use lists and get a sorted
                # result by default. I tried using difflib from STD but it is
                # just too slow.

                # _ is line
                additions = [_ for _ in result if _ not in in_lines]
                deletions = [_ for _ in in_lines if _ not in result]

                la = len(additions)
                ld = len(deletions)

                add_lines = linesep.join(f"+ {_}" for _ in additions)
                del_lines = linesep.join(f"- {_}" for _ in deletions)

                log_msg_ = log_msg.format(*args, *kwargs.values())
                extended_log = (
                    f"[{log_msg_}] + {la} - {ld} lines",
                    add_lines,
                    del_lines,
                )

                log.info(linesep.join(extended_log))
                return result

            # If report=False, maintain the original behaviour
            else:
                return function(lines, *args, **kwargs)

        return wrapper

    return decorator


def _open_or_give(inputdata: Iterable[LineIterSource]) -> list[list[str]]:
    """
    Adapt input to the functions.

    Used in py:func:`process_pdbs`.

    Homogenizes input by:

    * removing new line characters at the end of the line
    * removing empty lines

    Parameters
    ----------
    inputdata : list
        A **flat** list where in each index it can contain:

        * file objects
        * paths to files
        * strings representing paths
        * lists or tuples of lines

        The above types can be mixed in the input list.

        Files are read to lines in a list. Line separators are stripped.

        Do not provide nested lists with lists containing paths inside
        lists.

    Returns
    -------
    list of list of strings
        Each sublist has the contents of the input in the same order.

    Raises
    ------
    TypeError
        In any other circumstances.
    """

    def get_line(lines: Iterable[str]) -> list[str]:
        """Ignore empty lines."""
        return [line.rstrip(linesep) for line in lines if line]

    lines: list[list[str]] = []
    for idata in inputdata:
        if isinstance(idata, (Path, str)):
            lines.append(get_line(Path(idata).read_text().split(linesep)))

        elif isinstance(idata, io.TextIOBase):
            lines.append(get_line(idata.readlines()))

        elif isinstance(idata, (list, tuple)):
            lines.append(get_line(idata))

        else:
            emsg = f"Unexpected type in `inputdata`: {type(idata)}"
            raise TypeError(emsg)

    return lines


@read_lines
def read_additional_residues(
    lines: Iterable[str], *ignore: Any, **everything: Any
) -> tuple[str, ...]:
    """
    Read additional residues listed in a ``*.top`` filename.

    Expects new residues to be defined as::

        RESIdue XXX
        RESI XXX
        residue XXX

    where, XXX is the new residue name. Does not read ATOM or charge
    information. Reads only the residue name.

    Examples
    --------
    Read directly the file:

    >>> read_additional_residues(fpath)

    Read the lines instead:

    >>> lines = Path(fpath).read_text().split(os.linesep)
    >>> read_additional_residues.original(lines)

    Parameters
    ----------
    fpath : str or pathlib.Path
        The path to the file.

    lines : list of lines
        You can also use this function in the form of
        ``read_additional_residues.original(...)`` and directly give it
        a list containing the lines of the file.

    Returns
    -------
    tuple
        A tuple with the new identified residues names.
    """
    # https://regex101.com/r/1H44kO/1
    res_regex = re.compile(r"^(RESIdue|residue|RESI) ([A-Z0-9]{1,3}).*$")
    residues: list[str] = []
    for line in map(str.strip, lines):
        name = res_regex.findall(line)
        if name:
            residues.append(name[0][1])

    return tuple(residues)



[docs]
def process_pdbs(
    *inputdata: LineIterSource,
    dry: bool = False,
    user_supported_residues: Optional[Iterable[str]] = None,
) -> list[list[str]]:
    """
    Process PDB file contents for compatibility with HADDOCK3.

    Parameters
    ----------
    inputdata : list of (str, path, list of str [lines], file handler)

        A **flat** list where in each index it can contain:

        * file objects
        * paths to files
        * strings representing paths
        * lists or tuples of lines

        The above types can be mixed in the input list.

        Files are read to lines in a list. Line separators are stripped.

        Do not provide nested lists with lists containing paths inside
        lists.

    dry : bool
        Perform a dry run. That is, does not change anything, and just
        report.

    user_supported_residues : list, tuple, or set
        The new residues that are allowed.

    Returns
    -------
    list of (list of str)
        The corrected (processed) PDB content in the same order as
        ``inputdata``.
    """
    structures = _open_or_give(inputdata)

    # these are the processing or checking functions that should (if needed)
    # modify the input PDB and return the corrected lines.
    # Follows the same style as for pdb-tools.
    # these functions yield line-by-line.
    line_by_line_processing_steps = [
        wrep_pdb_keepcoord,  # also discards ANISOU
        # tidy is important before some other corrections
        wrep_pdb_tidy_strict,
        wrep_pdb_element,
        wrep_pdb_selaltloc,
        partial(wrep_pdb_occ, occupancy=1.00),
        replace_MSE_to_MET,
        replace_HSD_to_HIS,
        replace_HSE_to_HIS,
        replace_HID_to_HIS,
        replace_HIE_to_HIS,
        add_charges_to_ions,
        partial(
            convert_ATOM_to_HETATM,
            residues=set.union(
                supported_HETATM,
                user_supported_residues or set(),
            ),
        ),
        convert_HETATM_to_ATOM,
        partial(wrep_pdb_fixinsert, option_list=[]),
        #####
        partial(
            remove_unsupported_hetatm, user_defined=user_supported_residues
        ),  # noqa: E501
        partial(remove_unsupported_atom),
        ####
        # partial(wrep_pdb_shiftres, shifting_factor=0),
        partial(wrep_pdb_reatom, starting_value=1),
        wrep_pdb_tidy,
        ###
        wrep_rstrip,
    ]

    # these functions take the whole PDB content, evaluate it, and
    # modify it if needed.
    whole_pdb_processing_steps = [
        models_should_have_the_same_labels,
        solve_no_chainID_no_segID,
        homogenize_chains,
    ]

    # these functions take all structures combined, evulate them
    # togehter, and modify them if needed.
    processed_combined_steps = [
        correct_equal_chain_segids,
    ]

    # START THE ACTUAL PROCESSING

    # individual processing (line-by-line)
    result_1 = [
        list(chainf(structure, *line_by_line_processing_steps, report=dry))
        for structure in structures
    ]

    # whole structure processing
    result_2 = [
        list(chainf(structure, *whole_pdb_processing_steps, report=dry))
        for structure in result_1
    ]

    # combined processing
    final_result = chainf(result_2, *processed_combined_steps)

    return final_result



# Functions operating line-by-line

# make pdb-tools reportable
wrep_pdb_chain = _report("pdb_chain")(pdb_chain.run)
wrep_pdb_chainxseg = _report("pbd_segxchain")(pdb_chainxseg.run)
wrep_pdb_element = _report("pdb_element")(pdb_element.run)
wrep_pdb_fixinsert = _report("pdb_fixinsert")(pdb_fixinsert.run)
wrep_pdb_keepcoord = _report("pdb_keepcoord")(pdb_keepcoord.run)
wrep_pdb_occ = _report("pdb_occ")(pdb_occ.run)
wrep_pdb_reatom = _report("pdb_reatom")(pdb_reatom.run)
wrep_pdb_shiftres = _report("pdb_shiftres")(pdb_shiftres.run)
wrep_pdb_rplresname = _report("pdb_rplresname")(pdb_rplresname.run)
wrep_pdb_segxchain = _report("pdb_segxchain")(pdb_segxchain.run)
wrep_pdb_selaltloc = _report("pdb_selaltloc")(pdb_selaltloc.run)
wrep_pdb_tidy = _report("pdb_tidy")(pdb_tidy.run)
wrep_pdb_tidy_strict = _report("pdb_tidy")(partial(pdb_tidy.run, strict=True))
wrep_rstrip = _report("str.rstrip")(
    partial(map, lambda x: x.rstrip(linesep))
)  # noqa: E501



[docs]
@_report("Replacing HETATM to ATOM for residue {!r}")
def replace_HETATM_to_ATOM(
    fhandler: Iterable[str], res: str
) -> Generator[str, None, None]:
    """
    Replace record `HETATM` to `ATOM` for `res`.

    Do not alter other lines.

    Parameters
    ----------
    fhanlder : file handler or list of lines
        List-like of file lines. Consumes over a ``for`` loop.

    res : str
        Residue name to match for the substitution.

    Yields
    ------
    str
        Yield line-by-line.
    """
    for line in fhandler:
        if line.startswith("HETATM") and line[slc_resname].strip() == res:
            yield "ATOM  " + line[6:]
        else:
            yield line




[docs]
@_report("Replace residue ATOM/HETATM {!r} to ATOM {!r}")
def replace_residue(
    fhandler: Iterable[str], resin: str, resout: str
) -> Generator[str, None, None]:
    """
    Replace residue by another and changes ``HETATM`` to ``ATOM`` if needed.

    Do not alter other lines.

    Parameters
    ----------
    fhanlder : file handler or list of lines
        List-like of file lines. Consumes over a ``for`` loop.

    resin : str
        Residue name to match for the substitution.

    resout : str
        Name of the new residue. Renames ``resin`` to ``resout``.

    Yields
    ------
    str
        Yield line-by-line.

    See Also
    --------

    * :py:func:`replace_HETATM_to_ATOM`
    * ``pdb_rplresname`` from ``pdb-tools``
    """
    _ = replace_HETATM_to_ATOM(fhandler, res=resin)
    yield from pdb_rplresname.run(_, name_from=resin, name_to=resout)



replace_MSE_to_MET = partial(replace_residue, resin="MSE", resout="MET")
"""
Replace ``MSE`` to ``MET``.

See Also
--------
* :py:func:`replace_residue`
"""

replace_HSD_to_HIS = partial(replace_residue, resin="HSD", resout="HIS")
"""
Replace ``HSD`` to ``HIS``.

See Also
--------
* :py:func:`replace_residue`
"""

replace_HSE_to_HIS = partial(replace_residue, resin="HSE", resout="HIS")
"""
Replace ``HSE`` to ``HIS``.

See Also
--------
* :py:func:`replace_residue`
"""

replace_HID_to_HIS = partial(replace_residue, resin="HID", resout="HIS")
"""
Replace ``HID`` to ``HIS``.

See Also
--------
* :py:func:`replace_residue`
"""

replace_HIE_to_HIS = partial(replace_residue, resin="HIE", resout="HIS")
"""
Replace ``HIE`` to ``HIS``.

See Also
--------
* :py:func:`replace_residue`
"""



[docs]
@_report("Remove unsupported molecules")
def remove_unsupported_molecules(
    lines: Iterable[str],
    haddock3_defined: Optional[set[str]] = None,
    user_defined: Optional[set[str]] = None,
    line_startswith: Union[str, tuple[str, ...]] = ("ATOM", "HETATM"),
) -> Generator[str, None, None]:
    """
    Remove HADDOCK3 unsupported molecules.

    This function is abstract and you need to provide the set of
    residues supported by HADDOCK3. See parameters.

    Residues not provided in ``haddock3_defined`` and ``user_defined``
    are removed from the PDB lines.

    Other lines are yieled unmodified.

    Parameters
    ----------
    lines : list or list-like
        Lines of the PDB file. This function will consumes lines over a
        ``for`` loop; mind it if you use a generator.

    haddock3_defined : set
        Set of residues supported by HADDOCK3.
        Defaults to ``None``.

    user_defined : set
        An additional set of allowed residues given by the user.
        Defaults to ``None``.

    line_startswith : tuple
        The lines to consider. Defaults to ``("ATOM", "HETATM")``.

    Yields
    ------
    line : str
        Line-by-line.
        Lines for residues not supported are *not* yielded.

    See Also
    --------
    Other functions use this function to create context.

    * :py:func:`remove_unsupported_atom`
    * :py:func:`remove_unsupported_hetatm`
    """
    user_defined = user_defined or set()
    haddock3_defined = haddock3_defined or set()
    allowed = set.union(haddock3_defined, user_defined)

    # find a way to report this
    not_allowed_found: set[str] = set()

    for line in lines:
        if line.startswith(line_startswith):
            residue = line[slc_resname].strip()
            if residue in allowed:
                yield line
            else:
                not_allowed_found.add(residue)
                continue
        else:
            yield line

    return



remove_unsupported_hetatm = partial(
    remove_unsupported_molecules,
    haddock3_defined=supported_HETATM,
    line_startswith="HETATM",
)
"""
Remove unsupported molecules in ``HETATM`` lines.

Uses :py:func:`remove_unsupported_molecules` by populating its
``haddock3_define`` and ``line_startswith`` parameters.

See Also
--------
* :py:func:`remove_unsupported_atom`
"""

remove_unsupported_atom = partial(
    remove_unsupported_molecules,
    haddock3_defined=supported_ATOM,
    line_startswith="ATOM",
)
"""
Remove unsupported molecules in ``ATOM`` lines.

Uses :py:func:`remove_unsupported_molecules` by populating its
``haddock3_define`` and ``line_startswith`` parameters.

See Also
--------
* :py:func:`remove_unsupported_hetatm`
"""



[docs]
@_report("Add charges to ions.")
def add_charges_to_ions(fhandler: Iterable[str]) -> Generator[str, None, None]:
    """
    Add charges to ions according to HADDOCK3 specifications.

    1. Check if charge is correctly defined in residue name.
       If so, yield the line with correct residue name and charge at the
       end.
    2. Check if charge is correctly defined in atom name.
    3. Create charge from element. This might need manual edit in case
       the atom as an unconventional charge.

    Parameters
    ----------
    fhandler : file-hanlder, list, or list-like
        Lines of the PDB file. This function will consumes lines over a
        ``for`` loop; mind it if you use a generator.

    Yields
    ------
    line : str
        Line-by-line: modified ion lines and any other line.
    """
    # list of functions that correct ion entries
    # by order of preference in case a preference is not found.
    # see further
    ion_correction_cases = [
        _process_ion_case_atom,
        _process_ion_case_resname,
        _process_ion_case_element_charge,
    ]

    for line in fhandler:
        if line.startswith(("ATOM", "ANISOU", "HETATM")):
            # get values
            atom = line[slc_name].strip()  # max 4 chars
            resname = line[slc_resname].strip()  # max 3 chars
            element = line[slc_element].strip()  # max 2 chars
            charge = line[slc_charge].strip()  # max 2 chars

            if resname in supported_non_ions:
                yield line
                continue

            # Which of the above fields has information on the charge?
            # If more than one have information on the charge, we give
            # the preference to the atom, resname, and then charge
            func_to_apply = False
            if atom[-1].isdigit():
                func_to_apply = _process_ion_case_atom  # type: ignore
            elif resname[-1].isdigit():
                func_to_apply = _process_ion_case_resname  # type: ignore
            elif element and charge and charge[-1].isdigit():
                func_to_apply = _process_ion_case_element_charge  # type: ignore

            if func_to_apply:
                yield func_to_apply(line)  # type: ignore

            # in case none of the fields has information on the charge,
            # applies the process functions by giving preference to the
            # residue names.
            else:
                for func in ion_correction_cases:
                    try:
                        yield func(line)
                    except Exception:
                        # test the next function
                        continue
                    else:
                        break  # done
                else:
                    yield line  # lines that do not concern to ions

        else:
            yield line  # lines that are not atoms



def _process_ion_case_resname(line: str) -> str:
    """
    Process ion information based on resnames.

    case 1: charge is correctly defined in resname, for example, ZN2.
    In this case, ignore other fields and write ion information from
    scratch even if it's already correct.
    """
    resname = line[slc_resname].strip()  # max 3 chars
    new_atom = supported_single_ions_resnames_map[resname].atoms[0]
    new_element = supported_single_ions_resnames_map[resname].elements[0]
    charge = new_atom[-2:] if len(new_atom) > 2 else "  "

    new_line = (
        line[:12]
        + format_atom_name(new_atom, new_element)
        + line[16]
        + resname.rjust(3, " ")
        + line[20:76]
        + new_element.rjust(2, " ")
        + charge
    )

    return new_line


def _process_ion_case_atom(line: str) -> str:
    """
    Process ion information based on atom names.

    case 2: charge is correctly defined in atom name ignore other fields
    and write them from scratch even if they are already correct.
    """
    element = line[slc_element].strip()  # max 2 chars
    if element == "C":
        # element C can have atom name CA which conflicts carbon alpha
        # and calcium
        raise ValueError("Element is 'C', does not apply to this case.")

    atom = line[slc_name].strip()  # max 4 chars
    new_resname = supported_single_ions_atoms_map[atom].resname
    new_element = supported_single_ions_atoms_map[atom].elements[0]
    charge = atom[-2:] if len(atom) > 2 else "  "

    new_line = (
        line[:12]
        + format_atom_name(atom, new_element)
        + line[16]
        + new_resname.rjust(3, " ")
        + line[20:76]
        + new_element.rjust(2, " ")
        + charge
    )

    return new_line


def _process_ion_case_element_charge(line: str) -> str:
    """
    Process ion information based on element and charge.

    case 3: charge is correctly defined in atom name ignore other fields
    and write them from scratch even if they are already correct.
    """
    element = line[slc_element].strip()  # max 2 chars
    charge = line[slc_charge].strip()  # max 2 chars
    atom = element + charge
    new_resname = supported_single_ions_atoms_map[atom].resname

    new_line = (
        line[:12]
        + format_atom_name(atom, element)
        + line[16]
        + new_resname.rjust(3, " ")
        + line[20:76]
        + element.rjust(2, " ")
        + charge
    )

    return new_line



[docs]
@_report("Convert record: {!r} to {!r}.")
def convert_record(
    fhandler: Iterable[str], record: str, other_record: str, residues: Container[str]
) -> Generator[str, None, None]:
    """
    Convert on record to another for specified residues.

    For example, replace ``ATOM`` by ``HETATM`` for specific residues.

    Parameters
    ----------
    fhandler : list-like
        Contains lines of file.

    record : str
        The PDB RECORD to match; for example, ``ATOM`` or ``HETATM``.

    other_record : str
        The PDB RECORD to replace with; for example, ``ATOM`` or ``HETATM``.

    residues : list, tuple, or set
        List of residues to replace the record.
    """
    for line in fhandler:
        if line.startswith(record):
            resname = line[slc_resname].strip()
            if resname in residues:
                yield other_record + line[6:]
                continue
        yield line



convert_ATOM_to_HETATM = partial(
    convert_record,
    record="ATOM",
    other_record="HETATM",
    residues=supported_HETATM,
)
"""
Convert ``ATOM`` to ``HETATM`` for HADDOCK3 supported ``HETATM``.

See Also
--------
* :py:data:`haddock.core.supported_molecules.supported_HETATM`
"""

convert_HETATM_to_ATOM = partial(
    convert_record,
    record="HETATM",
    other_record="ATOM  ",
    residues=supported_ATOM,
)
"""
Convert ``HETATM`` to ``ATOM`` for HADDOCK3 supported ``ATOM``.

See Also
--------
* :py:data:`haddock.core.supported_molecules.supported_ATOM`
"""


# Functions operating in the whole PDB



[docs]
@_report("Solving chain/seg ID issues.")
def solve_no_chainID_no_segID(lines: Iterable[str]) -> Iterable[str]:
    """
    Solve inconsistencies with chainID and segID.

    If segID is non-existant, copy chainID over segID, and vice-versa.
    If none are present, adds an upper case char starting from A. This
    char is not repeated until the alphabet exhausts.
    If chainIDs and segIDs differ, copy chainIDs over segIDs.

    Parameters
    ----------
    lines : list of str
        The lines of a PDB file.

    Returns
    -------
    list
        With new lines. Or the input ones if no modification was made.
    """
    chainids = read_chainids(lines)
    segids = read_segids(lines)

    if not chainids and segids:
        new_lines = pdb_segxchain.run(lines)

    elif chainids and not segids:
        new_lines = pdb_chainxseg.run(lines)

    elif not chainids and not segids:
        _chains = pdb_chain.run(lines, next(_CHAINS))
        new_lines = pdb_chainxseg.run(_chains)

    # gives priority to chains
    elif chainids != segids:
        new_lines = pdb_chainxseg.run(lines)

    else:
        # nothing to do
        return lines

    return list(new_lines)



# TODO: needs to become an option.
# change chain ID and shift the residue of the other chains.
# also needs to be sync with the restraints.
# maybe not an automatic.. maybe used as a CLI
# maybe is a place to have a CHECK instead of a autoprocess

[docs]
@_report("Homogenizes chains")
def homogenize_chains(lines: list[str]) -> list[str]:
    """
    Homogenize chainIDs within the same PDB.

    If there are multiple chain identifiers in the PDB file, make all
    them equal to the first one.

    ChainIDs are copied to segIDs afterwards.

    Returns
    -------
    list
        The modified lines.
    """
    chainids = read_chainids(lines)
    if len(set(chainids)) > 1:
        return list(
            chainf(
                lines,
                partial(pdb_chain.run, chain_id=chainids[0]),
                pdb_chainxseg.run,
            )
        )
    else:
        return lines



# Functions operating in all PDBs at once



[docs]
def correct_equal_chain_segids(structures: list[list[str]]) -> list[list[str]]:
    """
    Correct for repeated chainID in the input PDB files.

    Repeated chain IDs are replaced by an upper case character (``[A-Z]``)
    in order.

    Parameters
    ----------
    structures : list of lists of str
        The input data.

    Returns
    -------
    list of lists of str
        The new structures.
    """
    _all_chains = (read_chainids(s) for s in structures)
    # set of all chain IDs present in the input PDBs
    all_chain_ids = set(it.chain.from_iterable(_all_chains))

    # the remaining available chain characters are the A-Z minus the
    # `all_chain_ids`
    remaining_chars = it.cycle(sorted(set(_ascii_letters).difference(all_chain_ids)))

    chain_ids: list[Iterable[str]] = []
    new_structures: list[list[str]] = []
    for lines in structures:
        new_lines: Optional[list[str]] = None

        # read chain IDs from the structure
        chain_id = read_chainids(lines)

        # if chain_id is repeated
        if chain_id in chain_ids:
            new_lines = list(
                chainf(
                    lines,
                    # change the chain ID by a new one
                    partial(pdb_chain.run, chain_id=next(remaining_chars)),
                    # applies chain ID to seg ID as well
                    pdb_chainxseg.run,
                )
            )
        else:
            chain_ids.append(chain_id)

        new_structures.append(new_lines or lines)

    if len(new_structures) != len(structures):
        raise AssertionError("Number of lines differ. This is a bug!")
    return new_structures



# this id bad

[docs]
@_report("Check models are the same")
def models_should_have_the_same_labels(lines: Iterable[str]) -> Iterable[str]:
    """
    Confirm models have the same labels.

    In an ensemble of structures, where the PDB file has multiple MODELS,
    all models should have the same labels; hence the same number and
    typ of atoms.

    Parameters
    ----------
    lines : list of strings.
        List containing the lines of the PDB file. Must NOT be a generator.

    Returns
    -------
    list
        The original ``lines`` in case no errors are found.

    Raises
    ------
    ModelsDifferError
        In case MODELS differ. Reports on which models differ.
    """
    # searchers for the first MODEL line. If found, break the loop
    # and continue to the rest of the function.
    #
    # if not found, return the same input lines.
    for line in lines:
        if line.startswith("MODEL"):
            break
    else:
        return lines

    # captures all the models
    models: dict[Optional[int], set[str]] = {}
    new_model: list[str] = []
    new_model_id = None
    for line in lines:
        if line.startswith("MODEL"):
            if new_model_id is not None:
                models[new_model_id] = set(new_model)
                new_model.clear()
            new_model_id = int(line[10:14])

        elif line.startswith(("ATOM", "HETATM")):
            new_model.append(line[12:27])
    else:
        models[new_model_id] = set(new_model)
        new_model.clear()

    # check if all MODELS are equal, performing all vs all comparison
    keys = list(models.keys())
    first_key = keys[0]
    for model_num in keys[1:]:
        if models[model_num] != models[first_key]:
            emsg = f"Labels in MODEL {model_num} differ from MODEL {first_key}."
            raise ModelsDifferError(emsg)

    return lines