Source code for haddock.modules.scoring

"""HADDOCK3 modules to score models."""
import pandas as pd

from haddock.core.typing import FilePath, Optional, Path, Any
from haddock.gear.haddockmodel import HaddockModel
from haddock.modules import BaseHaddockModule
from haddock.modules.base_cns_module import BaseCNSModule



[docs]
class ScoringModule(BaseHaddockModule):
    """Parent class for Scoring modules."""


[docs]
    def output(
            self,
            output_fname: FilePath,
            sep: str = "\t",
            ascending_sort: bool = True,
            ) -> None:
        r"""Save the output in comprehensive tables.

        Parameters
        ----------
        output_fname : FilePath
            Path to the file where to write scoring data.
        sep : str, optional
            Character used as separator in file, by default "\t"
        ascending_sort : bool, optional
            Should the data be sorted in ascending order, by default True
        """
        # saves scoring data
        sc_data = [
            [pdb.file_name, pdb.ori_name, pdb.md5, pdb.score]
            for pdb in self.output_models
            ]
        
        # converts to pandas dataframe and sorts by score
        df_columns = ["structure", "original_name", "md5", "score"]
        df_sc = pd.DataFrame(sc_data, columns=df_columns)
        df_sc_sorted = df_sc.sort_values(by="score", ascending=ascending_sort)
        # writes to disk
        df_sc_sorted.to_csv(
            output_fname,
            sep=sep,
            index=False,
            na_rep="None",
            float_format="%.3f",
            )
        return





[docs]
class CNSScoringModule(BaseCNSModule, ScoringModule):
    """Parent class for CNS Scoring modules."""


[docs]
    def per_interface_output(
            self,
            output_fname: FilePath,
            models: list[HaddockModel],
            sep: str = "\t",
            ascending_sort: bool = True,
            ) -> None:
        r"""Generate per interface scoring tsv output files.

        Parameters
        ----------
        output_fname : FilePath
            Path to the file where to write scoring data.
        models : list[HaddockModel]
            List of HaddockModel object obtained by loading the PDB files.
        sep : str, optional
            Character used as separator in file, by default "\t"
        ascending_sort : bool, optional
            Should the data be sorted in ascending order, by default True
        """
        # Skip the analysis if not desired by the user
        if not self.per_interface_scoring:
            return

        # Retrieve all interfaces data for all pdb
        set_interfaces: list[str] = []
        pdb_interfaces_scores: dict[tuple[Any, Any, Any], dict[str, dict[str, float]]] = {}  # noqa : E501

        # Loop over models to recover interfaces
        for pdb, haddock_model in zip(self.output_models, models):
            # if the pdb does not exist, skip
            if not Path(pdb.file_name).exists():
                continue
            # Make reverse interface checks
            # Score A->B == Score B->A
            reversed_interfaces_scores = {}
            # Hold list of interfaces
            for interface, scores in haddock_model.interface_energies.items():
                # Check if reverse chain order present
                split_inter = interface.split("_")
                reverse_interface = f"{split_inter[1]}_{split_inter[0]}"
                if reverse_interface in set_interfaces:
                    reversed_interfaces_scores[reverse_interface] = scores
                # Check if interface present
                if interface not in set_interfaces:
                    set_interfaces.append(interface)

            # Combine with reversed interface scores
            haddock_model.interface_energies.update(reversed_interfaces_scores)
            # Hold data
            pdbkey = (pdb.file_name, pdb.ori_name, pdb.md5)
            pdb_interfaces_scores[pdbkey] = haddock_model.interface_energies

        # Preset output file basename and extension
        output_file = Path(output_fname)
        output_bn = output_file.stem
        ouput_ext = "".join(output_file.suffixes)
        # Write separated files for all interfaces
        for interface in set_interfaces:
            # Point data
            sc_data = []
            for pdbkey, interfaces_scores in pdb_interfaces_scores.items():
                if interface not in interfaces_scores.keys():
                    continue
                interface_scores = interfaces_scores[interface]
                score = interface_scores["HADDOCKscore"]
                sc_data.append([pdbkey[0], pdbkey[1], pdbkey[2], score])
            # Check that the list is not empty
            if len(sc_data) == 0:
                continue
            # converts to pandas dataframe and sorts by score
            df_columns = ["structure", "original_name", "md5", "score"]
            df_sc = pd.DataFrame(sc_data, columns=df_columns)
            df_sc_sorted = df_sc.sort_values(
                by="score",
                ascending=ascending_sort,
                )
            # Generate output filename
            interface_output_fname = f"{output_bn}_{interface}{ouput_ext}"
            # writes to disk
            df_sc_sorted.to_csv(
                interface_output_fname,
                sep=sep,
                index=False,
                na_rep="None",
                float_format="%.3f",
                )
        return

    

[docs]
    def extract_interface_combinations(self) -> list[str]:
        """Read interface specific parameters.

        Removes the `interface_combinations` from the parameters as not
        supported by CNS.
        Sets the `per_interface_scoring` to True if `interface_combinations`
        is not empty, as it is required for the interface-scores to be
        present in the PDB file.

        Returns
        -------
        interface_combinations : list[str]
            List of user-defined combinations.
        """
        # Here we pop the parameter as not supported by CNS and only used
        # at the python level for downstream analysis
        interface_combinations = self.params.pop("interface_combinations")
        # Set the per_interface_scoring parameter value as set by the user
        self.per_interface_scoring = self.params["per_interface_scoring"]
        # Check if the parameter is used
        if interface_combinations != []:
            # NOTE: per_interface_scoring must be set to true for the interface
            # scores to be present as REMARK in the header of the PDB file.
            self.params["per_interface_scoring"] = True
        return interface_combinations

    

[docs]
    def update_pdb_scores(
            self,
            interface_combinations: list[str],
            ) -> tuple[list[HaddockModel], dict[str, list[str]]]:
        """Update the score attributes in the output pdb files.

        Parameters
        ----------
        interface_combinations : list[str]
            Input list of chains to be considered.
            Each list entry must be composed of two chains separated by coma.
            e.g.: 
            []              -> Consider all non-redundant chain pairs
            ["A,H", "A,L"]  -> Consider only the interface scores between A,H and A,L

        Returns
        -------
        output_haddock_models : list[HaddockModel]
            List of HaddockModel for each input pdb, that contain the actual
            scores loaded from the file.
        """
        # Obtain list of user defined interfaces
        desired_interfaces = self.build_interface_sets_combinations(
            interface_combinations
        )
        # Get the weights from the parameters
        _weight_keys = ("w_vdw", "w_elec", "w_desolv", "w_air", "w_bsa")
        weights = {e: self.params[e] for e in _weight_keys}

        interface_errors: dict[str, list[str]] = {}
        # Check for generated output, fail it not all expected files are found
        output_haddock_models: list[HaddockModel] = []
        for pdb in self.output_models:
            if pdb.is_present():
                # Convert pdb file into a HaddockModel to read the scores
                haddock_model = HaddockModel(pdb.file_name)
                # Set the unweighted energy terms
                pdb.unw_energies = haddock_model.energies
                # Compute set of interfaces if some are defined
                if len(desired_interfaces) >= 1:
                    try:
                        score = self.compute_interfaces_score(
                            haddock_model.interface_energies,
                            desired_interfaces,
                        )
                    # In case the output is None, fall back to standard
                    # haddock score that will always be ok to compute
                    except ValueError as interface_error:
                        # Hold that specific error
                        error_msg = str(interface_error)
                        if error_msg not in interface_errors.keys():
                            interface_errors[error_msg] = []
                        interface_errors[error_msg].append(pdb.file_name)
                        # Compute the haddock score instead
                        score = haddock_model.calc_haddock_score(**weights)
                    finally:
                        haddock_score = score
                # Otherwise simply compute the standard haddock score
                else:
                    # Compute the haddock score
                    haddock_score = haddock_model.calc_haddock_score(**weights)
                # Set the score attribute
                pdb.score = haddock_score
                output_haddock_models.append(haddock_model)
        # Log errors
        for error_msg, models in interface_errors.items():
            self.log(
                f"Interface error: '{error_msg}' occured {len(models)} times."
                " Falling back on classic HADDOCKscore."
                )
        return output_haddock_models



[docs]
    @staticmethod
    def build_interface_sets_combinations(
            interface_combinations: list[str],
            ) -> list[str]:
        """Build desired combinatation of interfaces.

        Parameters
        ----------
        interface_combinations : list[str] | None
            Input list of chains to be considered.
            Each list entry must be composed of two chains separated by coma.
            e.g.: 
            []              -> Consider all non-redundant chain pairs
            ["A,H", "A,L"]  -> Consider only the interface scores between A,H and A,L

        Returns
        -------
        combinations : list[str]
            Unpacked list of interface combinations to consider.
            ["A,H", "A,L"]  -> ["A_H", "A_L"]
        """
        combinations: list[str] = [
            "_".join([c.strip() for c in chains.split(",")])
            for chains in interface_combinations
            if chains.count(",") == 1  # ensures input only contains pairs
        ]
        return combinations



[docs]
    @staticmethod
    def compute_interfaces_score(
            interface_energies: dict[str, dict[str, float]],
            interface_sets_combinations: list[str],
            ) -> Optional[float]:
        """Compute the sum of selected interface haddock score.

        Parameters
        ----------
        interfaces_scores : dict[str, dict[str, float]]
            Scores of the various interfaces present in the pdb.
        interface_sets_combinations : list[str]
            List of interface combinations to consider
        """
        # Minimum set of interfaces must be >= 1
        if len(interface_sets_combinations) == 0:
            raise ValueError("No input interfaces")

        # Get all desired interfaces scores
        selected_interfaces_scores: list[float] = []
        for interface in interface_sets_combinations:
            if interface in interface_energies.keys():
                interface_score = interface_energies[interface]["HADDOCKscore"]
                selected_interfaces_scores.append(interface_score)
            else:
                # Check if reverse chain order present
                split_inter = interface.split("_")
                reverse_interface = f"{split_inter[1]}_{split_inter[0]}"
                if reverse_interface in interface_energies.keys():
                    interface_score = interface_energies[reverse_interface]["HADDOCKscore"]
                    selected_interfaces_scores.append(interface_score)
        # Sum all desired interfaces scores
        if len(selected_interfaces_scores) >= 1:
            new_score = sum(selected_interfaces_scores)
        else:
            raise ValueError("Selected interface not found")
        return new_score