Source code for haddock.modules.analysis.seletopclusts.seletopclusts

"""Set of functions related to the selection of top clusters."""

import os
import math
from pathlib import Path
from haddock.core.typing import Union
from haddock.libs.libontology import PDBFile


[docs]def select_top_clusts_models( sortby: str, models_to_select: list[PDBFile], top_clusters: int, top_models: Union[int, float], ) -> tuple[list[PDBFile], list[str]]: """Select best clusters based on structures scores. Parameters ---------- sortby : str How to order clusters: by `score` or by `size`. models_to_select : list[PDBFile] List of input models on which selection must be performed. top_clusters : int Number of best clusters to take into account. top_models : int Number of best models in each cluster to take into account. Returns ------- models_to_export : list[PDBFile] List of PDBfiles to export. notes : list[str] List of notes to be printed. """ notes: list[str] = [] by_clusters = map_clusters_models(models_to_select) # Get cluster order if sortby == "size": cluster_rankings = size_clust_order(by_clusters) else: cluster_rankings = rank_clust_order(by_clusters) # Check if number of clusters >= set of rank if top_clusters >= len(cluster_rankings): # select all clusters cluster_rankins_str = ",".join(map(str, cluster_rankings)) notes.append(f"Selecting all clusters: {cluster_rankins_str}") else: # select top_cluster clusters cluster_rankings = cluster_rankings[:top_clusters] cluster_rankins_str = ",".join(map(str, cluster_rankings)) notes.append( f"Selecting top {top_clusters} clusters: " f"{cluster_rankins_str}" ) # Initiate set of selected models to export models_to_export: list[PDBFile] = [] # Loop over cluster ranks for clt_rank in cluster_rankings: # Sort models by model rank clt_mdls, note = sort_models(by_clusters[clt_rank]) if note: notes.append(note) # Set new ranks to models for mdl_rank, pdb in enumerate(clt_mdls, start=1): pdb.clt_rank = clt_rank pdb.clt_model_rank = mdl_rank # In case number of models is not set (nan.) if math.isnan(top_models): for pdb in clt_mdls: models_to_export.append(pdb) # In case number of models is a integer else: # Loop over first `top_models` models for pdb in clt_mdls[:top_models]: notes.append( f" {pdb.file_name} " f"> cluster_{pdb.clt_rank}_" f"model_{pdb.clt_model_rank}.pdb" ) models_to_export.append(pdb) return models_to_export, notes
[docs]def sort_models( models: list[PDBFile] ) -> tuple[list[PDBFile], Union[None, str]]: """Sort models based on their rank in cluster. Parameters ---------- models : list[PDBFile] List of input models on which ordering must be performed. Returns ------- sorted_mdls : list[PDBFile] List of sorted models. """ note: Union[None, str] = None try: sorted_mdls = sorted( models, key=lambda k: k.clt_model_rank, ) except TypeError: note = 'model rank unavailable, falling back to input order' sorted_mdls = models return sorted_mdls, note
[docs]def rank_clust_order( by_clusters: dict[int, list[PDBFile]], ) -> list[int]: """Select best clusters based on structures scores. Parameters ---------- models_to_select : list[PDBFile] List of input models on which selection must be performed. top_clusters : int Number of best clusters to take into account. top_models : int Number of best models in each cluster to take into account. Returns ------- models_to_export : list[PDBFile] List of PDBfiles to export. notes : list[str] List of notes to be printed. """ # Generate set of all cluster rank available cluster_rankings = sorted(by_clusters) return cluster_rankings
[docs]def size_clust_order( by_clusters: dict[int, list[PDBFile]], ) -> list[int]: """Select best clusters based on structures scores. Parameters ---------- models_to_select : list[PDBFile] List of input models on which selection must be performed. top_clusters : int Number of best clusters to take into account. top_models : int Number of best models in each cluster to take into account. Returns ------- models_to_export : list[PDBFile] List of PDBfiles to export. notes : list[str] List of notes to be printed. """ # Generate set of all cluster rank available cluster_rankings = sorted( by_clusters, key=lambda k: len(by_clusters[k]), reverse=True, ) return cluster_rankings
[docs]def map_clusters_models(models: list[PDBFile]) -> dict[int, list[PDBFile]]: """Group models by clusters. Parameters ---------- models : list[PDBFile] List of PDBfiles models to be grouped. Returns ------- by_clusters : dict[int, list[PDBFile]] _description_ """ # Preset dictionary keys by_clusters: dict[int, list[PDBFile]] = { clrank: [] for clrank in list(set([pdb.clt_rank for pdb in models])) } # Loop over models for pdb in models: # Add model to cluster by_clusters[pdb.clt_rank].append(pdb) return by_clusters
[docs]def write_selected_models( output_path: Union[str, Path], models: list[PDBFile], module_path: Union[str, Path], ) -> list[PDBFile]: """Dump selected models and new names in a file. Parameters ---------- output_path : Union[str, Path] Name of tne file to create. models : list[PDBFile] List of PDBfiles of selected models. module_path : Union[str, Path] Path of the module. Returns ------- models : list[PDBFile] Updated list of selected models. """ # dump the models to disk and change their attributes with open(output_path, 'w') as fh: fh.write("rel_path\tori_name\tcluster_name\tmd5" + os.linesep) for model in models: name = ( f"cluster_{model.clt_rank}_model" f"_{model.clt_model_rank}.pdb" ) # writing name fh.write( f"{model.rel_path}\t" f"{model.ori_name}\t" f"{name}\t" f"{model.md5}" + os.linesep ) # changing attributes name_path = Path(name) name_path.write_text(model.rel_path.read_text()) model.ori_name = model.file_name model.file_name = name model.full_name = name model.rel_path = Path('..', Path(module_path).name, name) model.path = str(Path(".").resolve()) return models