Source code for haddock.modules.analysis.clustfcc.clustfcc

"""FCC clustering."""

import os
from pathlib import Path

import numpy as np

from haddock import log
from haddock.libs.libfcc import cluster_elements, output_clusters



[docs]
def iterate_clustering(pool, min_population_param):
    """
    Iterate over the clustering process until a cluster is found.

    Parameters
    ----------
    pool : fcc.Pool
        The pool object containing the fcc matrix.

    min_population_param : int
        The min_population parameter to start the clustering process.

    Returns
    -------
    clusters : list
        A list of clusters.

    min_population : int
        The min_population used to obtain the clusters.
    """
    cluster_check = False
    while not cluster_check:
        for min_population in range(min_population_param, 0, -1):
            log.info(f"Clustering with min_population={min_population}")
            _, clusters = cluster_elements(
                pool,
                threshold=min_population,
            )
            if not clusters:
                log.info("[WARNING] No cluster was found, decreasing min_population!")
            else:
                cluster_check = True
                # pass the actual min_population back to the param dict
                #  because it will be used in the detailed output
                break
        if not cluster_check:
            # No cluster was obtained in any min_population
            cluster_check = True
    return clusters, min_population




[docs]
def write_clusters(clusters, out_filename="cluster.out"):
    """
    Write the clusters to the cluster.out file.

    Parameters
    ----------
    clusters : list
        A list of clusters.

    out_filename : str, optional
        The name of the output file. The default is "cluster.out".

    Returns
    -------
    None
    """
    # write the classic output file for compatibility reasons
    log.info(f"Saving output to {out_filename}")
    cluster_out = Path(out_filename)
    with open(cluster_out, "w") as fh:
        output_clusters(fh, clusters)
    fh.close()




[docs]
def get_cluster_centers(clusters, models_to_cluster):
    """
    Get the cluster centers and the cluster dictionary.

    Parameters
    ----------
    clusters : list
        A list of clusters.

    models_to_cluster : list
        A list of models to cluster.

    Returns
    -------
    clt_dic : dict
        A dictionary containing the clusters.

    clt_centers : dict
        A dictionary containing the cluster centers.
    """
    clt_dic = {}
    clt_centers = {}
    # iterate over the clusters
    for clt in clusters:
        cluster_id = clt.name
        cluster_center_id = clt.center.name - 1
        cluster_center_pdb = models_to_cluster[cluster_center_id]

        clt_dic[cluster_id] = []
        clt_centers[cluster_id] = cluster_center_pdb
        clt_dic[cluster_id].append(cluster_center_pdb)
        # iterate over the models in the cluster
        for model in clt.members:
            model_id = model.name
            model_pdb = models_to_cluster[model_id - 1]
            clt_dic[cluster_id].append(model_pdb)
    return clt_dic, clt_centers




[docs]
def write_clustfcc_file(
    clusters,
    clt_centers,
    clt_dic,
    params,
    sorted_score_dic,
    output_fname="clustfcc.txt",
):  # noqa: E501
    """
    Write the clustfcc.txt file.

    Parameters
    ----------
    clusters : list
        A list of clusters.

    clt_centers : dict
        A dictionary containing the cluster centers.

    clt_dic : dict
        A dictionary containing the clusters.

    params : dict
        A dictionary containing the clustering parameters.

    sorted_score_dic : list
        A list of sorted scores.

    Returns
    -------
    None
    """
    # Prepare clustfcc.txt
    output_str = f"### clustfcc output ###{os.linesep}"
    output_str += os.linesep
    output_str += f"Clustering parameters {os.linesep}"
    output_str += (
        "> contact_distance_cutoff="
        f"{params['contact_distance_cutoff']}A"
        f"{os.linesep}"
    )
    output_str += f"> clust_cutoff={params['clust_cutoff']}" f"{os.linesep}"
    output_str += f"> min_population={params['min_population']}{os.linesep}"
    output_str += f"> strictness={params['strictness']}{os.linesep}"
    output_str += os.linesep
    output_str += (
        "Note: Models marked with * represent the center of the cluster" f"{os.linesep}"
    )
    output_str += f"-----------------------------------------------{os.linesep}"
    output_str += os.linesep
    output_str += f"Total # of clusters: {len(clusters)}{os.linesep}"

    for cluster_rank, _e in enumerate(sorted_score_dic, start=1):
        cluster_id, _ = _e
        center_pdb = clt_centers[cluster_id]
        model_score_l = [(e.score, e) for e in clt_dic[cluster_id]]
        model_score_l.sort()
        # subset_score_l = [e[0] for e in model_score_l][:min_population]
        subset_score_l = [e[0] for e in model_score_l]
        subset_score_l = subset_score_l[: params["min_population"]]
        top_mean_score = np.mean(subset_score_l)
        top_std = np.std(subset_score_l)
        output_str += (
            f"{os.linesep}"
            "-----------------------------------------------"
            f"{os.linesep}"
            f"Cluster {cluster_rank} (#{cluster_id}, "
            f"n={len(model_score_l)}, "
            f"top{params['min_population']}_avg_score = {top_mean_score:.2f} "
            f"+-{top_std:.2f})"
            f"{os.linesep}"
        )
        output_str += os.linesep
        output_str += f"clt_rank\tmodel_name\tscore{os.linesep}"
        for model_ranking, element in enumerate(model_score_l, start=1):
            score, pdb = element
            if pdb.file_name == center_pdb.file_name:
                output_str += (
                    f"{model_ranking}\t{pdb.file_name}\t{score:.2f}\t*" f"{os.linesep}"
                )
            else:
                output_str += (
                    f"{model_ranking}\t{pdb.file_name}\t{score:.2f}" f"{os.linesep}"
                )
    output_str += "-----------------------------------------------" f"{os.linesep}"

    log.info("Saving detailed output to clustfcc.txt")
    with open(output_fname, "w") as out_fh:
        out_fh.write(output_str)

    return