Source code for haddock.clis.cli_analyse

"""
Analyse a set of steps of a run.

Considering the example run::

    run1/
        0_topoaa/
        1_rigidbody/
        2_seletop/
        3_flexref/
        (etc...)


USAGE::

    haddock3-analyse -r <run_dir> -m <num_modules>
    haddock3-analyse -r run1 -m 1 3


Where, ``-m 1 3`` means that the analysis will be performed on ``1_rigidbody``
 and ``3_flexref``.
"""
import argparse
import os
import shutil
import sys
from pathlib import Path

from haddock import log
from haddock.clis.cli_unpack import main as haddock3_unpack
from haddock.clis.cli_clean import main as haddock3_clean
from haddock.core.defaults import INTERACTIVE_RE_SUFFIX
from haddock.core.typing import (
    Any,
    ArgumentParser,
    Callable,
    FilePath,
    ImgFormat,
    Namespace,
    Optional,
    ParamDict,
    ParamMap,
)
from haddock.gear.yaml2cfg import read_from_yaml_config
from haddock.gear.clean_steps import _unpack_gz
from haddock.libs.libcli import _ParamsToDict
from haddock.libs.libio import archive_files_ext
from haddock.libs.libontology import ModuleIO
from haddock.libs.libplots import (
    ClRank,
    box_plot_handler,
    clt_table_handler,
    read_capri_table,
    report_generator,
    scatter_plot_handler,
)
from haddock.modules import get_module_steps_folders
from haddock.modules.analysis.caprieval import DEFAULT_CONFIG as caprieval_params
from haddock import modules_defaults_path
from haddock.modules.analysis.caprieval import HaddockModule


ANA_FOLDER = "analysis"  # name of the analysis folder
INTER_STR = INTERACTIVE_RE_SUFFIX  # suffix of interactive analysis folders


[docs]def get_cluster_ranking(capri_clt_filename: FilePath, top_cluster: int) -> ClRank:
    """
    Get capri cluster ranking.

    Parameters
    ----------
    capri_clt_filename : str or Path
        capri cluster filename
    top_cluster : int
        Number of clusters to be considered

    Returns
    -------
    cl_ranking : dict
        {cluster_id : cluster_rank} dictionary
    """
    cl_ranking: ClRank = {}
    dfcl = read_capri_table(capri_clt_filename)
    for n in range(min(top_cluster, dfcl.shape[0])):
        cl_ranking[dfcl["cluster_id"].iloc[n]] = dfcl["caprieval_rank"].iloc[n]
    return cl_ranking


[docs]def update_paths(
    capri_ss_filename: FilePath, toch: str = "../", toadd: str = "../../"
) -> None:
    """
    Update paths in capri_ss_filename.

    Parameters
    ----------
    capri_ss_filename : str or Path
        capri ss filename
    toch : str
        string to be replaced
    toadd : str
        string to be added
    """
    new_lines: list[str] = []
    with open(capri_ss_filename, "r") as rfile:
        for ln in rfile:
            new_ln = ln.replace(toch, toadd)
            new_lines.append(new_ln)

    with open(capri_ss_filename, "w") as wfile:
        for ln in new_lines:
            wfile.write(ln)
    return


# Command line interface parser
ap = argparse.ArgumentParser(
    prog="haddock3-analyse",
    description=__doc__,
    formatter_class=argparse.RawDescriptionHelpFormatter,
)

ap.add_argument(
    "-r",
    "--run-dir",
    help="The input run directory.",
    required=True,
)

ap.add_argument(
    "-m",
    "--modules",
    nargs="+",
    help="The number of the steps to copy.",
    required=True,
    type=int,
)

ap.add_argument(
    "-t",
    "--top_cluster",
    help="The number of clusters to show.",
    required=False,
    type=int,
    default=10,
)

ap.add_argument(
    "--format",
    help="produce images in the desired format",
    required=False,
    type=str,
    default=None,
    choices=["png", "pdf", "svg", "jpeg", "webp"],
)

ap.add_argument(
    "--scale", help="scale for images", required=False, type=float, default=1.0
)

ap.add_argument(
    "--inter",
    help="interactive analysis",
    required=False,
    type=bool,
    default=False
    )

ap.add_argument(
    "--is_cleaned",
    help="is the directory going to be cleaned?",
    required=False,
    type=bool,
    default=False
)

ap.add_argument(
    "--offline",
    help="Should plots js functions be self-contained?",
    required=False,
    type=bool,
    default=False
)

ap.add_argument(
    "-p",
    "--other-params",
    dest="other_params",
    help=(
        "Any other parameter of the `caprieval` module."
        "For example: -p reference_fname target.pdb."
        "You can give any number of parameters."
    ),
    action=_ParamsToDict,
    default={},
    nargs="*",
)


def _ap() -> ArgumentParser:
    return ap


[docs]def load_args(ap: ArgumentParser) -> Namespace:
    """Load argument parser args."""
    return ap.parse_args()


[docs]def cli(ap: ArgumentParser, main: Callable[..., None]) -> None:
    """Command-line interface entry point."""
    cmd = vars(load_args(ap))
    kwargs = cmd.pop("other_params")
    main(**cmd, **kwargs)


[docs]def maincli() -> None:
    """Execute main client."""
    cli(ap, main)


[docs]def run_capri_analysis(step: str, run_dir: FilePath, capri_dict: ParamMap, is_cleaned: bool) -> None:
    """
    Run the caprieval analysis.

    Parameters
    ----------
    step : str
        step name
    run_dir : str or Path
        path to run directory
    capri_dict : dict
        capri dictionary of parameters
    """
    # retrieve json file with all information
    io = ModuleIO()
    filename = Path("..", f"{step}/io.json")
    io.load(filename)
    # unpack the files if they are compressed
    if is_cleaned:
        default_general_params = read_from_yaml_config(modules_defaults_path)
        path_to_unpack = io.output[0].path
        haddock3_unpack(path_to_unpack, ncores=default_general_params["ncores"])
    # create capri
    caprieval_module = HaddockModule(
        order=1,
        path=Path(run_dir),
        initial_params=caprieval_params,
    )
    caprieval_module.update_params(**capri_dict)
    # update model info
    caprieval_module.previous_io = io
    # run capri module
    caprieval_module._run()
    # compress files if they should be compressed
    if is_cleaned:
        haddock3_clean(path_to_unpack, ncores=default_general_params["ncores"])


[docs]def update_capri_dict(default_capri: ParamDict, kwargs: ParamMap) -> ParamDict:
    """
    Update capri dictionary.

    Parameters
    ----------
    default_capri : dict
        default capri dictionary of parameters
    kwargs : dict
        dictionary of input elements

    Returns
    -------
    capri_dict : dict
        updated capri dictionary of parameters
    """
    capri_dict = default_capri.copy()
    for param in kwargs:
        if param not in default_capri:
            sys.exit(
                f"* ERROR * Parameter {param!r} is not a valid `caprieval` parameter"
            )  # noqa:E501
        else:
            if param.endswith("fname"):  # using full path for files
                rel_path = Path(kwargs[param])
                _param = rel_path.resolve()
                kwargs[param] = _param
            capri_dict[param] = kwargs[param]
            log.info(f"setting {param} to {kwargs[param]}")

    return capri_dict


[docs]def update_paths_in_capri_dict(
    capri_dict: ParamDict, target_path: FilePath
) -> ParamDict:
    """
    Make capri_dict specific to target_path.

    Parameters
    ----------
    capri_dict : dict
        capri dictionary of parameters
    target_path : Path
        path to the output folder

    Returns
    -------
    new_capri_dict : dict
        target_path-specific capri dictionary of parameters
    """
    new_capri_dict = capri_dict.copy()
    for key in new_capri_dict:
        if key.endswith("fname") and new_capri_dict[key] not in ["", None]:
            try:
                ref_path = Path(target_path, "reference.pdb")
                shutil.copy(new_capri_dict[key], ref_path)
                new_capri_dict[key] = Path("reference.pdb")
            except FileNotFoundError:
                sys.exit(f"file not found {new_capri_dict[key]}")
    return new_capri_dict


[docs]def zip_top_ranked(capri_filename: FilePath, cluster_ranking: ClRank, summary_name: FilePath) -> None:
    """
    Zip the top ranked structures.

    Parameters
    ----------
    cluster_ranking : dict
        {cluster_id : cluster_rank} dictionary
    ss_file : str or Path
        capri ss filename

    Returns
    -------
    output_zipfile : str or Path
        path to the zipped file
    """
    capri_df = read_capri_table(capri_filename, comment="#")
    gb_cluster = capri_df.groupby("cluster_id")
    for cl_id, cl_df in gb_cluster:
        if cl_id in cluster_ranking.keys():
            if cl_id != "-":
                structs = cl_df.loc[cl_df["model-cluster_ranking"] <= 4][["model", "model-cluster_ranking"]]
            else:
                structs = cl_df.loc[cl_df["caprieval_rank"] <= 10][["model", "caprieval_rank"]]
            structs.columns = ["model", "rank"]
            # iterate over the structures
            for _, row in structs.iterrows():
                struct = Path(row["model"])
                struct_gz = Path(f"{struct}.gz")
                rank = row["rank"]
                # set target name
                if cl_id != "-":
                    target_name = f"cluster_{cluster_ranking[cl_id]}_model_{rank}.pdb"
                else:
                    target_name = f"model_{rank}.pdb"
                # copy the structure
                if Path(struct).exists():
                    shutil.copy(struct, Path(target_name))
                elif struct_gz.exists():
                    shutil.copy(struct_gz, ".")
                    # unpack the file
                    _unpack_gz(Path(".", struct_gz.name))
                    shutil.move(struct.name, Path(target_name))
                else:
                    log.warning(f"structure {struct} not found")

    # now make the archive and delete the pdb files
    archive_files_ext(".", "pdb")
    for file in Path(".").glob("*.pdb"):
        file.unlink()
    # move archive to summary
    expected_archive = Path(".", "pdb.tgz")
    if expected_archive.exists():
        shutil.move("pdb.tgz", summary_name)
        log.info(f"Summary archive {summary_name} created!")
    else:
        log.warning(f"Summary archive {summary_name} not created!")


[docs]def analyse_step(
    step: str,
    run_dir: FilePath,
    capri_dict: ParamDict,
    target_path: Path,
    top_cluster: int,
    format: Optional[ImgFormat],
    scale: Optional[float],
    is_cleaned: Optional[bool],
    offline: bool = False,
) -> None:
    """
    Analyse a step.

    If the step is a caprieval step, use the available capri files.
    Otherwise, launch a capri analysis.

    Parameters
    ----------
    step : str
        step name
    run_dir : str or Path
        path to run directory
    capri_dict : dict
        capri dictionary of parameters
    target_path : Path
        path to the output folder
    top_cluster : int
        Number of clusters to be considered
    format : str
        Produce images in the selected format.
    scale : int
        scale for images.
    """
    log.info(f"Analysing step {step}")

    target_path.mkdir(parents=True, exist_ok=False)
    step_name = step.split("_")[1]
    ss_fname = Path(run_dir, f"{step}/capri_ss.tsv")
    clt_fname = Path(run_dir, f"{step}/capri_clt.tsv")
    if step_name != "caprieval":
        if ss_fname.exists() and clt_fname.exists():
            log.info(f"step {step} has caprieval data, files are available")
            run_capri = False
        else:
            capri_dict = update_paths_in_capri_dict(capri_dict, target_path)
            run_capri = True
    else:
        log.info(f"step {step} is caprieval, files should be already available")
        run_capri = False

    if run_capri == False:
        shutil.copy(ss_fname, target_path)
        shutil.copy(clt_fname, target_path)

    os.chdir(target_path)
    # if the step is not caprieval, caprieval must be run
    if run_capri == True:
        run_capri_analysis(step, run_dir, capri_dict, is_cleaned)

    log.info("CAPRI files identified")
    # plotting
    ss_file = Path("capri_ss.tsv")
    clt_file = Path("capri_clt.tsv")
    if clt_file.exists():
        cluster_ranking = get_cluster_ranking(clt_file, top_cluster)
    else:
        raise Exception(f"clustering file {clt_file} does not exist")
    if ss_file.exists():
        log.info("Plotting results..")
        scatters = scatter_plot_handler(
            ss_file,
            cluster_ranking,
            format,
            scale,
            offline=offline,
            )
        boxes = box_plot_handler(
            ss_file,
            cluster_ranking,
            format,
            scale,
            offline=offline,
            )
        tables = clt_table_handler(clt_file, ss_file, is_cleaned)
        report_generator(boxes, scatters, tables, step)
        # provide a zipped archive of the top ranked structures
        zip_top_ranked(ss_file, cluster_ranking, Path("summary.tgz"))


[docs]def main(
    run_dir: FilePath,
    modules: list[int],
    top_cluster: int,
    format: Optional[ImgFormat],
    scale: Optional[float],
    inter: Optional[bool],
    is_cleaned: Optional[bool],
    offline: bool = False,
    **kwargs: Any,
) -> None:
    """
    Analyse CLI.

    Parameters
    ----------
    run_dir : str or Path
        Path to the original run directory.

    modules : list of ints
        List of the integer prefix of the modules to copy.

    top_cluster : int
        Number of clusters to be considered.

    format : str
        Produce images in the selected format.

    scale : int
        scale for images.

    inter: bool
        analyse only steps labelled as 'interactive'
    """
    log.level = 20
    log.info(
        f"Running haddock3-analyse on {run_dir}, modules {modules}, "
        f"with top_cluster = {top_cluster}"
    )
    ori_cwd = os.getcwd()
    # modifying the parameters
    default_capri = read_from_yaml_config(caprieval_params)
    capri_dict = update_capri_dict(default_capri, kwargs)

    os.chdir(run_dir)
    # Create analysis folder
    rundir_cwd = os.getcwd()
    outdir = Path(ANA_FOLDER)
    try:
        outdir.mkdir(exist_ok=False)
        log.info(f"Created directory: {str(outdir.resolve())}")
    except FileExistsError:
        log.warning(f"Directory {str(outdir.resolve())} already exists.")

    # Reading steps
    log.info("Reading input run directory")
    # get the module folders from the run_dir input
    sel_steps = get_module_steps_folders(Path("./"), modules)
    if inter:
        sel_steps = [st for st in sel_steps if st.endswith(INTER_STR)]
    else:
        sel_steps = [st for st in sel_steps if not st.endswith(INTER_STR)]
    log.info(f"selected steps: {', '.join(sel_steps)}")

    # analysis
    good_folder_paths: list[Path] = []
    bad_folder_paths: list[Path] = []
    for step in sel_steps:
        subfolder_name = f"{step}_analysis"
        target_path = Path(Path("./"), subfolder_name)

        # check if subfolder is already present
        dest_path = Path(ANA_FOLDER, subfolder_name)
        if dest_path.exists():
            if len(os.listdir(dest_path)) != 0 and not inter:
                log.warning(
                    f"{dest_path} exists and is not empty. "
                    "Skipping analysis..."
                    )
                continue
            else:  # subfolder is empty or is interactive, remove it.
                log.info(f"Removing folder {dest_path}.")
                shutil.rmtree(dest_path)

        # run the analysis
        error = False
        try:
            analyse_step(
                step,
                Path("./"),
                capri_dict,
                target_path,
                top_cluster,
                format,
                scale,
                is_cleaned,
                offline=offline,
            )
        except Exception as e:
            error = True
            log.warning(
                f"""Could not execute the analysis for step {step}.
                The following error occurred {e}"""
            )
        if error:
            bad_folder_paths.append(target_path)
        else:
            good_folder_paths.append(target_path)

        # going back
        os.chdir(rundir_cwd)

    # moving files into analysis folder
    if good_folder_paths != []:
        log.info("moving files to analysis folder")
        for directory in good_folder_paths:
            shutil.move(directory, outdir)

    if bad_folder_paths != []:
        log.info("cancelling unsuccesful analysis folders")
        for directory in bad_folder_paths:
            if directory.exists():
                shutil.rmtree(directory)

    # substituting the correct paths in the capri_ss files
    # after moving files into analysis folder
    for directory in good_folder_paths:
        ss_file = Path(outdir, directory, "capri_ss.tsv")
        if ss_file.exists():
            log.info(f"updating paths in {ss_file}")
            update_paths(ss_file, "../", "../../")
        report_file = Path(outdir, directory, "report.html")
        log.info(f"View the results in {report_file}")
        info_msg = (
            "To view structures or download the structure files, "
            f"in a terminal run the command "
            f"`python -m http.server --directory {rundir_cwd}`. "
            "By default, http server runs on `http://0.0.0.0:8000/`. "
            f"Open the link http://0.0.0.0:8000/{report_file} "
            "in a web browser."
        )
        log.info(info_msg)
    os.chdir(ori_cwd)
    return


if __name__ == "__main__":
    sys.exit(maincli())  # type: ignore