"""
Prepare HADDOCK3 benchmark configuration files and job scripts.
Creates HADDOCK3 configuration files and job files. Details on each
parameter are explained in the `-h` menu.
There's also a test flag that generates jobs only with `topology`
creation. This feature helps testing the `haddock3-dmn` client.
The state of the jobs is identified by a file tag:
- AVAILABLE
- RUNNING
- DONE
- FAIL
At start, all jobs have the AVAILABLE tag, and this tag is upgraded as
the job completes. To know which jobs are in each state, navigate to the
<output dir> and search for the tags, for example::
find . -name AVAILABLE
find . -name RUNNING
find . -name DONE
find . -name FAIL
Jobs are identified as FAIL if there are messages in the stderr file.
Finally, a daemon job file is created to facilitate the usage of the
daemon without directly using the `haddock3-dmn` client.
Usage::
haddock3-bm -h
haddock3-bm <BM dir> <output dir> [OPTIONS]
haddock3-bm <BM dir> <output dir> --workload-manager <option>
haddock3-bm <BM dir> <output dir> --workload-manager <option> -n <num cores>
haddock3-bm <BM dir> <output dir> --workload-manager <option> -n <num cores> -td
A `BM folder` is a folder with the characteristics of:
https://github.com/haddocking/BM5-clean
For more information read our benchmark tutorial at `docs/benchmark.tut`
in HADDOCK3 repository site: https://github.com/haddocking/haddock3
""" # noqa: E501
import argparse
import shutil
import string
import sys
from functools import partial
from pathlib import Path
from haddock import log
from haddock.core.typing import (
Any,
ArgumentParser,
Callable,
FilePath,
Namespace,
Union,
)
from haddock.libs.libhpc import create_job_header_funcs
# first character allowed for benchmark test cases, we use digits and
# upper cases because we consider BM test cases folders are named after
# their PDBID code.
capital_and_digits = tuple(string.digits + string.ascii_uppercase)
# client helper functions
def _dir_path(path: FilePath) -> Path:
path = Path(path)
if not path.is_dir():
_p = str(path.resolve())
raise argparse.ArgumentTypeError(f"{_p!r} is not a directory.")
return path
def _is_valid(
f: Path, cap_and_dig: Union[str, tuple[str, ...]] = capital_and_digits
) -> bool:
"""Assert if directory is a valid model directory."""
_is_valid = f.name.startswith(cap_and_dig) and f.is_dir()
return _is_valid
[docs]def get_conda_path() -> Path:
"""Get conda source path."""
return Path(Path(sys.executable).parents[3], "etc", "profile.d", "conda.sh")
[docs]def create_cfg_test_daemon(
run_dir: FilePath,
receptor_f: FilePath,
ligand_f: FilePath,
*ignore: Any,
**everythinelse: Any,
) -> str:
"""
Create HADDOCK3 configuration file that only generates the topology.
This function is usefull to use test the benchmark daemon.
"""
cfg_str = f"""
run_dir = {str(run_dir)!r}
ncores = 2
molecules = [
{str(receptor_f)!r},
{str(ligand_f)!r}
]
[topoaa]
"""
return cfg_str
# FIXME: This should not be hardcoded here
[docs]def create_cfg_ti(
run_dir: FilePath,
receptor_f: FilePath,
ligand_f: FilePath,
ambig_f: FilePath,
target_f: FilePath,
) -> str:
"""
Create HADDOCK3 configuration file for the first scenario.
Parameters
----------
run_dir : path or str
Path to the run directory; where run results will be saved.
receptor_f : Path or str
Absolute path pointing to the receptor PDB file.
ligand_f : Path or str
Absolute path pointing to the ligand PDB file.
ambig_f : Path or str
Absolute path pointing to the `ambig.tbl` file.
Return
------
str
The HADDOCK3 configuration file for benchmarking.
"""
cfg_str = f"""
run_dir = {str(run_dir)!r}
ncores = 48
molecules = [
{str(receptor_f)!r},
{str(ligand_f)!r}
]
[topoaa]
[rigidbody]
ambig_fname = {str(ambig_f)!r}
sampling = 1000
noecv = false
[caprieval]
reference = {str(target_f)!r}
[seletop]
select = 200
[flexref]
ambig_fname = {str(ambig_f)!r}
noecv = true
[caprieval]
reference = {str(target_f)!r}
[mdref]
ambig_fname = {str(ambig_f)!r}
noecv = true
[caprieval]
reference = {str(target_f)!r}
"""
return cfg_str
[docs]def create_job(
create_job_header: Callable[..., str],
create_job_body: Callable[[FilePath, FilePath, FilePath], str],
create_job_tail: Callable[[FilePath, FilePath, FilePath], str],
job_name_prefix: str,
scenario_name: str,
job_name_suffix: str,
queue_name: str,
ncores: int,
work_dir: Path,
run_dir: Path,
#
config_file: FilePath,
) -> str:
"""
Create the job file.
The jobs is created by assembling three parts: the job header,
the body, and the final tail (post execution process).
The different parameters will be injected in the respective job
creation functions.
Parameters
----------
create_job_header : callable
The function that will create the header.
create_job_body : callable
The function that will create the job body.
create_job_tail: callable
The function that will create the job tail.
job_name_prefix : str
A prefix for the job name. Normally this is the name of the job
test case, for example the PDB ID.
Injected in `create_job_header`.
scenario_name : str
The name of the benchmark scenario.
Injected in `create_job_header`.
job_name_suffix : str
An additional suffix for the job name. Normally, `BM5`.
Injected in `create_job_header`.
queue_name : str
The name of the queue. Injected in `create_job_header`.
ncores : int
The number of cpu cores to use in the jobs. Injected in
`create_job_header`.
work_dir : pathlib.Path
The working dir of the example. That is, the directory where
`input`, `jobs`, and `logs` reside. Injected in `create_job_header`.
run_dir : pathlib.Path
The running directory of the scenario.
config_file : pathlib.Path
Path to the scenario configuration file.
Injected in `create_job_body`.
Returns
-------
str
The job file in the form of string.
"""
# create job header
job_name = f"{job_name_prefix}-{scenario_name}-{job_name_suffix}"
std_out = str(Path("logs", "haddock.out"))
std_err = str(Path("logs", "haddock.err"))
job_header = create_job_header(
job_name,
work_dir=work_dir,
stdout_path=std_out,
stderr_path=std_err,
queue=queue_name,
ncores=ncores,
)
available_flag = str(Path(run_dir, "AVAILABLE"))
running_flag = str(Path(run_dir, "RUNNING"))
done_flag = str(Path(run_dir, "DONE"))
fail_flag = str(Path(run_dir, "FAIL"))
job_body = create_job_body(available_flag, running_flag, config_file)
job_tail = create_job_tail(std_err, done_flag, fail_flag)
return job_header + job_body + job_tail
[docs]def setup_haddock3_job(
available_flag: FilePath, running_flag: FilePath, conf_f: FilePath
) -> str:
"""
Write body for the job script.
Parameters
----------
available_flag : pathlib.Path
Path where to generate the `AVAILABLE` file tag. Relative to the
job header `workdir`.
running_flag : pathlib.Path
Path where to generate the `RUNNING` file tag. Relative to the
job header `workdir`.
conf_f : Path or str
Path to the configuration file. Relative to the job header
`workdir`.
Returns
-------
str
The body of the job file.
"""
conda_sh = get_conda_path()
job_body = f"""
source {str(conda_sh)}
conda activate haddock3
rm {str(available_flag)}
touch {str(running_flag)}
haddock3 {str(conf_f)}
rm {str(running_flag)}
"""
return job_body
[docs]def process_job_execution_status(
stderr: FilePath, done_tag: FilePath, fail_tag: FilePath
) -> str:
"""
Add execution status tail to job.
If job completes properly adds `DONE`, else adds `FAIL`.
Parameters
----------
done_tag : pathlib.Path
Path where to generate the `DONE` file tag. Relative to the
job header `workdir`.
fail_flag : pathlib.Path
Path where to generate the `FAIL` file tag. Relative to the
job header `workdir`.
"""
tail = f"""
if [ -s {stderr} ]; then
# the file is not empty
touch {str(fail_tag)}
else
# the file is empty
touch {str(done_tag)}
fi
"""
return tail
[docs]def process_target(
source_path: Path,
result_path: FilePath,
create_job_func: Callable[..., str],
scenarios: dict[str, Callable[..., str]],
) -> None:
"""
Process each model example for benchmarking.
Parameters
----------
source_path : Path
The folder path to the target.
result_path : Path
The path where the results for the different scenarios will be
saved.
create_job_func : callable
A function to create the job script. This is an argument because
there are several queue systems available. See
`create_job_header_funcs`.
"""
pdb_id = source_path.name
# Define the root directory
root_p = Path(result_path, pdb_id).resolve()
root_p.mkdir(exist_ok=True, parents=True)
# Define the input directory
# all files required for the different scenarios
# should be inside this folder
input_p = Path(root_p, "input")
input_p.mkdir(exist_ok=True, parents=True)
# copy the required files to the `input` path
ligand = shutil.copy(Path(source_path, f"{pdb_id}_l_u.pdb"), input_p)
receptor = shutil.copy(Path(source_path, f"{pdb_id}_r_u.pdb"), input_p)
ambig_tbl = shutil.copy(Path(source_path, "ambig.tbl"), input_p)
target = shutil.copy(
Path(source_path, "ana_scripts", "target.pdb"), input_p
) # noqa: E501
# make all paths relative
ligand = Path(ligand).relative_to(root_p)
receptor = Path(receptor).relative_to(root_p)
ambig_tbl = Path(ambig_tbl).relative_to(root_p)
target = Path(target).relative_to(root_p)
# Define the job folder
# all job files should be here
job_p = Path(root_p, "jobs")
job_p.mkdir(exist_ok=True, parents=True)
# Define the logs folder
log_p = Path(root_p, "logs")
log_p.mkdir(exist_ok=True, parents=True)
# for each scenario...
for scn_name, scn_func in scenarios.items():
# creates a scenario run folder
run_folder = Path(result_path, pdb_id, f"run-{scn_name}").resolve()
run_folder.mkdir(parents=True, exist_ok=True)
# creates the AVAILABLE tag
available_file = Path(run_folder, "AVAILABLE")
available_file.touch()
run_folder_rel = run_folder.relative_to(root_p)
# the actual scenario results will be inside the `run` folder
# which is inside the `run_folder` itself.
run_job_folder = Path(run_folder_rel, "run")
cfg_file = Path(input_p, f"{scn_name}.cfg")
job_file = Path(job_p, f"{scn_name}.job")
# creates the HADDOCK3 configuration file for the scenario
cfg_str = scn_func(
run_job_folder,
receptor,
ligand,
ambig_tbl,
target,
)
# creates the job for the scenario
job_str = create_job_func(
job_name_prefix=pdb_id,
scenario_name=scenarios_acronym[scn_name],
work_dir=root_p,
run_dir=run_folder_rel,
config_file=cfg_file.relative_to(root_p),
)
# saves files to disk
cfg_file.write_text(cfg_str)
job_file.write_text(job_str)
return
[docs]def make_daemon_job(
create_job_func: Callable[..., str],
workdir: FilePath,
target_dir: FilePath,
job_name: str = "HD3-dmn",
stdout_path: FilePath = "daemon.out",
stderr_path: FilePath = "daemon.err",
queue: str = "short",
) -> str:
"""Make a daemon-ready job."""
job_header = create_job_func(
job_name=job_name,
work_dir=workdir,
stdout_path=stdout_path,
stderr_path=stderr_path,
queue=queue,
ncores=1,
)
conda_sh = get_conda_path()
job = f"""{job_header}
source {str(conda_sh)}
conda activate haddock3
haddock3-dmn {str(target_dir)}
"""
return job
# helper dictionaries
# the different scenarios covered
benchmark_scenarios = {
"true-interface": create_cfg_ti,
# 'center-of-mass': create_cfg_scn_2,
}
test_scenarios = {
"test-daemon": create_cfg_test_daemon,
}
scenarios_acronym = {
"true-interface": "ti",
"center-of-mass": "com",
"test-daemon": "tdmn",
}
# prepares the command-line client arguments
ap = argparse.ArgumentParser(
prog="HADDOCK3 benchmark setup.",
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
ap.add_argument(
"benchmark_path",
help=(
"Location of BM5 folder. "
"This folder should have a subfolder for each model. "
"We expected each subfolder to have the name of a PDBID. "
"That is, folder starting with capital letters or numbers "
"will be considered."
),
type=_dir_path,
)
ap.add_argument(
"output_path",
help="Where the prepared jobs and the executed benchmark will be stored.",
type=_dir_path,
)
ap.add_argument(
"-wm",
"--workload-manager",
dest="workload_manager",
help="The system where the jobs will be run. Default `slurm`.",
choices=list(create_job_header_funcs.keys()),
default="slurm",
)
ap.add_argument(
"-td",
"--test-daemon",
dest="test_daemon",
help=(
"Creates configuration files just with the `topology` module. "
"Useful to test the benchmark daemon. Default `False`."
),
action="store_true",
)
ap.add_argument(
"-qu",
"--queue-name",
dest="queue_name",
help=("The name of the queue where to send the jobs. " "Default 'medium'."),
default="medium",
)
ap.add_argument(
"-n",
"--ncores",
help="Maximum number of processors to use in the jobs. Default: 48",
default=48,
)
ap.add_argument(
"-s",
"--suffix",
help=(
"A common suffix for all jobs. Defaults to 'BM5'. Avoid using "
"long names because the job-name has a limited amount of characters."
),
default="BM5",
type=str,
)
def _ap() -> ArgumentParser:
return ap
# client helper functions
[docs]def load_args(ap: ArgumentParser) -> Namespace:
"""Load argument parser args."""
return ap.parse_args()
[docs]def cli(ap: ArgumentParser, main: Callable[..., None]) -> None:
"""Command-line interface entry point."""
cmd = load_args(ap)
main(**vars(cmd))
[docs]def maincli() -> None:
"""Execute main client."""
cli(ap, main)
[docs]def main(
benchmark_path: Path,
output_path: FilePath,
workload_manager: str = "slurm",
ncores: int = 48,
queue_name: str = "medium",
test_daemon: bool = False,
suffix: str = "BM5",
) -> None:
"""
Create configuration and job scripts for HADDOCK3 benchmarking.
Developed for https://github.com/haddocking/BM5-clean
The parameters defined here are the same as defined in the client
arguments.
This is the main function of the client. If you want to run the benchmark
creation routine withOUT using the command line and instead importing its
functionalities and setting it up from another pythong script, you should
import this function.
>>> from haddock.clis.cli_bm import main
Parameters
----------
benchmark_path : Path
The path to the benchmark models folder. In BM5-clean would be
the 'HADDOCK-ready' folder.
output_path : Path
Where the results will be saved. A subfolder for each model in
`benchmark_path` will be created.
workload_manager : str
A key for `create_job_header_funcs` dictionary. These relate to
the queue managing software installed in your system. Examples
are 'slurm' and 'torque'.
ncores : int
The number of CPUs to use in the created jobs.
queue_name : str
The name of the queue where the jobs will be sent. This depends
on your system configurations.
test_daemon : bool
If `True`, generates short jobs where only the `topology` will
be created. This facilitates testing the `haddoc3 benchmark
daemon`.
suffix : str
A common suffix for all jobs. Avoid using more than three chars.
"""
log.info("*** Preparing Benchnark scripts")
_ = (f for f in benchmark_path.glob("*") if _is_valid(f))
source_folders = sorted(_)
log.info(f"* Creating benchmark jobs for {len(source_folders)} targets")
# which scenarios to use?
_scenarios = test_scenarios if test_daemon else benchmark_scenarios
# prepares a `create_job_func` with predefined parameters.
_create_job_func = partial(
create_job,
create_job_header=create_job_header_funcs[workload_manager],
create_job_body=setup_haddock3_job,
create_job_tail=process_job_execution_status,
queue_name=queue_name,
ncores=ncores,
job_name_suffix=suffix,
)
# prepares a `process_target` function with predefined parameters
# before entering the for loop
pe = partial(
process_target,
result_path=output_path,
create_job_func=_create_job_func,
scenarios=_scenarios,
)
# for each benchmark test case...
for source_path in source_folders:
pe(source_path)
# makes a job to run the daemon as a job.
dmn_job = make_daemon_job(
create_job_header_funcs[workload_manager],
Path.cwd(),
output_path,
queue=queue_name,
)
Path(output_path, "hd3-daemon.job").write_text(dmn_job)
log.info("* done")
return
if __name__ == "__main__":
sys.exit(maincli()) # type: ignore