import logging
import os
import shutil
import subprocess
logger = logging.getLogger(__name__)
[docs]
class ResourceMonitor:
"""
ResourceMonitor is responsible for monitoring the system resources
(CPU, Memory) on single/multi-node system, while the subprocess command
is running. It uses `ska-sdp-benchmark-monitor` for resource monitoring.
This class should be instantiated and used within a "with" scope.
"""
def __init__(
self,
generate_reports_on_failure: bool = True,
working_directory: str = "./benchmon-work-dir",
start: dict = None,
stop: dict = None,
):
"""
Parameters
----------
generate_reports_on_failure
Whether to generate reports even if
some exception occurs during exit
working_directory
Directory where the subprocess are run
start
CLI arguments required by benchmon-start
stop
CLI arguments required by benchmon-stop
"""
start = start or {}
self.__start_cli_args = (arg for item in start.items() for arg in item)
stop = stop or {}
self.__stop_cli_args = (arg for item in stop.items() for arg in item)
self.__work_dir = os.path.abspath(working_directory)
self.__multinode = (
"-multinode" if bool(os.getenv("SLURM_JOB_ID")) else ""
)
self.__generate_reports_on_failure = generate_reports_on_failure
os.makedirs(self.__work_dir, exist_ok=True)
def __enter__(self):
_command = [
f"benchmon{self.__multinode}-start",
*self.__start_cli_args,
]
logger.info(
"Starting resource monitor with working_dir='%s' and command='%s'",
self.__work_dir,
_command,
)
subprocess.run(
_command,
check=True,
cwd=self.__work_dir,
)
return self
def __exit__(self, exception_type, *args):
_command = [
f"benchmon{self.__multinode}-stop",
*self.__stop_cli_args,
]
logger.info("Closing resource monitor by executing: %s", _command)
subprocess.run(
_command,
check=True,
cwd=self.__work_dir,
)
if (exception_type is not None) and (
not self.__generate_reports_on_failure
):
logger.error("Deleting resource monitor reports")
shutil.rmtree(self.__work_dir, ignore_errors=True)