Source code for ska_sdp_batchlet.utils.monitor.resource

import logging
import os
import shutil
import subprocess

logger = logging.getLogger(__name__)


[docs] class ResourceMonitor: """ ResourceMonitor is responsible for monitoring the system resources (CPU, Memory) on single/multi-node system, while the subprocess command is running. It uses `ska-sdp-benchmark-monitor` for resource monitoring. This class should be instantiated and used within a "with" scope. """ def __init__( self, generate_reports_on_failure: bool = True, working_directory: str = "./benchmon-work-dir", start: dict = None, stop: dict = None, ): """ Parameters ---------- generate_reports_on_failure Whether to generate reports even if some exception occurs during exit working_directory Directory where the subprocess are run start CLI arguments required by benchmon-start stop CLI arguments required by benchmon-stop """ start = start or {} self.__start_cli_args = (arg for item in start.items() for arg in item) stop = stop or {} self.__stop_cli_args = (arg for item in stop.items() for arg in item) self.__work_dir = os.path.abspath(working_directory) self.__multinode = ( "-multinode" if bool(os.getenv("SLURM_JOB_ID")) else "" ) self.__generate_reports_on_failure = generate_reports_on_failure os.makedirs(self.__work_dir, exist_ok=True) def __enter__(self): _command = [ f"benchmon{self.__multinode}-start", *self.__start_cli_args, ] logger.info( "Starting resource monitor with working_dir='%s' and command='%s'", self.__work_dir, _command, ) subprocess.run( _command, check=True, cwd=self.__work_dir, ) return self def __exit__(self, exception_type, *args): _command = [ f"benchmon{self.__multinode}-stop", *self.__stop_cli_args, ] logger.info("Closing resource monitor by executing: %s", _command) subprocess.run( _command, check=True, cwd=self.__work_dir, ) if (exception_type is not None) and ( not self.__generate_reports_on_failure ): logger.error("Deleting resource monitor reports") shutil.rmtree(self.__work_dir, ignore_errors=True)