Source code for ska_sdp_batchlet.utils.monitor.resource

import logging
import os
import shutil
import subprocess

logger = logging.getLogger(__name__)


[docs] class ResourceMonitor: """ ResourceMonitor is responsible for monitoring the system resources (CPU, Memory) on single/multi-node system, while the subprocess command is running. It uses `ska-sdp-benchmark-monitor` for resource monitoring. This class should be instantiated and used within a "with" scope. """ def __init__( self, generate_reports_on_failure: bool = True, save_dir: str = "./benchmon-work-dir", level: int = 0, ): """ Parameters ---------- generate_reports_on_failure Whether to generate reports even if some exception occurs during exit save_dir Directory to save the report. level Level of the monitor. """ self.__level = level self.__save_dir = save_dir self.__multinode = ( "-multinode" if bool(os.getenv("SLURM_JOB_ID")) else "" ) self.__generate_reports_on_failure = generate_reports_on_failure os.makedirs(self.__save_dir, exist_ok=True) def __enter__(self): logger.info("Starting monitor with level: %s", self.__level) logger.info("Monitor output will be stored in: %s", self.__save_dir) subprocess.run( [ f"benchmon{self.__multinode}-start", "--level", f"{self.__level}", "--save-dir", self.__save_dir, ], check=True, cwd=self.__save_dir, ) return self def __exit__(self, exception_type, *args): logger.info("Closing resource monitor.") subprocess.run( [ f"benchmon{self.__multinode}-stop", "--level", f"{self.__level}", "--save-dir", self.__save_dir, ], check=True, cwd=self.__save_dir, ) if (exception_type is not None) and ( not self.__generate_reports_on_failure ): logger.error("Deleting resource monitor reports") shutil.rmtree(self.__save_dir, ignore_errors=True)