import logging
import os
import shutil
import subprocess
logger = logging.getLogger(__name__)
[docs]
class ResourceMonitor:
"""
ResourceMonitor is responsible for monitoring the system resources
(CPU, Memory) on single/multi-node system, while the subprocess command
is running. It uses `ska-sdp-benchmark-monitor` for resource monitoring.
This class should be instantiated and used within a "with" scope.
"""
def __init__(
self,
generate_reports_on_failure: bool = True,
save_dir: str = "./benchmon-work-dir",
level: int = 0,
):
"""
Parameters
----------
generate_reports_on_failure
Whether to generate reports even if
some exception occurs during exit
save_dir
Directory to save the report.
level
Level of the monitor.
"""
self.__level = level
self.__save_dir = save_dir
self.__multinode = (
"-multinode" if bool(os.getenv("SLURM_JOB_ID")) else ""
)
self.__generate_reports_on_failure = generate_reports_on_failure
os.makedirs(self.__save_dir, exist_ok=True)
def __enter__(self):
logger.info("Starting monitor with level: %s", self.__level)
logger.info("Monitor output will be stored in: %s", self.__save_dir)
subprocess.run(
[
f"benchmon{self.__multinode}-start",
"--level",
f"{self.__level}",
"--save-dir",
self.__save_dir,
],
check=True,
cwd=self.__save_dir,
)
return self
def __exit__(self, exception_type, *args):
logger.info("Closing resource monitor.")
subprocess.run(
[
f"benchmon{self.__multinode}-stop",
"--level",
f"{self.__level}",
"--save-dir",
self.__save_dir,
],
check=True,
cwd=self.__save_dir,
)
if (exception_type is not None) and (
not self.__generate_reports_on_failure
):
logger.error("Deleting resource monitor reports")
shutil.rmtree(self.__save_dir, ignore_errors=True)