Source code for perfmon.core.metrics.gpumetrics.nvidia.errors

"""Functions to monitor ECC error counts for NVIDIA GPUs"""

import logging

from py3nvml.py3nvml import *

from perfmon.core.metrics.gpumetrics.nvidia import device_query

_log = logging.getLogger(__name__)

# pylint: disable=E0401,W0201,C0301


[docs]def ecc_error_counts(data): """This method gets NVIDIA GPU ECC error counts for SP and DP""" # Get DP error counts dp_error_counts = device_query('nvmlDeviceGetTotalEccErrors', 1, 1) for i, err in enumerate(dp_error_counts): data[i]['ecc_errors']['dp'].append(err) # Get SP error counts sp_error_counts = device_query('nvmlDeviceGetTotalEccErrors', 0, 1) for i, err in enumerate(sp_error_counts): data[i]['ecc_errors']['sp'].append(err) return data