"""Functions to monitor ECC error counts for NVIDIA GPUs"""
import logging
from py3nvml.py3nvml import *
from perfmon.core.metrics.gpumetrics.nvidia import device_query
_log = logging.getLogger(__name__)
# pylint: disable=E0401,W0201,C0301
[docs]def ecc_error_counts(data):
"""This method gets NVIDIA GPU ECC error counts for SP and DP"""
# Get DP error counts
dp_error_counts = device_query('nvmlDeviceGetTotalEccErrors', 1, 1)
for i, err in enumerate(dp_error_counts):
data[i]['ecc_errors']['dp'].append(err)
# Get SP error counts
sp_error_counts = device_query('nvmlDeviceGetTotalEccErrors', 0, 1)
for i, err in enumerate(sp_error_counts):
data[i]['ecc_errors']['sp'].append(err)
return data