diff --git a/dcdbpusher/sensors/nvml/nvml.conf b/dcdbpusher/sensors/nvml/nvml.conf index e674e41c2651881de488aea038a65f2e2ae024f7..8b5b63cecd7a1629c01663a4c4fc17300ede5f02 100644 --- a/dcdbpusher/sensors/nvml/nvml.conf +++ b/dcdbpusher/sensors/nvml/nvml.conf @@ -92,9 +92,16 @@ group nvml_utilisation { } } +group nvml_g2 { + default def1 + sensor gpu_ecc_errors { + mqttsuffix /ecc_errors + feature GPU_ECC_ERR + } +} diff --git a/dcdbpusher/sensors/nvml/nvmlConfigurator.cpp b/dcdbpusher/sensors/nvml/nvmlConfigurator.cpp index 1b5d23c62bb1e57b4de3b4597298e6469ea8c088..034aa4263cb7768463df435fc3dbabfa395d8959 100644 --- a/dcdbpusher/sensors/nvml/nvmlConfigurator.cpp +++ b/dcdbpusher/sensors/nvml/nvmlConfigurator.cpp @@ -45,6 +45,7 @@ nvmlConfigurator::nvmlConfigurator() { _gpuFeatureMAP["GPU_CLK_MEM"] = GPU_CLK_MEM; _gpuFeatureMAP["GPU_UTL_MEM"] = GPU_UTL_MEM; _gpuFeatureMAP["GPU_UTL_GPU"] = GPU_UTL_GPU; + _gpuFeatureMAP["GPU_ECC_ERR"] = GPU_ECC_ERR; _groupName = "group"; _baseName = "sensor"; diff --git a/dcdbpusher/sensors/nvml/nvmlSensorBase.h b/dcdbpusher/sensors/nvml/nvmlSensorBase.h index 2f3ab12bde8137f5a80c8c1716cd81ef80feb88b..4a82ccf29bdfa1414407fbf8e7858150be8c1ad3 100644 --- a/dcdbpusher/sensors/nvml/nvmlSensorBase.h +++ b/dcdbpusher/sensors/nvml/nvmlSensorBase.h @@ -50,6 +50,7 @@ enum GPU_FEATURE { GPU_CLK_MEM = 9, GPU_UTL_MEM = 10, GPU_UTL_GPU = 11, + GPU_ECC_ERR = 13, }; /** @@ -141,6 +142,9 @@ class nvmlSensorBase : public SensorBase { case GPU_UTL_GPU: feature = "GPU_UTL_GPU"; break; + case GPU_ECC_ERR: + feature = "GPU_ECC_ERR"; + break; } LOG_VAR(ll) << leading << " Feature type: " << feature; } diff --git a/dcdbpusher/sensors/nvml/nvmlSensorGroup.cpp b/dcdbpusher/sensors/nvml/nvmlSensorGroup.cpp index 29188f8fc1488f5897b04eda4f75cf3b09d35dba..102ee4d9666686449797f19f5a8254d133035cb5 100644 --- a/dcdbpusher/sensors/nvml/nvmlSensorGroup.cpp +++ b/dcdbpusher/sensors/nvml/nvmlSensorGroup.cpp @@ -51,7 +51,7 @@ struct counters_t { unsigned int clockspeed_mem; nvmlMemory_t memory; unsigned int power; - // unsigned long long ecc_counts; + unsigned long long ecc_counts; nvmlUtilization_t utilization; } counters; @@ -186,6 +186,10 @@ void nvmlSensorGroup::read() { err = nvmlDeviceGetUtilizationRates (env.device, &(counters.utilization)); reading.value = counters.utilization.gpu; break; + case(GPU_ECC_ERR): + err = nvmlDeviceGetTotalEccErrors (env.device, NVML_MEMORY_ERROR_TYPE_CORRECTED,NVML_VOLATILE_ECC,&(counters.ecc_counts)); + reading.value = counters.ecc_counts; + break; } s->storeReading(reading);