Commit 22f19301 authored by Weronika's avatar Weronika

added the ecc error sensor to the nvml plugin

parent d6370083
......@@ -92,9 +92,16 @@ group nvml_utilisation {
}
}
group nvml_g2 {
default def1
sensor gpu_ecc_errors {
mqttsuffix /ecc_errors
feature GPU_ECC_ERR
}
}
......@@ -45,6 +45,7 @@ nvmlConfigurator::nvmlConfigurator() {
_gpuFeatureMAP["GPU_CLK_MEM"] = GPU_CLK_MEM;
_gpuFeatureMAP["GPU_UTL_MEM"] = GPU_UTL_MEM;
_gpuFeatureMAP["GPU_UTL_GPU"] = GPU_UTL_GPU;
_gpuFeatureMAP["GPU_ECC_ERR"] = GPU_ECC_ERR;
_groupName = "group";
_baseName = "sensor";
......
......@@ -50,6 +50,7 @@ enum GPU_FEATURE {
GPU_CLK_MEM = 9,
GPU_UTL_MEM = 10,
GPU_UTL_GPU = 11,
GPU_ECC_ERR = 13,
};
/**
......@@ -141,6 +142,9 @@ class nvmlSensorBase : public SensorBase {
case GPU_UTL_GPU:
feature = "GPU_UTL_GPU";
break;
case GPU_ECC_ERR:
feature = "GPU_ECC_ERR";
break;
}
LOG_VAR(ll) << leading << " Feature type: " << feature;
}
......
......@@ -51,7 +51,7 @@ struct counters_t {
unsigned int clockspeed_mem;
nvmlMemory_t memory;
unsigned int power;
// unsigned long long ecc_counts;
unsigned long long ecc_counts;
nvmlUtilization_t utilization;
} counters;
......@@ -186,6 +186,10 @@ void nvmlSensorGroup::read() {
err = nvmlDeviceGetUtilizationRates (env.device, &(counters.utilization));
reading.value = counters.utilization.gpu;
break;
case(GPU_ECC_ERR):
err = nvmlDeviceGetTotalEccErrors (env.device, NVML_MEMORY_ERROR_TYPE_CORRECTED,NVML_VOLATILE_ECC,&(counters.ecc_counts));
reading.value = counters.ecc_counts;
break;
}
s->storeReading(reading);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment