Commit 22f19301 authored by Weronika's avatar Weronika

added the ecc error sensor to the nvml plugin

parent d6370083
...@@ -92,9 +92,16 @@ group nvml_utilisation { ...@@ -92,9 +92,16 @@ group nvml_utilisation {
} }
} }
group nvml_g2 {
default def1
sensor gpu_ecc_errors {
mqttsuffix /ecc_errors
feature GPU_ECC_ERR
}
}
...@@ -45,6 +45,7 @@ nvmlConfigurator::nvmlConfigurator() { ...@@ -45,6 +45,7 @@ nvmlConfigurator::nvmlConfigurator() {
_gpuFeatureMAP["GPU_CLK_MEM"] = GPU_CLK_MEM; _gpuFeatureMAP["GPU_CLK_MEM"] = GPU_CLK_MEM;
_gpuFeatureMAP["GPU_UTL_MEM"] = GPU_UTL_MEM; _gpuFeatureMAP["GPU_UTL_MEM"] = GPU_UTL_MEM;
_gpuFeatureMAP["GPU_UTL_GPU"] = GPU_UTL_GPU; _gpuFeatureMAP["GPU_UTL_GPU"] = GPU_UTL_GPU;
_gpuFeatureMAP["GPU_ECC_ERR"] = GPU_ECC_ERR;
_groupName = "group"; _groupName = "group";
_baseName = "sensor"; _baseName = "sensor";
......
...@@ -50,6 +50,7 @@ enum GPU_FEATURE { ...@@ -50,6 +50,7 @@ enum GPU_FEATURE {
GPU_CLK_MEM = 9, GPU_CLK_MEM = 9,
GPU_UTL_MEM = 10, GPU_UTL_MEM = 10,
GPU_UTL_GPU = 11, GPU_UTL_GPU = 11,
GPU_ECC_ERR = 13,
}; };
/** /**
...@@ -141,6 +142,9 @@ class nvmlSensorBase : public SensorBase { ...@@ -141,6 +142,9 @@ class nvmlSensorBase : public SensorBase {
case GPU_UTL_GPU: case GPU_UTL_GPU:
feature = "GPU_UTL_GPU"; feature = "GPU_UTL_GPU";
break; break;
case GPU_ECC_ERR:
feature = "GPU_ECC_ERR";
break;
} }
LOG_VAR(ll) << leading << " Feature type: " << feature; LOG_VAR(ll) << leading << " Feature type: " << feature;
} }
......
...@@ -51,7 +51,7 @@ struct counters_t { ...@@ -51,7 +51,7 @@ struct counters_t {
unsigned int clockspeed_mem; unsigned int clockspeed_mem;
nvmlMemory_t memory; nvmlMemory_t memory;
unsigned int power; unsigned int power;
// unsigned long long ecc_counts; unsigned long long ecc_counts;
nvmlUtilization_t utilization; nvmlUtilization_t utilization;
} counters; } counters;
...@@ -186,6 +186,10 @@ void nvmlSensorGroup::read() { ...@@ -186,6 +186,10 @@ void nvmlSensorGroup::read() {
err = nvmlDeviceGetUtilizationRates (env.device, &(counters.utilization)); err = nvmlDeviceGetUtilizationRates (env.device, &(counters.utilization));
reading.value = counters.utilization.gpu; reading.value = counters.utilization.gpu;
break; break;
case(GPU_ECC_ERR):
err = nvmlDeviceGetTotalEccErrors (env.device, NVML_MEMORY_ERROR_TYPE_CORRECTED,NVML_VOLATILE_ECC,&(counters.ecc_counts));
reading.value = counters.ecc_counts;
break;
} }
s->storeReading(reading); s->storeReading(reading);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment