11.08., 9:00 - 11:00: Due to updates GitLab will be unavailable for some minutes between 09:00 and 11:00.

Commit c95b8c6d authored by Michael Ott's avatar Michael Ott

Merge branch 'development'

parents 495c126f c90a5d90
......@@ -177,10 +177,12 @@ void ClusteringOperator::compute(U_Ptr unit) {
void ClusteringOperator::computeFeatureVector(U_Ptr unit, uint64_t offset) {
_currentfVector = cv::Mat(1, unit->getInputs().size(), CV_32F);
std::vector<ClusteringSBPtr>& inputs = unit->getInputs();
uint64_t endTs = getTimestamp() - offset;
uint64_t startTs = endTs - _aggregationWindow;
for(size_t idx=0; idx<inputs.size(); idx++) {
_mean=0;
_buffer.clear();
if(!_queryEngine.querySensor(inputs[idx]->getName(), _aggregationWindow - offset, offset, _buffer) || _buffer.empty())
if(!_queryEngine.querySensor(inputs[idx]->getName(), startTs, endTs, _buffer, false) || _buffer.empty())
throw std::runtime_error("Operator " + _name + ": cannot read from sensor " + inputs[idx]->getName() + "!");
// Computing MEAN
......
......@@ -51,7 +51,7 @@ PerSystSqlOperator::PerSystSqlOperator(const std::string& name) :
OperatorTemplate(name), JobOperatorTemplate(name), _number_of_even_quantiles(
0), _severity_formula(NOFORMULA), _severity_threshold(0), _severity_exponent(
0), _severity_max_memory(0), _go_back_ns(0), _backend(DEFAULT), _scaling_factor(
1), _property_id(0), _searchedOnceForMetaData(false) {
1), _searchedOnceForMetaData(false), _property_id(0) {
_persystdb = MariaDB::getInstance();
}
......
......@@ -187,11 +187,13 @@ void RegressorOperator::computeFeatureVector(U_Ptr unit) {
_currentfVector = new cv::Mat(1, unit->getInputs().size()*REG_NUMFEATURES, CV_32F);
int64_t val;
size_t qId, qMod, idx, fIdx;
uint64_t endTs = getTimestamp();
uint64_t startTs = endTs - _aggregationWindow;
std::vector<RegressorSBPtr>& inputs = unit->getInputs();
for(idx=0; idx<inputs.size(); idx++) {
_mean=0; _std=0; _diffsum=0; _qtl25=0; _qtl75=0;
_buffer.clear();
if(!_queryEngine.querySensor(inputs[idx]->getName(), _aggregationWindow, 0, _buffer) || _buffer.empty())
if(!_queryEngine.querySensor(inputs[idx]->getName(), startTs, endTs, _buffer, false) || _buffer.empty())
throw std::runtime_error("Operator " + _name + ": cannot read from sensor " + inputs[idx]->getName() + "!");
if (inputs[idx]->getTrainingTarget())
_currentTarget = (float)_buffer.back().value;
......
......@@ -47,10 +47,11 @@ void SmoothingOperator::compute(U_Ptr unit) {
// Clearing the buffer
_buffer.clear();
SmoothingSBPtr sIn=unit->getInputs()[0], sOut=unit->getOutputs()[0];
uint64_t startTs = sOut->getTimestamp() ? getTimestamp()-sOut->getTimestamp() : 0;
uint64_t endTs = getTimestamp();
uint64_t startTs = sOut->getTimestamp() ? sOut->getTimestamp() : endTs;
// Throwing an error does not make sense here - the query will often fail depending on insert batching
if(!_queryEngine.querySensor(sIn->getName(), startTs, 0, _buffer, true))
if(!_queryEngine.querySensor(sIn->getName(), startTs, endTs, _buffer, false))
return;
for(const auto& v : _buffer) {
......
......@@ -117,3 +117,7 @@ libdcdbplugin_msr.$(LIBEXT): sensors/msr/MSRSensorGroup.o sensors/msr/MSRConfigu
libdcdbplugin_caliper.$(LIBEXT): sensors/caliper/CaliperSensorGroup.o sensors/caliper/CaliperConfigurator.o
$(CXX) $(LIBFLAGS)$@ -o $@ $^ -L$(DCDBDEPLOYPATH)/lib/ -lboost_log -lboost_system
# nvcc compiler and extra flags to enable the linking and creating of .so file to work:
libdcdbplugin_nvml.$(LIBEXT): sensors/nvml/nvmlSensorGroup.o sensors/nvml/nvmlConfigurator.o
$(NVCC) -shared --compiler-options '-fPIC' -o $@ $^ -L$(DCDBDEPLOYPATH)/lib/ -lboost_log -lboost_system -lnvidia-ml
global {
mqttPrefix /test
}
template_group def1 {
;define template groups by appending "template_"
interval 1000
minValues 3
mqttpart /nvml
}
group nvml_g1 {
interval 1000
; mqttpart /nvml
default def1
sensor gpu_Energy {
mqttsuffix /energy
feature GPU_ENERGY
}
sensor gpu_Power {
mqttsuffix /power
feature GPU_POWER
}
sensor gpu_Temp {
mqttsuffix /temp
feature GPU_TEMP
}
sensor gpu_Fan {
mqttsuffix /fan
feature GPU_FAN
}
}
group nvml_mem {
default def1
sensor gpu_Mem_tot {
mqttsuffix /memory_tot
feature GPU_MEM_TOT
}
sensor gpu_Mem_free {
mqttsuffix /memory_free
feature GPU_MEM_FREE
}
sensor gpu_Mem_used {
mqttsuffix /memory_used
feature GPU_MEM_USED
}
}
group nvml_clock {
default def1
sensor gpu_clk_gp {
mqttsuffix /clock_graphics
feature GPU_CLK_GP
}
sensor gpu_clk_sm {
mqttsuffix /clock_sm
feature GPU_CLK_SM
}
sensor gpu_clk_mem {
mqttsuffix /clock_mem
feature GPU_CLK_MEM
}
}
group nvml_utilisation {
default def1
sensor gpu_utl_memory {
mqttsuffix /util_mem
feature GPU_UTL_MEM
}
sensor gpu_utl_gpu {
mqttsuffix /util_gpu
feature GPU_UTL_GPU
}
}
group nvml_g2 {
default def1
sensor gpu_ecc_errors {
mqttsuffix /ecc_errors
feature GPU_ECC_ERR
}
sensor gpu_pcie_thru {
mqttsuffix /pcie_thru
feature GPU_PCIE_THRU
}
sensor gpu_run_prcs {
mqttsuffix /run_prcs
feature GPU_RUN_PRCS
}
}
......@@ -49,7 +49,6 @@ class ConfiguratorTemplateEntity : public ConfiguratorTemplate<SBase, SGroup> {
//mention all required parent attributes and functions here to avoid compiler errors
using ConfiguratorInterface::_cfgPath;
using ConfiguratorInterface::_mqttPrefix;
using ConfiguratorInterface::lg;
using ConfiguratorInterface::readGlobal;
using ConfiguratorTemplate<SBase, SGroup>::_baseName;
using ConfiguratorTemplate<SBase, SGroup>::_groupName;
......@@ -491,6 +490,7 @@ class ConfiguratorTemplateEntity : public ConfiguratorTemplate<SBase, SGroup> {
std::string _entityName;
LOGGER lg;
std::vector<SEntity *> _sensorEntitys;
sEntityMap_t _templateSensorEntitys;
};
......
......@@ -56,7 +56,6 @@ class SensorGroupTemplateEntity : public SensorGroupTemplate<S> {
using SensorGroupInterface::_keepRunning;
using SensorGroupInterface::_pendingTasks;
using SensorGroupInterface::_timer;
using SensorGroupInterface::lg;
using SensorGroupInterface::nextReadingTime;
using SensorGroupTemplate<S>::_sensors;
......@@ -159,6 +158,7 @@ class SensorGroupTemplateEntity : public SensorGroupTemplate<S> {
_pendingTasks--;
}
LOGGER lg;
E *_entity; ///< Entity this group is associated to
};
......
......@@ -173,7 +173,7 @@ class IPMISensorBase : public SensorBase {
}
}
void printConfig(LOG_LEVEL ll, LOGGER &lg, unsigned leadingSpaces = 16) {
void printConfig(LOG_LEVEL ll, LOGGER &lg, unsigned leadingSpaces = 16) override {
std::string leading(leadingSpaces, ' ');
LOG_VAR(ll) << leading << " Type: " << getTypeString();
switch (_type) {
......
......@@ -124,7 +124,7 @@ void IPMISensorGroup::read() {
std::vector<reading_t> readings;
LenovoXCC xcc(_entity);
if (_entity->getXCC()->getDatastorePower(readings) == 0) {
for (int i=0; i<readings.size(); i++) {
for (unsigned int i=0; i<readings.size(); i++) {
s->storeReading(readings[i], s->getFactor());
}
reading = readings.back();
......
This DCDB plugin uses the NVML library to capture the following GPU metrics:
* Power - sensor /test/nvml/power
Uses the nvmlDeviceGetPowerUsage function to retrieve power usage
for this GPU in milliwatts and its associated circuitry (e.g. memory).
* Temperature - sensor /test/nvml/temp
Uses the nvmlDeviceGetTemperature function to retrieve the current
temperature readings for the device, in degrees C.
* Energy - sensor /test/nvml/energy
Uses the nvmlDeviceGetTotalEnergyConsumption function to retrieve total
energy consumption for this GPU in millijoules (mJ) since the driver was
last reloaded.
* Running Compute Processes - sensor /test/nvml/run_prcs
Set up to use the nvmlDeviceGetComputeRunningProcesses function to get
the number of running processes with a compute context (e.g. CUDA
application which have active context) on the device.
* ECC errors - sensor /test/nvml/ecc_errors
Set up to use the nvmlDeviceGetTotalEccErrors function to retrieve the
NVML_MEMORY_ERROR_TYPE_CORRECTED type errors (a memory error that was
corrected for ECC errors; these are single bit errors for Texture memory;
these are errors fixed by resend) for the NVML_VOLATILE_ECC counter
(Volatile counts are reset each time the driver loads).
Requires ECC Mode to be enabled.
* Graphics Clock speed - sensor /test/nvml/clock_graphics
Set up to use the nvmlDeviceGetClock function to retrieves the clock speed
(current actual clock value) for the graphics clock domain in MHz.
* SM Clock speed - sensor /test/nvml/clock_sm
Set up to use the nvmlDeviceGetClock function to retrieves the clock speed
(current actual clock value) for the SM clock domain in MHz.
* Memory Clock speed - sensor /test/nvml/clocl_mem
Set up to use the nvmlDeviceGetClock function to retrieves the clock speed
(current actual clock value) for the memory clock domain in MHz.
* Total memory - sensor /test/nvml/memory_tot
Set up to use the nvmlDeviceGetMemoryInfo function to retrieve the amount
of total memory available on the device, in bytes.
* Free memory - sensor /test/nvml/memory_free
Set up to use the nvmlDeviceGetMemoryInfo function to retrieve the amount
of free memory available on the device, in bytes.
* Used memory - sensor /test/nvml/memory_used
Set up to use the nvmlDeviceGetMemoryInfo function to retrieve the amount
of used memory on the device, in bytes.
* Memory utilisation rate - sensor /test/nvml/util_mem
Set up to use the nvmlDeviceGetUtilizationRates function to retrieve the
current utilization rates for the memory subsystem. It's reported as a
percent of time over the past sample period during which global (device)
memory was being read or written.
* GPU utlisation - sensor /test/nvml/util_gpu
Set up to use the nvmlDeviceGetUtilizationRates function to retrieve the
current utilization rates for the gpu. It's reported as a percent of time
over the past sample period during which one or more kernels was executing
on the GPU.
* PCIe throughput - sensor /test/nvml/pcie_thru
Set up to use the DeviceGetPcieThroughput function to retrieve the PCIe
utilization information. This function is querying a byte counter over a
20ms interval and thus is the PCIe throughput (NVML_PCIE_UTIL_COUNT) over
that interval. The throughput is returned in KB/s.
Other possible counters are: NVML_PCIE_UTIL_TX_BYTES (transmitted bytes)
and NVML_PCIE_UTIL_RX_BYTES (received bytes).
* Fan - sensor /test/nvml/fan
Set up to use the nvmlDeviceGetFanSpeed funtion to retrieve the intended
operating speed of the device's fan. The fan speed is expressed as a percent
of the maximum, i.e. full speed is 100%.
Inside the config.mk (inside the dcdb directory) append to PLUGINS variable: nvml
#Other plugins to be built
PLUGINS = sysfs perfevent ipmi pdu bacnet snmp procfs tester gpfsmon opa msr nvml
Then Append at end of Makefile (inside the dcdbpusher directory):
# nvcc compiler and extra flags to enable the linking and creating of .so file to work:
libdcdbplugin_nvml.$(LIBEXT): sensors/nvml/nvmlSensorGroup.o sensors/nvml/nvmlConfigurator.o
$(NVCC) -shared --compiler-options '-fPIC' -o $@ $^ -L$(DCDBDEPLOYPATH)/lib/ -lboost_log -lboost_system -lnvidia-ml
global {
; mqttPrefix /nvml
mqttPrefix /test
}
template_group def1 {
;define template groups by appending "template_"
interval 1000
minValues 3
mqttpart /nvml
}
group nvml_g1 {
interval 1000
; mqttpart /nvml
default def1
sensor gpu_Energy {
mqttsuffix /energy
feature GPU_ENERGY
}
sensor gpu_Power {
mqttsuffix /power
feature GPU_POWER
}
sensor gpu_Temp {
mqttsuffix /temp
feature GPU_TEMP
}
sensor gpu_Fan {
mqttsuffix /fan
feature GPU_FAN
}
}
group nvml_mem {
default def1
sensor gpu_Mem_tot {
mqttsuffix /memory_tot
feature GPU_MEM_TOT
}
sensor gpu_Mem_free {
mqttsuffix /memory_free
feature GPU_MEM_FREE
}
sensor gpu_Mem_used {
mqttsuffix /memory_used
feature GPU_MEM_USED
}
}
group nvml_clock {
default def1
sensor gpu_clk_gp {
mqttsuffix /clock_graphics
feature GPU_CLK_GP
}
sensor gpu_clk_sm {
mqttsuffix /clock_sm
feature GPU_CLK_SM
}
sensor gpu_clk_mem {
mqttsuffix /clock_mem
feature GPU_CLK_MEM
}
}
group nvml_utilisation {
default def1
sensor gpu_utl_memory {
mqttsuffix /util_mem
feature GPU_UTL_MEM
}
sensor gpu_utl_gpu {
mqttsuffix /util_gpu
feature GPU_UTL_GPU
}
}
group nvml_g2 {
default def1
sensor gpu_ecc_errors {
mqttsuffix /ecc_errors
feature GPU_ECC_ERR
}
sensor gpu_pcie_thru {
mqttsuffix /pcie_thru
feature GPU_PCIE_THRU
}
sensor gpu_run_prcs {
mqttsuffix /run_prcs
feature GPU_RUN_PRCS
}
}
//================================================================================
// Name : nvmlConfigurator.cpp
// Author : Weronika Filinger, EPCC @ The University of Edinburgh
// Contact :
// Copyright :
// Description : Source file for nvml plugin configurator class.
//================================================================================
//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
//================================================================================
#include "nvmlConfigurator.h"
nvmlConfigurator::nvmlConfigurator() {
_gpuFeatureMAP["GPU_ENERGY"] = GPU_ENERGY;
_gpuFeatureMAP["GPU_POWER"] = GPU_POWER;
_gpuFeatureMAP["GPU_TEMP"] = GPU_TEMP;
_gpuFeatureMAP["GPU_FAN"] = GPU_FAN;
_gpuFeatureMAP["GPU_MEM_USED"] = GPU_MEM_USED;
_gpuFeatureMAP["GPU_MEM_FREE"] = GPU_MEM_FREE;
_gpuFeatureMAP["GPU_MEM_TOT"] = GPU_MEM_TOT;
_gpuFeatureMAP["GPU_CLK_GP"] = GPU_CLK_GP;
_gpuFeatureMAP["GPU_CLK_SM"] = GPU_CLK_SM;
_gpuFeatureMAP["GPU_CLK_MEM"] = GPU_CLK_MEM;
_gpuFeatureMAP["GPU_UTL_MEM"] = GPU_UTL_MEM;
_gpuFeatureMAP["GPU_UTL_GPU"] = GPU_UTL_GPU;
_gpuFeatureMAP["GPU_ECC_ERR"] = GPU_ECC_ERR;
_gpuFeatureMAP["GPU_PCIE_THRU"] = GPU_PCIE_THRU;
_gpuFeatureMAP["GPU_RUN_PRCS"] = GPU_RUN_PRCS;
_groupName = "group";
_baseName = "sensor";
}
nvmlConfigurator::~nvmlConfigurator() {}
void nvmlConfigurator::sensorBase(nvmlSensorBase& s, CFG_VAL config) {
BOOST_FOREACH (boost::property_tree::iptree::value_type &val, config) {
if (boost::iequals(val.first, "feature")) {
gpuFeatureMap_t::iterator it = _gpuFeatureMAP.find(val.second.data());
if (it != _gpuFeatureMAP.end()) {
s.setFeatureType(it->second);
} else {
LOG(warning) << " feature \"" << val.second.data() << "\" not known.";
}
}
}
}
void nvmlConfigurator::sensorGroup(nvmlSensorGroup& s, CFG_VAL config) {}
void nvmlConfigurator::printConfiguratorConfig(LOG_LEVEL ll) {
LOG_VAR(ll) << " NumSpacesAsIndention: " << 2;
}
//================================================================================
// Name : nvmlConfigurator.h
// Author : Weronika Filinger, EPCC @ The University of Edinburgh
// Contact :
// Copyright :
// Description : Header file for nvml plugin configurator class.
//================================================================================
//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
//================================================================================
#ifndef NVML_NVMLCONFIGURATOR_H_
#define NVML_NVMLCONFIGURATOR_H_
#include "../../includes/ConfiguratorTemplate.h"
#include "nvmlSensorGroup.h"
/**
* @brief ConfiguratorTemplate specialization for this plugin.
*
* @ingroup nvml
*/
class nvmlConfigurator : public ConfiguratorTemplate<nvmlSensorBase, nvmlSensorGroup> {
typedef std::map<std::string, unsigned int> gpuFeatureMap_t;
public:
nvmlConfigurator();
virtual ~nvmlConfigurator();
protected:
/* Overwritten from ConfiguratorTemplate */
void sensorBase(nvmlSensorBase& s, CFG_VAL config) override;
void sensorGroup(nvmlSensorGroup& s, CFG_VAL config) override;
virtual void printConfiguratorConfig(LOG_LEVEL ll) final override;
private:
gpuFeatureMap_t _gpuFeatureMAP;
};
extern "C" ConfiguratorInterface* create() {
return new nvmlConfigurator;
}
extern "C" void destroy(ConfiguratorInterface* c) {
delete c;
}
#endif /* NVML_NVMLCONFIGURATOR_H_ */
//================================================================================
// Name : nvmlSensorBase.h
// Author : Weronika Filinger, EPCC @ The University of Edinburgh
// Contact :
// Copyright :
// Description : Sensor base class for nvml plugin.
//================================================================================
//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
//================================================================================
/**
* @defgroup nvml nvml plugin
* @ingroup pusherplugins
*
* Collect data from the nvml interface
*/
#ifndef NVML_NVMLSENSORBASE_H_
#define NVML_NVMLSENSORBASE_H_
#include "sensorbase.h"
enum GPU_FEATURE {
GPU_ENERGY = 0,
GPU_POWER = 1,
GPU_TEMP = 2,
GPU_FAN = 3,
GPU_MEM_USED = 4,
GPU_MEM_TOT = 5,
GPU_MEM_FREE = 6,
GPU_CLK_GP = 7,
GPU_CLK_SM = 8,
GPU_CLK_MEM = 9,
GPU_UTL_MEM = 10,
GPU_UTL_GPU = 11,
GPU_ECC_ERR = 13,
GPU_PCIE_THRU = 14,
GPU_RUN_PRCS = 15,
};
/**
* @brief
*
*
* @ingroup nvml
*/
class nvmlSensorBase : public SensorBase {
public:
nvmlSensorBase(const std::string& name) :
SensorBase(name), _featureType(static_cast<GPU_FEATURE>(999)) {
}
nvmlSensorBase(const nvmlSensorBase &other)
: SensorBase(other),
_featureType(other._featureType) {}
virtual ~nvmlSensorBase() {}
int getFeatureType() const {
return _featureType;
}
void setFeatureType(int featureType){
_featureType = static_cast<GPU_FEATURE>(featureType);
}
nvmlSensorBase& operator=(const nvmlSensorBase& other) {
SensorBase::operator=(other);
_featureType = other._featureType;
return *this;
}
void printConfig(LOG_LEVEL ll, LOGGER& lg, unsigned leadingSpaces=16) {
std::string leading(leadingSpaces, ' ');
std::string feature("unknown");
switch (_featureType) {
case GPU_ENERGY:
feature = "GPU_ENERGY";
break;
case GPU_POWER:
feature = "GPU_POWER";
break;
case GPU_TEMP:
feature = "GPU_TEMP";
break;