Commit c8671269 authored by Alessio Netti's avatar Alessio Netti

Analytics: Health Checker plugin

- Allows to set a variety of conditions on sensor data and raise
alarms when they are triggered
- Arbitrary scripts can be executed (e.g., to send emails) when
alarms happen
parent 7b36af9b
......@@ -77,3 +77,6 @@ libdcdboperator_persystsql.$(LIBEXT): operators/persystsql/PerSystSqlOperator.o
libdcdboperator_coolingcontrol.$(LIBEXT): operators/coolingcontrol/CoolingControlOperator.o operators/coolingcontrol/CoolingControlConfigurator.o operators/coolingcontrol/SNMPController.o ../dcdbpusher/sensors/snmp/SNMPConnection.o ../common/src/sensornavigator.o
$(CXX) $(LIBFLAGS)$@ -o $@ $^ -L$(DCDBDEPLOYPATH)/lib/ -lboost_log -lboost_system -lboost_regex -lnetsnmp -lnetsnmpagent
libdcdboperator_healthchecker.$(LIBEXT): operators/healthchecker/HealthCheckerOperator.o operators/healthchecker/HealthCheckerConfigurator.o ../common/src/sensornavigator.o
$(CXX) $(LIBFLAGS)$@ -o $@ $^ -L$(DCDBDEPLOYPATH)/lib/ -lboost_log -lboost_system -lboost_regex
\ No newline at end of file
template_healthchecker def1 {
shell /bin/sh
log true
cooldown 3600000
}
healthchecker h1 {
default def1
command "/usr/dir/myscript.sh %s"
window 60000
input {
sensor "<bottomup, filter socket>temp" {
condition above
threshold 95000
}
sensor "<bottomup 1>power" {
condition exists
}
}
}
//================================================================================
// Name : HealthCheckerConfigurator.cpp
// Author : Alessio Netti
// Contact : info@dcdb.it
// Copyright : Leibniz Supercomputing Centre
// Description :
//================================================================================
//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019-2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
//================================================================================
#include "HealthCheckerConfigurator.h"
HealthCheckerConfigurator::HealthCheckerConfigurator() : OperatorConfiguratorTemplate() {
_operatorName = "healthchecker";
_baseName = "sensor";
}
HealthCheckerConfigurator::~HealthCheckerConfigurator() {}
void HealthCheckerConfigurator::sensorBase(HealthCheckerSensorBase& s, CFG_VAL config) {
BOOST_FOREACH(boost::property_tree::iptree::value_type &val, config)
{
if (boost::iequals(val.first, "threshold")) {
s.setThreshold(std::stoull(val.second.data()));
} else if (boost::iequals(val.first, "condition")) {
HealthCheckerSensorBase::HCCond c = s.stringToCond(val.second.data());
s.setCondition(c);
if (c == HealthCheckerSensorBase::HC_INVALID) {
LOG(error) << " " << _operatorName << ": Invalid alarm condition specified!";
}
}
}
}
void HealthCheckerConfigurator::operatorAttributes(HealthCheckerOperator& op, CFG_VAL config) {
BOOST_FOREACH(boost::property_tree::iptree::value_type &val, config)
{
if (boost::iequals(val.first, "cooldown")) {
op.setCooldown(std::stoull(val.second.data()) * 1000000);
} else if (boost::iequals(val.first, "window")) {
op.setWindow(std::stoull(val.second.data()) * 1000000);
} else if (boost::iequals(val.first, "log")) {
op.setLog(to_bool(val.second.data()));
} else if (boost::iequals(val.first, "command")) {
op.setCommand(val.second.data());
if(!op.isCommandValid(val.second.data())) {
LOG(error) << " " << _operatorName << ": Invalid command specified!";
}
} /*else if (boost::iequals(val.first, "shell")) {
op.setShell(val.second.data());
} */
}
}
bool HealthCheckerConfigurator::unit(UnitTemplate<HealthCheckerSensorBase>& u) {
if(u.isTopUnit()) {
LOG(error) << " " << _operatorName << ": This operator type only supports flat units!";
return false;
}
if(!u.getOutputs().empty()) {
LOG(error) << " " << _operatorName << ": This is an health checker, no output sensors can be defined!";
return false;
}
return true;
}
//================================================================================
// Name : HealthCheckerConfigurator.h
// Author : Alessio Netti
// Contact : info@dcdb.it
// Copyright : Leibniz Supercomputing Centre
// Description :
//================================================================================
//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019-2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
//================================================================================
#ifndef PROJECT_HEALTHCHECKERCONFIGURATOR_H
#define PROJECT_HEALTHCHECKERCONFIGURATOR_H
#include "../../includes/OperatorConfiguratorTemplate.h"
#include "HealthCheckerOperator.h"
/**
* @brief Configurator for the health checker plugin.
*
* @ingroup healthchecker
*/
class HealthCheckerConfigurator : virtual public OperatorConfiguratorTemplate<HealthCheckerOperator, HealthCheckerSensorBase> {
public:
HealthCheckerConfigurator();
virtual ~HealthCheckerConfigurator();
private:
void sensorBase(HealthCheckerSensorBase& s, CFG_VAL config) override;
void operatorAttributes(HealthCheckerOperator& op, CFG_VAL config) override;
bool unit(UnitTemplate<HealthCheckerSensorBase>& u) override;
};
extern "C" OperatorConfiguratorInterface* create() {
return new HealthCheckerConfigurator;
}
extern "C" void destroy(OperatorConfiguratorInterface* c) {
delete c;
}
#endif //PROJECT_HEALTHCHECKERCONFIGURATOR_H
//================================================================================
// Name : HealthCheckerOperator.cpp
// Author : Alessio Netti
// Contact : info@dcdb.it
// Copyright : Leibniz Supercomputing Centre
// Description :
//================================================================================
//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019-2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
//================================================================================
#include "HealthCheckerOperator.h"
#include <sys/wait.h>
#include <sys/types.h>
#include <time.h>
#include <signal.h>
HealthCheckerOperator::HealthCheckerOperator(const std::string& name) : OperatorTemplate(name) {
_shell = "/bin/sh";
_command = "";
_cooldown = 0;
_window = 0;
_log = true;
}
HealthCheckerOperator::HealthCheckerOperator(const HealthCheckerOperator &other) : OperatorTemplate(other) {
_shell = other._shell;
_command = other._command;
_cooldown = other._cooldown;
_window = other._window;
_log = other._log;
}
HealthCheckerOperator::~HealthCheckerOperator() {}
void HealthCheckerOperator::printConfig(LOG_LEVEL ll) {
//LOG_VAR(ll) << " Shell: " << _shell;
LOG_VAR(ll) << " Command: " << _command;
LOG_VAR(ll) << " Cooldown: " << _cooldown;
LOG_VAR(ll) << " Window: " << _window;
LOG_VAR(ll) << " Log: " << (_log ? "enabled" : "disabled");
OperatorTemplate<HealthCheckerSensorBase>::printConfig(ll);
}
void HealthCheckerOperator::compute(U_Ptr unit) {
std::string msg = "The following alarm conditions were detected by the DCDB Health Checker plugin:\n\n";
bool alrm = false;
uint64_t endTs = getTimestamp();
uint64_t startTs = endTs - _window;
vector<reading_t> buffer;
for (const auto& in : unit->getInputs()) {
buffer.clear();
HealthCheckerSensorBase::HCCond cond = in->getCondition();
_queryEngine.querySensor(in->getName(), startTs, endTs, buffer, false);
std::string tempMsg = "";
// Checking the existence condition
if (buffer.empty() && cond == HealthCheckerSensorBase::HC_EXISTS) {
tempMsg = " - Sensor " + in->getName() + " is not providing any data.\n";
// Checking the remaining value conditions
} else if(cond != HealthCheckerSensorBase::HC_EXISTS) {
for (const auto& v : buffer) {
if (v.value > in->getThreshold() && cond == HealthCheckerSensorBase::HC_ABOVE) {
tempMsg = " - Sensor " + in->getName() + " has a reading " + std::to_string(v.value) + " greater than threshold " + std::to_string(in->getThreshold()) + ".\n";
break;
} else if (v.value < in->getThreshold() && cond == HealthCheckerSensorBase::HC_BELOW) {
tempMsg = " - Sensor " + in->getName() + " has a reading " + std::to_string(v.value) + " smaller than threshold " + std::to_string(in->getThreshold()) + ".\n";
break;
} else if (v.value == in->getThreshold() && cond == HealthCheckerSensorBase::HC_EQUAL) {
tempMsg = " - Sensor " + in->getName() + " has a reading equal to threshold " + std::to_string(in->getThreshold()) + ".\n";
break;
}
}
}
if (tempMsg != "" && endTs - in->getLast() > _cooldown) {
alrm = true;
in->setLast(endTs);
msg += tempMsg;
}
}
// If at least one alarm was raised
if(alrm) {
if(_command != "") {
pid_t pid = fork();
// Father and son do their work
if (pid == 0) {
std::string cmd = _command.replace(_command.find(HC_MSG_MARKER), HC_LEN_MARKER, "\"" + msg + "\"");
if(execlp("/bin/sh", "sh", "-c", cmd.c_str(), (char *)0) < 0) {
LOG(error) << "Operator " << _name << ": could not spawn child process!";
exit(0);
}
} else {
// Fixed 100ms sleep cycle for the wait call, with 60s timeout
struct timespec req = {0, 100000000};
uint64_t timeout = S_TO_NS(60);
uint64_t now = getTimestamp();
int status = 0;
// Waiting until timeout for the process to complete
while (waitpid(pid, &status, WNOHANG) == 0) {
if (getTimestamp() - now > timeout) {
LOG(error) << "Operator " << _name << ": child process with PID " << pid << " does not respond. Killing...";
kill(pid, SIGKILL);
break;
} else {
nanosleep(&req, NULL);
}
}
}
}
// Logging to the standard DCDB log
if(_log) {
LOG(warning) << msg;
}
}
}
//================================================================================
// Name : HealthCheckerOperator.h
// Author : Alessio Netti
// Contact : info@dcdb.it
// Copyright : Leibniz Supercomputing Centre
// Description :
//================================================================================
//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019-2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
//================================================================================
#ifndef PROJECT_HEALTHCHECKEROPERATOR_H
#define PROJECT_HEALTHCHECKEROPERATOR_H
#include "../../includes/OperatorTemplate.h"
#include "HealthCheckerSensorBase.h"
#define HC_MSG_MARKER "%s"
#define HC_LEN_MARKER 2
/**
* @brief Health checker operator plugin.
*
* @ingroup healthchecker
*/
class HealthCheckerOperator : virtual public OperatorTemplate<HealthCheckerSensorBase> {
public:
HealthCheckerOperator(const std::string& name);
HealthCheckerOperator(const HealthCheckerOperator& other);
virtual ~HealthCheckerOperator();
void setCooldown(uint64_t c) { _cooldown = c; }
void setWindow(uint64_t w) { _window = w; }
void setLog(bool l) { _log = l; }
void setCommand(std::string c) { _command = isCommandValid(c) ? c : ""; }
void setShell(std::string s) { _shell = s; }
uint64_t getCooldown() { return _cooldown; }
uint64_t getWindow() { return _window; }
std::string getCommand() { return _command; }
std::string getShell() { return _shell; }
bool getLog() { return _log; }
void printConfig(LOG_LEVEL ll) override;
bool isCommandValid(std::string c) {
// Command must contain the marker to be replaced with the message, and a space (i.e., more than 1 argument)
return c.find(HC_MSG_MARKER) != std::string::npos && c.find(" ") != std::string::npos;
}
protected:
virtual void compute(U_Ptr unit) override;
std::string _shell;
std::string _command;
uint64_t _cooldown;
uint64_t _window;
bool _log;
};
#endif //PROJECT_HEALTHCHECKEROPERATOR_H
//================================================================================
// Name : HealthCheckerSensorBase.h
// Author : Alessio Netti
// Contact : info@dcdb.it
// Copyright : Leibniz Supercomputing Centre
// Description :
//================================================================================
//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019-2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
//================================================================================
#ifndef PROJECT_HEALTHCHECKERSENSORBASE_H
#define PROJECT_HEALTHCHECKERSENSORBASE_H
#include "sensorbase.h"
/**
* @brief Sensor base for health checker plugin
*
* @ingroup healthchecker
*/
class HealthCheckerSensorBase : public SensorBase {
public:
typedef enum {
HC_ABOVE,
HC_BELOW,
HC_EQUAL,
HC_EXISTS,
HC_INVALID
} HCCond;
// Constructor and destructor
HealthCheckerSensorBase(const std::string &name) : SensorBase(name) {
_last = 0;
_threshold = 0;
_condition = HC_INVALID;
}
// Copy constructor
HealthCheckerSensorBase(HealthCheckerSensorBase &other) : SensorBase(other) {
_last = 0;
_threshold = other._threshold;
_condition = other._condition;
}
virtual ~HealthCheckerSensorBase() {}
void setLast(uint64_t l) { _last = l; }
void setThreshold(int64_t t) { _threshold = t; }
void setCondition(HCCond c) { _condition = c; }
uint64_t getLast() { return _last; }
int64_t getThreshold() { return _threshold; }
HCCond getCondition() { return _condition; }
void printConfig(LOG_LEVEL ll, LOGGER &lg, unsigned leadingSpaces = 16) {
SensorBase::printConfig(ll, lg, leadingSpaces);
std::string leading(leadingSpaces, ' ');
LOG_VAR(ll) << leading << " Condition: " << condToString(_condition);
LOG_VAR(ll) << leading << " Threshold: " << _threshold;
}
std::string condToString(HCCond c) {
switch (c) {
case HC_ABOVE:
return "above";
case HC_BELOW:
return "below";
case HC_EQUAL:
return "equals";
case HC_EXISTS:
return "exists";
default:
return "invalid";
}
}
HCCond stringToCond(std::string s) {
if (boost::iequals(s, "above")) {
return HC_ABOVE;
} else if (boost::iequals(s, "below")) {
return HC_BELOW;
} else if (boost::iequals(s, "equals")) {
return HC_EQUAL;
} else if (boost::iequals(s, "exists")) {
return HC_EXISTS;
} else {
return HC_INVALID;
}
}
protected:
uint64_t _last;
int64_t _threshold;
HCCond _condition;
};
using HealthCheckerSBPtr = std::shared_ptr<HealthCheckerSensorBase>;
#endif //PROJECT_HEALTHCHECKERSENSORBASE_H
......@@ -6,7 +6,7 @@ DCDBDEPLOYPATH ?= $(DCDBBASEPATH)/install
PLUGINS = sysfs ipmi pdu bacnet snmp procfs tester gpfsmon msr
# data analytics plugins to be built
OPERATORS = aggregator smoothing regressor classifier clustering cssignatures job_aggregator testeroperator filesink smucngperf persystsql coolingcontrol
OPERATORS = aggregator smoothing regressor classifier clustering cssignatures job_aggregator testeroperator filesink smucngperf persystsql coolingcontrol healthchecker
DEFAULT_VERSION = 0.4
GIT_VERSION = $(shell git describe --tags 2>/dev/null|sed 's/-\([0-9]*\)/.\1/')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment