Commit 409c9140 authored by Alessio Netti's avatar Alessio Netti

Analytics: changes to Regressor and Classifier plugins

- Added a "raw" mode in which only the average is used as feature for
each sensor. Useful when users wish to build signatures with a different
plugin and then pipeline them into the model;
- Restored the "targetDistance" parameter for the Classifier plugin;
- Fixed a minor bug affecting the number of samples used for training.
parent bf0395f0
......@@ -855,6 +855,7 @@ The following are the configuration parameters available for the _Regressor_ plu
| inputPath | Path of a file from which a pre-trained random forest model must be loaded.
| outputPath | Path of a file to which the random forest model trained at runtime must be saved.
| getImportances | If true, the random forest will also compute feature importance values when trained, which are printed.
| rawMode | If true, only the average is used as feature for each of the sensor inputs.
> NOTE       When the _duplicate_ option is enabled, the _outputPath_ field is ignored to avoid file collisions from multiple regressors.
......@@ -879,7 +880,7 @@ The _Classifier_ plugin, as the name implies, performs machine learning classifi
* The _target_ parameter here indicates a sensor which stores the labels (as numerical integer identifiers) to be used for training and on which classification will be based. The mapping from the integer labels to their text equivalent is left to the users. Moreover, unlike in the
Regressor plugin, the target sensor is always excluded from the feature vectors.
* The _targetDistance_ parameter is not used here, as it is only meaningful for regression.
* The _targetDistance_ parameter has a default value of 0. It can be set to higher values to perform predictive classification.
## Clustering Plugin <a name="clusteringPlugin"></a>
......
......@@ -57,6 +57,10 @@ void ClassifierConfigurator::operatorAttributes(ClassifierOperator& op, CFG_VAL
op.setOutputPath(val.second.data());
else if(boost::iequals(val.first, "getImportances"))
op.setComputeImportances(to_bool(val.second.data()));
else if(boost::iequals(val.first, "targetDistance"))
op.setTargetDistance(stoull(val.second.data()));
else if(boost::iequals(val.first, "rawMode"))
op.setRawMode(to_bool(val.second.data()));
}
}
......
......@@ -33,7 +33,6 @@ ClassifierOperator::ClassifierOperator(const std::string& name) : OperatorTempla
}
ClassifierOperator::ClassifierOperator(const ClassifierOperator& other) : OperatorTemplate(other), RegressorOperator(other) {
_targetDistance = 0;
_includeTarget = false;
}
......@@ -41,10 +40,12 @@ ClassifierOperator::~ClassifierOperator() {}
void ClassifierOperator::printConfig(LOG_LEVEL ll) {
LOG_VAR(ll) << " Window: " << _aggregationWindow;
LOG_VAR(ll) << " Target Distance: " << _targetDistance;
LOG_VAR(ll) << " Training Sample: " << _trainingSamples;
LOG_VAR(ll) << " Input Path: " << (_modelIn!="" ? _modelIn : std::string("none"));
LOG_VAR(ll) << " Output Path: " << (_modelOut!="" ? _modelOut : std::string("none"));
LOG_VAR(ll) << " Importances: " << (_importances ? "enabled" : "disabled");
LOG_VAR(ll) << " Raw Mode: " << (getRawMode() ? "enabled" : "disabled");
OperatorTemplate<RegressorSensorBase>::printConfig(ll);
}
......
......@@ -59,6 +59,8 @@ void RegressorConfigurator::operatorAttributes(RegressorOperator& op, CFG_VAL co
op.setOutputPath(val.second.data());
else if(boost::iequals(val.first, "getImportances"))
op.setComputeImportances(to_bool(val.second.data()));
else if(boost::iequals(val.first, "rawMode"))
op.setRawMode(to_bool(val.second.data()));
}
}
......
......@@ -32,6 +32,7 @@ RegressorOperator::RegressorOperator(const std::string& name) : OperatorTemplate
_modelOut = "";
_aggregationWindow = 0;
_targetDistance = 1;
_numFeatures = REG_NUMFEATURES;
_trainingSamples = 256;
_trainingPending = true;
_importances = false;
......@@ -46,6 +47,7 @@ RegressorOperator::RegressorOperator(const RegressorOperator& other) : OperatorT
_modelOut = "";
_aggregationWindow = other._aggregationWindow;
_targetDistance = other._targetDistance;
_numFeatures = other._numFeatures;
_trainingSamples = other._trainingSamples;
_importances = other._importances;
_includeTarget = true;
......@@ -85,7 +87,7 @@ void RegressorOperator::execOnInit() {
if(_modelIn!="") {
try {
_rForest = cv::ml::RTrees::load(_modelIn);
if(!_rForest->isTrained() || _units.empty() || _units[0]->getInputs().size()*REG_NUMFEATURES!=(uint64_t)_rForest->getVarCount())
if(!_rForest->isTrained() || _units.empty() || _units[0]->getInputs().size()*_numFeatures!=(uint64_t)_rForest->getVarCount())
LOG(error) << "Operator " + _name + ": incompatible model, falling back to default!";
else {
_trainingPending = false;
......@@ -107,6 +109,7 @@ void RegressorOperator::printConfig(LOG_LEVEL ll) {
LOG_VAR(ll) << " Input Path: " << (_modelIn!="" ? _modelIn : std::string("none"));
LOG_VAR(ll) << " Output Path: " << (_modelOut!="" ? _modelOut : std::string("none"));
LOG_VAR(ll) << " Importances: " << (_importances ? "enabled" : "disabled");
LOG_VAR(ll) << " Raw Mode: " << (getRawMode() ? "enabled" : "disabled");
OperatorTemplate<RegressorSensorBase>::printConfig(ll);
}
......@@ -140,8 +143,8 @@ void RegressorOperator::trainRandomForest(bool categorical) {
if((uint64_t)_responseSet->size().height <= _targetDistance)
throw std::runtime_error("Operator " + _name + ": cannot perform training, insufficient data!");
// Shifting the training and response sets so as to obtain the desired prediction distance
*_responseSet = _responseSet->rowRange(_targetDistance, _responseSet->size().height-1);
*_trainingSet = _trainingSet->rowRange(0, _trainingSet->size().height-1-_targetDistance);
*_responseSet = _responseSet->rowRange(_targetDistance, _responseSet->size().height);
*_trainingSet = _trainingSet->rowRange(0, _trainingSet->size().height-_targetDistance);
shuffleTrainingSet();
cv::Mat varType = cv::Mat(_trainingSet->size().width + 1, 1, CV_8U);
......@@ -153,13 +156,13 @@ void RegressorOperator::trainRandomForest(bool categorical) {
throw std::runtime_error("Operator " + _name + ": model training failed!");
td.release();
LOG(info) << "Operator " << _name << ": model training performed using " << _trainingSet->size().height << " samples and " << _trainingSet->size().width << " features.";
LOG(info) << getImportances();
delete _trainingSet;
_trainingSet = nullptr;
delete _responseSet;
_responseSet = nullptr;
_trainingPending = false;
LOG(info) << "Operator " + _name + ": model training performed.";
LOG(info) << getImportances();
if(_modelOut!="") {
try {
_rForest->save(_modelOut);
......@@ -194,7 +197,7 @@ void RegressorOperator::shuffleTrainingSet() {
bool RegressorOperator::computeFeatureVector(U_Ptr unit) {
if(!_currentfVector)
_currentfVector = new cv::Mat(1, unit->getInputs().size()*REG_NUMFEATURES, CV_32F);
_currentfVector = new cv::Mat(1, unit->getInputs().size()*_numFeatures, CV_32F);
int64_t val;
size_t qId, qMod, idx, fIdx;
uint64_t endTs = getTimestamp();
......@@ -221,42 +224,48 @@ bool RegressorOperator::computeFeatureVector(U_Ptr unit) {
}
_mean /= _buffer.size();
// Computing STD
for (const auto &v : _buffer) {
val = v.value - _mean;
_std += val * val;
}
_std = sqrt(_std / _buffer.size());
// Computing additional features only if we are not in "raw" mode
if(_numFeatures == REG_NUMFEATURES) {
// Computing STD
for (const auto &v : _buffer) {
val = v.value - _mean;
_std += val * val;
}
_std = sqrt(_std / _buffer.size());
// I know, sorting is costly; here, we assume that the aggregation window of sensor data is going to be relatively
// small, in which case the O(log(N)) complexity of the std::sort implementation converges to O(N)
std::sort(_buffer.begin(), _buffer.end(),
[](const reading_t &lhs, const reading_t &rhs) { return lhs.value < rhs.value; });
// Computing 25th PERCENTILE
qId = ((_buffer.size()-1) * 25) / 100;
qMod = ((_buffer.size()-1) * 25) % 100;
_qtl25 = (qMod == 0 || qId == _buffer.size() - 1) ? _buffer[qId].value : (_buffer[qId].value + _buffer[qId + 1].value) / 2;
// Computing 75th PERCENTILE
qId = ((_buffer.size()-1) * 75) / 100;
qMod = ((_buffer.size()-1) * 75) % 100;
_qtl75 = (qMod == 0 || qId == _buffer.size() - 1) ? _buffer[qId].value : (_buffer[qId].value + _buffer[qId + 1].value) / 2;
// I know, sorting is costly; here, we assume that the aggregation window of sensor data is going to be relatively
// small, in which case the O(log(N)) complexity of the std::sort implementation converges to O(N)
std::sort(_buffer.begin(), _buffer.end(), [](const reading_t &lhs, const reading_t &rhs) { return lhs.value < rhs.value; });
// Computing 25th PERCENTILE
qId = ((_buffer.size() - 1) * 25) / 100;
qMod = ((_buffer.size() - 1) * 25) % 100;
_qtl25 = (qMod == 0 || qId == _buffer.size() - 1) ? _buffer[qId].value : (_buffer[qId].value + _buffer[qId + 1].value) / 2;
// Computing 75th PERCENTILE
qId = ((_buffer.size() - 1) * 75) / 100;
qMod = ((_buffer.size() - 1) * 75) % 100;
_qtl75 = (qMod == 0 || qId == _buffer.size() - 1) ? _buffer[qId].value : (_buffer[qId].value + _buffer[qId + 1].value) / 2;
}
fIdx = idx * REG_NUMFEATURES;
fIdx = idx * _numFeatures;
// Casting and storing the statistical features
_currentfVector->at<float>(fIdx) = (float) _mean;
_currentfVector->at<float>(fIdx + 1) = (float) _std;
_currentfVector->at<float>(fIdx + 2) = (float) _diffsum;
_currentfVector->at<float>(fIdx + 3) = (float) _qtl25;
_currentfVector->at<float>(fIdx + 4) = (float) _qtl75;
_currentfVector->at<float>(fIdx + 5) = (float) _latest;
if(_numFeatures == REG_NUMFEATURES) {
_currentfVector->at<float>(fIdx + 1) = (float) _std;
_currentfVector->at<float>(fIdx + 2) = (float) _diffsum;
_currentfVector->at<float>(fIdx + 3) = (float) _qtl25;
_currentfVector->at<float>(fIdx + 4) = (float) _qtl75;
_currentfVector->at<float>(fIdx + 5) = (float) _latest;
}
} else {
fIdx = idx * REG_NUMFEATURES;
fIdx = idx * _numFeatures;
_currentfVector->at<float>(fIdx) = 0.0f;
_currentfVector->at<float>(fIdx + 1) = 0.0f;
_currentfVector->at<float>(fIdx + 2) = 0.0f;
_currentfVector->at<float>(fIdx + 3) = 0.0f;
_currentfVector->at<float>(fIdx + 4) = 0.0f;
_currentfVector->at<float>(fIdx + 5) = 0.0f;
if(_numFeatures == REG_NUMFEATURES) {
_currentfVector->at<float>(fIdx + 1) = 0.0f;
_currentfVector->at<float>(fIdx + 2) = 0.0f;
_currentfVector->at<float>(fIdx + 3) = 0.0f;
_currentfVector->at<float>(fIdx + 4) = 0.0f;
_currentfVector->at<float>(fIdx + 5) = 0.0f;
}
}
}
//LOG(error) << "Target: " << _currentTarget;
......@@ -272,14 +281,14 @@ std::string RegressorOperator::getImportances() {
std::vector<ImportancePair> impLabels;
cv::Mat_<float> impValues;
_rForest->getVarImportance().convertTo(impValues, CV_32F);
if(impValues.empty() || _units.empty() || impValues.total()!=REG_NUMFEATURES*_units[0]->getInputs().size())
if(impValues.empty() || _units.empty() || impValues.total()!=_numFeatures*_units[0]->getInputs().size())
return "Operator " + _name + ": error when computing feature importances.";
for(size_t idx=0; idx<impValues.total(); idx++) {
ImportancePair pair;
pair.name = _units[0]->getInputs().at(idx/REG_NUMFEATURES)->getName();
pair.name = _units[0]->getInputs().at(idx/_numFeatures)->getName();
pair.value = impValues.at<float>(idx);
switch(idx%REG_NUMFEATURES) {
switch(idx%_numFeatures) {
case 0:
pair.name += " - mean";
break;
......
......@@ -63,6 +63,7 @@ public:
void setTrainingSamples(unsigned long long t) { _trainingSamples = t; }
void setTargetDistance(unsigned long long d) { _targetDistance = d; }
void setComputeImportances(bool i) { _importances = i; }
void setRawMode(bool r) { _numFeatures = r ? 1 : REG_NUMFEATURES; }
void triggerTraining() { _trainingPending = true;}
std::string getInputPath() { return _modelIn;}
......@@ -70,6 +71,7 @@ public:
unsigned long long getAggregationWindow() { return _aggregationWindow; }
unsigned long long getTrainingSamples() { return _trainingSamples; }
bool getComputeImportances() { return _importances; }
bool getRawMode() { return _numFeatures != REG_NUMFEATURES; }
virtual void printConfig(LOG_LEVEL ll) override;
......@@ -86,6 +88,7 @@ protected:
unsigned long long _aggregationWindow;
unsigned long long _trainingSamples;
unsigned long long _targetDistance;
unsigned long long _numFeatures;
bool _trainingPending;
bool _importances;
bool _includeTarget;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment