CSOperator.cpp 18 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
//================================================================================
// Name        : CSOperator.cpp
// Author      : Alessio Netti
// Contact     : info@dcdb.it
// Copyright   : Leibniz Supercomputing Centre
// Description :
//================================================================================

//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019-2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
//================================================================================

#include "CSOperator.h"

CSOperator::CSOperator(const std::string& name) : OperatorTemplate(name) {
    _modelIn = "";
    _modelOut = "";
    _aggregationWindow = 0;
34
    _trainingSamples = 3600;
35
    _numBlocks = 20;
36
    _scalingFactor = 1000000;
37
38
    _reuseModel = true;
    _trainingPending = true;
39
    _trainingReady = -1;
40
41
42
43
44
45
46
47
}

CSOperator::CSOperator(const CSOperator& other) : OperatorTemplate(other) {
    _modelIn = other._modelIn;
    _modelOut = "";
    _aggregationWindow = other._aggregationWindow;
    _trainingSamples = other._trainingSamples;
    _numBlocks = other._numBlocks;
48
    _scalingFactor = other._scalingFactor;
49
50
    _reuseModel = other._reuseModel;
    _trainingPending = true;
51
    _trainingReady = -1;
52
53
54
55
56
57
58
59
60
}

CSOperator::~CSOperator() {}

restResponse_t CSOperator::REST(const string& action, const unordered_map<string, string>& queries) {
    restResponse_t resp;
    if(action=="train") {
        resp.response = "Re-training triggered for CS Signatures operator " + this->_name + "!\n";
        this->_trainingPending = true;
61
        this->_trainingReady = -1;
62
63
64
65
66
67
68
69
70
71
    } else
        throw invalid_argument("Unknown plugin action " + action + " requested!");
    return resp;
}

void CSOperator::printConfig(LOG_LEVEL ll) {
    LOG_VAR(ll) << "            Window:          " << _aggregationWindow;
    LOG_VAR(ll) << "            Input Path:      " << (_modelIn!="" ? _modelIn : std::string("none"));
    LOG_VAR(ll) << "            Output Path:     " << (_modelOut!="" ? _modelOut : std::string("none"));
    LOG_VAR(ll) << "            Blocks:          " << _numBlocks;
72
    LOG_VAR(ll) << "            Scaling factor:  " << _scalingFactor;
73
74
75
76
77
78
79
    LOG_VAR(ll) << "            Training Sample: " << _trainingSamples;
    LOG_VAR(ll) << "            Reuse Model:     " << (_reuseModel ? "enabled" : "disabled");
    OperatorTemplate<CSSensorBase>::printConfig(ll);
}

void CSOperator::execOnInit() {
    bool useDefault=true;
80
81
82
83
    // Establishing the training unit and the appropriate number of signature blocks
    if(_streaming && !_units.empty()) {
        _trainingUnit = _units[0]->getName();
        _actualBlocks = _units[0]->getInputs().size() < _numBlocks ? _units[0]->getInputs().size() : _numBlocks;
84
        if(_actualBlocks!=_numBlocks)
85
            LOG(warning) << "Operator " << _name << ": cannot enforce " << _numBlocks << " blocks, using " << _actualBlocks << " instead.";
86
87
88
89
    } else {
        _actualBlocks = _numBlocks;
    }
    
90
91
92
93
94
95
    if(_modelIn!="") {
        try {
            if(!readFromFile(_modelIn))
                LOG(error) << "Operator " + _name + ": incompatible CS data, falling back to default!";
            else {
                _trainingPending = false;
96
                _trainingReady = -1;
97
98
99
100
101
102
103
                useDefault = false;
            }
        } catch(const std::exception& e) {
            LOG(error) << "Operator " + _name + ": cannot load CS data from file, falling back to default!"; }
    }
    if(useDefault) {
        _trainingPending = true;
104
        _trainingReady = -1;
105
106
107
108
109
110
111
112
113
114
115
        _max.clear();
        _min.clear();
        _permVector.clear();
    }
    _trainingData.clear();
}

void CSOperator::compute(U_Ptr unit) {
    uint64_t nowTs = getTimestamp();
    
    // Training-related tasks
116
    if(_trainingPending && _streaming && _trainingUnit==unit->getName()) {
117
118
119
120
121
122
        // Fetching sensor data
        if(_trainingData.empty())
            _trainingData.resize(unit->getInputs().size());
        for(size_t idx=0; idx<unit->getInputs().size(); idx++)
            accumulateData(_trainingData, unit->getInputs()[idx], idx, nowTs);
        // Performing training once enough samples are obtained
123
        if(!_trainingData.empty() && _trainingReady!=-1) {
124
125
126
127
            if(!checkTrainingSet(_trainingData)) {
                LOG(error) << "Operator " + _name + ": collected training set does not appear to be valid!";
                _trainingData.clear();
                _trainingPending = true;
128
                _trainingReady = -1;
129
130
131
132
133
            } else {
                computeMinMax(_trainingData);
                computePermutation(_trainingData);
                _trainingData.clear();
                _trainingPending = false;
134
                _trainingReady = -1;
135
                LOG(info) << "Operator " + _name + ": CS training performed.";
136
137
138
                if (_modelOut != "" && !dumpToFile(_modelOut))
                    LOG(error) << "Operator " + _name + ": cannot save CS data to a file!";
            }
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        }
    }
    
    // If the operator is in an invalid state
    if(_permVector.empty() && !(_trainingPending && _streaming)) {
        throw std::runtime_error("Operator " + _name + ": cannot compute signatures, no CS data available!");
    // If an unit has an unexpected number of input sensors
    } else if(!_permVector.empty() && _permVector.size()!=unit->getInputs().size()) {
        throw std::runtime_error("Operator " + _name + ": unit " + unit->getName() + " has an anomalous number of inputs!");
    }
        
    if(!_permVector.empty()) {
        computeSignature(unit, nowTs);
    }
}

// -------------------------------------- INPUT / OUTPUT --------------------------------------

bool CSOperator::dumpToFile(std::string &path) {
    boost::property_tree::ptree root, blocks;
    std::ostringstream data;
    
    if(_trainingPending || _permVector.empty())
        return false;
    
164
    // Saving CS data in terms of permutation index, minimum and maximum for each input sensor
165
    for(size_t idx=0; idx<_permVector.size(); idx++) {
166
        boost::property_tree::ptree group;
167
        group.push_back(boost::property_tree::ptree::value_type("idx", boost::property_tree::ptree(std::to_string(_permVector[idx]))));
168
169
        group.push_back(boost::property_tree::ptree::value_type("min", boost::property_tree::ptree(std::to_string(_min[_permVector[idx]]))));
        group.push_back(boost::property_tree::ptree::value_type("max", boost::property_tree::ptree(std::to_string(_max[_permVector[idx]]))));
170
171
        blocks.add_child(std::to_string(idx), group);
    }
172
    root.add_child(std::to_string(_permVector.size()), blocks);
173
174
175
    
    try {
        std::ofstream outFile(path);
176
177
        boost::property_tree::write_json(outFile, root, true);
        outFile.close();
178
179
180
181
182
183
184
185
186
187
    } catch(const std::exception &e) { return false; }
    return true;
}

bool CSOperator::readFromFile(std::string &path) {
    boost::property_tree::iptree config;
    try {
        boost::property_tree::read_json(path, config);
    } catch(const std::exception &e) { return false; }
    
188
189
    // The root JSON node encodes the number of sensors with which the model was trained
    if(config.begin() == config.end() || stoull(config.begin()->first) < _actualBlocks)
190
191
        return false;
    
192
193
194
195
196
197
    uint64_t numSensors = stoull(config.begin()->first);
    std::vector<size_t>  newPermVector(numSensors);
    std::vector<int64_t> newMin(numSensors);
    std::vector<int64_t> newMax(numSensors);
    
    BOOST_FOREACH(boost::property_tree::iptree::value_type &val, config.begin()->second) {
198
199
200
201
        size_t blockID = std::stoull(val.first);
        boost::property_tree::iptree &blk = val.second;
        if(blk.find("idx")==blk.not_found() || blk.find("min")==blk.not_found() || blk.find("max")==blk.not_found())
            return false;
202
        if(blockID>=numSensors)
203
            return false;
204

205
206
        size_t tempIdx = 0;
        int64_t tempMin = 0, tempMax = 0;
207
208
        BOOST_FOREACH(boost::property_tree::iptree::value_type &val2, blk) {
            if (boost::iequals(val2.first, "idx")) {
209
                tempIdx = std::stoull(val2.second.data());
210
            } else if (boost::iequals(val2.first, "min")) {
211
                tempMin = std::stoll(val2.second.data());
212
            } else if (boost::iequals(val2.first, "max")) {
213
                tempMax = std::stoll(val2.second.data());
214
215
            }
        }
216
217
218
219
220
        
        newPermVector[blockID] = tempIdx;
        newMin[tempIdx] = tempMin;
        newMax[tempIdx] = tempMax;
        
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
    }
    
    // Replacing the operator's CS data
    _permVector = newPermVector;
    _min = newMin;
    _max = newMax;
    return true;
}

// -------------------------------------- MODEL TRAINING --------------------------------------

// Accumulates sensor data in-memory for later training
void CSOperator::accumulateData(std::vector<std::vector<reading_t>>& v, CSSBPtr s, size_t idx, uint64_t nowTs) {
    // We query all new data for the sensor since the last one - we want a clean time series 
    uint64_t endTs = nowTs;
236
    uint64_t startTs = v[idx].empty() ? endTs - _aggregationWindow : v[idx].back().timestamp+100000;
237
238
239
240
241
    _buffer.clear();
    // This query might possibly fail very often, depending on the batching of sensors
    if(!_queryEngine.querySensor(s->getName(), startTs, endTs, _buffer, false))
        return;
    // We add the queried values only if they are actually "new"
242
    if(!_buffer.empty() && (v[idx].empty() || _buffer[0].timestamp>v[idx].back().timestamp)) {
243
        v[idx].insert(v[idx].end(), _buffer.begin(), _buffer.end());
244
245
        // Triggering training if right amount of sensor readings is reached
        if(v[idx].size() >= _trainingSamples)
246
            _trainingReady = idx;
247
    }
248
249
250
251
252
253
254
}

// Applies the sorting stage of the CS method and finds a permutation vector
void CSOperator::computePermutation(std::vector<std::vector<reading_t>>& v) {
    // Each column of the matrix will be an interpolated sensor
    cv::Mat sensorMatrix = cv::Mat(_trainingSamples, v.size(), CV_64F);
    // Evaluation parameters post-interpolation
255
256
    // Beware of the accuracy loss: casting timestamps to doubles should result in a loss only
    // at the level of microseconds, but if there are issues, then check this
257
258
    double startEval=(double)v[_trainingReady].front().timestamp;
    double stepEval=(double)(v[_trainingReady].back().timestamp - v[_trainingReady].front().timestamp) / (double)_trainingSamples;
259
260
261
    double startInterp, stepInterp;
    for(size_t idx=0; idx<v.size(); idx++) {
        std::vector<reading_t>& vals = v[idx];
262
263
        startInterp = startEval - (double)vals.front().timestamp;
        stepInterp = (double)(vals.back().timestamp - vals.front().timestamp) / (double)vals.size();
264
265
266
267
268
269
270
271
        // Copying element by element into a temporary vector - ugly and slow
        std::vector<double> sValues(vals.size());
        for(size_t idx2=0; idx2<vals.size(); idx2++)
            sValues[idx2] = (double)vals[idx2].value;
        // Spline interpolation
        boost::math::cubic_b_spline<double> spline(sValues.begin(), sValues.end(), startInterp, stepInterp);
        // Evaluating in the interpolated points and storing in the matrix
        for(size_t idx2=0; idx2<_trainingSamples; idx2++)
272
            sensorMatrix.at<double>(idx2, idx) = spline(stepEval*idx2);
273
        sValues.clear();
274
275
276
277
    }
    
    // Calculating covariance matrix
    cv::Mat covMatrix, meanMatrix;
278
    cv::calcCovarMatrix(sensorMatrix, covMatrix, meanMatrix, cv::COVAR_ROWS + cv::COVAR_SCALE + cv::COVAR_NORMAL, CV_64F);
279
    sensorMatrix.release();
280
    meanMatrix.release();
281
282
283
284
285
286
287
288
    // Transforming the matrix
    convertToCorrelation(covMatrix);
    
    // Initial set of available sensors
    std::set<size_t> availSet;
    for(size_t idx=0; idx<v.size(); idx++)
        availSet.insert(idx);

289
    // Correlation-based sorting
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
    _permVector.clear();
    double corrMax = -1000.0;
    double corrCoef = 0.0;
    size_t corrIdx = 0;

    for(size_t idx=0; idx<v.size(); idx++) {
        if (covMatrix.at<double>(idx, idx) > corrMax) {
            corrMax = covMatrix.at<double>(idx, idx);
            corrIdx = idx;
        }
    }
    
    _permVector.push_back(corrIdx);
    availSet.erase(corrIdx);
    
    while(!availSet.empty()) {
        corrMax = -1000;
        corrIdx = 0;
        for(const auto& avId : availSet) {
            corrCoef = covMatrix.at<double>(avId, avId) * covMatrix.at<double>(_permVector.back(), avId);
            if(corrCoef > corrMax) {
                corrMax = corrCoef;
                corrIdx = avId;
            }
        }
        _permVector.push_back(corrIdx);
        availSet.erase(corrIdx);
    }
    covMatrix.release();
}

// Computes minimum and maximum for each separate sensor
void CSOperator::computeMinMax(std::vector<std::vector<reading_t>>& v) {
    _min.resize(v.size());
    _max.resize(v.size());
    
    int64_t max, min;
    for(size_t idx=0; idx<v.size(); idx++) {
        max = LLONG_MIN;
        min = LLONG_MAX;
        if (v[idx].size() > 0) {
            for (const auto &s : v[idx]) {
                if (s.value > max)
                    max = s.value;
334
                if (s.value < min)
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
                    min = s.value;
            }
        } else {
            max = 0;
            min = 0;
        }
        _min[idx] = min;
        _max[idx] = max;
    }
}

// Converts a covariance matrix to a correlation one and additionally stores the "total" correlation 
// of each variable in the diagonal of the matrix
void CSOperator::convertToCorrelation(cv::Mat &m) {
    // Computing Pearson correlations
Alessio Netti's avatar
Alessio Netti committed
350
351
    for(size_t i=0; i<(size_t)m.size().height; i++) {
        for(size_t j=0; j<(size_t)m.size().width; j++) {
352
            if(i!=j)
353
                m.at<double>(i,j) = m.at<double>(i,j) / (sqrt(m.at<double>(i,i))*sqrt(m.at<double>(j,j)) + 0.00001);
354
355
356
        }
    }
    // Getting global correlations
Alessio Netti's avatar
Alessio Netti committed
357
    for(size_t i=0; i<(size_t)m.size().height; i++) {
358
        m.at<double>(i,i) = 0;
Alessio Netti's avatar
Alessio Netti committed
359
        for(size_t j=0; j<(size_t)m.size().width; j++) {
360
            if(i!=j) {
361
                m.at<double>(i,i) += m.at<double>(i,j);
362
363
364
365
366
367
            }
        }
        m.at<double>(i,i) /= m.size().width - 1;
    }
}

368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
// Checks that the training set is actually valid
bool CSOperator::checkTrainingSet(std::vector<std::vector<reading_t>>& v) {
    if(v.empty())
        return false;
    bool foundValid=false;
    for(const auto& s : v) {
        if(s.size() < 100) {
            return false;
        } else if(s.size() >= _trainingSamples) {
            foundValid = true;
        }
    }
    return foundValid;
}

383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
// -------------------------------------- SIGNATURE COMPUTATION --------------------------------------

// Actual signature computation
void CSOperator::computeSignature(U_Ptr unit, uint64_t nowTs) {
    uint64_t endTs = nowTs;
    uint64_t startTs = endTs - _aggregationWindow;
    // Buffers need to have the same number of elements as the input sensors, and uniform across units
    if(_avgBuffer.size()!=_permVector.size() || _derBuffer.size()!=_permVector.size()) {
        _avgBuffer.resize(_permVector.size());
        _derBuffer.resize(_permVector.size());
    }

    // Querying sensors, calculating averages and first-order derivatives
    for(size_t idx=0; idx<unit->getInputs().size(); idx++) {
        _buffer.clear();
        if(!_queryEngine.querySensor(unit->getInputs()[idx]->getName(), startTs, endTs, _buffer, false)) {
399
            LOG(debug) << "Operator " + _name + ": cannot read from sensor " << unit->getInputs()[idx]->getName() << "!";
400
401
402
403
404
405
406
407
408
409
            return;
        }
        normalize(_buffer, idx);
        _avgBuffer[idx] = getAvg(_buffer);
        _derBuffer[idx] = getDer(_buffer);
    }

    // Computing blocks and storing result into output sensors
    reading_t val;
    val.timestamp = nowTs;
410
    _blockLen = (float)unit->getInputs().size() / (float)_actualBlocks;
411
412
413
414
415
416
417
418
419
420
421
422
423
    for(auto &s : unit->getOutputs()) {
        if(s->getBlockID()<_actualBlocks) {
            _bBegin = (size_t)floor(_blockLen*s->getBlockID());
            _bEnd = (size_t)ceil(_blockLen*(s->getBlockID()+1));
            val.value = 0;

            if(!s->getImag()) {
                for (size_t idx = _bBegin; idx < _bEnd; idx++)
                    val.value += _avgBuffer[_permVector[idx]];
            } else {
                for (size_t idx = _bBegin; idx < _bEnd; idx++)
                    val.value += _derBuffer[_permVector[idx]];
            }
424
            val.value /= ((int64_t)_bEnd - (int64_t)_bBegin);
425
426
427
428
429
430
431
            s->storeReading(val);
        }
    }
}

// Normalizes sensor data
void CSOperator::normalize(std::vector<reading_t> &v, size_t idx) {
432
    double denom = _max[idx]!=_min[idx] ? (double)_scalingFactor / (double)(_max[idx] - _min[idx]) : (double)_scalingFactor;
433
434
435
436
437
    for(size_t idx2=0; idx2<v.size(); idx2++) {
        if(v[idx2].value > _max[idx])
            v[idx2].value = _max[idx];
        else if(v[idx2].value < _min[idx])
            v[idx2].value = _min[idx];
438
        v[idx2].value = (int64_t)((double)(v[idx2].value - _min[idx]) * denom);
439
    }
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
}

// Computes average sensor values
int64_t CSOperator::getAvg(std::vector<reading_t> &v) {
    int64_t avg = 0;
    for(size_t idx=0; idx<v.size(); idx++) {
        avg += v[idx].value;
    }
    avg = v.size()>0 ? avg/v.size() : 0;
    return avg;
}

// Computes average first-order derivatives
int64_t CSOperator::getDer(std::vector<reading_t> &v) {
    int64_t der = 0;
    for(size_t idx=1; idx<v.size(); idx++) {
456
        der += (int64_t)v[idx].value - (int64_t)v[idx-1].value;
457
    }
458
    der = v.size()>1 ? der/((int64_t)v.size()-1) : 0;
459
460
    return der;
}