MSRSensorGroup.cpp 8.4 KB
Newer Older
1
2
3
//================================================================================
// Name        : MSRSensorGroup.cpp
// Author      : Carla Guillen
Micha Müller's avatar
Micha Müller committed
4
// Contact     : info@dcdb.it
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
// Copyright   : Leibniz Supercomputing Centre
// Description : Source file for MSR sensor group class.
//================================================================================

//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019-2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
//================================================================================
27
28
29

#include "MSRSensorGroup.h"

30
#include <atomic>
31
32
33
34
35
#include <boost/log/core/record.hpp>
#include <boost/log/sources/record_ostream.hpp>
#include <boost/log/trivial.hpp>
#include <boost/log/utility/formatting_ostream.hpp>
#include <boost/parameter/keyword.hpp>
36
#include <exception>
37
38
39
40
41
42
#include <fcntl.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <utility>

43
#include "Types.h"
44
#include "logging.h"
45
46
#include "timestamp.h"
#include <iomanip>
47
#include <sstream>
48
#include <thread>
49

50
const uint64_t MSR_MAXIMUM_SIZE = 281474976710656; //2^48
51
52
MSRSensorGroup::MSRSensorGroup(const std::string &name)
    : SensorGroupTemplate(name), _htAggregation(0) {
53
	_total_number_cpus = std::thread::hardware_concurrency();
54
55
56
57
58
}

MSRSensorGroup::~MSRSensorGroup() {
}

59
60
void MSRSensorGroup::groupInBins() {
	for (auto s : _sensors) {
61
62
63
64
		_sensorBins[s->getCpu()].addMsrSensorBin(s);
	}
	//sanity check: all bins should have the same number of sensors
	if (_sensorBins.size() == 0) {
65
66
		LOG(error) << "Sensorgroup " << _groupName << " failed to sort sensors!";
		return;
67
68
	}
	size_t binSensorSize = _sensorBins.front().sensors.size();
69
70
71
72
73
	for (auto &b : _sensorBins) {
		if (b.sensors.size() != binSensorSize) {
			LOG(error) << "Sensorgroup " << _groupName << " sensor number missmatch!";
			return;
		}
74
75
76
	}

	//sort bins, so that the sensor ordering is equal in every bin (useful in case of hyper-threading aggregation
77
78
79
80
	for (auto &b : _sensorBins) {
		std::sort(b.sensors.begin(), b.sensors.end(), [](const S_Ptr &lhs, const S_Ptr &rhs) {
			return lhs->getMetric() < rhs->getMetric();
		});
81
82
83
	}

	_sensorBins.shrink_to_fit();
84
85
86
87
88
89
90
91

	if (_htAggregation) {
		for (unsigned int cpu = _htAggregation; cpu < _total_number_cpus; ++cpu) {
			for (unsigned int m = 0; m < _sensorBins[cpu].sensors.size(); ++m) {
				_sensorBins[cpu].sensors[m]->setPublish(false);
			}
		}
	}
92
93
}

94
bool MSRSensorGroup::execOnStart() {
95
	for (unsigned int cpu = 0; cpu < _sensorBins.size(); ++cpu) {
96
		if (!_sensorBins[cpu].isActive()) {
97
98
			continue;
		}
99
100
		const std::size_t BUF_LEN = 200;
		char              path[BUF_LEN];
101
		snprintf(path, BUF_LEN, "/dev/cpu/%d/msr", cpu);
102
		int handle = open(path, O_RDWR);
103
		if (handle < 0) { // try msr_safe
104
			snprintf(path, BUF_LEN, "/dev/cpu/%d/msr_safe", cpu);
105
106
			handle = open(path, O_RDWR);
		}
107
		if (handle < 0) {
108
109
110
			LOG(error) << "Can't open msr device " << path;
			continue;
		}
111
		_sensorBins[cpu].setFd(handle);
112
	}
lu43jih's avatar
lu43jih committed
113
114
	program_fixed();

115
	return true;
116
117
}

118
void MSRSensorGroup::execOnStop() {
119
120
121
122
123
124
125
	//close file descriptors and leave counters running freely
	for (unsigned int cpu = 0; cpu < _sensorBins.size(); ++cpu) {
		if (_sensorBins[cpu].isActive()) {
			close(_sensorBins[cpu].getFd());
			_sensorBins[cpu].setFd(-1);
		}
	}
126
127
128
129
130
131
132
}

void MSRSensorGroup::read() {
	ureading_t reading;
	reading.timestamp = getTimestamp();

	try {
133
		for (auto s : _sensors) {
lu43jih's avatar
lu43jih committed
134
			auto ret_val = msr_read(s->getMetric(), &reading.value, s->getCpu());
135
			if (ret_val != -1) {
136
				s->storeReading(reading, 1, MSR_MAXIMUM_SIZE, !_htAggregation); //1 is no correction...
137
#ifdef DEBUG
lu43jih's avatar
lu43jih committed
138
				LOG(debug) << _groupName << "::" << s->getName() << " raw reading: \"" << reading.value << "\"";
139
#endif
lu43jih's avatar
lu43jih committed
140
			}
141
		}
142

143
		if (_htAggregation) {
144
145
			for (unsigned int cpu = 0; cpu < _htAggregation; ++cpu) { // loop through all cpus until the aggregation
				for (unsigned int m = 0; m < _sensorBins[cpu].sensors.size(); ++m) { //loop through the group's sensors
146
147
148
					reading_t aggregation;
					aggregation.value = 0;
					aggregation.timestamp = reading.timestamp;
149
					// starting at the cpu we find all the cpus which will be aggregated here
150
151
					for (unsigned int agg = cpu; agg + _htAggregation < _total_number_cpus; agg += _htAggregation) {
						if (_sensorBins[agg].isActive()) {
152
153
154
							aggregation.value += _sensorBins[agg].sensors[m]->getLatestValue().value;
						}
					}
155
					_sensorBins[cpu].sensors[m]->storeReadingGlobal(aggregation);
156
157
158
159
				}
			}
		}

160
	} catch (const std::exception &e) {
161
		LOG(error) << "Sensorgroup " << _groupName << " could not read value: " << e.what();
162
163
164
	}
}

165
166
int32_t MSRSensorGroup::msr_read(uint64_t msr_number, uint64_t *value, unsigned int cpu) {
	return pread(_sensorBins[cpu].getFd(), (void *)value, sizeof(uint64_t), msr_number);
167
168
}

169
170
int32_t MSRSensorGroup::msr_write(uint64_t msr_number, uint64_t value, unsigned int cpu) {
	return pwrite(_sensorBins[cpu].getFd(), (const void *)&value, sizeof(uint64_t), msr_number);
171
172
}

Micha Mueller's avatar
Micha Mueller committed
173
174
175
176
177
178
/**
 * Program the fixed MSR as required for this plugin.
 *
 * @return  True if counters programmed successfully, false otherwise, e.g.
 *          because the counters are already in use.
 */
179
void MSRSensorGroup::program_fixed() {
Micha Mueller's avatar
Micha Mueller committed
180

181
182
	for (unsigned int cpu = 0; cpu < _sensorBins.size(); ++cpu) {
		if (!_sensorBins[cpu].isActive()) { // CPU is not active, so it won't be programmed
183
184
			continue;
		}
185
186
		// program core counters

187
188
		//we do not want to interrupt other services already doing measurements with MSRs
		//therefore check if any fixed counter is currently enabled
189
		struct FixedEventControlRegister ctrl_reg;
Micha Mueller's avatar
Micha Mueller committed
190

191
		msr_read(IA32_CR_FIXED_CTR_CTRL, &ctrl_reg.value, cpu);
192
		//are they all enabled?
193
		if (ctrl_reg.fields.os0 && ctrl_reg.fields.usr0 && ctrl_reg.fields.os1 && ctrl_reg.fields.usr1 && ctrl_reg.fields.os2 && ctrl_reg.fields.usr2) {
194
			//yes! Free running counters were set by someone else => we don't need to program them, just read them.
195
			LOG(debug) << "CPU" << cpu << " has free running counter, so there will be no fixed counter programming";
196
			continue;
Micha Mueller's avatar
Micha Mueller committed
197
		}
198
		//not all of them (or none) are enabled => we program them again
Micha Mueller's avatar
Micha Mueller committed
199
200

		// disable counters while programming
201
		msr_write(IA32_CR_PERF_GLOBAL_CTRL, 0, cpu);
Micha Mueller's avatar
Micha Mueller committed
202

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
		ctrl_reg.fields.os0 = 1;
		ctrl_reg.fields.usr0 = 1;
		ctrl_reg.fields.any_thread0 = 0;
		ctrl_reg.fields.enable_pmi0 = 0;

		ctrl_reg.fields.os1 = 1;
		ctrl_reg.fields.usr1 = 1;
		ctrl_reg.fields.any_thread1 = 0;
		ctrl_reg.fields.enable_pmi1 = 0;

		ctrl_reg.fields.os2 = 1;
		ctrl_reg.fields.usr2 = 1;
		ctrl_reg.fields.any_thread2 = 0;
		ctrl_reg.fields.enable_pmi2 = 0;

		ctrl_reg.fields.reserved1 = 0;
219

220
		// program them
221
		msr_write(IA32_CR_FIXED_CTR_CTRL, ctrl_reg.value, cpu);
222

223
		// start counting, enable 3 fixed counters (enable also the programmables counters)
lu43jih's avatar
lu43jih committed
224
		uint64_t value = (1ULL << 0) + (1ULL << 1) + (1ULL << 2) + (1ULL << 3) + (1ULL << 32) + (1ULL << 33) + (1ULL << 34);
225
		//uint64_t value = (1ULL << 32) + (1ULL << 33) + (1ULL << 34);
226
		msr_write(IA32_CR_PERF_GLOBAL_CTRL, value, cpu);
227
228
229
	}
}

230
231
void MSRSensorGroup::addCpu(unsigned int cpu) {
	if (cpu + 1 > _sensorBins.size()) {
232
233
234
		_sensorBins.resize(cpu + 1);
	}
	_sensorBins[cpu].setActive();
235
}
236

237
std::vector<unsigned> MSRSensorGroup::getCpus() {
238
239
240
241
242
243
244
	std::vector<unsigned> cpus;
	for (unsigned int cpu = 0; cpu < _sensorBins.size(); ++cpu) {
		if (_sensorBins[cpu].isActive()) {
			cpus.push_back(cpu);
		}
	}
	return cpus;
245
246
}

247
void MSRSensorGroup::printGroupConfig(LOG_LEVEL ll, unsigned int leadingSpaces) {
248
249
250
251
252
253
254
255
256
257
	std::stringstream ss;
	const char *      separator = "";
	for (unsigned int cpu = 0; cpu < _sensorBins.size(); ++cpu) {
		if (_sensorBins[cpu].isActive()) {
			ss << separator << cpu;
			separator = ", ";
		}
	}
	std::string leading(leadingSpaces, ' ');
	LOG_VAR(ll) << leading << "CPUs:  " << ss.str();
258
}