2.12.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

nvmlSensorGroup.cpp 6.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
//================================================================================
// Name        : nvmlSensorGroup.cpp
// Author      : Fiona Reid, Weronika Filinger, EPCC @ The University of Edinburgh
// Contact     :
// Copyright   : 
// Description : Source file for nvml sensor group class.
//================================================================================

//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
//================================================================================

#include "nvmlSensorGroup.h"

#include "timestamp.h"

// Used to ensure we get a sensible value of energy by computing the difference 
// between calls to the read function
Weronika's avatar
Weronika committed
34

35
static int isfirsttime=0; 
36
37
38
struct env_t {
	nvmlDevice_t device;
} env; 
39

40
41
42
43
44
45
46
47
struct counters_t {
	unsigned long long energy_initial;
	unsigned long long energy_current;
	unsigned long long energy_previous;
	//	unsigned int freq1;
	//	unsigned int freq2;
	//	unsigned int freq3;
	unsigned int temperature;
48
	unsigned int fanspeed;
49
50
51
	unsigned int clockspeed_graphics;
	unsigned int clockspeed_sm;
	unsigned int clockspeed_mem;
52
	nvmlMemory_t memory;
53
	unsigned int power;
54
	//	unsigned long long ecc_counts;
55
	nvmlUtilization_t utilization;
56
57
} counters;

58
nvmlSensorGroup::nvmlSensorGroup(const std::string& name) :
59
60
	SensorGroupTemplate(name) {
	}
61
62

nvmlSensorGroup::nvmlSensorGroup(const nvmlSensorGroup& other) :
63
64
	SensorGroupTemplate(other) {
	}
65

Weronika's avatar
Weronika committed
66
nvmlSensorGroup::~nvmlSensorGroup() {}
67
68

nvmlSensorGroup& nvmlSensorGroup::operator=(const nvmlSensorGroup& other) {
69
70
71
72
73
	SensorGroupTemplate::operator=(other);
	/* 
	 * TODO
	 * Implement assignment operator
	 */
74

75
	return *this;
76
77
78
}

void nvmlSensorGroup::execOnInit() {
79
80
81
82
83
84
85
86
	/* 
	 * TODO
	 * Implement one time initialization logic for this group here
	 * (e.g. allocate memory for buffer) or remove this method if not
	 * required.
	 */
	// FR Add the contents of init_environment in here
	nvmlReturn_t err;
87

88
89
90
	err = nvmlInit();
	err = nvmlDeviceGetHandleByIndex(0,&(env.device));
	err = nvmlDeviceGetTotalEnergyConsumption(env.device,&(counters.energy_initial));
91

92
	// FR 
93
94
95
96

}

bool nvmlSensorGroup::execOnStart() {
97
98
99
100
101
102
	//FR	 
	cudaError_t cerr;
	cerr = cudaProfilerStart();
	// FR

	return true;
103
104
105
}

void nvmlSensorGroup::execOnStop() {
106
107
108
109
110
111
112
113
114
	/* 
	 * TODO
	 * Implement logic when the group stops polling here
	 * (e.g. close a file descriptor) or remove this method if not required.
	 */
	// FR 
	cudaError_t cerr;
	cerr = cudaProfilerStop();
	// FR 
115
116
117
}

void nvmlSensorGroup::read() {
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
	reading_t reading;
	reading.timestamp = getTimestamp();
	// FR
	nvmlReturn_t err;     
	unsigned long long temp;
	// FR
	try {
		for(auto s : _sensors) {
			switch(s->getFeatureType()){
				case(GPU_ENERGY):
					// Need to measure the difference in energy used between calls to the read function
					if (isfirsttime==0){
						// First time through we use the initial value to set previous and get the new energy into current
						counters.energy_previous = counters.energy_initial;
						err = nvmlDeviceGetTotalEnergyConsumption(env.device,&(counters.energy_current));
						isfirsttime=1;
					}
					else {
						// Otherwise, set previous energy to whatever it was before and get the new value
						counters.energy_previous=counters.energy_current;
						err = nvmlDeviceGetTotalEnergyConsumption(env.device,&(counters.energy_current));
					}
					temp=counters.energy_current - counters.energy_previous; // Take difference and compute energy in millijoules 
					// You might want to consider putting this in the else block so we always measure something?
					reading.value = temp;
					// FR 
					break;
				case(GPU_POWER):
					err = nvmlDeviceGetPowerUsage(env.device,&(counters.power));
					reading.value = counters.power;
					break;
Weronika's avatar
Weronika committed
149
				case(GPU_TEMP):
150
151
152
153
154
155
156
					err = nvmlDeviceGetTemperature(env.device,NVML_TEMPERATURE_GPU,&(counters.temperature));
					reading.value = counters.temperature;
					break;
				case(GPU_FAN):
					err = nvmlDeviceGetFanSpeed(env.device,&(counters.fanspeed));
					reading.value = counters.fanspeed;
					break;
157
158
159
160
				case(GPU_MEM_USED):
					err = nvmlDeviceGetMemoryInfo (env.device, &(counters.memory));
					reading.value = counters.memory.used;
					break;
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
				case(GPU_MEM_TOT):
					err = nvmlDeviceGetMemoryInfo (env.device, &(counters.memory));
					reading.value = counters.memory.total;
					break;
				case(GPU_MEM_FREE):
					err = nvmlDeviceGetMemoryInfo (env.device, &(counters.memory));
					reading.value = counters.memory.free;
					break;
				case(GPU_CLK_GP):
					err = nvmlDeviceGetClock (env.device, NVML_CLOCK_GRAPHICS,NVML_CLOCK_ID_CURRENT,&(counters.clockspeed_graphics));
					reading.value = counters.clockspeed_graphics;
					break;
				case(GPU_CLK_SM):
					err = nvmlDeviceGetClock (env.device, NVML_CLOCK_SM,NVML_CLOCK_ID_CURRENT,&(counters.clockspeed_sm));
					reading.value = counters.clockspeed_sm;
					break;
				case(GPU_CLK_MEM):
					err = nvmlDeviceGetClock (env.device, NVML_CLOCK_MEM,NVML_CLOCK_ID_CURRENT,&(counters.clockspeed_mem));
					reading.value = counters.clockspeed_mem;
					break;
				case(GPU_UTL_MEM):
					err = nvmlDeviceGetUtilizationRates (env.device, &(counters.utilization));
					reading.value = counters.utilization.memory;
					break;
				case(GPU_UTL_GPU):
					err = nvmlDeviceGetUtilizationRates (env.device, &(counters.utilization));
					reading.value = counters.utilization.gpu;
					break;

190
191
			}
			s->storeReading(reading);
192
#ifdef DEBUG
193
			LOG(debug) << _groupName << "::" << s->getName() << " raw reading: \"" << reading.value << "\"";
194
#endif
195
196
197
198
		}
	} catch (const std::exception& e) {
		LOG(error) << "Sensorgroup" << _groupName << " could not read value: " << e.what();
	}
199
200
201
}

void nvmlSensorGroup::printGroupConfig(LOG_LEVEL ll, unsigned int leadingSpaces) {
202
203
204
205
	/*
	 * TODO
	 * Log attributes here for debug reasons
	 */
206

207
	LOG_VAR(ll) << "            NumSpacesAsIndention: " << 12;
208
}