24.09., 9:00 - 11:00: Due to updates GitLab will be unavailable for some minutes between 09:00 and 11:00.

nvmlSensorGroup.cpp 7.07 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
//================================================================================
// Name        : nvmlSensorGroup.cpp
// Author      : Fiona Reid, Weronika Filinger, EPCC @ The University of Edinburgh
// Contact     :
// Copyright   : 
// Description : Source file for nvml sensor group class.
//================================================================================

//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
//================================================================================

#include "nvmlSensorGroup.h"

#include "timestamp.h"

// Used to ensure we get a sensible value of energy by computing the difference 
// between calls to the read function
Weronika's avatar
Weronika committed
34

35
static int isfirsttime=0; 
36 37 38
struct env_t {
	nvmlDevice_t device;
} env; 
39

40 41 42 43 44
struct counters_t {
	unsigned long long energy_initial;
	unsigned long long energy_current;
	unsigned long long energy_previous;
	unsigned int temperature;
45
	unsigned int fanspeed;
46 47 48
	unsigned int clockspeed_graphics;
	unsigned int clockspeed_sm;
	unsigned int clockspeed_mem;
49
	nvmlMemory_t memory;
50
	unsigned int power;
51
	unsigned long long ecc_counts;
52
	nvmlUtilization_t utilization;
53
	unsigned int pcie_throughput;
54 55
} counters;

56
nvmlSensorGroup::nvmlSensorGroup(const std::string& name) :
57 58
	SensorGroupTemplate(name) {
	}
59 60

nvmlSensorGroup::nvmlSensorGroup(const nvmlSensorGroup& other) :
61 62
	SensorGroupTemplate(other) {
	}
63

Weronika's avatar
Weronika committed
64
nvmlSensorGroup::~nvmlSensorGroup() {}
65 66

nvmlSensorGroup& nvmlSensorGroup::operator=(const nvmlSensorGroup& other) {
67 68 69 70 71
	SensorGroupTemplate::operator=(other);
	/* 
	 * TODO
	 * Implement assignment operator
	 */
72

73
	return *this;
74 75 76
}

void nvmlSensorGroup::execOnInit() {
77 78 79 80 81 82 83 84
	/* 
	 * TODO
	 * Implement one time initialization logic for this group here
	 * (e.g. allocate memory for buffer) or remove this method if not
	 * required.
	 */
	// FR Add the contents of init_environment in here
	nvmlReturn_t err;
85

86 87 88
	err = nvmlInit();
	err = nvmlDeviceGetHandleByIndex(0,&(env.device));
	err = nvmlDeviceGetTotalEnergyConsumption(env.device,&(counters.energy_initial));
89

90
	// FR 
91 92 93 94

}

bool nvmlSensorGroup::execOnStart() {
95 96 97 98 99 100
	//FR	 
	cudaError_t cerr;
	cerr = cudaProfilerStart();
	// FR

	return true;
101 102 103
}

void nvmlSensorGroup::execOnStop() {
104 105 106 107 108 109 110 111 112
	/* 
	 * TODO
	 * Implement logic when the group stops polling here
	 * (e.g. close a file descriptor) or remove this method if not required.
	 */
	// FR 
	cudaError_t cerr;
	cerr = cudaProfilerStop();
	// FR 
113 114 115
}

void nvmlSensorGroup::read() {
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
	reading_t reading;
	reading.timestamp = getTimestamp();
	// FR
	nvmlReturn_t err;     
	unsigned long long temp;
	// FR
	try {
		for(auto s : _sensors) {
			switch(s->getFeatureType()){
				case(GPU_ENERGY):
					// Need to measure the difference in energy used between calls to the read function
					if (isfirsttime==0){
						// First time through we use the initial value to set previous and get the new energy into current
						counters.energy_previous = counters.energy_initial;
						err = nvmlDeviceGetTotalEnergyConsumption(env.device,&(counters.energy_current));
						isfirsttime=1;
					}
					else {
						// Otherwise, set previous energy to whatever it was before and get the new value
						counters.energy_previous=counters.energy_current;
						err = nvmlDeviceGetTotalEnergyConsumption(env.device,&(counters.energy_current));
					}
					temp=counters.energy_current - counters.energy_previous; // Take difference and compute energy in millijoules 
					// You might want to consider putting this in the else block so we always measure something?
					reading.value = temp;
					// FR 
					break;
				case(GPU_POWER):
					err = nvmlDeviceGetPowerUsage(env.device,&(counters.power));
					reading.value = counters.power;
					break;
Weronika's avatar
Weronika committed
147
				case(GPU_TEMP):
148 149 150 151 152 153 154
					err = nvmlDeviceGetTemperature(env.device,NVML_TEMPERATURE_GPU,&(counters.temperature));
					reading.value = counters.temperature;
					break;
				case(GPU_FAN):
					err = nvmlDeviceGetFanSpeed(env.device,&(counters.fanspeed));
					reading.value = counters.fanspeed;
					break;
155 156 157 158
				case(GPU_MEM_USED):
					err = nvmlDeviceGetMemoryInfo (env.device, &(counters.memory));
					reading.value = counters.memory.used;
					break;
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
				case(GPU_MEM_TOT):
					err = nvmlDeviceGetMemoryInfo (env.device, &(counters.memory));
					reading.value = counters.memory.total;
					break;
				case(GPU_MEM_FREE):
					err = nvmlDeviceGetMemoryInfo (env.device, &(counters.memory));
					reading.value = counters.memory.free;
					break;
				case(GPU_CLK_GP):
					err = nvmlDeviceGetClock (env.device, NVML_CLOCK_GRAPHICS,NVML_CLOCK_ID_CURRENT,&(counters.clockspeed_graphics));
					reading.value = counters.clockspeed_graphics;
					break;
				case(GPU_CLK_SM):
					err = nvmlDeviceGetClock (env.device, NVML_CLOCK_SM,NVML_CLOCK_ID_CURRENT,&(counters.clockspeed_sm));
					reading.value = counters.clockspeed_sm;
					break;
				case(GPU_CLK_MEM):
					err = nvmlDeviceGetClock (env.device, NVML_CLOCK_MEM,NVML_CLOCK_ID_CURRENT,&(counters.clockspeed_mem));
					reading.value = counters.clockspeed_mem;
					break;
				case(GPU_UTL_MEM):
					err = nvmlDeviceGetUtilizationRates (env.device, &(counters.utilization));
					reading.value = counters.utilization.memory;
					break;
				case(GPU_UTL_GPU):
					err = nvmlDeviceGetUtilizationRates (env.device, &(counters.utilization));
					reading.value = counters.utilization.gpu;
					break;
187 188 189 190
				case(GPU_ECC_ERR):
					err = nvmlDeviceGetTotalEccErrors (env.device, NVML_MEMORY_ERROR_TYPE_CORRECTED,NVML_VOLATILE_ECC,&(counters.ecc_counts));
					reading.value = counters.ecc_counts;
					break;
191 192 193 194
                                case(GPU_PCIE_THRU):
                                        err = nvmlDeviceGetPcieThroughput (env.device, NVML_PCIE_UTIL_COUNT,&(counters.pcie_throughput));
                                        reading.value = counters.pcie_throughput;
                                        break;
195

196 197
			}
			s->storeReading(reading);
198
#ifdef DEBUG
199
			LOG(debug) << _groupName << "::" << s->getName() << " raw reading: \"" << reading.value << "\"";
200
#endif
201 202 203 204
		}
	} catch (const std::exception& e) {
		LOG(error) << "Sensorgroup" << _groupName << " could not read value: " << e.what();
	}
205 206 207
}

void nvmlSensorGroup::printGroupConfig(LOG_LEVEL ll, unsigned int leadingSpaces) {
208 209 210 211
	/*
	 * TODO
	 * Log attributes here for debug reasons
	 */
212

213
	LOG_VAR(ll) << "            NumSpacesAsIndention: " << 12;
214
}