Currently job artifacts in CI/CD pipelines on LRZ GitLab never expire. Starting from Wed 26.1.2022 the default expiration time will be 30 days (GitLab default). Currently existing artifacts in already completed jobs will not be affected by the change. The latest artifacts for all jobs in the latest successful pipelines will be kept. More information: https://gitlab.lrz.de/help/user/admin_area/settings/continuous_integration.html#default-artifacts-expiration

dcdbslurmjob.cpp 15.5 KB
Newer Older
1
2
//================================================================================
// Name        : dcdbslurmjob.cpp
Micha Müller's avatar
Micha Müller committed
3
// Author      : Michael Ott, Micha Mueller
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
// Copyright   : Leibniz Supercomputing Centre
// Description : Main file of the dcdbslurmjob command line utility
//================================================================================

//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2011-2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
//================================================================================

Micha Müller's avatar
Micha Müller committed
27
#include "../../common/include/globalconfiguration.h"
28
#include "timestamp.h"
Micha Müller's avatar
Micha Müller committed
29
#include <boost/algorithm/string.hpp>
30
#include <boost/regex.hpp>
Micha Müller's avatar
Micha Müller committed
31
32
33
#include <boost/property_tree/json_parser.hpp>
#include <boost/property_tree/ptree.hpp>
#include <cstdlib>
34
#include <dcdb/connection.h>
Alessio Netti's avatar
Alessio Netti committed
35
#include <dcdb/jobdatastore.h>
Micha Müller's avatar
Micha Müller committed
36
37
#include <iostream>
#include <mosquitto.h>
38
39
40
#include "dcdb/version.h"
#include "version.h"

41
42
#define SLURM_JOBSTEP_SEP "."

43
44
45
46
47
48
49
50
51
int msgId = -1;
bool done = false;

void publishCallback(struct mosquitto *mosq, void *obj, int mid) {
	if(msgId != -1 && mid == msgId)
		done = true;
}


Alessio Netti's avatar
Alessio Netti committed
52
53
54
55
56
/*
 * Print usage information
 */
void usage() {
    std::cout << "Usage:" << std::endl;
Micha Müller's avatar
Micha Müller committed
57
    std::cout << "  dcdbslurmjob [-b<host>] [-t<timestamp>] [-n<nodelist>] [-j<jobid>] [-i<userid>] start|stop" << std::endl;
58
    std::cout << "  dcdbslurmjob [-c<host>] [-u<username>] [-p<password>] [-t<timestamp>] [-n<nodelist>] [-j<jobid>] [-i<userid>] [-s<pattern>] start|stop" << std::endl;
Alessio Netti's avatar
Alessio Netti committed
59
60
61
62
    std::cout << "  dcdbslurmjob -h" << std::endl;
    std::cout << std::endl;

    std::cout << "Options:" << std::endl;
63
    std::cout << "  -b<hosts>     List of MQTT brokers           [default: localhost:1883]" << std::endl;
Michael Ott's avatar
Michael Ott committed
64
    std::cout << "  -q<qos>       MQTT QoS to use                [default: 1]" << std::endl;
65
    std::cout << "  -o<timeout>   MQTT timeout in seconds        [default: 10]" << std::endl;
66
    std::cout << "  -c<hosts>     List of Cassandra hosts        [default: none]" << std::endl;
67
68
69
70
71
72
73
    std::cout << "  -u<username>  Cassandra username             [default: none]" << std::endl;
    std::cout << "  -p<password>  Cassandra password             [default: none]" << std::endl;
    std::cout << "  -t<timestamp> Timestamp value                [default: now]" << std::endl;
    std::cout << "  -n<nodelist>  Comma-separated nodelist       [default: SLURM_JOB_NODELIST]" << std::endl;
    std::cout << "  -j<jobid>     Numerical job id               [default: SLURM_JOB_ID var]" << std::endl;
    std::cout << "  -i<userid>    Numerical user id              [default: SLURM_JOB_USER var]" << std::endl;
    std::cout << "  -s<pattern>   Nodelist substitution pattern  [default: none]" << std::endl;
74
    std::cout << "  -m<pattern>   Maximum job length in h        [default: none]" << std::endl;
Alessio Netti's avatar
Alessio Netti committed
75
76
77
    std::cout << std::endl;
    std::cout << "  -h            This help page" << std::endl;
    std::cout << std::endl;
78
    std::cout << "Options -b and -c|u|p are mutual exclusive! If both are specified, the latter takes precedence. By default MQTT broker is specified." << std::endl;
Alessio Netti's avatar
Alessio Netti committed
79
}
80
81
82
83

std::string getEnv(const char* var) {
    char* str = std::getenv(var);
    if (str != NULL) {
Alessio Netti's avatar
Alessio Netti committed
84
	    return std::string(str);
85
    } else {
Alessio Netti's avatar
Alessio Netti committed
86
87
88
89
	    return std::string("");
    }
}

90
void splitNodeList(const std::string& str, DCDB::NodeList& nl)
Alessio Netti's avatar
Alessio Netti committed
91
92
{
    nl.clear();
93
94
95
96
    std::string s1 = str;
    boost::regex r1("([^,[]+)(\\[[0-9,-]+\\])?(,|$)", boost::regex::extended);
    boost::smatch m1;
    while (boost::regex_search(s1, m1, r1)) {
97
98
99
100
		std::string hostBase = m1[1].str();
		
		if (m1[2].str().size() == 0) {
			nl.push_back(hostBase);
101
		} else {
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
			std::string s2 = m1[2].str();
			boost::regex r2("([0-9]+)-?([0-9]+)?(,|\\])", boost::regex::extended);
			boost::smatch m2;
			while (boost::regex_search(s2, m2, r2)) {
				if (m2[2] == "") {
					nl.push_back(hostBase + m2[1].str());
				} else {
					int start = atoi(m2[1].str().c_str());
					int stop = atoi(m2[2].str().c_str());
					for (int i=start; i<=stop; i++) {
						std::stringstream ss;
						ss << std::setw(m2[2].str().length()) << std::setfill('0') << i;
						nl.push_back(hostBase + ss.str());
					}
				}
				s2 = m2.suffix().str();
			}
119
		}
120
		s1 = m1.suffix().str();
121
122
123
    }
}

124
125
126
127
128
129
void convertNodeList(DCDB::NodeList& nl, std::string substitution) {
    //check if input has sed format of "s/.../.../" for substitution
    boost::regex  checkSubstitute("s([^\\\\]{1})([\\S|\\s]*)\\1([\\S|\\s]*)\\1");
    boost::smatch matchResults;
    
    if (regex_match(substitution, matchResults, checkSubstitute)) {
130
131
132
133
134
135
136
		//input has substitute format
		boost::regex re = (boost::regex(matchResults[2].str(), boost::regex_constants::extended));
		std::string fmt = matchResults[3].str();
		for (auto &n: nl) {
			 n = boost::regex_replace(n, re, fmt);
			 //std::cout << n <<" => " << mqtt << std::endl;
		}
137
138
139
    }
}

140
141
142
143
144
145
void splitHostList(const std::string& str, std::vector<std::string>& hl, char delim = ',')
{
    hl.clear();
    std::stringstream ss(str);
    std::string token;
    while (std::getline(ss, token, delim)) {
146
		hl.push_back(token);
147
148
149
    }
}

150
void pickRandomHost(std::vector<std::string>& hl, std::string& host, int& port, bool erase = false) {
151
152
153
154
    srand (time(NULL));
    int n = rand() % hl.size();
    host = parseNetworkHost(hl[n]);
    port = atoi(parseNetworkPort(hl[n]).c_str());
155
    if (erase) {
156
		hl.erase(hl.begin()+n);
157
    }
158
159
}

Micha Müller's avatar
Micha Müller committed
160
/**
161
162
163
 * Retrieves Slurm job data from environment variables and sends it to either a
 * CollectAgent or a Cassandra database. Job data can also be passed as command
 * line options.
Micha Müller's avatar
Micha Müller committed
164
 */
165
166
int main(int argc, char** argv) {
    std::cout << "dcdbslurmjob " << VERSION << std::endl << std::endl;
167

168
    bool cassandra = false;
169
170
171
172
    DCDB::Connection *  dcdbConn = nullptr;
    DCDB::JobDataStore *myJobDataStore = nullptr;
    struct mosquitto *  _mosq = nullptr;

173
174
    std::vector<std::string> hostList;
    std::string host = "", cassandraUser = "", cassandraPassword = "";
175
176
    int port;
    std::string nodelist="", jobId="", userId="", stepId="";
177
    std::string substitution="";
178
    int maxJobLength = -1;
Michael Ott's avatar
Michael Ott committed
179
    int qos = 1;
180
    int timeout = 10;
Alessio Netti's avatar
Alessio Netti committed
181
    uint64_t ts=0;
182
    
Alessio Netti's avatar
Alessio Netti committed
183
    // Defining options
184
    const char *opts = "b:q:o:c:u:p:n:t:j:i:s:m:h";
Alessio Netti's avatar
Alessio Netti committed
185
186
187
188
189
190
191
192
193
194
195

    char ret;
    while ((ret = getopt(argc, argv, opts))!=-1) {
        switch (ret)
        {
            case 'h':
                usage();
                return 0;
            default:
                break;
        }
196
197
    }
    
Alessio Netti's avatar
Alessio Netti committed
198
199
200
201
202
203
    if (argc < 2) {
        std::cerr << "At least one argument is required: start or stop" << std::endl;
        return 1;
    } else if(!boost::iequals(argv[argc-1], "start") && !boost::iequals(argv[argc-1], "stop")) {
        std::cerr << "Unsupported action: must either be start or stop" << std::endl;
        return 1;
204
205
    }
    
Alessio Netti's avatar
Alessio Netti committed
206
207
208
    optind = 1;
    while ((ret=getopt(argc, argv, opts))!=-1) {
        switch(ret) {
Micha Müller's avatar
Micha Müller committed
209
		case 'b': {
210
			cassandra = false;
211
			splitHostList(optarg, hostList);
Micha Müller's avatar
Micha Müller committed
212
213
			break;
		}
214
215
216
		case 'q':
			qos = atoi(optarg);
			break;
217
218
219
		case 'o':
			timeout = atoi(optarg);
			break;
220
221
		case 'c':
			cassandra = true;
222
			splitHostList(optarg, hostList);
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
			break;
		case 'u':
			cassandra = true;
			cassandraUser = optarg;
			break;
		case 'p': {
			cassandra = true;
			cassandraPassword = optarg;
			// What does this do? Mask the password?
			size_t pwdLen = strlen(optarg);
			memset(optarg, 'x', (pwdLen >= 3) ? 3 : pwdLen);
			if (pwdLen > 3) {
				memset(optarg + 3, 0, pwdLen - 3);
			}
			break;
		}
Micha Müller's avatar
Micha Müller committed
239
	    case 'n':
240
241
242
243
244
245
246
247
248
249
250
			nodelist = optarg;
			break;
		case 't':
			ts = std::stoull(optarg);
			break;
		case 'j':
			jobId = optarg;
			break;
		case 'i':
			userId = optarg;
			break;
251
	    case 's':
252
253
254
255
256
257
258
259
260
			substitution = optarg;
			if (substitution == "SNG") {
				substitution = "s%([fi][0-9]{2})(r[0-9]{2})(c[0-9]{2})(s[0-9]{2})%/sng/\\1/\\2/\\3/\\4%";
				maxJobLength = 48;
			} else if (substitution == "DEEPEST") {
				substitution = "s%dp-(cn|dam|esb)([0-9]{2})%/deepest/\\1/s\\2%";
				maxJobLength = 20;
			}
			break;
261
	    case 'm':
262
263
264
265
266
267
			maxJobLength = std::stoull(optarg);
			break;
		case 'h':
		default:
			usage();
			return 1;
Alessio Netti's avatar
Alessio Netti committed
268
269
        }
    }
270
271
    
    if (hostList.size() == 0) {
272
		hostList.push_back("localhost");
273
    }
Alessio Netti's avatar
Alessio Netti committed
274

275
276
    if (cassandra) {
	    //Allocate and initialize connection to Cassandra.
277
278
	    pickRandomHost(hostList, host, port);
	    if (port == 0) {
279
			port = 9042;
280
281
282
	    }
	
	    dcdbConn = new DCDB::Connection(host, port, cassandraUser, cassandraPassword);
283
	    if (!dcdbConn->connect()) {
284
		    std::cerr << "Cannot connect to Cassandra server " << host << ":" << port << std::endl;
285
286
		    return 1;
	    }
287
	    std::cout << "Connected to Cassandra server " << host << ":" << port << std::endl;
288
289
290
291
	    myJobDataStore = new DCDB::JobDataStore(dcdbConn);
    } else {
	    //Initialize Mosquitto library and connect to broker
	    char hostname[256];
Micha Müller's avatar
Micha Müller committed
292

293
294
295
296
297
298
299
300
301
302
303
304
	    if (gethostname(hostname, 255) != 0) {
		    std::cerr << "Cannot get hostname!" << std::endl;
		    return 1;
	    }
	    hostname[255] = '\0';
	    mosquitto_lib_init();
	    _mosq = mosquitto_new(hostname, false, NULL);
	    if (!_mosq) {
		    perror(NULL);
		    return 1;
	    }

305
306
307
308
309
310
311
312
313
314
315
316
317
318
	    int ret = MOSQ_ERR_UNKNOWN;
	    do {
		    pickRandomHost(hostList, host, port, true);
		    if (port == 0) {
			    port = 1883;
		    }
		    
		    if ((ret = mosquitto_connect(_mosq, host.c_str(), port, 1000)) != MOSQ_ERR_SUCCESS) {
			    std::cerr << "Could not connect to MQTT broker " << host << ":" << port << " (" << mosquitto_strerror(ret) << ")" <<std::endl;
		    } else {
			    std::cout << "Connected to MQTT broker " << host << ":" << port << ", using QoS " << qos << std::endl;
			    break;
		    }
	    } while (hostList.size() > 0);
319
	    
320
321
	    if (ret != MOSQ_ERR_SUCCESS) {
		    std::cout << "No more MQTT brokers left, aborting" << std::endl;
322
323
		    return 1;
	    }
Alessio Netti's avatar
Alessio Netti committed
324
    }
Micha Müller's avatar
Micha Müller committed
325
326

    //collect job data
Alessio Netti's avatar
Alessio Netti committed
327
    DCDB::JobData jd;
328
    int retCode = 0;
Micha Müller's avatar
Micha Müller committed
329

Alessio Netti's avatar
Alessio Netti committed
330
331
    if(ts==0)
        ts = getTimestamp();
332
    
333
    if(jobId=="") {
334
335
336
337
338
339
340
341
342
343
344
		jobId = getEnv("SLURM_JOB_ID");
		if (jobId == "") {
			jobId = getEnv("SLURM_JOBID");
		}

		stepId = getEnv("SLURM_STEP_ID");
		if (stepId=="") {
			stepId = getEnv("SLURM_STEPID");
		}
		if (stepId!="" && jobId!="")
			jobId = jobId + SLURM_JOBSTEP_SEP + stepId;
345
	}
346
	
Alessio Netti's avatar
Alessio Netti committed
347
    if (boost::iequals(argv[argc-1], "start")) {
348
349
350
351
352
353
        if(userId=="") {
			userId = getEnv("SLURM_JOB_USER");
			if(userId=="") {
				userId = getEnv("USER");
			}
		}
Alessio Netti's avatar
Alessio Netti committed
354
        
355
		if(nodelist=="") {
Alessio Netti's avatar
Alessio Netti committed
356
            nodelist = getEnv("SLURM_JOB_NODELIST");
357
358
359
360
	    	if(nodelist=="") {
				nodelist = getEnv("SLURM_NODELIST");
	    	}
		}
361

362
363
364
		DCDB::NodeList nl;
		splitNodeList(nodelist, nl);
		convertNodeList(nl, substitution);
365

366
		std::cout << "JOBID    = " << jobId << std::endl;
367
368
369
        std::cout << "USER     = " << userId << std::endl;
        std::cout << "START    = " << ts << std::endl;
        std::cout << "NODELIST = " << nodelist << std::endl;
370
371
372
373
374
375
376
377
378
379
		std::cout << "SUBST    = " << substitution << std::endl;
		if (maxJobLength >= 0) {
			std::cout << "JOBLEN   = " << maxJobLength << std::endl;
		}
		std::cout << "NODES    =";
		for (auto &n: nl) {
			std::cout << " " << n;
		}
		std::cout << std::endl;
		
Alessio Netti's avatar
Alessio Netti committed
380
        try {
381
382
            jd.jobId     = jobId;
            jd.userId    = userId;
Alessio Netti's avatar
Alessio Netti committed
383
            jd.startTime = DCDB::TimeStamp(ts);
Michael Ott's avatar
Michael Ott committed
384
            jd.endTime   = (maxJobLength >= 0) ? DCDB::TimeStamp((uint64_t) (ts + S_TO_NS((uint64_t)maxJobLength * 3600ull) + 1)) : DCDB::TimeStamp((uint64_t)0);
Alessio Netti's avatar
Alessio Netti committed
385
386
            jd.nodes     = nl;
        } catch(const std::invalid_argument& e) {
387
388
389
390
			std::cerr << "Invalid input format!" << std::endl;
			retCode = 1;
			goto exit;
		}
391

392
393
394
395
396
		if (cassandra && (myJobDataStore->insertJob(jd) != DCDB::JD_OK)) {
			std::cerr << "Job data insert failed!" << std::endl;
			retCode = 1;
			goto exit;
		}
Alessio Netti's avatar
Alessio Netti committed
397
398
399
400
401
    } else if (boost::iequals(argv[argc-1], "stop")) {
        std::cout << "JOBID = " << jobId << std::endl;
        std::cout << "STOP  = " << ts << std::endl;
        
        try {
402
403
404
405
			jd.jobId = jobId;
			jd.endTime = DCDB::TimeStamp(ts);
		} catch (const std::invalid_argument &e) {
			std::cerr << "Invalid input format!" << std::endl;
406
407
408
			retCode = 1;
			goto exit;
		}
409
410
411
412
413
414
415
416
417
418
419
420
421

		if (cassandra) {
			DCDB::JobData jobStart;
			if (myJobDataStore->getJobById(jobStart, jd.jobId) != DCDB::JD_OK) {
				std::cerr << "Could not retrieve job to be updated!" << std::endl;
				retCode = 1;
				goto exit;
			}
			if (myJobDataStore->updateEndtime(jobStart.jobId, jobStart.startTime, jd.endTime) != DCDB::JD_OK) {
				std::cerr << "Could not update end time of job!" << std::endl;
				retCode = 1;
				goto exit;
			}
422
		}
Micha Müller's avatar
Micha Müller committed
423
424
    }

425
426
427
428
    //Message sent to CollectAgent is independent of start/stop. We send the
    //same JSON in either case. CA does job insert or update depending
    //on job endtime value.
    if (!cassandra) {
Micha Müller's avatar
Micha Müller committed
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
	    //create job data string in JSON format
	    std::string                 payload = "";
	    std::string                 topic = "/DCDB_JOBDATA/"; //do not change or keep in sync with simplemqttservermessage.h
	    boost::property_tree::ptree config;
	    std::ostringstream          output;
	    config.clear();
	    config.push_back(boost::property_tree::ptree::value_type("jobid", boost::property_tree::ptree(jd.jobId)));
	    config.push_back(boost::property_tree::ptree::value_type("userid", boost::property_tree::ptree(jd.userId)));
	    config.push_back(boost::property_tree::ptree::value_type("starttime", boost::property_tree::ptree(std::to_string(jd.startTime.getRaw()))));
	    config.push_back(boost::property_tree::ptree::value_type("endtime", boost::property_tree::ptree(std::to_string(jd.endTime.getRaw()))));
	    boost::property_tree::ptree nodes;
	    for (const auto &n : jd.nodes) {
		    nodes.push_back(boost::property_tree::ptree::value_type("", boost::property_tree::ptree(n)));
	    }
	    config.push_back(boost::property_tree::ptree::value_type("nodes", nodes));
	    boost::property_tree::write_json(output, config, true);
	    payload = output.str();

	    //std::cout << "Payload:\n" << payload << std::endl;
448
449
450
	    
	    mosquitto_publish_callback_set(_mosq, publishCallback);
	    uint64_t startTs = getTimestamp();
451
	    int ret = MOSQ_ERR_UNKNOWN;
Micha Müller's avatar
Micha Müller committed
452
	    //send it to broker
453
454
	    if ((ret = mosquitto_publish(_mosq, &msgId, topic.c_str(), payload.length(), payload.c_str(), qos, false)) != MOSQ_ERR_SUCCESS) {
		    std::cerr << "Could not publish job data via MQTT: " << mosquitto_strerror(ret) << std::endl;
Micha Müller's avatar
Micha Müller committed
455
456
457
		    retCode = 1;
		    goto exit;
	    }
458
459
	    
	    do {
460
461
462
463
464
		    if ((ret = mosquitto_loop(_mosq, -1, 1)) != MOSQ_ERR_SUCCESS) {
			    std::cerr << "Error in mosquitto_loop: " << mosquitto_strerror(ret) << std::endl;
			    retCode = 1;
			    goto exit;
		    }
465
	    } while(!done && getTimestamp() - startTs < (uint64_t)S_TO_NS(timeout));
Alessio Netti's avatar
Alessio Netti committed
466
467
    }

Micha Müller's avatar
Micha Müller committed
468
469
//hasta la vista
exit:
470
471
472
473
474
475
476
477
478
	if (cassandra) {
		delete myJobDataStore;
		dcdbConn->disconnect();
		delete dcdbConn;
	} else {
		mosquitto_disconnect(_mosq);
		mosquitto_destroy(_mosq);
		mosquitto_lib_cleanup();
	}
Micha Müller's avatar
Micha Müller committed
479
	return retCode;
480
}