Currently job artifacts in CI/CD pipelines on LRZ GitLab never expire. Starting from Wed 26.1.2022 the default expiration time will be 30 days (GitLab default). Currently existing artifacts in already completed jobs will not be affected by the change. The latest artifacts for all jobs in the latest successful pipelines will be kept. More information: https://gitlab.lrz.de/help/user/admin_area/settings/continuous_integration.html#default-artifacts-expiration

Commit 406c7fd9 authored by Alessio Netti's avatar Alessio Netti
Browse files

Tools: integrating domain ID for jobs

- Jobs can now be inserted and queried with arbitrary domain IDs
- Collect agent uses domain ID accordingly during inserts
parent f25fe159
......@@ -423,7 +423,9 @@ int mqttCallback(SimpleMQTTMessage *msg)
BOOST_FOREACH (boost::property_tree::iptree::value_type &val, config) {
if (boost::iequals(val.first, "jobid")) {
jd.jobId = val.second.data();
} else if (boost::iequals(val.first, "userid")) {
} else if (boost::iequals(val.first, "domainid")) {
jd.domainId = val.second.data();
} else if (boost::iequals(val.first, "userid")) {
jd.userId = val.second.data();
} else if (boost::iequals(val.first, "starttime")) {
jd.startTime = DCDB::TimeStamp((uint64_t)stoull(val.second.data()));
......@@ -462,12 +464,12 @@ int mqttCallback(SimpleMQTTMessage *msg)
} else {
//ending job data
DCDB::JobData tmp;
if (myJobDataStore->getJobById(tmp, jd.jobId) != DCDB::JD_OK) {
if (myJobDataStore->getJobById(tmp, jd.jobId, jd.domainId) != DCDB::JD_OK) {
LOG(error) << "Could not retrieve job to be updated!";
return 1;
}
if (myJobDataStore->updateEndtime(tmp.jobId, tmp.startTime, jd.endTime) != DCDB::JD_OK) {
if (myJobDataStore->updateEndtime(tmp.jobId, tmp.startTime, jd.endTime, jd.domainId) != DCDB::JD_OK) {
LOG(error) << "Could not update end time of job!";
return 1;
}
......
......@@ -100,17 +100,17 @@ Sensor Properties:
Listing all jobs stored in the database that are currently running:
```bash
dcdbconfig -h 127.0.0.1 job running
dcdbconfig -h 127.0.0.1 job running SYSTEM1
```
Sample output:
```
dcdbconfig 0.4.135-gb57cedf (libdcdb 0.4.135-gb57cedf)
dcdbconfig 0.4.160-g48ce9d0 (libdcdb 0.4.161-gf25fe15)
Job ID, User ID
800475,di726bwe
800499,ga526ppd
Domain ID, Job ID, User ID, Start Time, End Time, #Nodes
SYSTEM1,800475,di726bwe,1597992340671657000,0,128
SYSTEM1,800499,ga526ppd,1597992340522786000,0,16
```
---
......@@ -118,18 +118,19 @@ Job ID, User ID
Showing the information associated to a specific job ID:
```bash
dcdbconfig -h 127.0.0.1 job show 800475
dcdbconfig -h 127.0.0.1 job show 800475 SYSTEM1
```
Sample output:
```
dcdbconfig 0.4.135-gb57cedf (libdcdb 0.4.135-gb57cedf)
dcdbconfig 0.4.160-g48ce9d0 (libdcdb 0.4.161-gf25fe15)
Domain ID: SYSTEM1
Job ID: 800475
User ID: di726bwe
Start Time: 1595314022854049000
End Time: 0
Start Time: 2020-07-21T06:47:30.854049000 (1595314022854049000)
End Time: 1970-01-01T01:00:00 (3600000000000)
Node List: /mpp2/r03/c04/s02/, /mpp2/r04/c04/s04/, /mpp2/r05/c05/s02/
```
......
......@@ -40,11 +40,13 @@ void JobAction::printHelp(int argc, char* argv[])
/* 01234567890123456789012345678901234567890123456789012345678901234567890123456789 */
std::cout << "JOB command help" << std::endl << std::endl;
std::cout << "The JOB command has the following options:" << std::endl;
std::cout << " SHOW <jobid> - Shows information for a certain <jobid>" << std::endl;
std::cout << " LIST - Lists all job IDs stored in the database" << std::endl;
std::cout << " RUNNING - Lists all currently running jobs" << std::endl;
std::cout << " PENDING - Lists all jobs that have not yet started" << std::endl;
std::cout << " FINISHED - Lists all jobs that have already terminated" << std::endl;
std::cout << " SHOW <jobid> <domainid> - Shows information for a certain <jobid>" << std::endl;
std::cout << " LIST <domainid> - Lists all job IDs stored in the database" << std::endl;
std::cout << " RUNNING <domainid> - Lists all currently running jobs" << std::endl;
std::cout << " PENDING <domainid> - Lists all jobs that have not yet started" << std::endl;
std::cout << " FINISHED <domainid> - Lists all jobs that have already terminated" << std::endl;
std::cout << std::endl;
std::cout << "The <domainid> argument is optional and defines the job domain to query." << std::endl;
}
/*
......@@ -74,19 +76,24 @@ int JobAction::executeCommand(int argc, char* argv[], int argvidx, const char* h
std::cout << "SHOW needs one more parameter!" << std::endl;
goto executeCommandError;
}
doShow(argv[argvidx+1]);
std::string domainId = argvidx+2 >= argc ? JOB_DEFAULT_DOMAIN : argv[argvidx+2];
doShow(argv[argvidx+1], domainId);
}
else if (strcasecmp(argv[argvidx], "LIST") == 0) {
doList();
std::string domainId = argvidx+1 >= argc ? JOB_DEFAULT_DOMAIN : argv[argvidx+1];
doList(domainId);
}
else if (strcasecmp(argv[argvidx], "RUNNING") == 0) {
doRunning();
std::string domainId = argvidx+1 >= argc ? JOB_DEFAULT_DOMAIN : argv[argvidx+1];
doRunning(domainId);
}
else if (strcasecmp(argv[argvidx], "PENDING") == 0) {
doPending();
std::string domainId = argvidx+1 >= argc ? JOB_DEFAULT_DOMAIN : argv[argvidx+1];
doPending(domainId);
}
else if (strcasecmp(argv[argvidx], "FINISHED") == 0) {
doFinished();
std::string domainId = argvidx+1 >= argc ? JOB_DEFAULT_DOMAIN : argv[argvidx+1];
doFinished(domainId);
}
else {
std::cout << "Invalid JOB command: " << argv[argvidx] << std::endl;
......@@ -104,10 +111,10 @@ int JobAction::executeCommand(int argc, char* argv[], int argvidx, const char* h
return EXIT_FAILURE;
}
void JobAction::doShow(std::string jobId) {
void JobAction::doShow(std::string jobId, std::string domainId) {
DCDB::JobDataStore jobDataStore(connection);
DCDB::JobData jobData;
DCDB::JDError err = jobDataStore.getJobById(jobData, jobId);
DCDB::JDError err = jobDataStore.getJobById(jobData, jobId, domainId);
std::list<std::string>::iterator nIt;
switch (err) {
......@@ -118,6 +125,7 @@ void JobAction::doShow(std::string jobId) {
}
jobData.startTime.convertToLocal();
jobData.endTime.convertToLocal();
std::cout << "Domain ID: " << jobData.domainId << std::endl;
std::cout << "Job ID: " << jobId << std::endl;
std::cout << "User ID: " << jobData.userId << std::endl;
std::cout << "Start Time: " << jobData.startTime.getString() << " (" << jobData.startTime.getRaw() << ")" << std::endl;
......@@ -135,10 +143,10 @@ void JobAction::doShow(std::string jobId) {
std::cout << std::endl;
break;
case DCDB::JD_JOBKEYNOTFOUND:
std::cout << "Job key not found: " << jobId << std::endl;
std::cout << "Job key " << jobId << " with domain ID " << domainId << "not found." << std::endl;
break;
case DCDB::JD_JOBIDNOTFOUND:
std::cout << "Job ID not found: " << jobId << std::endl;
std::cout << "Job ID " << jobId << " with domain ID " << domainId << "not found." << std::endl;
break;
default:
std::cout << "Internal error." << std::endl;
......@@ -146,19 +154,19 @@ void JobAction::doShow(std::string jobId) {
}
void JobAction::printList(std::list<DCDB::JobData>& jobList) {
std::cout << "Job ID, User ID, Start Time, End Time, #Nodes" << std::endl;
std::cout << "Domain ID, Job ID, User ID, Start Time, End Time, #Nodes" << std::endl;
for(const auto &j : jobList) {
std::cout << j.jobId << "," << j.userId << "," << j.startTime.getRaw() << "," << j.endTime.getRaw() << "," << j.nodes.size() << std::endl;
std::cout << j.domainId << "," << j.jobId << "," << j.userId << "," << j.startTime.getRaw() << "," << j.endTime.getRaw() << "," << j.nodes.size() << std::endl;
}
std::cout << std::endl;
}
void JobAction::doList() {
void JobAction::doList(std::string domainId) {
DCDB::JobDataStore jobDataStore(connection);
DCDB::TimeStamp tsEnd((uint64_t)LLONG_MAX);
DCDB::TimeStamp tsStart((uint64_t)0);
std::list<DCDB::JobData> jobList;
DCDB::JDError err = jobDataStore.getJobsInIntervalIncl(jobList, tsStart, tsEnd);
DCDB::JDError err = jobDataStore.getJobsInIntervalIncl(jobList, tsStart, tsEnd, domainId);
switch (err) {
case DCDB::JD_OK:
case DCDB::JD_PARSINGERROR:
......@@ -172,12 +180,12 @@ void JobAction::doList() {
}
}
void JobAction::doPending() {
void JobAction::doPending(std::string domainId) {
DCDB::JobDataStore jobDataStore(connection);
DCDB::TimeStamp tsEnd;
DCDB::TimeStamp tsStart(tsEnd.getRaw() - JOB_ACTION_OFFSET);
std::list<DCDB::JobData> jobList;
DCDB::JDError err = jobDataStore.getJobsInIntervalPending(jobList, tsStart, tsEnd);
DCDB::JDError err = jobDataStore.getJobsInIntervalPending(jobList, tsStart, tsEnd, domainId);
switch (err) {
case DCDB::JD_OK:
case DCDB::JD_PARSINGERROR:
......@@ -191,12 +199,12 @@ void JobAction::doPending() {
}
}
void JobAction::doRunning() {
void JobAction::doRunning(std::string domainId) {
DCDB::JobDataStore jobDataStore(connection);
DCDB::TimeStamp tsEnd;
DCDB::TimeStamp tsStart(tsEnd.getRaw() - JOB_ACTION_OFFSET);
std::list<DCDB::JobData> jobList;
DCDB::JDError err = jobDataStore.getJobsInIntervalRunning(jobList, tsStart, tsEnd);
DCDB::JDError err = jobDataStore.getJobsInIntervalRunning(jobList, tsStart, tsEnd, domainId);
switch (err) {
case DCDB::JD_OK:
case DCDB::JD_PARSINGERROR:
......@@ -210,12 +218,12 @@ void JobAction::doRunning() {
}
}
void JobAction::doFinished() {
void JobAction::doFinished(std::string domainId) {
DCDB::JobDataStore jobDataStore(connection);
DCDB::TimeStamp tsEnd;
DCDB::TimeStamp tsStart((uint64_t)0);
std::list<DCDB::JobData> jobList;
DCDB::JDError err = jobDataStore.getJobsInIntervalFinished(jobList, tsStart, tsEnd);
DCDB::JDError err = jobDataStore.getJobsInIntervalFinished(jobList, tsStart, tsEnd, domainId);
switch (err) {
case DCDB::JD_OK:
case DCDB::JD_PARSINGERROR:
......
......@@ -53,11 +53,11 @@ public:
protected:
DCDB::Connection* connection;
void doShow(std::string jobId);
void doList();
void doRunning();
void doFinished();
void doPending();
void doShow(std::string jobId, std::string domainId);
void doList(std::string domainId);
void doRunning(std::string domainId);
void doFinished(std::string domainId);
void doPending(std::string domainId);
private:
void printList(std::list<DCDB::JobData>& jobList);
......
......@@ -54,7 +54,7 @@ void publishCallback(struct mosquitto *mosq, void *obj, int mid) {
*/
void usage() {
std::cout << "Usage:" << std::endl;
std::cout << " dcdbslurmjob [-b<host>] [-t<timestamp>] [-n<nodelist>] [-j<jobid>] [-i<userid>] start|stop" << std::endl;
std::cout << " dcdbslurmjob [-b<host>] [-t<timestamp>] [-n<nodelist>] [-d<domainid>] [-j<jobid>] [-i<userid>] start|stop" << std::endl;
std::cout << " dcdbslurmjob [-c<host>] [-u<username>] [-p<password>] [-t<timestamp>] [-n<nodelist>] [-j<jobid>] [-i<userid>] [-s<pattern>] start|stop" << std::endl;
std::cout << " dcdbslurmjob -h" << std::endl;
std::cout << std::endl;
......@@ -68,7 +68,8 @@ void usage() {
std::cout << " -p<password> Cassandra password [default: none]" << std::endl;
std::cout << " -t<timestamp> Timestamp value [default: now]" << std::endl;
std::cout << " -n<nodelist> Comma-separated nodelist [default: SLURM_JOB_NODELIST]" << std::endl;
std::cout << " -j<jobid> Numerical job id [default: SLURM_JOB_ID var]" << std::endl;
std::cout << " -d<domainid> Job domain id [default: default]" << std::endl;
std::cout << " -j<jobid> String job id [default: SLURM_JOB_ID var]" << std::endl;
std::cout << " -i<userid> Numerical user id [default: SLURM_JOB_USER var]" << std::endl;
std::cout << " -s<pattern> Nodelist substitution pattern [default: none]" << std::endl;
std::cout << " -m<pattern> Maximum job length in h [default: none]" << std::endl;
......@@ -174,6 +175,7 @@ int main(int argc, char** argv) {
std::string host = "", cassandraUser = "", cassandraPassword = "";
int port;
std::string nodelist="", jobId="", userId="", stepId="";
std::string domainId = JOB_DEFAULT_DOMAIN;
std::string substitution="";
int maxJobLength = -1;
int qos = 1;
......@@ -181,7 +183,7 @@ int main(int argc, char** argv) {
uint64_t ts=0;
// Defining options
const char *opts = "b:q:o:c:u:p:n:t:j:i:s:m:h";
const char *opts = "b:q:o:c:u:p:n:t:d:j:i:s:m:h";
char ret;
while ((ret = getopt(argc, argv, opts))!=-1) {
......@@ -242,6 +244,9 @@ int main(int argc, char** argv) {
case 't':
ts = std::stoull(optarg);
break;
case 'd':
domainId = optarg;
break;
case 'j':
jobId = optarg;
break;
......@@ -363,6 +368,7 @@ int main(int argc, char** argv) {
splitNodeList(nodelist, nl);
convertNodeList(nl, substitution);
std::cout << "DOMAINID = " << domainId << std::endl;
std::cout << "JOBID = " << jobId << std::endl;
std::cout << "USER = " << userId << std::endl;
std::cout << "START = " << ts << std::endl;
......@@ -378,6 +384,7 @@ int main(int argc, char** argv) {
std::cout << std::endl;
try {
jd.domainId = domainId;
jd.jobId = jobId;
jd.userId = userId;
jd.startTime = DCDB::TimeStamp(ts);
......@@ -395,10 +402,12 @@ int main(int argc, char** argv) {
goto exit;
}
} else if (boost::iequals(argv[argc-1], "stop")) {
std::cout << "JOBID = " << jobId << std::endl;
std::cout << "STOP = " << ts << std::endl;
std::cout << "DOMAINID = " << domainId << std::endl;
std::cout << "JOBID = " << jobId << std::endl;
std::cout << "STOP = " << ts << std::endl;
try {
jd.domainId = domainId;
jd.jobId = jobId;
jd.endTime = DCDB::TimeStamp(ts);
} catch (const std::invalid_argument &e) {
......@@ -409,12 +418,12 @@ int main(int argc, char** argv) {
if (cassandra) {
DCDB::JobData jobStart;
if (myJobDataStore->getJobById(jobStart, jd.jobId) != DCDB::JD_OK) {
if (myJobDataStore->getJobById(jobStart, jd.jobId, jd.domainId) != DCDB::JD_OK) {
std::cerr << "Could not retrieve job to be updated!" << std::endl;
retCode = 1;
goto exit;
}
if (myJobDataStore->updateEndtime(jobStart.jobId, jobStart.startTime, jd.endTime) != DCDB::JD_OK) {
if (myJobDataStore->updateEndtime(jobStart.jobId, jobStart.startTime, jd.endTime, jd.domainId) != DCDB::JD_OK) {
std::cerr << "Could not update end time of job!" << std::endl;
retCode = 1;
goto exit;
......@@ -432,6 +441,7 @@ int main(int argc, char** argv) {
boost::property_tree::ptree config;
std::ostringstream output;
config.clear();
config.push_back(boost::property_tree::ptree::value_type("domainid", boost::property_tree::ptree(jd.domainId)));
config.push_back(boost::property_tree::ptree::value_type("jobid", boost::property_tree::ptree(jd.jobId)));
config.push_back(boost::property_tree::ptree::value_type("userid", boost::property_tree::ptree(jd.userId)));
config.push_back(boost::property_tree::ptree::value_type("starttime", boost::property_tree::ptree(std::to_string(jd.startTime.getRaw()))));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment