04.06., 9:00 - 12:00: GitLab will be migrated to a new server environment and upgraded to Enterprise Edition Ultimate. The estimated downtime will be 2-3 hours. Please see https://doku.lrz.de/display/PUBLIC/GitLab+Ultimate+Migration for more details about changes related to the migration.

Commit ef19b612 authored by Phillip Samfass's avatar Phillip Samfass

fixed issue that could delay master thread's processing of time critical skeleton jobs

parent 6e76f34b
Pipeline #150848 failed with stage
......@@ -186,6 +186,7 @@ def parseResultFile(filePath):
try:
fileHandle=codecs.open(filePath,'r','UTF_8')
for line in fileHandle:
if line.startswith("sweep/environment"):
value = line.replace("sweep/environment=","")
environmentDict=json.loads(value)
......@@ -287,7 +288,7 @@ def parseAdapterTimes(resultsFolderPath,projectName,compressTable):
# derived
totalCores = str(int(ranks)*int(cores)) # total used CPU cores
environmentDict,parameterDict,adapters,stats = parseResultFile(resultsFolderPath + "/" + fileName)
if len(environmentDict):
# write header
......
......@@ -215,7 +215,7 @@ void exahype::offloading::OffloadingAnalyser::printWaitingTimes() {
for(int i=0; i<nnodes; i++) {
for(int j=0; j<nnodes; j++) {
if(waitingTimesSnapshot[k+j]>0)
logDebug("printWaitingTimes()","rank "<<i<<" waiting for "<<waitingTimesSnapshot[k+j]<<" for rank "<<j);
logInfo("printWaitingTimes()","rank "<<i<<" waiting for "<<waitingTimesSnapshot[k+j]<<" for rank "<<j);
}
k+= nnodes;
}
......
This diff is collapsed.
......@@ -3043,27 +3043,21 @@ public:
const CellDescription& cellDescription = *((const CellDescription*) cellDescripPtr);
//bool hasProcessed = false;
bool hasTriggeredEmergency = false;
bool offloadingTreatment = true;
#if !defined(OffloadingUseProgressThread)
if( offloadingTreatment )
{
//exahype::solvers::ADERDGSolver::setMaxNumberOfIprobesInProgressOffloading(1);
setMaxNumberOfIprobesInProgressOffloading(1);
}
#endif
int myRank = tarch::parallel::Node::getInstance().getRank();
int responsibleRank = myRank;
if( offloadingTreatment)
responsibleRank = getResponsibleRankForCellDescription((const void*) &cellDescription);
responsibleRank = getResponsibleRankForCellDescription((const void*) &cellDescription);
bool progress = false;
double startTime = MPI_Wtime();
if ( !cellDescription.getHasCompletedLastStep() ) {
peano::datatraversal::TaskSet::startToProcessBackgroundJobs();
#if !defined(OffloadingUseProgressThread)
if ( responsibleRank!=myRank
&& offloadingTreatment) {
if ( responsibleRank!=myRank) {
pauseOffloadingManager();
logInfo("waitUntil", "cell missing from responsible rank: "<<responsibleRank);
tryToReceiveTaskBack(this) ;
......@@ -3072,8 +3066,7 @@ public:
}
while ( !cellDescription.getHasCompletedLastStep() ) {
#if !defined(OffloadingUseProgressThread)
if ( responsibleRank!=myRank
&& offloadingTreatment) {
if ( responsibleRank!=myRank) {
tryToReceiveTaskBack(this);
//solver->spawnReceiveBackJob();
}
......@@ -3090,20 +3083,21 @@ public:
}
#endif
switch ( JobSystemWaitBehaviour ) {
case JobSystemWaitBehaviourType::ProcessJobsWithSamePriority:
//switch ( JobSystemWaitBehaviour ) {
// case JobSystemWaitBehaviourType::ProcessJobsWithSamePriority:
tarch::multicore::jobs::processBackgroundJobs( 1, getTaskPriority(waitForHighPriorityJob), true );
break;
case JobSystemWaitBehaviourType::ProcessAnyJobs:
tarch::multicore::jobs::processBackgroundJobs( 1, -1, true );
break;
default:
break;
}
if((MPI_Wtime()-startTime)>10.0) { // && responsibleRank!=myRank) {
// case JobSystemWaitBehaviourType::ProcessAnyJobs:
// tarch::multicore::jobs::processBackgroundJobs( 1, -1, true );
// break;
// default:
// break;
// }
if((MPI_Wtime()-startTime)>0.10) { // && responsibleRank!=myRank) {
startTime = MPI_Wtime();
logInfo("waitUntilCompletedTimeStep()","warning: rank waiting too long for missing task from rank "<<responsibleRank<< " outstanding jobs:"<<NumberOfRemoteJobs);
logInfo("waitUntilCompletedTimeStep()","warning: rank waiting too long for missing task from rank "<<responsibleRank<< " outstanding remote jobs:"<<NumberOfRemoteJobs
<<" outstanding skeleton jobs "<< exahype::solvers::ADERDGSolver::NumberOfSkeletonJobs<< " outstanding enclave jobs:"<<NumberOfEnclaveJobs );
}
......@@ -3113,7 +3107,6 @@ public:
&& !hasTriggeredEmergency
&& !progress
&& myRank!=responsibleRank
&& offloadingTreatment
&& ( exahype::solvers::ADERDGSolver::NumberOfEnclaveJobs
-exahype::solvers::ADERDGSolver::NumberOfRemoteJobs)==0
)
......@@ -3123,7 +3116,6 @@ public:
if( !cellDescription.getHasCompletedLastStep()
&& !hasTriggeredEmergency
&& myRank!=responsibleRank
&& offloadingTreatment
&& !hasProcessed)
#endif
{
......@@ -3140,8 +3132,7 @@ public:
}
#if !defined(OffloadingUseProgressThread)
if ( responsibleRank!=myRank
&& offloadingTreatment) {
if ( responsibleRank!=myRank ) {
resumeOffloadingManager();
}
exahype::solvers::ADERDGSolver::setMaxNumberOfIprobesInProgressOffloading( std::numeric_limits<int>::max() );
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment