Commit ef19b612 authored by Phillip Samfass's avatar Phillip Samfass

fixed issue that could delay master thread's processing of time critical skeleton jobs

parent 6e76f34b
Pipeline #150848 failed with stage
......@@ -186,6 +186,7 @@ def parseResultFile(filePath):
try:
fileHandle=codecs.open(filePath,'r','UTF_8')
for line in fileHandle:
if line.startswith("sweep/environment"):
value = line.replace("sweep/environment=","")
environmentDict=json.loads(value)
......@@ -287,7 +288,7 @@ def parseAdapterTimes(resultsFolderPath,projectName,compressTable):
# derived
totalCores = str(int(ranks)*int(cores)) # total used CPU cores
environmentDict,parameterDict,adapters,stats = parseResultFile(resultsFolderPath + "/" + fileName)
if len(environmentDict):
# write header
......
......@@ -215,7 +215,7 @@ void exahype::offloading::OffloadingAnalyser::printWaitingTimes() {
for(int i=0; i<nnodes; i++) {
for(int j=0; j<nnodes; j++) {
if(waitingTimesSnapshot[k+j]>0)
logDebug("printWaitingTimes()","rank "<<i<<" waiting for "<<waitingTimesSnapshot[k+j]<<" for rank "<<j);
logInfo("printWaitingTimes()","rank "<<i<<" waiting for "<<waitingTimesSnapshot[k+j]<<" for rank "<<j);
}
k+= nnodes;
}
......
This diff is collapsed.
......@@ -3043,27 +3043,21 @@ public:
const CellDescription& cellDescription = *((const CellDescription*) cellDescripPtr);
//bool hasProcessed = false;
bool hasTriggeredEmergency = false;
bool offloadingTreatment = true;
#if !defined(OffloadingUseProgressThread)
if( offloadingTreatment )
{
//exahype::solvers::ADERDGSolver::setMaxNumberOfIprobesInProgressOffloading(1);
setMaxNumberOfIprobesInProgressOffloading(1);
}
#endif
int myRank = tarch::parallel::Node::getInstance().getRank();
int responsibleRank = myRank;
if( offloadingTreatment)
responsibleRank = getResponsibleRankForCellDescription((const void*) &cellDescription);
responsibleRank = getResponsibleRankForCellDescription((const void*) &cellDescription);
bool progress = false;
double startTime = MPI_Wtime();
if ( !cellDescription.getHasCompletedLastStep() ) {
peano::datatraversal::TaskSet::startToProcessBackgroundJobs();
#if !defined(OffloadingUseProgressThread)
if ( responsibleRank!=myRank
&& offloadingTreatment) {
if ( responsibleRank!=myRank) {
pauseOffloadingManager();
logInfo("waitUntil", "cell missing from responsible rank: "<<responsibleRank);
tryToReceiveTaskBack(this) ;
......@@ -3072,8 +3066,7 @@ public:
}
while ( !cellDescription.getHasCompletedLastStep() ) {
#if !defined(OffloadingUseProgressThread)
if ( responsibleRank!=myRank
&& offloadingTreatment) {
if ( responsibleRank!=myRank) {
tryToReceiveTaskBack(this);
//solver->spawnReceiveBackJob();
}
......@@ -3090,20 +3083,21 @@ public:
}
#endif
switch ( JobSystemWaitBehaviour ) {
case JobSystemWaitBehaviourType::ProcessJobsWithSamePriority:
//switch ( JobSystemWaitBehaviour ) {
// case JobSystemWaitBehaviourType::ProcessJobsWithSamePriority:
tarch::multicore::jobs::processBackgroundJobs( 1, getTaskPriority(waitForHighPriorityJob), true );
break;
case JobSystemWaitBehaviourType::ProcessAnyJobs:
tarch::multicore::jobs::processBackgroundJobs( 1, -1, true );
break;
default:
break;
}
if((MPI_Wtime()-startTime)>10.0) { // && responsibleRank!=myRank) {
// case JobSystemWaitBehaviourType::ProcessAnyJobs:
// tarch::multicore::jobs::processBackgroundJobs( 1, -1, true );
// break;
// default:
// break;
// }
if((MPI_Wtime()-startTime)>0.10) { // && responsibleRank!=myRank) {
startTime = MPI_Wtime();
logInfo("waitUntilCompletedTimeStep()","warning: rank waiting too long for missing task from rank "<<responsibleRank<< " outstanding jobs:"<<NumberOfRemoteJobs);
logInfo("waitUntilCompletedTimeStep()","warning: rank waiting too long for missing task from rank "<<responsibleRank<< " outstanding remote jobs:"<<NumberOfRemoteJobs
<<" outstanding skeleton jobs "<< exahype::solvers::ADERDGSolver::NumberOfSkeletonJobs<< " outstanding enclave jobs:"<<NumberOfEnclaveJobs );
}
......@@ -3113,7 +3107,6 @@ public:
&& !hasTriggeredEmergency
&& !progress
&& myRank!=responsibleRank
&& offloadingTreatment
&& ( exahype::solvers::ADERDGSolver::NumberOfEnclaveJobs
-exahype::solvers::ADERDGSolver::NumberOfRemoteJobs)==0
)
......@@ -3123,7 +3116,6 @@ public:
if( !cellDescription.getHasCompletedLastStep()
&& !hasTriggeredEmergency
&& myRank!=responsibleRank
&& offloadingTreatment
&& !hasProcessed)
#endif
{
......@@ -3140,8 +3132,7 @@ public:
}
#if !defined(OffloadingUseProgressThread)
if ( responsibleRank!=myRank
&& offloadingTreatment) {
if ( responsibleRank!=myRank ) {
resumeOffloadingManager();
}
exahype::solvers::ADERDGSolver::setMaxNumberOfIprobesInProgressOffloading( std::numeric_limits<int>::max() );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment