Commit 461f4712 authored by Ben Hazelwood's avatar Ben Hazelwood
Browse files

General cleanup

parent d1d73470
......@@ -34,6 +34,7 @@
std::cout.flush(); \
}
// Disables output (cout, cerr) for this rank
inline void disableLogging() {
std::cout.setstate(std::ios_base::failbit);
std::cerr.setstate(std::ios_base::failbit);
......
......@@ -15,8 +15,6 @@
#include "Logging.h"
#include "Timing.h"
static int worldRank;
static int worldSize;
static int teamRank;
......@@ -30,7 +28,6 @@ int initialiseTMPI() {
/**
* The application should have no knowledge of the world_size or world_rank
*/
registerSignalHandler();
setEnvironment();
PMPI_Comm_size(MPI_COMM_WORLD, &worldSize);
......@@ -49,6 +46,7 @@ int initialiseTMPI() {
assert(teamSize == (worldSize / numTeams));
registerSignalHandler();
outputEnvironment();
#ifndef REPLICAS_OUTPUT
......
......@@ -27,50 +27,66 @@
#elif SIZE_MAX == ULLONG_MAX
#define TMPI_SIZE_T MPI_UNSIGNED_LONG_LONG
#else
#error "what is happening here?"
#error "Cannot decipher SIZE_MAX"
#endif
/* Split ranks into teams */
int initialiseTMPI();
int getWorldRank();
int getWorldSize();
/* Get the rank as seen by the application */
int getTeamRank();
/* Get the number of ranks as seen by the application */
int getTeamSize();
/* Also the number of replicas */
int getNumberOfTeams();
/* Return which team this rank belongs to */
int getTeam();
/* The communicator used by this team */
MPI_Comm getTeamComm();
int freeTeamComm();
/* The duplicate MPI_COMM_WORLD used by the library*/
MPI_Comm getLibComm();
int freeLibComm();
/* Get the value of an environment variable (empty string if undefined) */
std::string getEnvString(std::string const& key);
/* Get the number of teams from environment */
void setEnvironment();
/* Output team sizes and any timing inaccuracies between ranks */
void outputEnvironment();
/* Output the timing differences between replicas */
void outputTiming();
/* Decide whether data should be manually corrupted upon next heartbeat */
bool getShouldCorruptData();
void setShouldCorruptData(bool toggle);
int mapRankToTeamNumber(int rank);
int mapWorldToTeamRank(int rank);
int mapTeamToWorldRank(int rank, int r);
/* Alters the MPI_SOURCE member of MPI_Status to 0 <= r < team size */
void remapStatus(MPI_Status *status);
/* Barrier on team communicator */
int synchroniseRanksInTeam();
/* Barrier on all ranks (not called by application) */
int synchroniseRanksGlobally();
......
......@@ -21,13 +21,13 @@ void registerSignalHandler() {
}
void pauseThisRankSignalHandler( int signum ) {
const double sleepLength = 0.1 * 1e6;
logDebug( "Signal received: sleep for 0.1s");
const double sleepLength = 1.0 * 1e6;
logDebug( "Signal received: sleep for 1s");
usleep(sleepLength);
}
void corruptThisRankSignalHandler( int signum ) {
logInfo("Signal received: corrupt this rank");
logInfo("Signal received: corrupt this rank on next heartbeart");
shouldCorruptData = true;
}
......
......@@ -8,10 +8,16 @@
#ifndef RANKCONTROL_H_
#define RANKCONTROL_H_
/*
USR1 is used to pause a rank for 1s
USR2 is used to corrupt the data on next heartbeat
*/
void registerSignalHandler();
// USR1
void pauseThisRankSignalHandler(int signum);
// USR2
void corruptThisRankSignalHandler(int signum);
bool getShouldCorruptData();
......
......@@ -6,6 +6,8 @@
*/
#include "Timing.h"
#include "Logging.h"
#include "Rank.h"
#include <fstream>
#include <map>
......@@ -16,17 +18,21 @@
#include <stddef.h>
#include <bitset>
#include "Logging.h"
#include "Rank.h"
struct Timer {
// PMPI_Wtime at start of execution
double startTime;
// PMPI_Wtime at the end of this ranks execution
double endTime;
// TODO change to heartbeat terminology
// Times for each heartbeat (per replica)
std::map< int, std::vector<double> > syncPoints;
// Store the MPI_Requests for each heartbeat (per replica) for calling MPI_Test
std::map< int, std::vector<MPI_Request> > syncRequests;
// Hash for each heartbeat buffer (per replica)
std::map<int, std::vector<std::size_t> > hashes;
// Store the MPI_Requests for each heartbeat (per replica) for calling MPI_Test
std::map<int, std::vector<MPI_Request> > hashRequests;
} timer;
......@@ -86,8 +92,8 @@ void Timing::compareProgressWithReplicas() {
void Timing::compareBufferWithReplicas(const void *sendbuf, int sendcount, MPI_Datatype sendtype) {
if (getShouldCorruptData()) {
//TODO can remove const here (assuming data was originally non-const) and corrupt properly, no need for now
sendcount++; // This isn't really that safe either...
//TODO can remove const here via cast (assuming data was originally non-const) and corrupt properly, no need for now
sendcount++; // This isn't really that safe either...likely causes memory corruption occasionally
setShouldCorruptData(false);
}
......
......@@ -13,19 +13,20 @@
namespace Timing {
// Mark time only for this heartbeat
void markTimeline();
// Also mark the hash for the heartbeat buffer
void markTimeline(const void *sendbuf, int sendcount, MPI_Datatype sendtype);
void initialiseTiming();
void finaliseTiming();
// Compare the time of heartbeat(s) with other replica(s)
void compareProgressWithReplicas();
// Also compare a hash of a heartbeat buffer
void compareBufferWithReplicas(const void *sendbuf, int sendcount, MPI_Datatype sendtype);
const std::vector<double>& getSyncPoints();
void outputTiming();
}
#endif /* TIMING_H_ */
......@@ -104,6 +104,7 @@ int MPI_Alltoallv(const void *sendbuf, const int *sendcounts,
double MPI_Wtime();
/* This is the plugin for the heartbeat called by the application (MPI_COMM_SELF must be used) */
int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
int dest, int sendtag,
void *recvbuf, int recvcount, MPI_Datatype recvtype,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment