Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Tobias Weinzierl
teaMPI
Commits
461f4712
Commit
461f4712
authored
Jul 08, 2018
by
Ben Hazelwood
Browse files
General cleanup
parent
d1d73470
Changes
8
Hide whitespace changes
Inline
Side-by-side
lib/Logging.h
View file @
461f4712
...
...
@@ -34,6 +34,7 @@
std::cout.flush(); \
}
// Disables output (cout, cerr) for this rank
inline
void
disableLogging
()
{
std
::
cout
.
setstate
(
std
::
ios_base
::
failbit
);
std
::
cerr
.
setstate
(
std
::
ios_base
::
failbit
);
...
...
lib/Rank.cpp
View file @
461f4712
...
...
@@ -15,8 +15,6 @@
#include "Logging.h"
#include "Timing.h"
static
int
worldRank
;
static
int
worldSize
;
static
int
teamRank
;
...
...
@@ -30,7 +28,6 @@ int initialiseTMPI() {
/**
* The application should have no knowledge of the world_size or world_rank
*/
registerSignalHandler
();
setEnvironment
();
PMPI_Comm_size
(
MPI_COMM_WORLD
,
&
worldSize
);
...
...
@@ -49,6 +46,7 @@ int initialiseTMPI() {
assert
(
teamSize
==
(
worldSize
/
numTeams
));
registerSignalHandler
();
outputEnvironment
();
#ifndef REPLICAS_OUTPUT
...
...
lib/Rank.h
View file @
461f4712
...
...
@@ -27,50 +27,66 @@
#elif SIZE_MAX == ULLONG_MAX
#define TMPI_SIZE_T MPI_UNSIGNED_LONG_LONG
#else
#error "
what is happening here?
"
#error "
Cannot decipher SIZE_MAX
"
#endif
/* Split ranks into teams */
int
initialiseTMPI
();
int
getWorldRank
();
int
getWorldSize
();
/* Get the rank as seen by the application */
int
getTeamRank
();
/* Get the number of ranks as seen by the application */
int
getTeamSize
();
/* Also the number of replicas */
int
getNumberOfTeams
();
/* Return which team this rank belongs to */
int
getTeam
();
/* The communicator used by this team */
MPI_Comm
getTeamComm
();
int
freeTeamComm
();
/* The duplicate MPI_COMM_WORLD used by the library*/
MPI_Comm
getLibComm
();
int
freeLibComm
();
/* Get the value of an environment variable (empty string if undefined) */
std
::
string
getEnvString
(
std
::
string
const
&
key
);
/* Get the number of teams from environment */
void
setEnvironment
();
/* Output team sizes and any timing inaccuracies between ranks */
void
outputEnvironment
();
/* Output the timing differences between replicas */
void
outputTiming
();
/* Decide whether data should be manually corrupted upon next heartbeat */
bool
getShouldCorruptData
();
void
setShouldCorruptData
(
bool
toggle
);
int
mapRankToTeamNumber
(
int
rank
);
int
mapWorldToTeamRank
(
int
rank
);
int
mapTeamToWorldRank
(
int
rank
,
int
r
);
/* Alters the MPI_SOURCE member of MPI_Status to 0 <= r < team size */
void
remapStatus
(
MPI_Status
*
status
);
/* Barrier on team communicator */
int
synchroniseRanksInTeam
();
/* Barrier on all ranks (not called by application) */
int
synchroniseRanksGlobally
();
...
...
lib/RankControl.cpp
View file @
461f4712
...
...
@@ -21,13 +21,13 @@ void registerSignalHandler() {
}
void
pauseThisRankSignalHandler
(
int
signum
)
{
const
double
sleepLength
=
0.1
*
1e6
;
logDebug
(
"Signal received: sleep for
0.
1s"
);
const
double
sleepLength
=
1.0
*
1e6
;
logDebug
(
"Signal received: sleep for 1s"
);
usleep
(
sleepLength
);
}
void
corruptThisRankSignalHandler
(
int
signum
)
{
logInfo
(
"Signal received: corrupt this rank"
);
logInfo
(
"Signal received: corrupt this rank
on next heartbeart
"
);
shouldCorruptData
=
true
;
}
...
...
lib/RankControl.h
View file @
461f4712
...
...
@@ -8,10 +8,16 @@
#ifndef RANKCONTROL_H_
#define RANKCONTROL_H_
/*
USR1 is used to pause a rank for 1s
USR2 is used to corrupt the data on next heartbeat
*/
void
registerSignalHandler
();
// USR1
void
pauseThisRankSignalHandler
(
int
signum
);
// USR2
void
corruptThisRankSignalHandler
(
int
signum
);
bool
getShouldCorruptData
();
...
...
lib/Timing.cpp
View file @
461f4712
...
...
@@ -6,6 +6,8 @@
*/
#include "Timing.h"
#include "Logging.h"
#include "Rank.h"
#include <fstream>
#include <map>
...
...
@@ -16,17 +18,21 @@
#include <stddef.h>
#include <bitset>
#include "Logging.h"
#include "Rank.h"
struct
Timer
{
// PMPI_Wtime at start of execution
double
startTime
;
// PMPI_Wtime at the end of this ranks execution
double
endTime
;
// TODO change to heartbeat terminology
// Times for each heartbeat (per replica)
std
::
map
<
int
,
std
::
vector
<
double
>
>
syncPoints
;
// Store the MPI_Requests for each heartbeat (per replica) for calling MPI_Test
std
::
map
<
int
,
std
::
vector
<
MPI_Request
>
>
syncRequests
;
// Hash for each heartbeat buffer (per replica)
std
::
map
<
int
,
std
::
vector
<
std
::
size_t
>
>
hashes
;
// Store the MPI_Requests for each heartbeat (per replica) for calling MPI_Test
std
::
map
<
int
,
std
::
vector
<
MPI_Request
>
>
hashRequests
;
}
timer
;
...
...
@@ -86,8 +92,8 @@ void Timing::compareProgressWithReplicas() {
void
Timing
::
compareBufferWithReplicas
(
const
void
*
sendbuf
,
int
sendcount
,
MPI_Datatype
sendtype
)
{
if
(
getShouldCorruptData
())
{
//TODO can remove const here (assuming data was originally non-const) and corrupt properly, no need for now
sendcount
++
;
// This isn't really that safe either...
//TODO can remove const here
via cast
(assuming data was originally non-const) and corrupt properly, no need for now
sendcount
++
;
// This isn't really that safe either...
likely causes memory corruption occasionally
setShouldCorruptData
(
false
);
}
...
...
lib/Timing.h
View file @
461f4712
...
...
@@ -13,19 +13,20 @@
namespace
Timing
{
// Mark time only for this heartbeat
void
markTimeline
();
// Also mark the hash for the heartbeat buffer
void
markTimeline
(
const
void
*
sendbuf
,
int
sendcount
,
MPI_Datatype
sendtype
);
void
initialiseTiming
();
void
finaliseTiming
();
// Compare the time of heartbeat(s) with other replica(s)
void
compareProgressWithReplicas
();
// Also compare a hash of a heartbeat buffer
void
compareBufferWithReplicas
(
const
void
*
sendbuf
,
int
sendcount
,
MPI_Datatype
sendtype
);
const
std
::
vector
<
double
>&
getSyncPoints
();
void
outputTiming
();
}
#endif
/* TIMING_H_ */
lib/Wrapper.h
View file @
461f4712
...
...
@@ -104,6 +104,7 @@ int MPI_Alltoallv(const void *sendbuf, const int *sendcounts,
double
MPI_Wtime
();
/* This is the plugin for the heartbeat called by the application (MPI_COMM_SELF must be used) */
int
MPI_Sendrecv
(
const
void
*
sendbuf
,
int
sendcount
,
MPI_Datatype
sendtype
,
int
dest
,
int
sendtag
,
void
*
recvbuf
,
int
recvcount
,
MPI_Datatype
recvtype
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment