Commit 5e73318f authored by Philipp Samfaß's avatar Philipp Samfaß
Browse files

added more doxygen docu

parent cba6d1e9
......@@ -21,8 +21,14 @@ namespace CommunicationStatistics {
* Struct for storing the communication statistics
*/
struct CommunicationStats {
std::atomic<size_t> sentBytes; /// number of sent bytes
std::atomic<size_t> receivedBytes; // number of received bytes
/**
* number of sent bytes
*/
std::atomic<size_t> sentBytes;
/**
* number of received bytes
*/
std::atomic<size_t> receivedBytes;
};
/**
* Computes the communication volume in bytes for a message of give datatype and count.
......
/*
* Logging.h
*
* Created on: 2 Mar 2018
* Author: Ben Hazelwood
/**
* @file Logging.h
* @brief Contains some logging functionality for teaMPI.
* @author: Ben Hazelwood, Philipp Samfass
*/
#ifndef LOGGING_H_
......
......@@ -106,12 +106,16 @@ MPI_Comm getTeamComm(MPI_Comm comm) {
return (comm==MPI_COMM_WORLD) ? TMPI_COMM_TEAM : comm;
}
int freeTeamComm() {
return MPI_Comm_free(&TMPI_COMM_TEAM);
}
MPI_Comm getTeamInterComm() {
return TMPI_COMM_INTER_TEAM;
}
int freeTeamComm() {
return MPI_Comm_free(&TMPI_COMM_TEAM);
int freeTeamInterComm() {
return MPI_Comm_free(&TMPI_COMM_INTER_TEAM);
}
MPI_Comm getLibComm() {
......@@ -183,12 +187,12 @@ int mapWorldToTeamRank(int rank) {
}
}
int mapTeamToWorldRank(int rank, int r) {
int mapTeamToWorldRank(int rank, int team) {
if (rank == MPI_ANY_SOURCE) {
return MPI_ANY_SOURCE;
}
return rank + r * getTeamSize();
return rank + team * getTeamSize();
}
int translateRank(MPI_Comm srcComm, int srcRank, MPI_Comm destComm) {
......
/*
* RankOperations.h
*
* Created on: 2 Mar 2018
* Author: Ben Hazelwood, Philipp Samfass
/**
* @file Rank.h
* @brief Several routines to manage ranks and communicators in teaMPI.
* @author Ben Hazelwood, Philipp Samfass
*/
#ifndef RANK_H_
......@@ -32,71 +31,137 @@
extern MPI_Comm TMPI_COMM_DUP;
/* Split ranks into teams */
/**
* Split ranks into teams.
*/
int initialiseTMPI(MPI_Comm comm);
/**
* Returns rank of calling process in MPI_COMM_WORLD.
*/
int getWorldRank();
/**
* Returns size of calling process in MPI_COMM_WORLD.
*/
int getWorldSize();
/* Get the rank as seen by the application */
/**
* Get the rank as seen by the application
*/
int getTeamRank();
/* Get the number of ranks as seen by the application */
/**
* Get the number of ranks as seen by the application
*/
int getTeamSize();
/* Also the number of replicas */
/** Get the number of replicas==number of teams
*
*/
int getNumberOfTeams();
/* Return which team this rank belongs to */
/**
* Return which team this rank belongs to.
*/
int getTeam();
/* The communicator used by this team */
/**
* Return the communicator used by this team
*/
MPI_Comm getTeamComm(MPI_Comm comm);
/**
* Frees the communicator used by the calling team
*/
int freeTeamComm();
/**
* Returns communictor for horizontal communication between replica ranks.
*/
MPI_Comm getTeamInterComm();
/* The duplicate MPI_COMM_WORLD used by the library*/
/**
* Frees inter-team communicator.
*/
int freeTeamInterComm();
/**
* The duplicate MPI_COMM_WORLD used by the library
*/
MPI_Comm getLibComm();
/**
* Frees duplicate MPI_COMM_WORLD used by the library
*/
int freeLibComm();
/* Get the value of an environment variable (empty string if undefined) */
/**
* Get the value of an environment variable (empty string if undefined)
*/
std::string getEnvString(std::string const& key);
/* Get the number of teams from environment */
/**
* Get the number of teams from environment
*/
void setEnvironment();
/* Output team sizes and any timing inaccuracies between ranks */
/**
* Output team sizes and any timing inaccuracies between ranks
*/
void outputEnvironment();
/* Output the timing differences between replicas */
/**
* Output the timing differences between replicas
*/
void outputTiming();
/* Decide whether data should be manually corrupted upon next heartbeat */
bool getShouldCorruptData();
void setShouldCorruptData(bool toggle);
/**
* Maps a world rank to its team.
* @param rank The input rank.
*/
int mapRankToTeamNumber(int rank);
/**
* Maps a world rank to its team rank.
* @param rank The input rank.
*/
int mapWorldToTeamRank(int rank);
int mapTeamToWorldRank(int rank, int r);
/**
* Maps a team rank in some given team to the world rank.
* @param rank The input rank within a team
* @param team The team the input rank belongs to
*/
int mapTeamToWorldRank(int rank, int team);
/**
* Translates a rank in a src communicator to the matching rank in a destination communicator.
* @param srcComm The source communicator
* @param srcRank The rank in the source communicator
* @param destComm The destination communicator
*/
int translateRank(MPI_Comm srcComm, int srcRank, MPI_Comm destComm);
/**
* Map a rank in a communicator comm to the world rank in MPI_COMM_WORLD.
* @param rank The input rank
* @param comm The communicator this rank number belongs to
*/
int mapToWorldRank(int rank, MPI_Comm comm);
/* Alters the MPI_SOURCE member of MPI_Status to 0 <= r < team size */
/**
* Alters the MPI_SOURCE member of MPI_Status to 0 <= r < team size
*/
void remapStatus(MPI_Status *status);
/* Barrier on team communicator */
/**
* Barrier on team communicator
*/
int synchroniseRanksInTeam();
/* Barrier on all ranks (not called by application) */
/**
* Barrier on all ranks (not called by application)
*/
int synchroniseRanksGlobally();
#endif /* RANK_H_ */
/*
* RankOperations.cpp
* RankControl.cpp
*
* Created on: 2 Jul 2018
* Author: Ben Hazelwood
*/
#include "RankControl.h"
#include <csignal>
#include <unistd.h>
#include "RankControl.h"
#include "Logging.h"
#include "Timing.h"
......
/*
* RankControl.h
*
/**
* @file RankControl.h
* @brief Ranks can be paused or some of their data may be corrupted which is implemented by the functions declared in this file.
* This works by sending signals to the application processes which are caught in respective handlers.
* SIGUSR1 is the signal used to pause a rank.
* SIGUSR2 is the signal used to corrupt data on the next heartbeat.
* Created on: 2 Jul 2018
* Author: Ben Hazelwood
* @author: Ben Hazelwood, Philipp Samfass
*/
#ifndef RANKCONTROL_H_
#define RANKCONTROL_H_
/*
USR1 is used to pause a rank for 1s
USR2 is used to corrupt the data on next heartbeat
/**
* USR1 is used to pause a rank for 1s
* USR2 is used to corrupt the data on next heartbeat
*/
void registerSignalHandler();
// USR1
/**
* Signal handler for USR1.
*/
void pauseThisRankSignalHandler(int signum);
// USR2
/**
* Signal handler for USR2.
*/
void corruptThisRankSignalHandler(int signum);
/**
* Returns true if data should be corrupted.
*/
bool getShouldCorruptData();
/**
* Disables/enables corruption in next heartbeat.
* @param toggle If true, corruption is triggered.
*/
void setShouldCorruptData(bool toggle);
#endif
/*
* Timing.h
*
* Created on: 2 Mar 2018
* Author: Ben Hazelwood
/**
* @file Timing.h
* @brief Manages heartbeats (sending and receiving) and contains functionality for dumping heartbeats post-mortem to an output file.
* @author Ben Hazelwood, Philipp Samfass
*/
#ifndef TIMING_H_
......@@ -10,29 +9,66 @@
#include <mpi.h>
/**
* Contains routines for managing heartbeats.
*/
namespace Timing {
// Mark time only for this heartbeat
/**
* Tracks start and end of a heartbeat and stores time between heartbeats.
* @param tag Tags for this heartbeat. A positive tag x starts the heartbeat, the corresponding negative tag -x ends the heartbeat..
*/
void markTimeline(int tag);
// Also mark the hash for the heartbeat buffer
/**
* Tracks start and end of a heartbeat, stores time between heartbeats and also keeps track of hashes over send buffers (for corruption detection).
* @see markTimeline
* @param tag Tag for this heartbeat.
* @param sendbuf Send buffer that is hashed.
* @param sendcount Length of sendbuffer in MPI_Datatype sendtype
* @param sendtype MPI datatype contained in sendbuffer
*/
void markTimeline(int tag, const void *sendbuf, int sendcount, MPI_Datatype sendtype);
/**
* Initialises data structures for heartbeats.
*/
void initialiseTiming();
/**
* Destroys data structures for heartbeats.
*/
void finaliseTiming();
// Compare the time of heartbeat(s) with other replica(s)
/**
* Compare the time of heartbeat(s) with other replica(s)
*/
void compareProgressWithReplicas();
// Also compare a hash of a heartbeat buffer
/**
* Also compare a hash of a heartbeat buffer.
*/
void compareBufferWithReplicas(const void *sendbuf, int sendcount, MPI_Datatype sendtype);
/**
* Probes for a heartbeat from the replica of a given team.
* @param targetTeam Team of the replica from which the heartbeat should be received.
*/
void pollForAndReceiveHeartbeat(int targetTeam);
/**
* Makes progress on outstanding communication requests for the given team.
* @param targetTeam Team for which progress on outstanding requests should be made.
*/
void progressOutstandingRequests(int targetTeam);
/**
* Tracks points in time when sleep is invoked.
*/
void sleepRankRaised();
/**
* Dumps heartbeat statistics
*/
void outputTiming();
}
}
#endif /* TIMING_H_ */
......@@ -362,6 +362,7 @@ int MPI_Finalize() {
#ifdef DirtyCleanUp
return MPI_SUCCESS;
#endif
freeTeamInterComm();
return PMPI_Finalize();
}
......
/**
* @file Wrapper.h
* @author Benjamin Hazelwood, Philipp Samfass
* @brief This file contains the wrapped MPI routines using the PMPI interface.
*/
#ifndef WRAPPER_H
#define WRAPPER_H
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment