Timing.cpp 7.42 KB
Newer Older
1
2
3
4
/*
 * Timing.cpp
 *
 *  Created on: 2 Mar 2018
5
 *      Author: Ben Hazelwood
6
7
8
 */

#include "Timing.h"
Ben Hazelwood's avatar
Ben Hazelwood committed
9
10
#include "Logging.h"
#include "Rank.h"
11
#include "RankControl.h"
12

Ben Hazelwood's avatar
Ben Hazelwood committed
13
14
#include <fstream>
#include <map>
15
#include <set>
Ben Hazelwood's avatar
Ben Hazelwood committed
16
17
18
#include <sstream>
#include <string>
#include <utility>
19
#include <stddef.h>
20
#include <bitset>
21
#include <unistd.h>
22
23
#include <list>
#include <vector>
Ben Hazelwood's avatar
Ben Hazelwood committed
24

25
struct Timer {
Ben Hazelwood's avatar
Ben Hazelwood committed
26
  // PMPI_Wtime at start of execution
Ben Hazelwood's avatar
Ben Hazelwood committed
27
  double startTime;
Ben Hazelwood's avatar
Ben Hazelwood committed
28
  // PMPI_Wtime at the end of this ranks execution
Ben Hazelwood's avatar
Ben Hazelwood committed
29
  double endTime;
30

Ben Hazelwood's avatar
Ben Hazelwood committed
31
32
33
  // Mark when an application sleeps
  std::vector<double> sleepPoints;

34

35
  // TODO: add support for multiple tags (or do we need this?)
36
37
38
39
  // Delta times for each heartbeat (per replica)
  std::map< int, std::list<double> > heartbeatTimes;
  // Store the MPI_Requests for each heartbeat delta (per replica)
  std::map< int, std::list<MPI_Request> > heartbeatTimeRequests;
40

41
42
  std::map< int, bool > isHeartbeatTriggeredForTag;

Ben Hazelwood's avatar
Ben Hazelwood committed
43
  // Hash for each heartbeat buffer (per replica)
44
45
46
  std::map<int, std::list<std::size_t> > heartbeatHashes;
  // Store the MPI_Requests for each heartbeat (per replica) 
  std::map<int, std::list<MPI_Request> > heartbeatHashRequests;
Ben Hazelwood's avatar
Ben Hazelwood committed
47
48
} timer;

49
void Timing::initialiseTiming() {
Ben Hazelwood's avatar
Ben Hazelwood committed
50
  synchroniseRanksInTeam();
51
  timer.startTime = PMPI_Wtime();
Ben Hazelwood's avatar
Ben Hazelwood committed
52
  for (int i=0; i < getNumberOfTeams(); i++) {
Ben Hazelwood's avatar
Ben Hazelwood committed
53
54
    timer.heartbeatTimes.insert(std::make_pair(i, std::list<double>());
    timer.heartbeatTimeRequests.insert(std::make_pair(i, std::list<MPI_Request>()));
55

Ben Hazelwood's avatar
Ben Hazelwood committed
56
57
    timer.heartbeatHashes.insert(std::make_pair(i, std::list<std::size_t>()));
    timer.heartbeatHashRequests.insert(std::make_pair(i, std::list<MPI_Request>()));
58
59
60
61
  }
}

void Timing::finaliseTiming() {
Ben Hazelwood's avatar
Ben Hazelwood committed
62
  synchroniseRanksInTeam();
63
64
65
  timer.endTime = PMPI_Wtime();
}

66
void Timing::markTimeline(int tag) {
67
68
69
70
71
72
  if (timer.isHeartbeatTriggeredForTag.find(tag) == timer.isHeartbeatTriggeredForTag.end()) {
    // New heartbeat tag found
    timer.isHeartbeatTriggeredForTag.insert( std::make_pair(tag, false));
    timer.heartbeatTimes.at(getTeam()).push_back(PMPI_Wtime());
  } else if (timer.isHeartbeatTriggeredForTag.at(tag) == false) {
    // Trigger heartbeat
Ben Hazelwood's avatar
Ben Hazelwood committed
73
    timer.heartbeatTimes.at(getTeam()).push_back(PMPI_Wtime());
74
  } else {
75
    // End heartbeat
76
77
78
    timer.heartbeatTimes.at(getTeam()).back() = PMPI_Wtime() - timer.heartbeatTimes.at(getTeam()).back();
    compareProgressWithReplicas();
  }
79
  timer.isHeartbeatTriggeredForTag.at(tag) = !timer.isHeartbeatTriggeredForTag.at(tag);
80
81
}

82
83
void Timing::markTimeline(int tag, const void *sendbuf, int sendcount, MPI_Datatype sendtype) {
  markTimeline(tag);
84
85
86
  compareBufferWithReplicas(sendbuf, sendcount, sendtype);
}

87
void Timing::compareProgressWithReplicas() {
Ben Hazelwood's avatar
Ben Hazelwood committed
88
89
  for (int r=0; r < getNumberOfTeams(); r++) {
    if (r != getTeam()) {
90
91
92
      // Send out this replica's delta
      timer.heartbeatTimeRequests.at(r).push_back(MPI_Request());
      PMPI_Isend(&timer.heartbeatTimes.at(getTeam()).back(), 1, MPI_DOUBLE,
Ben Hazelwood's avatar
Ben Hazelwood committed
93
                mapTeamToWorldRank(getTeamRank(), r), getTeam(),
94
                getLibComm(), &timer.heartbeatTimeRequests.at(r).back());
95

96
97

      // Receive deltas from other replicas
98
      timer.heartbeatTimes.at(r).push_back(0.0);
Ben Hazelwood's avatar
Ben Hazelwood committed
99
      timer.heartbeatTimeRequests.at(r).push_back(MPI_Request());
100
      PMPI_Irecv(&timer.heartbeatTimes.at(getTeam()).back(), 1, MPI_DOUBLE,
Ben Hazelwood's avatar
Ben Hazelwood committed
101
                 mapTeamToWorldRank(getTeamRank(), r), r, getLibComm(), &timer.heartbeatTimeRequests.at(r).back());
102

103
104
105
106
107
108
109
110
111
112
113
      auto it = timer.heartbeatTimeRequests.at(r).begin();
      while (it != timer.heartbeatTimeRequests.at(r).end()) {
        int flag;
        PMPI_Test(&(*it), &flag, MPI_STATUS_IGNORE);
        if (flag) {
          if (!((*it) == MPI_REQUEST_NULL)){
            MPI_Request_free(&(*it));
          }
          it = timer.heartbeatTimeRequests.at(r).erase(it);
        }
        ++it;
114
115
116
117
118
119
120
      }
    }
  }
}

void Timing::compareBufferWithReplicas(const void *sendbuf, int sendcount, MPI_Datatype sendtype) {
  if (getShouldCorruptData()) {
Ben Hazelwood's avatar
Ben Hazelwood committed
121
122
    //TODO can remove const here via cast (assuming data was originally non-const) and corrupt properly, no need for now
    sendcount++; // This isn't really that safe either...likely causes memory corruption occasionally
123
124
125
126
127
128
129
130
131
    setShouldCorruptData(false);
  }

  int typeSize;
  MPI_Type_size(sendtype, &typeSize);

  std::string bits((const char*)sendbuf, sendcount*typeSize);
  std::hash<std::string> hash_fn;
  std::size_t hash = hash_fn(bits);
Ben Hazelwood's avatar
Ben Hazelwood committed
132
  timer.heartbeatHashes.at(getTeam()).push_back((std::size_t)hash);
133
134
135
136
137

  for (int r=0; r < getNumberOfTeams(); r++) {
    if (r != getTeam()) {
      // Send out this replica's times
      MPI_Request request;
Ben Hazelwood's avatar
Ben Hazelwood committed
138
      PMPI_Isend(&timer.heartbeatHashes.at(getTeam()).back(), 1, TMPI_SIZE_T,
139
140
141
142
143
                mapTeamToWorldRank(getTeamRank(), r), getTeam(),
                getLibComm(), &request);
      MPI_Request_free(&request);

      // Receive times from other replicas
Ben Hazelwood's avatar
Ben Hazelwood committed
144
145
146
147
      timer.heartbeatHashes.at(r).push_back(0);
      timer.heartbeatHashRequests.at(r).push_back(MPI_Request());
      PMPI_Irecv(&timer.heartbeatHashes.at(r).back(), 1, TMPI_SIZE_T,
                 mapTeamToWorldRank(getTeamRank(), r), r, getLibComm(), &timer.heartbeatHashRequests.at(r).back());
148

149
150
151
152
153
154
155
156
      // // Test for completion of Irecv's
      // int numPending = 0;
      // for (int i=0; i < timer.heartbeatHashRequests.at(r).size(); i++) {
      //   int flag = 0;
      //   PMPI_Test(&timer.heartbeatHashRequests.at(r).at(i), &flag, MPI_STATUS_IGNORE);
      //   numPending += 1 - flag;
      // }
      // std::cout << "Num pending: " << numPending << "\n";
157
158
    }
  }
Ben Hazelwood's avatar
Ben Hazelwood committed
159
160
}

Ben Hazelwood's avatar
Ben Hazelwood committed
161
162
163
164
165
void Timing::sleepRankRaised() {
  timer.sleepPoints.push_back(PMPI_Wtime());
}


Ben Hazelwood's avatar
Ben Hazelwood committed
166
void Timing::outputTiming() {
167
168
  std::cout.flush();
  PMPI_Barrier(MPI_COMM_WORLD);
Ben Hazelwood's avatar
Ben Hazelwood committed
169

170
171
  std::string filenamePrefix = getEnvString("TMPI_FILE");

172
  // Output simple replica timings
173
  if ((getTeamRank() == MASTER) && (getWorldRank() != MASTER)) {
Ben Hazelwood's avatar
Ben Hazelwood committed
174
    PMPI_Send(&timer.endTime, 1, MPI_DOUBLE, MASTER, 0, getLibComm());
Ben Hazelwood's avatar
Ben Hazelwood committed
175
176
  }

177

178
179
180
181
  if (getWorldRank() == MASTER) {
    std::cout << std::endl;
    std::cout << "----------TMPI_TIMING----------\n";
    std::cout << "timing_file=";
182
    std::cout << (filenamePrefix.empty() ? "timing_not_enabled" : filenamePrefix) << "\n";
Ben Hazelwood's avatar
Ben Hazelwood committed
183
184
    std::cout << "num_replicas=" << getNumberOfTeams() << "\n";
    for (int i=0; i < getNumberOfTeams(); i++) {
185
186
187
188
      double rEndTime = 0.0;
      if (i == MASTER) {
        rEndTime = timer.endTime;
      } else {
Ben Hazelwood's avatar
Ben Hazelwood committed
189
        PMPI_Recv(&rEndTime, 1, MPI_DOUBLE, mapTeamToWorldRank(MASTER, i), 0, getLibComm(), MPI_STATUS_IGNORE);
190
191
      }

Ben Hazelwood's avatar
Ben Hazelwood committed
192
      std::cout << "replica " << i << "=" << rEndTime - timer.startTime << "s\n";
193
194
195
196
197
    }
    std::cout << "-------------------------------\n";
  }
  std::cout.flush();
  PMPI_Barrier(MPI_COMM_WORLD);
198

199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
  if (!filenamePrefix.empty()) {
    // Write Generic Sync points to files
    char sep = ',';
    std::ostringstream filename;
    std::string outputFolder("tmpi-timings");
    filename << outputFolder << "/"
        << filenamePrefix << "-"
        << getWorldRank() << "-"
        << getTeamRank() << "-"
        << getTeam()
        << ".csv";
    std::ofstream f;
    f.open(filename.str().c_str());

    logInfo("Writing timings to " << filename);

    f << "endTime" << sep << timer.endTime - timer.startTime << "\n";

Ben Hazelwood's avatar
Ben Hazelwood committed
217
218
    f << "heartbeatTimes";
    for (const double& t : timer.heartbeatTimes.at(getTeam())) {
Ben Hazelwood's avatar
Ben Hazelwood committed
219
      f << sep << t;
220
221
    }
    f << "\n";
222

Ben Hazelwood's avatar
Ben Hazelwood committed
223
    f << "sleepPoints";
Ben Hazelwood's avatar
Ben Hazelwood committed
224
225
226
    for (const double& t : timer.sleepPoints) {
      f << sep << t - timer.startTime;
    }
227
228
    f.close();
  }
229
230

  PMPI_Barrier(MPI_COMM_WORLD);
231
232
}