Timing.cpp 7.02 KB
Newer Older
1
2
3
4
/*
 * Timing.cpp
 *
 *  Created on: 2 Mar 2018
5
 *      Author: Ben Hazelwood
6
7
8
 */

#include "Timing.h"
Ben Hazelwood's avatar
Ben Hazelwood committed
9
10
#include "Logging.h"
#include "Rank.h"
11
#include "RankControl.h"
12

Ben Hazelwood's avatar
Ben Hazelwood committed
13
14
#include <fstream>
#include <map>
15
#include <set>
Ben Hazelwood's avatar
Ben Hazelwood committed
16
17
18
#include <sstream>
#include <string>
#include <utility>
19
#include <stddef.h>
20
#include <bitset>
21
#include <unistd.h>
22
23
#include <list>
#include <vector>
Ben Hazelwood's avatar
Ben Hazelwood committed
24

25
struct Timer {
Ben Hazelwood's avatar
Ben Hazelwood committed
26
  // PMPI_Wtime at start of execution
Ben Hazelwood's avatar
Ben Hazelwood committed
27
  double startTime;
Ben Hazelwood's avatar
Ben Hazelwood committed
28
  // PMPI_Wtime at the end of this ranks execution
Ben Hazelwood's avatar
Ben Hazelwood committed
29
  double endTime;
30

Ben Hazelwood's avatar
Ben Hazelwood committed
31
32
33
  // Mark when an application sleeps
  std::vector<double> sleepPoints;

34

35
  // TODO: add support for multiple tags (or do we need this?)
36
37
38
39
  // Delta times for each heartbeat (per replica)
  std::map< int, std::list<double> > heartbeatTimes;
  // Store the MPI_Requests for each heartbeat delta (per replica)
  std::map< int, std::list<MPI_Request> > heartbeatTimeRequests;
40

Ben Hazelwood's avatar
Ben Hazelwood committed
41
  // Hash for each heartbeat buffer (per replica)
42
43
44
  std::map<int, std::list<std::size_t> > heartbeatHashes;
  // Store the MPI_Requests for each heartbeat (per replica) 
  std::map<int, std::list<MPI_Request> > heartbeatHashRequests;
Ben Hazelwood's avatar
Ben Hazelwood committed
45
46
} timer;

47
void Timing::initialiseTiming() {
Ben Hazelwood's avatar
Ben Hazelwood committed
48
  synchroniseRanksInTeam();
49
  timer.startTime = PMPI_Wtime();
Ben Hazelwood's avatar
Ben Hazelwood committed
50
  for (int i=0; i < getNumberOfTeams(); i++) {
51
52
    timer.heartbeatTimes.insert({i, std::list<double>()});
    timer.heartbeatTimeRequests.insert({i, std::list<MPI_Request>()});
53

54
55
    timer.heartbeatHashes.insert({i, std::list<std::size_t>()});
    timer.heartbeatHashRequests.insert({i, std::list<MPI_Request>()});
56
57
58
59
  }
}

void Timing::finaliseTiming() {
Ben Hazelwood's avatar
Ben Hazelwood committed
60
  synchroniseRanksInTeam();
61
62
63
  timer.endTime = PMPI_Wtime();
}

64
void Timing::markTimeline(int tag) {
65
  if (tag > 0) {
Ben Hazelwood's avatar
Ben Hazelwood committed
66
    timer.heartbeatTimes.at(getTeam()).push_back(PMPI_Wtime());
67
68
69
70
71
  } else if (tag < 0) {
    if (timer.heartbeatTimes.at(getTeam()).size()) {
      timer.heartbeatTimes.at(getTeam()).back() = PMPI_Wtime() - timer.heartbeatTimes.at(getTeam()).back();
      compareProgressWithReplicas();
    }
72
  } else {
73
    // TODO: if tag == 0 then single heartbeat mode not deltas
74
  }
75
76
}

77
78
void Timing::markTimeline(int tag, const void *sendbuf, int sendcount, MPI_Datatype sendtype) {
  markTimeline(tag);
79
80
81
  compareBufferWithReplicas(sendbuf, sendcount, sendtype);
}

82
void Timing::compareProgressWithReplicas() {
Ben Hazelwood's avatar
Ben Hazelwood committed
83
84
  for (int r=0; r < getNumberOfTeams(); r++) {
    if (r != getTeam()) {
85
86
87
      // Send out this replica's delta
      timer.heartbeatTimeRequests.at(r).push_back(MPI_Request());
      PMPI_Isend(&timer.heartbeatTimes.at(getTeam()).back(), 1, MPI_DOUBLE,
Ben Hazelwood's avatar
Ben Hazelwood committed
88
                mapTeamToWorldRank(getTeamRank(), r), getTeam(),
89
                getLibComm(), &timer.heartbeatTimeRequests.at(r).back());
90

91
92

      // Receive deltas from other replicas
93
      timer.heartbeatTimes.at(r).push_back(0.0);
Ben Hazelwood's avatar
Ben Hazelwood committed
94
      timer.heartbeatTimeRequests.at(r).push_back(MPI_Request());
95
      PMPI_Irecv(&timer.heartbeatTimes.at(getTeam()).back(), 1, MPI_DOUBLE,
Ben Hazelwood's avatar
Ben Hazelwood committed
96
                 mapTeamToWorldRank(getTeamRank(), r), r, getLibComm(), &timer.heartbeatTimeRequests.at(r).back());
97

98
99
100
101
102
103
104
105
106
107
108
      auto it = timer.heartbeatTimeRequests.at(r).begin();
      while (it != timer.heartbeatTimeRequests.at(r).end()) {
        int flag;
        PMPI_Test(&(*it), &flag, MPI_STATUS_IGNORE);
        if (flag) {
          if (!((*it) == MPI_REQUEST_NULL)){
            MPI_Request_free(&(*it));
          }
          it = timer.heartbeatTimeRequests.at(r).erase(it);
        }
        ++it;
109
110
111
112
113
114
115
      }
    }
  }
}

void Timing::compareBufferWithReplicas(const void *sendbuf, int sendcount, MPI_Datatype sendtype) {
  if (getShouldCorruptData()) {
Ben Hazelwood's avatar
Ben Hazelwood committed
116
117
    //TODO can remove const here via cast (assuming data was originally non-const) and corrupt properly, no need for now
    sendcount++; // This isn't really that safe either...likely causes memory corruption occasionally
118
119
120
121
122
123
124
125
126
    setShouldCorruptData(false);
  }

  int typeSize;
  MPI_Type_size(sendtype, &typeSize);

  std::string bits((const char*)sendbuf, sendcount*typeSize);
  std::hash<std::string> hash_fn;
  std::size_t hash = hash_fn(bits);
Ben Hazelwood's avatar
Ben Hazelwood committed
127
  timer.heartbeatHashes.at(getTeam()).push_back((std::size_t)hash);
128
129
130
131
132

  for (int r=0; r < getNumberOfTeams(); r++) {
    if (r != getTeam()) {
      // Send out this replica's times
      MPI_Request request;
Ben Hazelwood's avatar
Ben Hazelwood committed
133
      PMPI_Isend(&timer.heartbeatHashes.at(getTeam()).back(), 1, TMPI_SIZE_T,
134
135
136
137
138
                mapTeamToWorldRank(getTeamRank(), r), getTeam(),
                getLibComm(), &request);
      MPI_Request_free(&request);

      // Receive times from other replicas
Ben Hazelwood's avatar
Ben Hazelwood committed
139
140
141
142
      timer.heartbeatHashes.at(r).push_back(0);
      timer.heartbeatHashRequests.at(r).push_back(MPI_Request());
      PMPI_Irecv(&timer.heartbeatHashes.at(r).back(), 1, TMPI_SIZE_T,
                 mapTeamToWorldRank(getTeamRank(), r), r, getLibComm(), &timer.heartbeatHashRequests.at(r).back());
143

144
145
146
147
148
149
150
151
      // // Test for completion of Irecv's
      // int numPending = 0;
      // for (int i=0; i < timer.heartbeatHashRequests.at(r).size(); i++) {
      //   int flag = 0;
      //   PMPI_Test(&timer.heartbeatHashRequests.at(r).at(i), &flag, MPI_STATUS_IGNORE);
      //   numPending += 1 - flag;
      // }
      // std::cout << "Num pending: " << numPending << "\n";
152
153
    }
  }
Ben Hazelwood's avatar
Ben Hazelwood committed
154
155
}

Ben Hazelwood's avatar
Ben Hazelwood committed
156
157
158
159
160
void Timing::sleepRankRaised() {
  timer.sleepPoints.push_back(PMPI_Wtime());
}


Ben Hazelwood's avatar
Ben Hazelwood committed
161
void Timing::outputTiming() {
162
163
  std::cout.flush();
  PMPI_Barrier(MPI_COMM_WORLD);
Ben Hazelwood's avatar
Ben Hazelwood committed
164

165
166
  std::string filenamePrefix = getEnvString("TMPI_FILE");

167
  // Output simple replica timings
168
  if ((getTeamRank() == MASTER) && (getWorldRank() != MASTER)) {
Ben Hazelwood's avatar
Ben Hazelwood committed
169
    PMPI_Send(&timer.endTime, 1, MPI_DOUBLE, MASTER, 0, getLibComm());
Ben Hazelwood's avatar
Ben Hazelwood committed
170
171
  }

172

173
174
175
176
  if (getWorldRank() == MASTER) {
    std::cout << std::endl;
    std::cout << "----------TMPI_TIMING----------\n";
    std::cout << "timing_file=";
177
    std::cout << (filenamePrefix.empty() ? "timing_not_enabled" : filenamePrefix) << "\n";
Ben Hazelwood's avatar
Ben Hazelwood committed
178
179
    std::cout << "num_replicas=" << getNumberOfTeams() << "\n";
    for (int i=0; i < getNumberOfTeams(); i++) {
180
181
182
183
      double rEndTime = 0.0;
      if (i == MASTER) {
        rEndTime = timer.endTime;
      } else {
Ben Hazelwood's avatar
Ben Hazelwood committed
184
        PMPI_Recv(&rEndTime, 1, MPI_DOUBLE, mapTeamToWorldRank(MASTER, i), 0, getLibComm(), MPI_STATUS_IGNORE);
185
186
      }

Ben Hazelwood's avatar
Ben Hazelwood committed
187
      std::cout << "replica " << i << "=" << rEndTime - timer.startTime << "s\n";
188
189
190
191
192
    }
    std::cout << "-------------------------------\n";
  }
  std::cout.flush();
  PMPI_Barrier(MPI_COMM_WORLD);
193

194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
  if (!filenamePrefix.empty()) {
    // Write Generic Sync points to files
    char sep = ',';
    std::ostringstream filename;
    std::string outputFolder("tmpi-timings");
    filename << outputFolder << "/"
        << filenamePrefix << "-"
        << getWorldRank() << "-"
        << getTeamRank() << "-"
        << getTeam()
        << ".csv";
    std::ofstream f;
    f.open(filename.str().c_str());

    logInfo("Writing timings to " << filename);

    f << "endTime" << sep << timer.endTime - timer.startTime << "\n";

Ben Hazelwood's avatar
Ben Hazelwood committed
212
213
    f << "heartbeatTimes";
    for (const double& t : timer.heartbeatTimes.at(getTeam())) {
Ben Hazelwood's avatar
Ben Hazelwood committed
214
      f << sep << t;
215
216
    }
    f << "\n";
217

Ben Hazelwood's avatar
Ben Hazelwood committed
218
    f << "sleepPoints";
Ben Hazelwood's avatar
Ben Hazelwood committed
219
220
221
    for (const double& t : timer.sleepPoints) {
      f << sep << t - timer.startTime;
    }
222
223
    f.close();
  }
224
225

  PMPI_Barrier(MPI_COMM_WORLD);
226
227
}