DcdbPusher.cpp 8.82 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// This program is to be used with Caliper.
// Caliper belongs to Lawrence Livermore National Security, LLC.
// LLNL-CODE-678900
// All rights reserved.
//
// For details, see https://github.com/scalability-llnl/Caliper.

//================================================================================
// Name        : DcdbPusher.cpp
// Author      : Micha Mueller
// Copyright   : Leibniz Supercomputing Centre
// Description : Caliper service to forward snapshot data to dcdb pusher.
//================================================================================

//================================================================================
// This file is part of DCDB (DataCenter DataBase)
// Copyright (C) 2019-2019 Leibniz Supercomputing Centre
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
//================================================================================

/**
 * @file DcdbPusher.cpp
 *
 * @brief Caliper service to forward snapshot data to dcdb pusher.
 *
 * @details Service for the runtime processing level. Relies on the sampler,
 *          symbollookup and timestamp service. To avoid a detour to the flush
 *          level the symbollookup service is invoked manually when processing a
 *          snapshot. This way all required information is available without
 *          having to explicitly trigger a flush and the relevant data can be
 *          instantly forwarded. Unneeded atttributes are stripped off and
 *          discarded.
 *
 */

#include "caliper/CaliperService.h"

#include "caliper/Caliper.h"
#include "caliper/SnapshotRecord.h"

#include "caliper/common/Log.h"
#include "caliper/common/RuntimeConfig.h"

57
58
59
60
61
62
#include <errno.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <unistd.h>

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
using namespace cali;

namespace {

class DcdbPusher {

private:

    static const ConfigSet::Entry s_configdata[];

    //TODO reduce snapshot frequency during runtime
    unsigned snapshot_threshold = 10;

    unsigned threshold_exceeded  = 0;
    unsigned snapshots_processed = 0;
    unsigned snapshots_failed    = 0;

    bool initialized = false;

    Attribute sampler    { Attribute::invalid };
    Attribute symbol_fun { Attribute::invalid };
    Attribute timestamp  { Attribute::invalid };

86
87
    int sock;

88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
    void post_init_cb(Caliper* c, Channel* chn) {
        // manually issue a pre-flush event to set up the SymbolLookup service
        chn->events().pre_flush_evt(c, chn, nullptr);

        // Check if required services sampler, symbollookup and timestamp are
        // active by searching for identifying attributes.

        //TODO support data collection if event triggered snapshots are used
        sampler    = c->get_attribute("cali.sampler.pc");
        symbol_fun = c->get_attribute("source.function#cali.sampler.pc");
        timestamp  = c->get_attribute("time.timestamp");

        if (sampler    == Attribute::invalid ||
            symbol_fun == Attribute::invalid ||
            timestamp  == Attribute::invalid) {

            Log(1).stream() << chn->name() << ": DcdbPusher: not all required services "
                    "sampler, symbollookup and timestamp are running." << std::endl;
            return;
        }

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
        sock = socket(AF_UNIX, SOCK_SEQPACKET, 0);

        if(sock == -1) {
            Log(1).stream() << chn->name() << ": DcdbPusher: Failed to open socket: "
                    << strerror(errno) << std::endl;
            return;
        }

        sockaddr_un addr;
        memset(&addr, 0, sizeof(struct sockaddr_un));

        addr.sun_family = AF_UNIX;
        snprintf(&addr.sun_path[1], 91, "DCDBPusherCaliSocket");

        if(connect(sock, (struct sockaddr*) &addr, sizeof(addr))) {
            Log(1).stream() << chn->name() << ": DcdbPusher: Failed to connect socket: "
                    << strerror(errno) << std::endl;
            close(sock);
            sock = -1;
            return;
        }

131
132
133
134
135
136
        initialized = true;
    }

    void process_snapshot_cb(Caliper* c, Channel* chn, const SnapshotRecord*, const SnapshotRecord* sbuf) {
        ++snapshots_processed;
        //TODO check if signal safety is required
137
138
        //TODO sampler invoked snapshots are always signals, so better do
        //     nothing signal "un-safe"
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
        //if (c->is_signal()) {
        //    ++snapshots_failed;
        //    return;
        //}

        if (!initialized) {
            ++snapshots_failed;
            return;
        }

        SnapshotRecord::Sizes sizes = sbuf->size();

        if ((sizes.n_nodes + sizes.n_immediate) == 0) {
            ++snapshots_failed;
            return;
        }

        std::vector<Entry> rec = sbuf->to_entrylist();

        // manually issue a postprocess-snapshot event to enrich snapshot with
        // symbollookup data
        chn->events().postprocess_snapshot(c, chn, rec);

162
163
164
165
        const size_t bufSize = 2048;
        char buf[bufSize];
        std::string time;
        std::string value;
166
167
168
169
170

        for (const Entry& e : rec) {

            //print attribute values for timestamp and looked up function symbol name
            cali_id_t entryId = c->get_attribute(e.attribute()).id();
171
172
173
174
            if (entryId == timestamp.id()) {
                time = e.value().to_string();
            } else if (entryId == symbol_fun.id()) {
                value = e.value().to_string();
175
176
            }
        }
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195

        if ((value.length() + time.length() + 2) > 2048) {
            Log(1).stream() << chn->name() << ": DcdbPusher: value exceeding buffer size" << std::endl;
            ++snapshots_failed;
            return;
        }

        Log(1).stream() << chn->name() << ": DcdbPusher: sending value "
                << value << " (" << time << ")" << std::endl << std::endl;

        strncpy(buf, time.c_str(), time.length()+1);
        strncpy(&buf[time.length()+1], value.c_str(), value.length()+1);

        if (send(sock, buf, value.length() + time.length() + 2, 0) == -1) {
            Log(1).stream() << chn->name() << ": DcdbPusher: Failed to send message: "
                    << strerror(errno) << std::endl;
            ++snapshots_failed;
        }

196
197
198
    }

    void finish_cb(Caliper* c, Channel* chn) {
199
200
201
202
203
        if(initialized) {
            shutdown(sock, SHUT_WR);
            close(sock);
        }

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
        if (threshold_exceeded > 0) {
            Log(1).stream() << chn->name() << ": DcdbPusher: threshold was exceeded "
                                           << threshold_exceeded << " times." << std::endl;
        }

        Log(1).stream() << chn->name() << ": DcdbPusher: "
                        << snapshots_processed << " snapshots processed of which "
                        << snapshots_failed    << " failed." << std::endl;
    }

    DcdbPusher(Caliper* c, Channel* chn) {
        ConfigSet config = chn->config().init("dcdbpusher", s_configdata);

        snapshot_threshold = config.get("snapshot_threshold").to_uint();
    }

public:

    ~DcdbPusher() {

    }

    static void dcdbpusher_register(Caliper* c, Channel* chn) {
        DcdbPusher* instance = new DcdbPusher(c, chn);

        chn->events().post_init_evt.connect(
            [instance](Caliper* c, Channel* chn){
                instance->post_init_cb(c, chn);
            });
        chn->events().process_snapshot.connect(
            [instance](Caliper* c, Channel* chn, const SnapshotRecord* trigger, const SnapshotRecord* snapshot){
                instance->process_snapshot_cb(c, chn, trigger, snapshot);
            });
        chn->events().finish_evt.connect(
            [instance](Caliper* c, Channel* chn){
                instance->finish_cb(c, chn);
                delete instance;
            });

        Log(1).stream() << chn->name() << ": Registered dcdbpusher service" << std::endl;
    }
}; // class DcdbPusher

const ConfigSet::Entry DcdbPusher::s_configdata[] = {
    { "snapshot_threshold", CALI_TYPE_UINT, "10",
      "Maximum number of snapshots per second we are allowed to process",
      "Maximum number of snapshots per second we are allowed to process.\n"
      "If the threshold is exceeded snapshots get discarded until the\n"
      "limit is met again." },

    ConfigSet::Terminator
};

} // namespace

namespace cali {
    CaliperService dcdbpusher_service { "dcdbpusher", ::DcdbPusher::dcdbpusher_register };
}