|
30 | 30 | #include "Framework/RuntimeError.h" |
31 | 31 | #include "Framework/RateLimiter.h" |
32 | 32 | #include "Framework/PluginManager.h" |
| 33 | +#include "Framework/Signpost.h" |
33 | 34 | #include <Monitoring/Monitoring.h> |
34 | 35 |
|
35 | 36 | #include <fairmq/Device.h> |
| 37 | +#include <uv.h> |
36 | 38 | #include <fstream> |
37 | 39 | #include <functional> |
38 | 40 | #include <memory> |
39 | 41 | #include <string> |
40 | 42 |
|
41 | 43 | using namespace o2::framework::data_matcher; |
42 | 44 |
|
| 45 | +// Special log to track callbacks we know about |
| 46 | +O2_DECLARE_DYNAMIC_LOG(callbacks); |
| 47 | + |
43 | 48 | namespace o2::framework |
44 | 49 | { |
45 | 50 |
|
@@ -145,27 +150,46 @@ DataProcessorSpec CommonDataProcessors::getGlobalFairMQSink(std::vector<InputSpe |
145 | 150 |
|
146 | 151 | void retryMetricCallback(uv_async_t* async) |
147 | 152 | { |
| 153 | + O2_SIGNPOST_ID_FROM_POINTER(cid, callbacks, async); |
| 154 | + O2_SIGNPOST_EVENT_EMIT(callbacks, cid, "rate-limiting", "Attempting again propagating rate-limiting information."); |
| 155 | + |
| 156 | + // Check if this is a source device |
148 | 157 | static size_t lastTimeslice = -1; |
149 | 158 | auto* services = (ServiceRegistryRef*)async->data; |
150 | 159 | auto& timesliceIndex = services->get<TimesliceIndex>(); |
151 | 160 | auto* device = services->get<RawDeviceService>().device(); |
152 | 161 | auto channel = device->GetChannels().find("metric-feedback"); |
153 | 162 | auto oldestPossingTimeslice = timesliceIndex.getOldestPossibleOutput().timeslice.value; |
154 | 163 | if (channel == device->GetChannels().end()) { |
| 164 | + O2_SIGNPOST_EVENT_EMIT(callbacks, cid, "rate-limiting", "Could not find metric-feedback channel."); |
155 | 165 | return; |
156 | 166 | } |
157 | 167 | fair::mq::MessagePtr payload(device->NewMessage()); |
158 | 168 | payload->Rebuild(&oldestPossingTimeslice, sizeof(int64_t), nullptr, nullptr); |
159 | 169 | auto consumed = oldestPossingTimeslice; |
160 | 170 |
|
| 171 | + size_t start = uv_hrtime(); |
161 | 172 | int64_t result = channel->second[0].Send(payload, 100); |
| 173 | + size_t stop = uv_hrtime(); |
162 | 174 | // If the sending worked, we do not retry. |
163 | | - if (result != 0) { |
| 175 | + if (result <= 0) { |
| 176 | + // Forcefully slow down in case FairMQ returns earlier than expected... |
| 177 | + int64_t ellapsed = (stop - start) / 1000000; |
| 178 | + if (ellapsed < 100) { |
| 179 | + O2_SIGNPOST_EVENT_EMIT(callbacks, cid, "rate-limiting", |
| 180 | + "FairMQ returned %llu earlier than expected. Sleeping %llu ms more before, retrying.", |
| 181 | + result, ellapsed); |
| 182 | + uv_sleep(100 - ellapsed); |
| 183 | + } else { |
| 184 | + O2_SIGNPOST_EVENT_EMIT(callbacks, cid, "rate-limiting", |
| 185 | + "FairMQ returned %llu, unable to send last consumed timeslice to source for %llu ms, retrying.", result, ellapsed); |
| 186 | + } |
164 | 187 | // If the sending did not work, we keep trying until it actually works. |
165 | 188 | // This will schedule other tasks in the queue, so the processing of the |
166 | 189 | // data will still happen. |
167 | 190 | uv_async_send(async); |
168 | 191 | } else { |
| 192 | + O2_SIGNPOST_EVENT_EMIT(callbacks, cid, "rate-limiting", "Send %llu bytes, Last timeslice now set to %zu.", result, consumed); |
169 | 193 | lastTimeslice = consumed; |
170 | 194 | } |
171 | 195 | } |
|
0 commit comments