Skip to content

Commit 2fef6d5

Browse files
wiechuladavidrohr
authored andcommitted
TPC: Throttle error messages in case of buggy raw data
After 10 reported errors sleep for 10min
1 parent 0dfe1e4 commit 2fef6d5

File tree

1 file changed

+30
-2
lines changed

1 file changed

+30
-2
lines changed

Detectors/TPC/workflow/src/CalibProcessingHelper.cxx

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,13 @@
99
// granted to it by virtue of its status as an Intergovernmental Organization
1010
// or submit itself to any jurisdiction.
1111

12+
#include <fmt/core.h>
13+
#include <unordered_map>
1214
#include <vector>
1315
#include <algorithm>
16+
#include <chrono>
17+
#include <fmt/format.h>
18+
#include <fmt/chrono.h>
1419

1520
#include "Framework/ConcreteDataMatcher.h"
1621
#include "Framework/InputRecordWalker.h"
@@ -148,8 +153,31 @@ uint64_t calib_processing_helper::processRawData(o2::framework::InputRecord& inp
148153
}
149154

150155
} catch (const std::exception& e) {
151-
LOGP(alarm, "EXCEPTIION in processRawData: {} -> skipping part:{}/{} of spec:{}/{}/{}, size:{}", e.what(), dh->splitPayloadIndex, dh->splitPayloadParts,
152-
dh->dataOrigin, dh->dataDescription, subSpecification, payloadSize);
156+
// error message throtteling
157+
using namespace std::literals::chrono_literals;
158+
static std::unordered_map<uint32_t, size_t> nErrorPerSubspec;
159+
static std::chrono::time_point<std::chrono::steady_clock> lastReport = std::chrono::steady_clock::now();
160+
const auto now = std::chrono::steady_clock::now();
161+
static size_t reportedErrors = 0;
162+
const size_t MAXERRORS = 10;
163+
const auto sleepTime = 10min;
164+
++nErrorPerSubspec[subSpecification];
165+
166+
if ((now - lastReport) < sleepTime) {
167+
if (reportedErrors < MAXERRORS) {
168+
++reportedErrors;
169+
std::string sleepInfo;
170+
if (reportedErrors == MAXERRORS) {
171+
sleepInfo = fmt::format(", maximum error count ({}) reached, not reporting for the next {}", MAXERRORS, sleepTime);
172+
}
173+
LOGP(alarm, "EXCEPTIION in processRawData: {} -> skipping part:{}/{} of spec:{}/{}/{}, size:{}, error count for subspec: {}{}", e.what(), dh->splitPayloadIndex, dh->splitPayloadParts,
174+
dh->dataOrigin, dh->dataDescription, subSpecification, payloadSize, nErrorPerSubspec.at(subSpecification), sleepInfo);
175+
lastReport = now;
176+
}
177+
} else {
178+
lastReport = now;
179+
reportedErrors = 0;
180+
}
153181
errorCount++;
154182
continue;
155183
}

0 commit comments

Comments
 (0)