Skip to content

Commit 981cd40

Browse files
committed
GPU: Add possibility to dump raw data in case of error
1 parent 546f793 commit 981cd40

13 files changed

+313
-50
lines changed

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ int32_t GPUReconstruction::Init()
193193
}
194194
mSlaves[i]->ClearAllocatedMemory();
195195
}
196+
debugInit();
196197
return 0;
197198
}
198199

@@ -469,6 +470,7 @@ int32_t GPUReconstruction::Exit()
469470
if (mInitialized) {
470471
ExitDevice();
471472
}
473+
debugExit();
472474
mInitialized = false;
473475
return 0;
474476
}

GPU/GPUTracking/Base/GPUReconstruction.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <memory>
2323
#include <iosfwd>
2424
#include <vector>
25+
#include <functional>
2526
#include <unordered_map>
2627
#include <unordered_set>
2728

@@ -239,6 +240,9 @@ class GPUReconstruction
239240
virtual void PrintKernelOccupancies() {}
240241
double GetStatKernelTime() { return mStatKernelTime; }
241242
double GetStatWallTime() { return mStatWallTime; }
243+
void setDebugDumpCallback(std::function<void()>&& callback = std::function<void()>(nullptr));
244+
bool triggerDebugDump();
245+
std::string getDebugFolder(const std::string& prefix = ""); // empty string = no debug
242246

243247
// Threading
244248
std::shared_ptr<GPUReconstructionThreading> mThreading;
@@ -407,6 +411,13 @@ class GPUReconstruction
407411
};
408412
static std::shared_ptr<LibraryLoader> sLibCUDA, sLibHIP, sLibOCL;
409413

414+
// Debugging
415+
struct debugInternal;
416+
static std::unique_ptr<debugInternal> mDebugData;
417+
bool mDebugEnabled = false;
418+
void debugInit();
419+
void debugExit();
420+
410421
static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg);
411422
};
412423

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2+
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3+
// All rights not expressly granted are reserved.
4+
//
5+
// This software is distributed under the terms of the GNU General Public
6+
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7+
//
8+
// In applying this license CERN does not waive the privileges and immunities
9+
// granted to it by virtue of its status as an Intergovernmental Organization
10+
// or submit itself to any jurisdiction.
11+
12+
/// \file GPUReconstructionDebug.cxx
13+
/// \author David Rohr
14+
15+
#include "GPUReconstruction.h"
16+
#include "GPULogging.h"
17+
#include "GPUSettings.h"
18+
19+
#include <csignal>
20+
#include <functional>
21+
#include <unordered_map>
22+
#include <mutex>
23+
#include <filesystem>
24+
#include <chrono>
25+
#include <format>
26+
27+
using namespace o2::gpu;
28+
29+
struct GPUReconstruction::debugInternal {
30+
std::function<void(int32_t, siginfo_t*, void*)> signalCallback;
31+
std::function<void()> debugCallback = nullptr;
32+
std::function<void()> reinstallCallback = nullptr;
33+
std::unordered_map<int32_t, struct sigaction> oldActions;
34+
size_t debugCount = 0;
35+
static void globalCallback(int32_t signal, siginfo_t* info, void* ucontext)
36+
{
37+
GPUReconstruction::mDebugData->signalCallback(signal, info, ucontext);
38+
}
39+
};
40+
41+
std::unique_ptr<GPUReconstruction::debugInternal> GPUReconstruction::mDebugData;
42+
43+
void GPUReconstruction::debugInit()
44+
{
45+
if (GetProcessingSettings().debugOnFailure) {
46+
static std::mutex initMutex;
47+
{
48+
std::lock_guard<std::mutex> guard(initMutex);
49+
if (mDebugData) {
50+
GPUFatal("Error handlers for debug dumps already set, cannot set them again");
51+
}
52+
mDebugData = std::make_unique<debugInternal>();
53+
}
54+
mDebugEnabled = true;
55+
if ((GetProcessingSettings().debugOnFailure & 1) || (GetProcessingSettings().debugOnFailure & 2)) {
56+
struct sigaction sa, oldsa;
57+
memset(&sa, 0, sizeof(sa));
58+
sa.sa_sigaction = GPUReconstruction::debugInternal::globalCallback;
59+
sa.sa_flags = SA_SIGINFO;
60+
uint32_t mask = GetProcessingSettings().debugOnFailureSignalMask == (uint32_t)-1 ? ((1 << SIGINT) | (1 << SIGABRT) | (1 << SIGBUS) | (1 << SIGTERM) | (1 << SIGSEGV)) : GetProcessingSettings().debugOnFailureSignalMask;
61+
if (mask) {
62+
for (uint32_t i = 0; i < sizeof(mask) * 8; i++) {
63+
if (mask & (1 << i)) {
64+
if (sigaction(i, &sa, &oldsa)) {
65+
GPUFatal("Error installing signal handler for error dump on signal %d", i);
66+
}
67+
mDebugData->oldActions.emplace(i, oldsa);
68+
}
69+
}
70+
}
71+
72+
mDebugData->signalCallback = [this, &oldActions = mDebugData->oldActions, myAction = std::move(sa)](int32_t signal, siginfo_t* info, void* ucontext) {
73+
static std::mutex callbackMutex;
74+
std::lock_guard<std::mutex> guard(callbackMutex);
75+
if (mDebugData->debugCallback) {
76+
GPUInfo("Running debug callback for signal %d", signal);
77+
mDebugData->debugCallback();
78+
mDebugData->debugCount++;
79+
}
80+
mDebugData->debugCallback = nullptr;
81+
if (!GetProcessingSettings().debugOnFailureNoForwardSignal) {
82+
sigaction(signal, &oldActions[signal], nullptr);
83+
raise(signal);
84+
mDebugData->reinstallCallback = [signal, myAction]() { sigaction(signal, &myAction, nullptr); };
85+
}
86+
};
87+
}
88+
}
89+
}
90+
91+
void GPUReconstruction::debugExit()
92+
{
93+
if (!mDebugEnabled) {
94+
return;
95+
}
96+
if (mDebugData) {
97+
for (auto& it : mDebugData->oldActions) {
98+
if (sigaction(it.first, &it.second, nullptr)) {
99+
GPUFatal("Error restoring signal handler for signal %d", it.first);
100+
}
101+
}
102+
}
103+
mDebugEnabled = false;
104+
}
105+
106+
void GPUReconstruction::setDebugDumpCallback(std::function<void()>&& callback)
107+
{
108+
if (mMaster) {
109+
if (mDebugData->reinstallCallback) {
110+
mDebugData->reinstallCallback();
111+
mDebugData->reinstallCallback = nullptr;
112+
}
113+
mMaster->setDebugDumpCallback(std::move(callback));
114+
} else if (mDebugEnabled && mDebugData) {
115+
mDebugData->debugCallback = callback;
116+
}
117+
}
118+
119+
std::string GPUReconstruction::getDebugFolder(const std::string& prefix)
120+
{
121+
const std::filesystem::path target_dir = GetProcessingSettings().debugOnFailureDirectory;
122+
123+
std::size_t total_size = 0;
124+
std::size_t subfolder_count = 0;
125+
126+
if (!std::filesystem::exists(target_dir) || !std::filesystem::is_directory(target_dir)) {
127+
GPUError("Invalid debugOnFailureDirectory %s", GetProcessingSettings().debugOnFailureDirectory.c_str());
128+
return "";
129+
}
130+
131+
for (const auto& entry : std::filesystem::directory_iterator(target_dir)) {
132+
if (entry.is_directory()) {
133+
subfolder_count++;
134+
135+
for (const auto& subentry : std::filesystem::directory_iterator(entry.path())) {
136+
if (subentry.is_regular_file()) {
137+
std::error_code ec;
138+
auto size = std::filesystem::file_size(subentry.path(), ec);
139+
if (!ec) {
140+
total_size += size;
141+
}
142+
}
143+
}
144+
}
145+
}
146+
147+
if ((GetProcessingSettings().debugOnFailureMaxFiles && subfolder_count >= GetProcessingSettings().debugOnFailureMaxFiles) || (GetProcessingSettings().debugOnFailureMaxSize && (total_size >> 30) >= GetProcessingSettings().debugOnFailureMaxSize)) {
148+
GPUError("Cannot store debug dump files, target storage exceeded: %zu dumps, %zu bytes", subfolder_count, total_size);
149+
return "";
150+
}
151+
152+
auto currentTime = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
153+
std::ostringstream dateTime;
154+
dateTime << std::put_time(std::localtime(&currentTime), "%Y-%m-%d_%H-%M-%S");
155+
156+
int32_t attempt = 0;
157+
std::string outname;
158+
while (true) {
159+
if (attempt++ >= 512) {
160+
GPUError("Error creating debug dump folder");
161+
return "";
162+
}
163+
164+
outname = GetProcessingSettings().debugOnFailureDirectory + "/debug_" + prefix + (prefix == "" ? "" : "_") + dateTime.str() + "_" + std::to_string(attempt);
165+
std::error_code ec;
166+
bool created = std::filesystem::create_directory(outname, ec);
167+
if (!ec && created) {
168+
break;
169+
}
170+
}
171+
172+
GPUInfo("Debug dump to %s", outname.c_str());
173+
return outname;
174+
}
175+
176+
bool GPUReconstruction::triggerDebugDump()
177+
{
178+
if (mMaster) {
179+
return mMaster->triggerDebugDump();
180+
} else if (mDebugEnabled && mDebugData && mDebugData->debugCallback) {
181+
GPUInfo("Running triggered debug callback");
182+
mDebugData->debugCallback();
183+
mDebugData->debugCount++;
184+
mDebugData->debugCallback = nullptr;
185+
return true;
186+
}
187+
return false;
188+
}

GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
// granted to it by virtue of its status as an Intergovernmental Organization
1010
// or submit itself to any jurisdiction.
1111

12-
/// \file GPUReconstruction.cxx
12+
/// \file GPUReconstructionLibrary.cxx
1313
/// \author David Rohr
1414

1515
#ifdef _WIN32

GPU/GPUTracking/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ set(SRCS_NO_CINT
9595
set(SRCS_NO_H SectorTracker/GPUTPCTrackerDump.cxx
9696
Merger/GPUTPCGMMergerDump.cxx
9797
Base/GPUReconstructionLibrary.cxx
98+
Base/GPUReconstructionDebug.cxx
9899
Global/GPUChainTrackingClusterizer.cxx
99100
Global/GPUChainTrackingTransformation.cxx
100101
Global/GPUChainTrackingTRD.cxx

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,14 @@ AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from
360360
AddOption(oclOverrideSourceBuildFlags, std::string, "", "", 0, "Override OCL build flags for compilation from source, put a space for empty options")
361361
AddOption(printSettings, bool, false, "", 0, "Print all settings when initializing")
362362
AddOption(tpcFreeAllocatedMemoryAfterProcessing, bool, false, "", 0, "Clean all memory allocated by TPC when TPC processing done, only data written to external output resources will remain")
363+
AddOption(debugOnFailure, int32_t, 0, "", 0, "Dump raw data in case an error occured, bit 1 enables all dumps, otherwise bitmask for: 2 = signal, 3 = GPUErrorCode", def(1))
364+
AddOption(debugOnFailureSignalMask, uint32_t, (uint32_t)-1, "", 0, "Mask of signals that trigger debug / dump")
365+
AddOption(debugOnFailureErrorMask, uint64_t, (uint64_t)-1, "", 0, "Mask of GPUCA_ERRORS that trigger debug / dump")
366+
AddOption(debugOnFailureNoForwardSignal, bool, false, "", 0, "Do not forward signal to original signal handler")
367+
AddOption(debugOnFailureMaxN, uint32_t, 1, "", 0, "Max number of times to run the debug / dump")
368+
AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to have in the target folder")
369+
AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB")
370+
AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump")
363371
AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
364372
AddSubConfig(GPUSettingsProcessingRTC, rtc)
365373
AddSubConfig(GPUSettingsProcessingRTCtechnical, rtctech)

GPU/GPUTracking/Global/GPUChainTracking.cxx

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -705,10 +705,14 @@ int32_t GPUChainTracking::RunChain()
705705
}
706706
mRec->getGeneralStepTimer(GeneralStep::Prepare).Stop();
707707

708-
PrepareDebugOutput();
708+
PrepareKernelDebugOutput();
709709

710710
SynchronizeStream(0); // Synchronize all init copies that might be ongoing
711711

712+
if (GetProcessingSettings().debugOnFailure) {
713+
mRec->setDebugDumpCallback([this]() { DoDebugRawDump(); });
714+
}
715+
712716
if (mIOPtrs.tpcCompressedClusters) {
713717
if (runRecoStep(RecoStep::TPCDecompression, &GPUChainTracking::RunTPCDecompression)) {
714718
return 1;
@@ -775,7 +779,7 @@ int32_t GPUChainTracking::RunChain()
775779
}
776780

777781
int32_t retVal = 0;
778-
if (CheckErrorCodes(false, false, mRec->getErrorCodeOutput())) {
782+
if (CheckErrorCodes(false, false, mRec->getErrorCodeOutput())) { // TODO: Eventually, we should use GPUReconstruction::CheckErrorCodes
779783
retVal = 3;
780784
if (!GetProcessingSettings().ignoreNonFatalGPUErrors) {
781785
return retVal;
@@ -815,7 +819,7 @@ int32_t GPUChainTracking::RunChainFinalize()
815819
PrintOutputStat();
816820
}
817821

818-
PrintDebugOutput();
822+
PrintKernelDebugOutput();
819823

820824
// PrintMemoryRelations();
821825

@@ -884,6 +888,7 @@ int32_t GPUChainTracking::FinalizePipelinedProcessing()
884888
int32_t GPUChainTracking::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, std::vector<std::array<uint32_t, 4>>* fillErrors)
885889
{
886890
int32_t retVal = 0;
891+
bool hasDebugError = false;
887892
for (int32_t i = 0; i < 1 + (!cpuOnly && mRec->IsGPU()); i++) {
888893
if (i) {
889894
const auto& threadContext = GetThreadContext();
@@ -925,9 +930,26 @@ int32_t GPUChainTracking::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, st
925930
fillErrors->emplace_back(std::array<uint32_t, 4>{pErrors[4 * j], pErrors[4 * j + 1], pErrors[4 * j + 2], pErrors[4 * j + 3]});
926931
}
927932
}
933+
if ((GetProcessingSettings().debugOnFailure & 1) || (GetProcessingSettings().debugOnFailure & 4)) {
934+
if (GetProcessingSettings().debugOnFailureErrorMask == (uint64_t)-1) {
935+
hasDebugError = true;
936+
} else {
937+
uint32_t nErrors = processors()->errorCodes.getNErrors();
938+
const uint32_t* pErrors = processors()->errorCodes.getErrorPtr();
939+
for (uint32_t j = 0; j < nErrors; j++) {
940+
if (GetProcessingSettings().debugOnFailureErrorMask & (1 << pErrors[4 * j])) {
941+
hasDebugError = true;
942+
break;
943+
}
944+
}
945+
}
946+
}
928947
}
929948
}
930949
ClearErrorCodes(cpuOnly);
950+
if (hasDebugError) {
951+
mRec->triggerDebugDump();
952+
}
931953
return retVal;
932954
}
933955

GPU/GPUTracking/Global/GPUChainTracking.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ class GPUChainTracking : public GPUChain
134134
void ClearIOPointers();
135135
void AllocateIOMemory();
136136
using GPUChain::DumpData;
137-
void DumpData(const char* filename);
137+
void DumpData(const char* filename, const GPUTrackingInOutPointers* ioPtrs = nullptr);
138138
using GPUChain::ReadData;
139139
int32_t ReadData(const char* filename);
140140
void DumpSettings(const char* dir = "") override;
@@ -231,11 +231,12 @@ class GPUChainTracking : public GPUChain
231231
int32_t DoProfile();
232232
void PrintMemoryRelations();
233233
void PrintMemoryStatistics() override;
234-
void PrepareDebugOutput();
235-
void PrintDebugOutput();
234+
void PrepareKernelDebugOutput();
235+
void PrintKernelDebugOutput();
236236
void PrintOutputStat();
237237
static void DumpClusters(std::ostream& out, const o2::tpc::ClusterNativeAccess* clusters);
238238
static void DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls);
239+
void DoDebugRawDump();
239240

240241
bool ValidateSteps();
241242
bool ValidateSettings();

0 commit comments

Comments
 (0)