Skip to content

Commit b06d799

Browse files
committed
GPU: Add possibility to dump raw data in case of error
1 parent b47c7c6 commit b06d799

File tree

10 files changed

+128
-7
lines changed

10 files changed

+128
-7
lines changed

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ int32_t GPUReconstruction::Init()
193193
}
194194
mSlaves[i]->ClearAllocatedMemory();
195195
}
196+
debugInit();
196197
return 0;
197198
}
198199

@@ -469,6 +470,7 @@ int32_t GPUReconstruction::Exit()
469470
if (mInitialized) {
470471
ExitDevice();
471472
}
473+
debugExit();
472474
mInitialized = false;
473475
return 0;
474476
}

GPU/GPUTracking/Base/GPUReconstruction.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <memory>
2323
#include <iosfwd>
2424
#include <vector>
25+
#include <functional>
2526
#include <unordered_map>
2627
#include <unordered_set>
2728

@@ -239,6 +240,7 @@ class GPUReconstruction
239240
virtual void PrintKernelOccupancies() {}
240241
double GetStatKernelTime() { return mStatKernelTime; }
241242
double GetStatWallTime() { return mStatWallTime; }
243+
void setDebugDumpCallback(std::function<void()>&& callback = std::function<void()>(nullptr));
242244

243245
// Threading
244246
std::shared_ptr<GPUReconstructionThreading> mThreading;
@@ -407,6 +409,13 @@ class GPUReconstruction
407409
};
408410
static std::shared_ptr<LibraryLoader> sLibCUDA, sLibHIP, sLibOCL;
409411

412+
// Debugging
413+
struct debugInternal;
414+
static std::unique_ptr<debugInternal> mDebugData;
415+
bool mDebugEnabled = false;
416+
void debugInit();
417+
void debugExit();
418+
410419
static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg);
411420
};
412421

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2+
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3+
// All rights not expressly granted are reserved.
4+
//
5+
// This software is distributed under the terms of the GNU General Public
6+
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7+
//
8+
// In applying this license CERN does not waive the privileges and immunities
9+
// granted to it by virtue of its status as an Intergovernmental Organization
10+
// or submit itself to any jurisdiction.
11+
12+
/// \file GPUReconstructionDebug.cxx
13+
/// \author David Rohr
14+
15+
#include "GPUReconstruction.h"
16+
#include "GPULogging.h"
17+
#include "GPUSettings.h"
18+
19+
#include <signal.h>
20+
#include <functional>
21+
#include <unordered_map>
22+
#include <mutex>
23+
24+
using namespace o2::gpu;
25+
26+
struct GPUReconstruction::debugInternal {
27+
std::function<void(int32_t, siginfo_t*, void*)> signalCallback;
28+
std::function<void()> debugCallback = nullptr;
29+
std::unordered_map<int32_t, struct sigaction> oldActions;
30+
static void globalCallback(int32_t signal, siginfo_t* info, void* ucontext)
31+
{
32+
GPUReconstruction::mDebugData->signalCallback(signal, info, ucontext);
33+
}
34+
};
35+
36+
std::unique_ptr<GPUReconstruction::debugInternal> GPUReconstruction::mDebugData;
37+
38+
void GPUReconstruction::debugInit()
39+
{
40+
if (GetProcessingSettings().dumpOnFailure) {
41+
static std::mutex initMutex;
42+
{
43+
std::lock_guard<std::mutex> guard(initMutex);
44+
if (mDebugData) {
45+
GPUFatal("Error handlers for debug dumps already set, cannot set them again");
46+
}
47+
mDebugData = std::make_unique<debugInternal>();
48+
}
49+
mDebugEnabled = true;
50+
struct sigaction sa, oldsa;
51+
memset(&sa, 0, sizeof(sa));
52+
sa.sa_sigaction = GPUReconstruction::debugInternal::globalCallback;
53+
sa.sa_flags = SA_SIGINFO;
54+
int32_t sigs[] = {SIGINT, SIGABRT, SIGBUS, SIGTERM, SIGSEGV};
55+
for (uint32_t i = 0; i < sizeof(sigs) / sizeof(sigs[0]); i++) {
56+
if (sigaction(sigs[i], &sa, &oldsa)) {
57+
GPUFatal("Error installing signal handler for error dump on signal %d", sigs[i]);
58+
}
59+
mDebugData->oldActions.emplace(sigs[i], oldsa);
60+
}
61+
62+
mDebugData->signalCallback = [&oldActions = mDebugData->oldActions, myAction = std::move(sa)](int32_t signal, siginfo_t* info, void* ucontext) {
63+
static std::mutex callbackMutex;
64+
{
65+
std::lock_guard<std::mutex> guard(callbackMutex);
66+
GPUInfo("Received signal %d", signal);
67+
if (mDebugData->debugCallback) {
68+
GPUInfo("Running debug callback");
69+
mDebugData->debugCallback();
70+
}
71+
mDebugData->debugCallback = nullptr;
72+
sigaction(signal, &oldActions[signal], nullptr);
73+
printf("DEF SIGNAL\n");
74+
raise(signal);
75+
}
76+
sigaction(signal, &myAction, nullptr);
77+
};
78+
}
79+
}
80+
81+
void GPUReconstruction::debugExit()
82+
{
83+
if (!mDebugEnabled) {
84+
return;
85+
}
86+
if (mDebugData) {
87+
for (auto it = mDebugData->oldActions.begin(); it != mDebugData->oldActions.end(); it++) {
88+
if (sigaction(it->first, &it->second, nullptr)) {
89+
GPUFatal("Error restoring signal handler for signal %d", it->first);
90+
}
91+
}
92+
}
93+
mDebugEnabled = false;
94+
}
95+
96+
void GPUReconstruction::setDebugDumpCallback(std::function<void()>&& callback)
97+
{
98+
if (mMaster) {
99+
mMaster->setDebugDumpCallback(std::move(callback));
100+
} else if (mDebugEnabled && mDebugData) {
101+
mDebugData->debugCallback = callback;
102+
}
103+
}

GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
// granted to it by virtue of its status as an Intergovernmental Organization
1010
// or submit itself to any jurisdiction.
1111

12-
/// \file GPUReconstruction.cxx
12+
/// \file GPUReconstructionLibrary.cxx
1313
/// \author David Rohr
1414

1515
#ifdef _WIN32

GPU/GPUTracking/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ set(SRCS_NO_CINT
9595
set(SRCS_NO_H SectorTracker/GPUTPCTrackerDump.cxx
9696
Merger/GPUTPCGMMergerDump.cxx
9797
Base/GPUReconstructionLibrary.cxx
98+
Base/GPUReconstructionDebug.cxx
9899
Global/GPUChainTrackingClusterizer.cxx
99100
Global/GPUChainTrackingTransformation.cxx
100101
Global/GPUChainTrackingTRD.cxx

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from
360360
AddOption(oclOverrideSourceBuildFlags, std::string, "", "", 0, "Override OCL build flags for compilation from source, put a space for empty options")
361361
AddOption(printSettings, bool, false, "", 0, "Print all settings when initializing")
362362
AddOption(tpcFreeAllocatedMemoryAfterProcessing, bool, false, "", 0, "Clean all memory allocated by TPC when TPC processing done, only data written to external output resources will remain")
363+
AddOption(dumpOnFailure, bool, false, "", 0, "Dump raw data in case an error occured")
363364
AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
364365
AddSubConfig(GPUSettingsProcessingRTC, rtc)
365366
AddSubConfig(GPUSettingsProcessingRTCtechnical, rtctech)

GPU/GPUTracking/Global/GPUChainTracking.cxx

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -705,10 +705,14 @@ int32_t GPUChainTracking::RunChain()
705705
}
706706
mRec->getGeneralStepTimer(GeneralStep::Prepare).Stop();
707707

708-
PrepareDebugOutput();
708+
PrepareKernelDebugOutput();
709709

710710
SynchronizeStream(0); // Synchronize all init copies that might be ongoing
711711

712+
if (GetProcessingSettings().dumpOnFailure) {
713+
mRec->setDebugDumpCallback([]() { GPUInfo("CALLBACK"); });
714+
}
715+
712716
if (mIOPtrs.tpcCompressedClusters) {
713717
if (runRecoStep(RecoStep::TPCDecompression, &GPUChainTracking::RunTPCDecompression)) {
714718
return 1;
@@ -815,7 +819,7 @@ int32_t GPUChainTracking::RunChainFinalize()
815819
PrintOutputStat();
816820
}
817821

818-
PrintDebugOutput();
822+
PrintKernelDebugOutput();
819823

820824
// PrintMemoryRelations();
821825

GPU/GPUTracking/Global/GPUChainTracking.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,8 @@ class GPUChainTracking : public GPUChain
231231
int32_t DoProfile();
232232
void PrintMemoryRelations();
233233
void PrintMemoryStatistics() override;
234-
void PrepareDebugOutput();
235-
void PrintDebugOutput();
234+
void PrepareKernelDebugOutput();
235+
void PrintKernelDebugOutput();
236236
void PrintOutputStat();
237237
static void DumpClusters(std::ostream& out, const o2::tpc::ClusterNativeAccess* clusters);
238238
static void DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls);

GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ void GPUChainTracking::PrintMemoryRelations()
185185
GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NOutputTrackClusters());
186186
}
187187

188-
void GPUChainTracking::PrepareDebugOutput()
188+
void GPUChainTracking::PrepareKernelDebugOutput()
189189
{
190190
#ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT
191191
const auto& threadContext = GetThreadContext();
@@ -198,7 +198,7 @@ void GPUChainTracking::PrepareDebugOutput()
198198
#endif
199199
}
200200

201-
void GPUChainTracking::PrintDebugOutput()
201+
void GPUChainTracking::PrintKernelDebugOutput()
202202
{
203203
#ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT
204204
const auto& threadContext = GetThreadContext();

GPU/GPUTracking/SectorTracker/GPUTPCStartHitsFinder.cxx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ GPUdii() void GPUTPCStartHitsFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nThr
3030
} else {
3131
s.mNHits = -1;
3232
}
33+
tracker.mData.mLinkUpData[10000000000] = 0;
3334
}
3435
GPUbarrier();
3536
GPUglobalref() const GPUTPCRow& GPUrestrict() row = tracker.mData.mRows[s.mIRow];

0 commit comments

Comments
 (0)