Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ int32_t GPUReconstruction::Init()
}
mSlaves[i]->ClearAllocatedMemory();
}
debugInit();
return 0;
}

Expand Down Expand Up @@ -469,6 +470,7 @@ int32_t GPUReconstruction::Exit()
if (mInitialized) {
ExitDevice();
}
debugExit();
mInitialized = false;
return 0;
}
Expand Down
11 changes: 11 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <memory>
#include <iosfwd>
#include <vector>
#include <functional>
#include <unordered_map>
#include <unordered_set>

Expand Down Expand Up @@ -239,6 +240,9 @@ class GPUReconstruction
virtual void PrintKernelOccupancies() {}
double GetStatKernelTime() { return mStatKernelTime; }
double GetStatWallTime() { return mStatWallTime; }
void setDebugDumpCallback(std::function<void()>&& callback = std::function<void()>(nullptr));
bool triggerDebugDump();
std::string getDebugFolder(const std::string& prefix = ""); // empty string = no debug

// Threading
std::shared_ptr<GPUReconstructionThreading> mThreading;
Expand Down Expand Up @@ -407,6 +411,13 @@ class GPUReconstruction
};
static std::shared_ptr<LibraryLoader> sLibCUDA, sLibHIP, sLibOCL;

// Debugging
struct debugInternal;
static std::unique_ptr<debugInternal> mDebugData;
bool mDebugEnabled = false;
void debugInit();
void debugExit();

static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg);
};

Expand Down
188 changes: 188 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstructionDebug.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
// All rights not expressly granted are reserved.
//
// This software is distributed under the terms of the GNU General Public
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
//
// In applying this license CERN does not waive the privileges and immunities
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.

/// \file GPUReconstructionDebug.cxx
/// \author David Rohr

#include "GPUReconstruction.h"
#include "GPULogging.h"
#include "GPUSettings.h"

#include <csignal>
#include <functional>
#include <unordered_map>
#include <mutex>
#include <filesystem>
#include <chrono>
#include <format>

using namespace o2::gpu;

struct GPUReconstruction::debugInternal {
std::function<void(int32_t, siginfo_t*, void*)> signalCallback;
std::function<void()> debugCallback = nullptr;
std::function<void()> reinstallCallback = nullptr;
std::unordered_map<int32_t, struct sigaction> oldActions;
size_t debugCount = 0;
static void globalCallback(int32_t signal, siginfo_t* info, void* ucontext)
{
GPUReconstruction::mDebugData->signalCallback(signal, info, ucontext);
}
};

std::unique_ptr<GPUReconstruction::debugInternal> GPUReconstruction::mDebugData;

void GPUReconstruction::debugInit()
{
if (GetProcessingSettings().debugOnFailure) {
static std::mutex initMutex;
{
std::lock_guard<std::mutex> guard(initMutex);
if (mDebugData) {
GPUFatal("Error handlers for debug dumps already set, cannot set them again");
}
mDebugData = std::make_unique<debugInternal>();
}
mDebugEnabled = true;
if ((GetProcessingSettings().debugOnFailure & 1) || (GetProcessingSettings().debugOnFailure & 2)) {
struct sigaction sa, oldsa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = GPUReconstruction::debugInternal::globalCallback;
sa.sa_flags = SA_SIGINFO;
uint32_t mask = GetProcessingSettings().debugOnFailureSignalMask == (uint32_t)-1 ? ((1 << SIGINT) | (1 << SIGABRT) | (1 << SIGBUS) | (1 << SIGTERM) | (1 << SIGSEGV)) : GetProcessingSettings().debugOnFailureSignalMask;
if (mask) {
for (uint32_t i = 0; i < sizeof(mask) * 8; i++) {
if (mask & (1 << i)) {
if (sigaction(i, &sa, &oldsa)) {
GPUFatal("Error installing signal handler for error dump on signal %d", i);
}
mDebugData->oldActions.emplace(i, oldsa);
}
}
}

mDebugData->signalCallback = [this, &oldActions = mDebugData->oldActions, myAction = std::move(sa)](int32_t signal, siginfo_t* info, void* ucontext) {
static std::mutex callbackMutex;
std::lock_guard<std::mutex> guard(callbackMutex);
if (mDebugData->debugCallback) {
GPUInfo("Running debug callback for signal %d", signal);
mDebugData->debugCallback();
mDebugData->debugCount++;
}
mDebugData->debugCallback = nullptr;
if (!GetProcessingSettings().debugOnFailureNoForwardSignal) {
sigaction(signal, &oldActions[signal], nullptr);
raise(signal);
mDebugData->reinstallCallback = [signal, myAction]() { sigaction(signal, &myAction, nullptr); };
}
};
}
}
}

void GPUReconstruction::debugExit()
{
if (!mDebugEnabled) {
return;
}
if (mDebugData) {
for (auto& it : mDebugData->oldActions) {
if (sigaction(it.first, &it.second, nullptr)) {
GPUFatal("Error restoring signal handler for signal %d", it.first);
}
}
}
mDebugEnabled = false;
}

void GPUReconstruction::setDebugDumpCallback(std::function<void()>&& callback)
{
if (mMaster) {
if (mDebugData->reinstallCallback) {
mDebugData->reinstallCallback();
mDebugData->reinstallCallback = nullptr;
}
mMaster->setDebugDumpCallback(std::move(callback));
} else if (mDebugEnabled && mDebugData) {
mDebugData->debugCallback = callback;
}
}

std::string GPUReconstruction::getDebugFolder(const std::string& prefix)
{
const std::filesystem::path target_dir = GetProcessingSettings().debugOnFailureDirectory;

std::size_t total_size = 0;
std::size_t subfolder_count = 0;

if (!std::filesystem::exists(target_dir) || !std::filesystem::is_directory(target_dir)) {
GPUError("Invalid debugOnFailureDirectory %s", GetProcessingSettings().debugOnFailureDirectory.c_str());
return "";
}

for (const auto& entry : std::filesystem::directory_iterator(target_dir)) {
if (entry.is_directory()) {
subfolder_count++;

for (const auto& subentry : std::filesystem::directory_iterator(entry.path())) {
if (subentry.is_regular_file()) {
std::error_code ec;
auto size = std::filesystem::file_size(subentry.path(), ec);
if (!ec) {
total_size += size;
}
}
}
}
}

if ((GetProcessingSettings().debugOnFailureMaxFiles && subfolder_count >= GetProcessingSettings().debugOnFailureMaxFiles) || (GetProcessingSettings().debugOnFailureMaxSize && (total_size >> 30) >= GetProcessingSettings().debugOnFailureMaxSize)) {
GPUError("Cannot store debug dump files, target storage exceeded: %zu dumps, %zu bytes", subfolder_count, total_size);
return "";
}

auto currentTime = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
std::ostringstream dateTime;
dateTime << std::put_time(std::localtime(&currentTime), "%Y-%m-%d_%H-%M-%S");

int32_t attempt = 0;
std::string outname;
while (true) {
if (attempt++ >= 512) {
GPUError("Error creating debug dump folder");
return "";
}

outname = GetProcessingSettings().debugOnFailureDirectory + "/debug_" + prefix + (prefix == "" ? "" : "_") + dateTime.str() + "_" + std::to_string(attempt);
std::error_code ec;
bool created = std::filesystem::create_directory(outname, ec);
if (!ec && created) {
break;
}
}

GPUInfo("Debug dump to %s", outname.c_str());
return outname;
}

bool GPUReconstruction::triggerDebugDump()
{
if (mMaster) {
return mMaster->triggerDebugDump();
} else if (mDebugEnabled && mDebugData && mDebugData->debugCallback) {
GPUInfo("Running triggered debug callback");
mDebugData->debugCallback();
mDebugData->debugCount++;
mDebugData->debugCallback = nullptr;
return true;
}
return false;
}
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.

/// \file GPUReconstruction.cxx
/// \file GPUReconstructionLibrary.cxx
/// \author David Rohr

#ifdef _WIN32
Expand Down
1 change: 1 addition & 0 deletions GPU/GPUTracking/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ set(SRCS_NO_CINT
set(SRCS_NO_H SectorTracker/GPUTPCTrackerDump.cxx
Merger/GPUTPCGMMergerDump.cxx
Base/GPUReconstructionLibrary.cxx
Base/GPUReconstructionDebug.cxx
Global/GPUChainTrackingClusterizer.cxx
Global/GPUChainTrackingTransformation.cxx
Global/GPUChainTrackingTRD.cxx
Expand Down
8 changes: 8 additions & 0 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,14 @@ AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from
AddOption(oclOverrideSourceBuildFlags, std::string, "", "", 0, "Override OCL build flags for compilation from source, put a space for empty options")
AddOption(printSettings, bool, false, "", 0, "Print all settings when initializing")
AddOption(tpcFreeAllocatedMemoryAfterProcessing, bool, false, "", 0, "Clean all memory allocated by TPC when TPC processing done, only data written to external output resources will remain")
AddOption(debugOnFailure, int32_t, 0, "", 0, "Dump raw data in case an error occured, bit 1 enables all dumps, otherwise bitmask for: 2 = signal, 3 = GPUErrorCode", def(1))
AddOption(debugOnFailureSignalMask, uint32_t, (uint32_t)-1, "", 0, "Mask of signals that trigger debug / dump")
AddOption(debugOnFailureErrorMask, uint64_t, (uint64_t)-1, "", 0, "Mask of GPUCA_ERRORS that trigger debug / dump")
AddOption(debugOnFailureNoForwardSignal, bool, false, "", 0, "Do not forward signal to original signal handler")
AddOption(debugOnFailureMaxN, uint32_t, 1, "", 0, "Max number of times to run the debug / dump")
AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to have in the target folder")
AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB")
AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump")
AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
AddSubConfig(GPUSettingsProcessingRTC, rtc)
AddSubConfig(GPUSettingsProcessingRTCtechnical, rtctech)
Expand Down
28 changes: 25 additions & 3 deletions GPU/GPUTracking/Global/GPUChainTracking.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -705,10 +705,14 @@ int32_t GPUChainTracking::RunChain()
}
mRec->getGeneralStepTimer(GeneralStep::Prepare).Stop();

PrepareDebugOutput();
PrepareKernelDebugOutput();

SynchronizeStream(0); // Synchronize all init copies that might be ongoing

if (GetProcessingSettings().debugOnFailure) {
mRec->setDebugDumpCallback([this]() { DoDebugRawDump(); });
}

if (mIOPtrs.tpcCompressedClusters) {
if (runRecoStep(RecoStep::TPCDecompression, &GPUChainTracking::RunTPCDecompression)) {
return 1;
Expand Down Expand Up @@ -775,7 +779,7 @@ int32_t GPUChainTracking::RunChain()
}

int32_t retVal = 0;
if (CheckErrorCodes(false, false, mRec->getErrorCodeOutput())) {
if (CheckErrorCodes(false, false, mRec->getErrorCodeOutput())) { // TODO: Eventually, we should use GPUReconstruction::CheckErrorCodes
retVal = 3;
if (!GetProcessingSettings().ignoreNonFatalGPUErrors) {
return retVal;
Expand Down Expand Up @@ -815,7 +819,7 @@ int32_t GPUChainTracking::RunChainFinalize()
PrintOutputStat();
}

PrintDebugOutput();
PrintKernelDebugOutput();

// PrintMemoryRelations();

Expand Down Expand Up @@ -884,6 +888,7 @@ int32_t GPUChainTracking::FinalizePipelinedProcessing()
int32_t GPUChainTracking::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, std::vector<std::array<uint32_t, 4>>* fillErrors)
{
int32_t retVal = 0;
bool hasDebugError = false;
for (int32_t i = 0; i < 1 + (!cpuOnly && mRec->IsGPU()); i++) {
if (i) {
const auto& threadContext = GetThreadContext();
Expand Down Expand Up @@ -925,9 +930,26 @@ int32_t GPUChainTracking::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, st
fillErrors->emplace_back(std::array<uint32_t, 4>{pErrors[4 * j], pErrors[4 * j + 1], pErrors[4 * j + 2], pErrors[4 * j + 3]});
}
}
if ((GetProcessingSettings().debugOnFailure & 1) || (GetProcessingSettings().debugOnFailure & 4)) {
if (GetProcessingSettings().debugOnFailureErrorMask == (uint64_t)-1) {
hasDebugError = true;
} else {
uint32_t nErrors = processors()->errorCodes.getNErrors();
const uint32_t* pErrors = processors()->errorCodes.getErrorPtr();
for (uint32_t j = 0; j < nErrors; j++) {
if (GetProcessingSettings().debugOnFailureErrorMask & (1 << pErrors[4 * j])) {
hasDebugError = true;
break;
}
}
}
}
}
}
ClearErrorCodes(cpuOnly);
if (hasDebugError) {
mRec->triggerDebugDump();
}
return retVal;
}

Expand Down
7 changes: 4 additions & 3 deletions GPU/GPUTracking/Global/GPUChainTracking.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ class GPUChainTracking : public GPUChain
void ClearIOPointers();
void AllocateIOMemory();
using GPUChain::DumpData;
void DumpData(const char* filename);
void DumpData(const char* filename, const GPUTrackingInOutPointers* ioPtrs = nullptr);
using GPUChain::ReadData;
int32_t ReadData(const char* filename);
void DumpSettings(const char* dir = "") override;
Expand Down Expand Up @@ -231,11 +231,12 @@ class GPUChainTracking : public GPUChain
int32_t DoProfile();
void PrintMemoryRelations();
void PrintMemoryStatistics() override;
void PrepareDebugOutput();
void PrintDebugOutput();
void PrepareKernelDebugOutput();
void PrintKernelDebugOutput();
void PrintOutputStat();
static void DumpClusters(std::ostream& out, const o2::tpc::ClusterNativeAccess* clusters);
static void DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls);
void DoDebugRawDump();

bool ValidateSteps();
bool ValidateSettings();
Expand Down
Loading