|
| 1 | +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. |
| 2 | +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. |
| 3 | +// All rights not expressly granted are reserved. |
| 4 | +// |
| 5 | +// This software is distributed under the terms of the GNU General Public |
| 6 | +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". |
| 7 | +// |
| 8 | +// In applying this license CERN does not waive the privileges and immunities |
| 9 | +// granted to it by virtue of its status as an Intergovernmental Organization |
| 10 | +// or submit itself to any jurisdiction. |
| 11 | + |
| 12 | +/// \file GPUReconstructionDebug.cxx |
| 13 | +/// \author David Rohr |
| 14 | + |
| 15 | +#include "GPUReconstruction.h" |
| 16 | +#include "GPULogging.h" |
| 17 | +#include "GPUSettings.h" |
| 18 | + |
| 19 | +#include <csignal> |
| 20 | +#include <functional> |
| 21 | +#include <unordered_map> |
| 22 | +#include <mutex> |
| 23 | +#include <filesystem> |
| 24 | +#include <chrono> |
| 25 | +#include <format> |
| 26 | + |
| 27 | +using namespace o2::gpu; |
| 28 | + |
| 29 | +struct GPUReconstruction::debugInternal { |
| 30 | + std::function<void(int32_t, siginfo_t*, void*)> signalCallback; |
| 31 | + std::function<void()> debugCallback = nullptr; |
| 32 | + std::function<void()> reinstallCallback = nullptr; |
| 33 | + std::unordered_map<int32_t, struct sigaction> oldActions; |
| 34 | + size_t debugCount = 0; |
| 35 | + static void globalCallback(int32_t signal, siginfo_t* info, void* ucontext) |
| 36 | + { |
| 37 | + GPUReconstruction::mDebugData->signalCallback(signal, info, ucontext); |
| 38 | + } |
| 39 | +}; |
| 40 | + |
| 41 | +std::unique_ptr<GPUReconstruction::debugInternal> GPUReconstruction::mDebugData; |
| 42 | + |
| 43 | +void GPUReconstruction::debugInit() |
| 44 | +{ |
| 45 | + if (GetProcessingSettings().debugOnFailure) { |
| 46 | + static std::mutex initMutex; |
| 47 | + { |
| 48 | + std::lock_guard<std::mutex> guard(initMutex); |
| 49 | + if (mDebugData) { |
| 50 | + GPUFatal("Error handlers for debug dumps already set, cannot set them again"); |
| 51 | + } |
| 52 | + mDebugData = std::make_unique<debugInternal>(); |
| 53 | + } |
| 54 | + mDebugEnabled = true; |
| 55 | + if ((GetProcessingSettings().debugOnFailure & 1) || (GetProcessingSettings().debugOnFailure & 2)) { |
| 56 | + struct sigaction sa, oldsa; |
| 57 | + memset(&sa, 0, sizeof(sa)); |
| 58 | + sa.sa_sigaction = GPUReconstruction::debugInternal::globalCallback; |
| 59 | + sa.sa_flags = SA_SIGINFO; |
| 60 | + uint32_t mask = GetProcessingSettings().debugOnFailureSignalMask == (uint32_t)-1 ? ((1 << SIGINT) | (1 << SIGABRT) | (1 << SIGBUS) | (1 << SIGTERM) | (1 << SIGSEGV)) : GetProcessingSettings().debugOnFailureSignalMask; |
| 61 | + if (mask) { |
| 62 | + for (uint32_t i = 0; i < sizeof(mask) * 8; i++) { |
| 63 | + if (mask & (1 << i)) { |
| 64 | + if (sigaction(i, &sa, &oldsa)) { |
| 65 | + GPUFatal("Error installing signal handler for error dump on signal %d", i); |
| 66 | + } |
| 67 | + mDebugData->oldActions.emplace(i, oldsa); |
| 68 | + } |
| 69 | + } |
| 70 | + } |
| 71 | + |
| 72 | + mDebugData->signalCallback = [this, &oldActions = mDebugData->oldActions, myAction = std::move(sa)](int32_t signal, siginfo_t* info, void* ucontext) { |
| 73 | + static std::mutex callbackMutex; |
| 74 | + std::lock_guard<std::mutex> guard(callbackMutex); |
| 75 | + if (mDebugData->debugCallback) { |
| 76 | + GPUInfo("Running debug callback for signal %d", signal); |
| 77 | + mDebugData->debugCallback(); |
| 78 | + mDebugData->debugCount++; |
| 79 | + } |
| 80 | + mDebugData->debugCallback = nullptr; |
| 81 | + if (!GetProcessingSettings().debugOnFailureNoForwardSignal) { |
| 82 | + sigaction(signal, &oldActions[signal], nullptr); |
| 83 | + raise(signal); |
| 84 | + mDebugData->reinstallCallback = [signal, myAction]() { sigaction(signal, &myAction, nullptr); }; |
| 85 | + } |
| 86 | + }; |
| 87 | + } |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +void GPUReconstruction::debugExit() |
| 92 | +{ |
| 93 | + if (!mDebugEnabled) { |
| 94 | + return; |
| 95 | + } |
| 96 | + if (mDebugData) { |
| 97 | + for (auto& it : mDebugData->oldActions) { |
| 98 | + if (sigaction(it.first, &it.second, nullptr)) { |
| 99 | + GPUFatal("Error restoring signal handler for signal %d", it.first); |
| 100 | + } |
| 101 | + } |
| 102 | + } |
| 103 | + mDebugEnabled = false; |
| 104 | +} |
| 105 | + |
| 106 | +void GPUReconstruction::setDebugDumpCallback(std::function<void()>&& callback) |
| 107 | +{ |
| 108 | + if (mMaster) { |
| 109 | + if (mDebugData->reinstallCallback) { |
| 110 | + mDebugData->reinstallCallback(); |
| 111 | + mDebugData->reinstallCallback = nullptr; |
| 112 | + } |
| 113 | + mMaster->setDebugDumpCallback(std::move(callback)); |
| 114 | + } else if (mDebugEnabled && mDebugData) { |
| 115 | + mDebugData->debugCallback = callback; |
| 116 | + } |
| 117 | +} |
| 118 | + |
| 119 | +std::string GPUReconstruction::getDebugFolder(const std::string& prefix) |
| 120 | +{ |
| 121 | + const std::filesystem::path target_dir = GetProcessingSettings().debugOnFailureDirectory; |
| 122 | + |
| 123 | + std::size_t total_size = 0; |
| 124 | + std::size_t subfolder_count = 0; |
| 125 | + |
| 126 | + if (!std::filesystem::exists(target_dir) || !std::filesystem::is_directory(target_dir)) { |
| 127 | + GPUError("Invalid debugOnFailureDirectory %s", GetProcessingSettings().debugOnFailureDirectory.c_str()); |
| 128 | + return ""; |
| 129 | + } |
| 130 | + |
| 131 | + for (const auto& entry : std::filesystem::directory_iterator(target_dir)) { |
| 132 | + if (entry.is_directory()) { |
| 133 | + subfolder_count++; |
| 134 | + |
| 135 | + for (const auto& subentry : std::filesystem::directory_iterator(entry.path())) { |
| 136 | + if (subentry.is_regular_file()) { |
| 137 | + std::error_code ec; |
| 138 | + auto size = std::filesystem::file_size(subentry.path(), ec); |
| 139 | + if (!ec) { |
| 140 | + total_size += size; |
| 141 | + } |
| 142 | + } |
| 143 | + } |
| 144 | + } |
| 145 | + } |
| 146 | + |
| 147 | + if ((GetProcessingSettings().debugOnFailureMaxFiles && subfolder_count >= GetProcessingSettings().debugOnFailureMaxFiles) || (GetProcessingSettings().debugOnFailureMaxSize && (total_size >> 30) >= GetProcessingSettings().debugOnFailureMaxSize)) { |
| 148 | + GPUError("Cannot store debug dump files, target storage exceeded: %zu dumps, %zu bytes", subfolder_count, total_size); |
| 149 | + return ""; |
| 150 | + } |
| 151 | + |
| 152 | + auto currentTime = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); |
| 153 | + std::ostringstream dateTime; |
| 154 | + dateTime << std::put_time(std::localtime(¤tTime), "%Y-%m-%d_%H-%M-%S"); |
| 155 | + |
| 156 | + int32_t attempt = 0; |
| 157 | + std::string outname; |
| 158 | + while (true) { |
| 159 | + if (attempt++ >= 512) { |
| 160 | + GPUError("Error creating debug dump folder"); |
| 161 | + return ""; |
| 162 | + } |
| 163 | + |
| 164 | + outname = GetProcessingSettings().debugOnFailureDirectory + "/debug_" + prefix + (prefix == "" ? "" : "_") + dateTime.str() + "_" + std::to_string(attempt); |
| 165 | + std::error_code ec; |
| 166 | + bool created = std::filesystem::create_directory(outname, ec); |
| 167 | + if (!ec && created) { |
| 168 | + break; |
| 169 | + } |
| 170 | + } |
| 171 | + |
| 172 | + GPUInfo("Debug dump to %s", outname.c_str()); |
| 173 | + return outname; |
| 174 | +} |
| 175 | + |
| 176 | +bool GPUReconstruction::triggerDebugDump() |
| 177 | +{ |
| 178 | + if (mMaster) { |
| 179 | + return mMaster->triggerDebugDump(); |
| 180 | + } else if (mDebugEnabled && mDebugData && mDebugData->debugCallback) { |
| 181 | + GPUInfo("Running triggered debug callback"); |
| 182 | + mDebugData->debugCallback(); |
| 183 | + mDebugData->debugCount++; |
| 184 | + mDebugData->debugCallback = nullptr; |
| 185 | + return true; |
| 186 | + } |
| 187 | + return false; |
| 188 | +} |
0 commit comments