GPU: Provide general GPUFailedMsg functionality also externally

davidrohr · davidrohr · commit 8e8965d77662 · 2025-03-14T14:33:40.000+01:00
diff --git a/GPU/Common/CMakeLists.txt b/GPU/Common/CMakeLists.txt
@@ -15,6 +15,7 @@ set(HDRS_INSTALL
     GPUCommonAlgorithm.h
     GPUCommonDef.h
     GPUCommonDefAPI.h
+    GPUCommonChkErr.h
     GPUCommonDefSettings.h
     GPUCommonConstants.h
     GPUCommonLogger.h
diff --git a/GPU/Common/GPUCommonChkErr.h b/GPU/Common/GPUCommonChkErr.h
@@ -0,0 +1,21 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUCommonChkErr.h
+/// \author David Rohr
+
+#ifndef GPUCOMMONCHKERR_H
+#define GPUCOMMONCHKERR_H
+
+#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__, true)
+#define GPUFailedMsgI(x) GPUFailedMsgA(x, __FILE__, __LINE__, false)
+
+#endif
diff --git a/GPU/Common/GPUCommonDefAPI.h b/GPU/Common/GPUCommonDefAPI.h
@@ -36,7 +36,7 @@
   #define GPUdni()                                  // Device function, not-to-be-inlined
   #define GPUdnii() inline                          // Device function, not-to-be-inlined on device, inlined on host
   #define GPUh()                                    // Host-only function
-  // NOTE: All GPUd*() functions are also compiled on the host during GCC compilation.
+  // NOTE: All GPUd*() functions are also compiled on the host during host compilation.
   // The GPUh*() macros are for the rare cases of functions that you want to compile for the host during GPU compilation.
   // Usually, you do not need the GPUh*() versions. If in doubt, use GPUd*()!
   #define GPUhi() inline                            // to-be-inlined host-only function
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -1078,6 +1078,21 @@ int32_t GPUReconstruction::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, s
   return retVal;
 }
 
+int32_t GPUReconstruction::GPUFailedMsgA(const int64_t error, const char* file, int32_t line, bool failOnError)
+{
+  if (error == 0 || !GPUFailedMsgInternal(error, file, line)) {
+    return 0;
+  }
+  if (failOnError) {
+    if (mInitialized && mInErrorHandling == false) {
+      mInErrorHandling = true;
+      CheckErrorCodes(false, true);
+    }
+    throw std::runtime_error("GPU Backend Failure");
+  }
+  return 1;
+}
+
 void GPUReconstruction::DumpSettings(const char* dir)
 {
   std::string f;
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -143,6 +143,7 @@ class GPUReconstruction
   virtual void* getGPUPointer(void* ptr) { return ptr; }
   virtual void startGPUProfiling() {}
   virtual void endGPUProfiling() {}
+  int32_t GPUFailedMsgA(const int64_t error, const char* file, int32_t line, bool failOnError);
   int32_t CheckErrorCodes(bool cpuOnly = false, bool forceShowErrors = false, std::vector<std::array<uint32_t, 4>>* fillErrors = nullptr);
   void RunPipelineWorker();
   void TerminatePipelineWorker();
@@ -246,6 +247,7 @@ class GPUReconstruction
   void UpdateMaxMemoryUsed();
   int32_t EnqueuePipeline(bool terminate = false);
   GPUChain* GetNextChainInQueue();
+  virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const { return 0; }
 
   virtual int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) = 0;
   virtual int32_t unregisterMemoryForGPU_internal(const void* ptr) = 0;
@@ -327,6 +329,7 @@ class GPUReconstruction
 
   // Others
   bool mInitialized = false;
+  bool mInErrorHandling = false;
   uint32_t mStatNEvents = 0;
   uint32_t mNEventsProcessed = 0;
   double mStatKernelTime = 0.;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
@@ -175,7 +175,11 @@ void GPUReconstructionDeviceBase::runConstantRegistrators()
 {
   auto& list = getDeviceConstantMemRegistratorsVector();
   for (uint32_t i = 0; i < list.size(); i++) {
-    mDeviceConstantMemList.emplace_back(list[i]());
+    auto* ptr = list[i]();
+    if (ptr == nullptr) {
+      GPUFatal("Error registering constant memory");
+    }
+    mDeviceConstantMemList.emplace_back(ptr);
   }
 }
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
@@ -46,6 +46,7 @@ class GPUReconstructionDeviceBase : public GPUReconstructionCPU
   virtual int32_t InitDevice_Runtime() = 0;
   int32_t ExitDevice() override;
   virtual int32_t ExitDevice_Runtime() = 0;
+  virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const override = 0;
   int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) override;
   int32_t unregisterMemoryForGPU_internal(const void* ptr) override;
   void unregisterRemainingRegisteredMemory();
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -61,28 +61,16 @@ GPUReconstructionCUDABackend::~GPUReconstructionCUDABackend()
   }
 }
 
-int32_t GPUReconstructionCUDABackend::GPUFailedMsgAI(const int64_t error, const char* file, int32_t line)
+static_assert(sizeof(cudaError_t) <= sizeof(int64_t) && cudaSuccess == 0);
+int32_t GPUReconstructionCUDABackend::GPUFailedMsgStatic(const int64_t error, const char* file, int32_t line)
 {
-  // Check for CUDA Error and in the case of an error display the corresponding error string
   if (error == cudaSuccess) {
     return (0);
   }
   GPUError("CUDA Error: %ld / %s (%s:%d)", error, cudaGetErrorString((cudaError_t)error), file, line);
   return 1;
 }
 
-void GPUReconstructionCUDABackend::GPUFailedMsgA(const int64_t error, const char* file, int32_t line)
-{
-  if (GPUFailedMsgAI(error, file, line)) {
-    static bool runningCallbacks = false;
-    if (IsInitialized() && runningCallbacks == false) {
-      runningCallbacks = true;
-      CheckErrorCodes(false, true);
-    }
-    throw std::runtime_error("CUDA Failure");
-  }
-}
-
 GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg)
 {
   mDeviceBackendSettings.deviceType = DeviceType::CUDA;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -33,13 +33,13 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
 {
  public:
   ~GPUReconstructionCUDABackend() override;
-  static int32_t GPUFailedMsgAI(const int64_t error, const char* file, int32_t line);
-  void GPUFailedMsgA(const int64_t error, const char* file, int32_t line);
+  static int32_t GPUFailedMsgStatic(const int64_t error, const char* file, int32_t line);
 
  protected:
   GPUReconstructionCUDABackend(const GPUSettingsDeviceBackend& cfg);
 
   void PrintKernelOccupancies() override;
+  virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const override { return GPUFailedMsgStatic(error, file, line); }
 
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
@@ -36,7 +36,7 @@ using namespace o2::gpu;
 #ifndef GPUCA_NO_CONSTANT_MEMORY
 static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
   void* retVal = nullptr;
-  if (cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer) != cudaSuccess) {
+  if (GPUReconstructionCUDA::GPUFailedMsgStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {
     throw std::runtime_error("Could not obtain GPU constant memory symbol");
   }
   return retVal;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h
@@ -22,13 +22,11 @@
 #include <vector>
 #include <memory>
 #include <string>
+#include "GPUCommonChkErr.h"
 
 namespace o2::gpu
 {
 
-#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__)
-#define GPUFailedMsgI(x) GPUFailedMsgAI(x, __FILE__, __LINE__)
-
 struct GPUReconstructionCUDAInternals {
   std::vector<std::unique_ptr<CUmodule>> kernelModules;     // module for RTC compilation
   std::vector<std::unique_ptr<CUfunction>> kernelFunctions; // vector of ptrs to RTC kernels
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -138,7 +138,9 @@ void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& k
 #ifndef GPUCA_NO_CONSTANT_MEMORY
 static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
   void* retVal = nullptr;
-  GPUReconstructionCUDA::GPUFailedMsgI(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer));
+  if (GPUReconstructionCUDA::GPUFailedMsgStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {
+    throw std::runtime_error("Could not obtain GPU constant memory symbol");
+  }
   return retVal;
 });
 #endif
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
@@ -48,28 +48,17 @@ GPUReconstructionOCLBackend::~GPUReconstructionOCLBackend()
   }
 }
 
-int32_t GPUReconstructionOCLBackend::GPUFailedMsgAI(const int64_t error, const char* file, int32_t line)
+static_assert(sizeof(cl_int) <= sizeof(int64_t) && CL_SUCCESS == 0);
+int32_t GPUReconstructionOCLBackend::GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const
 {
   // Check for OPENCL Error and in the case of an error display the corresponding error string
   if (error == CL_SUCCESS) {
     return (0);
   }
-  GPUError("OCL Error: %ld / %s (%s:%d)", error, convertErrorToString(error), file, line);
+  GPUError("OpenCL Error: %ld / %s (%s:%d)", error, convertErrorToString(error), file, line);
   return 1;
 }
 
-void GPUReconstructionOCLBackend::GPUFailedMsgA(const int64_t error, const char* file, int32_t line)
-{
-  if (GPUFailedMsgAI(error, file, line)) {
-    static bool runningCallbacks = false;
-    if (IsInitialized() && runningCallbacks == false) {
-      runningCallbacks = true;
-      CheckErrorCodes(false, true);
-    }
-    throw std::runtime_error("OpenCL Failure");
-  }
-}
-
 void GPUReconstructionOCLBackend::UpdateAutomaticProcessingSettings()
 {
   GPUCA_GPUReconstructionUpdateDefaults();
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
@@ -39,8 +39,7 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase
   int32_t ExitDevice_Runtime() override;
   void UpdateAutomaticProcessingSettings() override;
 
-  int32_t GPUFailedMsgAI(const int64_t error, const char* file, int32_t line);
-  void GPUFailedMsgA(const int64_t error, const char* file, int32_t line);
+  virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const override;
 
   void SynchronizeGPU() override;
   int32_t DoStuckProtection(int32_t stream, deviceEvent event) override;
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
@@ -28,6 +28,7 @@
 
 #include "GPUReconstructionOCL.h"
 #include "GPUReconstructionIncludes.h"
+#include "GPUCommonChkErr.h"
 
 using namespace o2::gpu;
 
@@ -36,9 +37,6 @@ using namespace o2::gpu;
 #include <typeinfo>
 #include <cstdlib>
 
-#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__)
-#define GPUFailedMsgI(x) GPUFailedMsgAI(x, __FILE__, __LINE__)
-
 namespace o2::gpu
 {
 struct GPUReconstructionOCLInternals {

Original file line number	Diff line number	Diff line change
`@@ -175,7 +175,11 @@ void GPUReconstructionDeviceBase::runConstantRegistrators()`
`175`	`175`	`{`
`176`	`176`	`auto& list = getDeviceConstantMemRegistratorsVector();`
`177`	`177`	`for (uint32_t i = 0; i < list.size(); i++) {`
`178`		`- mDeviceConstantMemList.emplace_back(list[i]());`
	`178`	`+ auto* ptr = list[i]();`
	`179`	`+ if (ptr == nullptr) {`
	`180`	`+ GPUFatal("Error registering constant memory");`
	`181`	`+ }`
	`182`	`+ mDeviceConstantMemList.emplace_back(ptr);`
`179`	`183`	`}`
`180`	`184`	`}`
`181`	`185`
Original file line number	Diff line number	Diff line change
`@@ -61,28 +61,16 @@ GPUReconstructionCUDABackend::~GPUReconstructionCUDABackend()`
`61`	`61`	`}`
`62`	`62`	`}`
`63`	`63`
`64`		`-int32_t GPUReconstructionCUDABackend::GPUFailedMsgAI(const int64_t error, const char* file, int32_t line)`
	`64`	`+static_assert(sizeof(cudaError_t) <= sizeof(int64_t) && cudaSuccess == 0);`
	`65`	`+int32_t GPUReconstructionCUDABackend::GPUFailedMsgStatic(const int64_t error, const char* file, int32_t line)`
`65`	`66`	`{`
`66`		`- // Check for CUDA Error and in the case of an error display the corresponding error string`
`67`	`67`	`if (error == cudaSuccess) {`
`68`	`68`	`return (0);`
`69`	`69`	`}`
`70`	`70`	`GPUError("CUDA Error: %ld / %s (%s:%d)", error, cudaGetErrorString((cudaError_t)error), file, line);`
`71`	`71`	`return 1;`
`72`	`72`	`}`
`73`	`73`
`74`		`-void GPUReconstructionCUDABackend::GPUFailedMsgA(const int64_t error, const char* file, int32_t line)`
`75`		`-{`
`76`		`- if (GPUFailedMsgAI(error, file, line)) {`
`77`		`- static bool runningCallbacks = false;`
`78`		`- if (IsInitialized() && runningCallbacks == false) {`
`79`		`- runningCallbacks = true;`
`80`		`- CheckErrorCodes(false, true);`
`81`		`- }`
`82`		`- throw std::runtime_error("CUDA Failure");`
`83`		`- }`
`84`		`-}`
`85`		`-`
`86`	`74`	`GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg)`
`87`	`75`	`{`
`88`	`76`	`mDeviceBackendSettings.deviceType = DeviceType::CUDA;`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ using namespace o2::gpu;`
`36`	`36`	`#ifndef GPUCA_NO_CONSTANT_MEMORY`
`37`	`37`	`static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {`
`38`	`38`	`void* retVal = nullptr;`
`39`		`- if (cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer) != cudaSuccess) {`
	`39`	`+ if (GPUReconstructionCUDA::GPUFailedMsgStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {`
`40`	`40`	`throw std::runtime_error("Could not obtain GPU constant memory symbol");`
`41`	`41`	`}`
`42`	`42`	`return retVal;`