GPU: Rename GPUFailedMsg to GPUChkErr

davidrohr · davidrohr · commit e5e4e1c76624 · 2025-03-14T15:08:39.000+01:00
diff --git a/GPU/Common/GPUCommonChkErr.h b/GPU/Common/GPUCommonChkErr.h
@@ -12,10 +12,19 @@
 /// \file GPUCommonChkErr.h
 /// \author David Rohr
 
+// GPUChkErr and GPUChkErrI will both check x for an error, using the loaded backend of GPUReconstruction (requiring GPUReconstruction.h to be included by the user).
+// In case of an error, it will print out the corresponding CUDA / HIP / OpenCL error code
+// GPUChkErr will download GPUReconstruction error values from GPU, print them, and terminate the application with an exception if an error occured.
+// GPUChkErrI will return 0 or 1, depending on whether an error has occurred.
+// The Macros must be called ona GPUReconstruction instance, e.g.:
+// if (mRec->GPUChkErrI(cudaMalloc(...))) { exit(1); }
+// gpuRecObj.GPUChkErr(cudaMalloc(...));
+
 #ifndef GPUCOMMONCHKERR_H
 #define GPUCOMMONCHKERR_H
 
-#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__, true)
-#define GPUFailedMsgI(x) GPUFailedMsgA(x, __FILE__, __LINE__, false)
+// Please #include "GPUReconstruction.h" in your code, if you use these 2!
+#define GPUChkErr(x) GPUChkErrA(x, __FILE__, __LINE__, true)
+#define GPUChkErrI(x) GPUChkErrA(x, __FILE__, __LINE__, false)
 
 #endif
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -1078,9 +1078,9 @@ int32_t GPUReconstruction::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, s
   return retVal;
 }
 
-int32_t GPUReconstruction::GPUFailedMsgA(const int64_t error, const char* file, int32_t line, bool failOnError)
+int32_t GPUReconstruction::GPUChkErrA(const int64_t error, const char* file, int32_t line, bool failOnError)
 {
-  if (error == 0 || !GPUFailedMsgInternal(error, file, line)) {
+  if (error == 0 || !GPUChkErrInternal(error, file, line)) {
     return 0;
   }
   if (failOnError) {
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -143,7 +143,7 @@ class GPUReconstruction
   virtual void* getGPUPointer(void* ptr) { return ptr; }
   virtual void startGPUProfiling() {}
   virtual void endGPUProfiling() {}
-  int32_t GPUFailedMsgA(const int64_t error, const char* file, int32_t line, bool failOnError);
+  int32_t GPUChkErrA(const int64_t error, const char* file, int32_t line, bool failOnError);
   int32_t CheckErrorCodes(bool cpuOnly = false, bool forceShowErrors = false, std::vector<std::array<uint32_t, 4>>* fillErrors = nullptr);
   void RunPipelineWorker();
   void TerminatePipelineWorker();
@@ -247,7 +247,7 @@ class GPUReconstruction
   void UpdateMaxMemoryUsed();
   int32_t EnqueuePipeline(bool terminate = false);
   GPUChain* GetNextChainInQueue();
-  virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const { return 0; }
+  virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const { return 0; }
 
   virtual int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) = 0;
   virtual int32_t unregisterMemoryForGPU_internal(const void* ptr) = 0;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
@@ -46,7 +46,7 @@ class GPUReconstructionDeviceBase : public GPUReconstructionCPU
   virtual int32_t InitDevice_Runtime() = 0;
   int32_t ExitDevice() override;
   virtual int32_t ExitDevice_Runtime() = 0;
-  virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const override = 0;
+  virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override = 0;
   int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) override;
   int32_t unregisterMemoryForGPU_internal(const void* ptr) override;
   void unregisterRemainingRegisteredMemory();
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -33,13 +33,13 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
 {
  public:
   ~GPUReconstructionCUDABackend() override;
-  static int32_t GPUFailedMsgStatic(const int64_t error, const char* file, int32_t line);
+  static int32_t GPUChkErrStatic(const int64_t error, const char* file, int32_t line);
 
  protected:
   GPUReconstructionCUDABackend(const GPUSettingsDeviceBackend& cfg);
 
   void PrintKernelOccupancies() override;
-  virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const override { return GPUFailedMsgStatic(error, file, line); }
+  virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override { return GPUChkErrStatic(error, file, line); }
 
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu
@@ -36,7 +36,7 @@ using namespace o2::gpu;
 #ifndef GPUCA_NO_CONSTANT_MEMORY
 static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
   void* retVal = nullptr;
-  if (GPUReconstructionCUDA::GPUFailedMsgStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {
+  if (GPUReconstructionCUDA::GPUChkErrStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {
     throw std::runtime_error("Could not obtain GPU constant memory symbol");
   }
   return retVal;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h
@@ -49,7 +49,7 @@ class GPUDebugTiming
   {
     if (mDo) {
       if (mDeviceTimers) {
-        mRec->GPUFailedMsg(cudaEventRecord(mDeviceTimers[0].get<cudaEvent_t>(), mStreams[mXYZ.x.stream]));
+        mRec->GPUChkErr(cudaEventRecord(mDeviceTimers[0].get<cudaEvent_t>(), mStreams[mXYZ.x.stream]));
       } else {
         mTimer.ResetStart();
       }
@@ -59,13 +59,13 @@ class GPUDebugTiming
   {
     if (mDo && mXYZ.t == 0.) {
       if (mDeviceTimers) {
-        mRec->GPUFailedMsg(cudaEventRecord(mDeviceTimers[1].get<cudaEvent_t>(), mStreams[mXYZ.x.stream]));
-        mRec->GPUFailedMsg(cudaEventSynchronize(mDeviceTimers[1].get<cudaEvent_t>()));
+        mRec->GPUChkErr(cudaEventRecord(mDeviceTimers[1].get<cudaEvent_t>(), mStreams[mXYZ.x.stream]));
+        mRec->GPUChkErr(cudaEventSynchronize(mDeviceTimers[1].get<cudaEvent_t>()));
         float v;
-        mRec->GPUFailedMsg(cudaEventElapsedTime(&v, mDeviceTimers[0].get<cudaEvent_t>(), mDeviceTimers[1].get<cudaEvent_t>()));
+        mRec->GPUChkErr(cudaEventElapsedTime(&v, mDeviceTimers[0].get<cudaEvent_t>(), mDeviceTimers[1].get<cudaEvent_t>()));
         mXYZ.t = v * 1.e-3f;
       } else {
-        mRec->GPUFailedMsg(cudaStreamSynchronize(mStreams[mXYZ.x.stream]));
+        mRec->GPUChkErr(cudaStreamSynchronize(mStreams[mXYZ.x.stream]));
         mXYZ.t = mTimer.GetCurrentElapsedTime();
       }
     }
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -34,7 +34,7 @@ __global__ void gGPUConstantMemBuffer_dummy(int32_t* p) { *p = *(int32_t*)&gGPUC
 template <>
 inline void GPUReconstructionCUDABackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
-  GPUFailedMsg(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));
+  GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));
 }
 
 template <class T, int32_t I, typename... Args>
@@ -56,7 +56,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet
 #endif
     pArgs[arg_offset] = &y.index;
     GPUReconstructionCUDAInternals::getArgPtrs(&pArgs[arg_offset + 1], args...);
-    GPUFailedMsg(cuLaunchKernel(*mInternals->kernelFunctions[getRTCkernelNum<false, T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
+    GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[getRTCkernelNum<false, T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
   }
 }
 
@@ -67,16 +67,16 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
   auto& z = args.s.z;
   if (z.evList) {
     for (int32_t k = 0; k < z.nEvents; k++) {
-      GPUFailedMsg(cudaStreamWaitEvent(mInternals->Streams[x.stream], ((cudaEvent_t*)z.evList)[k], 0));
+      GPUChkErr(cudaStreamWaitEvent(mInternals->Streams[x.stream], ((cudaEvent_t*)z.evList)[k], 0));
     }
   }
   {
     GPUDebugTiming timer(mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this);
     std::apply([this, &args](auto&... vals) { this->runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
   }
-  GPUFailedMsg(cudaGetLastError());
+  GPUChkErr(cudaGetLastError());
   if (z.ev) {
-    GPUFailedMsg(cudaEventRecord(*(cudaEvent_t*)z.ev, mInternals->Streams[x.stream]));
+    GPUChkErr(cudaEventRecord(*(cudaEvent_t*)z.ev, mInternals->Streams[x.stream]));
   }
 }
 
@@ -138,7 +138,7 @@ void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& k
 #ifndef GPUCA_NO_CONSTANT_MEMORY
 static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
   void* retVal = nullptr;
-  if (GPUReconstructionCUDA::GPUFailedMsgStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {
+  if (GPUReconstructionCUDA::GPUChkErrStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {
     throw std::runtime_error("Could not obtain GPU constant memory symbol");
   }
   return retVal;
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx

Original file line number	Diff line number	Diff line change
`@@ -1078,9 +1078,9 @@ int32_t GPUReconstruction::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, s`
`1078`	`1078`	`return retVal;`
`1079`	`1079`	`}`
`1080`	`1080`
`1081`		`-int32_t GPUReconstruction::GPUFailedMsgA(const int64_t error, const char* file, int32_t line, bool failOnError)`
	`1081`	`+int32_t GPUReconstruction::GPUChkErrA(const int64_t error, const char* file, int32_t line, bool failOnError)`
`1082`	`1082`	`{`
`1083`		`- if (error == 0 \|\| !GPUFailedMsgInternal(error, file, line)) {`
	`1083`	`+ if (error == 0 \|\| !GPUChkErrInternal(error, file, line)) {`
`1084`	`1084`	`return 0;`
`1085`	`1085`	`}`
`1086`	`1086`	`if (failOnError) {`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ using namespace o2::gpu;`
`36`	`36`	`#ifndef GPUCA_NO_CONSTANT_MEMORY`
`37`	`37`	`static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {`
`38`	`38`	`void* retVal = nullptr;`
`39`		`- if (GPUReconstructionCUDA::GPUFailedMsgStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {`
	`39`	`+ if (GPUReconstructionCUDA::GPUChkErrStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {`
`40`	`40`	`throw std::runtime_error("Could not obtain GPU constant memory symbol");`
`41`	`41`	`}`
`42`	`42`	`return retVal;`
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ class GPUDebugTiming`
`49`	`49`	`{`
`50`	`50`	`if (mDo) {`
`51`	`51`	`if (mDeviceTimers) {`
`52`		`- mRec->GPUFailedMsg(cudaEventRecord(mDeviceTimers[0].get<cudaEvent_t>(), mStreams[mXYZ.x.stream]));`
	`52`	`+ mRec->GPUChkErr(cudaEventRecord(mDeviceTimers[0].get<cudaEvent_t>(), mStreams[mXYZ.x.stream]));`
`53`	`53`	`} else {`
`54`	`54`	`mTimer.ResetStart();`
`55`	`55`	`}`
`@@ -59,13 +59,13 @@ class GPUDebugTiming`
`59`	`59`	`{`
`60`	`60`	`if (mDo && mXYZ.t == 0.) {`
`61`	`61`	`if (mDeviceTimers) {`
`62`		`- mRec->GPUFailedMsg(cudaEventRecord(mDeviceTimers[1].get<cudaEvent_t>(), mStreams[mXYZ.x.stream]));`
`63`		`- mRec->GPUFailedMsg(cudaEventSynchronize(mDeviceTimers[1].get<cudaEvent_t>()));`
	`62`	`+ mRec->GPUChkErr(cudaEventRecord(mDeviceTimers[1].get<cudaEvent_t>(), mStreams[mXYZ.x.stream]));`
	`63`	`+ mRec->GPUChkErr(cudaEventSynchronize(mDeviceTimers[1].get<cudaEvent_t>()));`
`64`	`64`	`float v;`
`65`		`- mRec->GPUFailedMsg(cudaEventElapsedTime(&v, mDeviceTimers[0].get<cudaEvent_t>(), mDeviceTimers[1].get<cudaEvent_t>()));`
	`65`	`+ mRec->GPUChkErr(cudaEventElapsedTime(&v, mDeviceTimers[0].get<cudaEvent_t>(), mDeviceTimers[1].get<cudaEvent_t>()));`
`66`	`66`	`mXYZ.t = v * 1.e-3f;`
`67`	`67`	`} else {`
`68`		`- mRec->GPUFailedMsg(cudaStreamSynchronize(mStreams[mXYZ.x.stream]));`
	`68`	`+ mRec->GPUChkErr(cudaStreamSynchronize(mStreams[mXYZ.x.stream]));`
`69`	`69`	`mXYZ.t = mTimer.GetCurrentElapsedTime();`
`70`	`70`	`}`
`71`	`71`	`}`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ __global__ void gGPUConstantMemBuffer_dummy(int32_t* p) { p = (int32_t*)&gGPUC`
`34`	`34`	`template <>`
`35`	`35`	`inline void GPUReconstructionCUDABackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)`
`36`	`36`	`{`
`37`		`- GPUFailedMsg(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));`
	`37`	`+ GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));`
`38`	`38`	`}`
`39`	`39`
`40`	`40`	`template <class T, int32_t I, typename... Args>`
`@@ -56,7 +56,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet`
`56`	`56`	`#endif`
`57`	`57`	`pArgs[arg_offset] = &y.index;`
`58`	`58`	`GPUReconstructionCUDAInternals::getArgPtrs(&pArgs[arg_offset + 1], args...);`
`59`		`- GPUFailedMsg(cuLaunchKernel(mInternals->kernelFunctions[getRTCkernelNum<false, T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void*)pArgs, nullptr));`
	`59`	`+ GPUChkErr(cuLaunchKernel(mInternals->kernelFunctions[getRTCkernelNum<false, T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void*)pArgs, nullptr));`
`60`	`60`	`}`
`61`	`61`	`}`
`62`	`62`
`@@ -67,16 +67,16 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar`
`67`	`67`	`auto& z = args.s.z;`
`68`	`68`	`if (z.evList) {`
`69`	`69`	`for (int32_t k = 0; k < z.nEvents; k++) {`
`70`		`- GPUFailedMsg(cudaStreamWaitEvent(mInternals->Streams[x.stream], ((cudaEvent_t*)z.evList)[k], 0));`
	`70`	`+ GPUChkErr(cudaStreamWaitEvent(mInternals->Streams[x.stream], ((cudaEvent_t*)z.evList)[k], 0));`
`71`	`71`	`}`
`72`	`72`	`}`
`73`	`73`	`{`
`74`	`74`	`GPUDebugTiming timer(mProcessingSettings.deviceTimers && mProcessingSettings.debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this);`
`75`	`75`	`std::apply([this, &args](auto&... vals) { this->runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);`
`76`	`76`	`}`
`77`		`- GPUFailedMsg(cudaGetLastError());`
	`77`	`+ GPUChkErr(cudaGetLastError());`
`78`	`78`	`if (z.ev) {`
`79`		`- GPUFailedMsg(cudaEventRecord((cudaEvent_t)z.ev, mInternals->Streams[x.stream]));`
	`79`	`+ GPUChkErr(cudaEventRecord((cudaEvent_t)z.ev, mInternals->Streams[x.stream]));`
`80`	`80`	`}`
`81`	`81`	`}`
`82`	`82`
`@@ -138,7 +138,7 @@ void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& k`
`138`	`138`	`#ifndef GPUCA_NO_CONSTANT_MEMORY`
`139`	`139`	`static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {`
`140`	`140`	`void* retVal = nullptr;`
`141`		`- if (GPUReconstructionCUDA::GPUFailedMsgStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {`
	`141`	`+ if (GPUReconstructionCUDA::GPUChkErrStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {`
`142`	`142`	`throw std::runtime_error("Could not obtain GPU constant memory symbol");`
`143`	`143`	`}`
`144`	`144`	`return retVal;`