Skip to content

Commit 8e8965d

Browse files
committed
GPU: Provide general GPUFailedMsg functionality also externally
1 parent b5ab60d commit 8e8965d

15 files changed

+61
-42
lines changed

GPU/Common/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ set(HDRS_INSTALL
1515
GPUCommonAlgorithm.h
1616
GPUCommonDef.h
1717
GPUCommonDefAPI.h
18+
GPUCommonChkErr.h
1819
GPUCommonDefSettings.h
1920
GPUCommonConstants.h
2021
GPUCommonLogger.h

GPU/Common/GPUCommonChkErr.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2+
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3+
// All rights not expressly granted are reserved.
4+
//
5+
// This software is distributed under the terms of the GNU General Public
6+
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7+
//
8+
// In applying this license CERN does not waive the privileges and immunities
9+
// granted to it by virtue of its status as an Intergovernmental Organization
10+
// or submit itself to any jurisdiction.
11+
12+
/// \file GPUCommonChkErr.h
13+
/// \author David Rohr
14+
15+
#ifndef GPUCOMMONCHKERR_H
16+
#define GPUCOMMONCHKERR_H
17+
18+
#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__, true)
19+
#define GPUFailedMsgI(x) GPUFailedMsgA(x, __FILE__, __LINE__, false)
20+
21+
#endif

GPU/Common/GPUCommonDefAPI.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
#define GPUdni() // Device function, not-to-be-inlined
3737
#define GPUdnii() inline // Device function, not-to-be-inlined on device, inlined on host
3838
#define GPUh() // Host-only function
39-
// NOTE: All GPUd*() functions are also compiled on the host during GCC compilation.
39+
// NOTE: All GPUd*() functions are also compiled on the host during host compilation.
4040
// The GPUh*() macros are for the rare cases of functions that you want to compile for the host during GPU compilation.
4141
// Usually, you do not need the GPUh*() versions. If in doubt, use GPUd*()!
4242
#define GPUhi() inline // to-be-inlined host-only function

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1078,6 +1078,21 @@ int32_t GPUReconstruction::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, s
10781078
return retVal;
10791079
}
10801080

1081+
int32_t GPUReconstruction::GPUFailedMsgA(const int64_t error, const char* file, int32_t line, bool failOnError)
1082+
{
1083+
if (error == 0 || !GPUFailedMsgInternal(error, file, line)) {
1084+
return 0;
1085+
}
1086+
if (failOnError) {
1087+
if (mInitialized && mInErrorHandling == false) {
1088+
mInErrorHandling = true;
1089+
CheckErrorCodes(false, true);
1090+
}
1091+
throw std::runtime_error("GPU Backend Failure");
1092+
}
1093+
return 1;
1094+
}
1095+
10811096
void GPUReconstruction::DumpSettings(const char* dir)
10821097
{
10831098
std::string f;

GPU/GPUTracking/Base/GPUReconstruction.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ class GPUReconstruction
143143
virtual void* getGPUPointer(void* ptr) { return ptr; }
144144
virtual void startGPUProfiling() {}
145145
virtual void endGPUProfiling() {}
146+
int32_t GPUFailedMsgA(const int64_t error, const char* file, int32_t line, bool failOnError);
146147
int32_t CheckErrorCodes(bool cpuOnly = false, bool forceShowErrors = false, std::vector<std::array<uint32_t, 4>>* fillErrors = nullptr);
147148
void RunPipelineWorker();
148149
void TerminatePipelineWorker();
@@ -246,6 +247,7 @@ class GPUReconstruction
246247
void UpdateMaxMemoryUsed();
247248
int32_t EnqueuePipeline(bool terminate = false);
248249
GPUChain* GetNextChainInQueue();
250+
virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const { return 0; }
249251

250252
virtual int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) = 0;
251253
virtual int32_t unregisterMemoryForGPU_internal(const void* ptr) = 0;
@@ -327,6 +329,7 @@ class GPUReconstruction
327329

328330
// Others
329331
bool mInitialized = false;
332+
bool mInErrorHandling = false;
330333
uint32_t mStatNEvents = 0;
331334
uint32_t mNEventsProcessed = 0;
332335
double mStatKernelTime = 0.;

GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,11 @@ void GPUReconstructionDeviceBase::runConstantRegistrators()
175175
{
176176
auto& list = getDeviceConstantMemRegistratorsVector();
177177
for (uint32_t i = 0; i < list.size(); i++) {
178-
mDeviceConstantMemList.emplace_back(list[i]());
178+
auto* ptr = list[i]();
179+
if (ptr == nullptr) {
180+
GPUFatal("Error registering constant memory");
181+
}
182+
mDeviceConstantMemList.emplace_back(ptr);
179183
}
180184
}
181185

GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class GPUReconstructionDeviceBase : public GPUReconstructionCPU
4646
virtual int32_t InitDevice_Runtime() = 0;
4747
int32_t ExitDevice() override;
4848
virtual int32_t ExitDevice_Runtime() = 0;
49+
virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const override = 0;
4950
int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) override;
5051
int32_t unregisterMemoryForGPU_internal(const void* ptr) override;
5152
void unregisterRemainingRegisteredMemory();

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -61,28 +61,16 @@ GPUReconstructionCUDABackend::~GPUReconstructionCUDABackend()
6161
}
6262
}
6363

64-
int32_t GPUReconstructionCUDABackend::GPUFailedMsgAI(const int64_t error, const char* file, int32_t line)
64+
static_assert(sizeof(cudaError_t) <= sizeof(int64_t) && cudaSuccess == 0);
65+
int32_t GPUReconstructionCUDABackend::GPUFailedMsgStatic(const int64_t error, const char* file, int32_t line)
6566
{
66-
// Check for CUDA Error and in the case of an error display the corresponding error string
6767
if (error == cudaSuccess) {
6868
return (0);
6969
}
7070
GPUError("CUDA Error: %ld / %s (%s:%d)", error, cudaGetErrorString((cudaError_t)error), file, line);
7171
return 1;
7272
}
7373

74-
void GPUReconstructionCUDABackend::GPUFailedMsgA(const int64_t error, const char* file, int32_t line)
75-
{
76-
if (GPUFailedMsgAI(error, file, line)) {
77-
static bool runningCallbacks = false;
78-
if (IsInitialized() && runningCallbacks == false) {
79-
runningCallbacks = true;
80-
CheckErrorCodes(false, true);
81-
}
82-
throw std::runtime_error("CUDA Failure");
83-
}
84-
}
85-
8674
GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg)
8775
{
8876
mDeviceBackendSettings.deviceType = DeviceType::CUDA;

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,13 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
3333
{
3434
public:
3535
~GPUReconstructionCUDABackend() override;
36-
static int32_t GPUFailedMsgAI(const int64_t error, const char* file, int32_t line);
37-
void GPUFailedMsgA(const int64_t error, const char* file, int32_t line);
36+
static int32_t GPUFailedMsgStatic(const int64_t error, const char* file, int32_t line);
3837

3938
protected:
4039
GPUReconstructionCUDABackend(const GPUSettingsDeviceBackend& cfg);
4140

4241
void PrintKernelOccupancies() override;
42+
virtual int32_t GPUFailedMsgInternal(const int64_t error, const char* file, int32_t line) const override { return GPUFailedMsgStatic(error, file, line); }
4343

4444
template <class T, int32_t I = 0, typename... Args>
4545
void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ using namespace o2::gpu;
3636
#ifndef GPUCA_NO_CONSTANT_MEMORY
3737
static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
3838
void* retVal = nullptr;
39-
if (cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer) != cudaSuccess) {
39+
if (GPUReconstructionCUDA::GPUFailedMsgStatic(cudaGetSymbolAddress(&retVal, gGPUConstantMemBuffer), __FILE__, __LINE__)) {
4040
throw std::runtime_error("Could not obtain GPU constant memory symbol");
4141
}
4242
return retVal;

0 commit comments

Comments
 (0)