Skip to content

Commit e29eadd

Browse files
committed
GPU RTC: Do not store preprocessed launch-bounds for RTC
1 parent ebf8670 commit e29eadd

17 files changed

+91
-78
lines changed

GPU/Common/GPUCommonAlgorithm.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ typedef GPUCommonAlgorithm CAAlgo;
214214

215215
} // namespace o2::gpu
216216

217-
#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY)
217+
#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_HOSTONLY)
218218

219219
#include "GPUCommonAlgorithmThrust.h"
220220

GPU/Common/GPUCommonAlgorithmThrust.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#ifndef GPUCOMMONALGORITHMTHRUST_H
1616
#define GPUCOMMONALGORITHMTHRUST_H
1717

18+
#ifndef GPUCA_GPUCODE_COMPILEKERNELS
1819
#pragma GCC diagnostic push
1920
#pragma GCC diagnostic ignored "-Wshadow"
2021
#include <thrust/sort.h>
@@ -25,14 +26,19 @@
2526
#include "GPUCommonDef.h"
2627
#include "GPUCommonHelpers.h"
2728

29+
#ifndef __HIPCC__ // CUDA
30+
#include <cub/cub.cuh>
31+
#else // HIP
32+
#include <hipcub/hipcub.hpp>
33+
#endif
34+
#endif // GPUCA_GPUCODE_COMPILEKERNELS
35+
2836
#ifndef __HIPCC__ // CUDA
2937
#define GPUCA_THRUST_NAMESPACE thrust::cuda
3038
#define GPUCA_CUB_NAMESPACE cub
31-
#include <cub/cub.cuh>
3239
#else // HIP
3340
#define GPUCA_THRUST_NAMESPACE thrust::hip
3441
#define GPUCA_CUB_NAMESPACE hipcub
35-
#include <hipcub/hipcub.hpp>
3642
#endif
3743

3844
namespace o2::gpu
@@ -90,6 +96,7 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
9096
thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
9197
}
9298

99+
#ifndef GPUCA_GPUCODE_COMPILEKERNELS
93100
template <class T, class S>
94101
GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp)
95102
{
@@ -105,6 +112,8 @@ GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begi
105112
GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
106113
#endif
107114
}
115+
#endif // #ifndef GPUCA_GPUCODE_COMPILEKERNELS
116+
108117
} // namespace o2::gpu
109118

110119
#undef GPUCA_THRUST_NAMESPACE

GPU/GPUTracking/Base/GPUParam.cxx

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,12 +193,10 @@ void GPUParamRTC::setFrom(const GPUParam& param)
193193

194194
std::string GPUParamRTC::generateRTCCode(const GPUParam& param, bool useConstexpr)
195195
{
196-
return "#ifndef GPUCA_GPUCODE_DEVICE\n"
197-
"#include <string>\n"
196+
return "#include <string>\n"
198197
"#include <vector>\n"
199198
"#include <cstdint>\n"
200199
"#include <cstddef>\n"
201-
"#endif\n"
202200
"namespace o2::gpu { class GPUDisplayFrontendInterface; }\n" +
203201
qConfigPrintRtc(std::make_tuple(&param.rec.tpc, &param.rec.trd, &param.rec, &param.par), useConstexpr);
204202
}

GPU/GPUTracking/Base/cuda/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ endif()
1818
message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}")
1919

2020
set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu)
21-
set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h)
21+
set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesSystem.h)
2222
# -------------------------------- Prepare RTC -------------------------------------------------------
2323
enable_language(ASM)
2424
if(ALIGPU_BUILD_TYPE STREQUAL "O2")
@@ -67,8 +67,8 @@ set(GPU_RTC_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionCUDArtc)
6767
# cmake-format: off
6868
add_custom_command(
6969
OUTPUT ${GPU_RTC_BIN}.src
70-
COMMAND cp ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludesHost.h ${GPU_RTC_BIN}.src
71-
COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_CUDA_STANDARD} -D__CUDA_ARCH__=${RTC_CUDA_ARCH} -D__CUDACC__ -x c++ -nostdinc -E ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
70+
COMMAND cp ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludesSystem.h ${GPU_RTC_BIN}.src
71+
COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_CUDA_STANDARD} -D__CUDA_ARCH__=${RTC_CUDA_ARCH} -D__CUDACC__ -x c++ -nostdinc -E -P ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
7272
MAIN_DEPENDENCY ${GPU_RTC_SRC}
7373
IMPLICIT_DEPENDS CXX ${GPU_RTC_SRC}
7474
COMMAND_EXPAND_LISTS

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
#include "GPUDefParametersDefault.h"
2020
#include "GPUDefParametersLoad.inc"
2121

22-
#include "GPUReconstructionCUDAIncludesHost.h"
22+
#include "GPUReconstructionCUDAIncludesSystem.h"
23+
#include "GPUReconstructionCUDADef.h"
2324
#include <cuda_profiler_api.h>
2425

2526
#include "GPUReconstructionCUDA.h"

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAExternalProvider.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
/// \file GPUReconstructionCUDAExternalProvider.cu
1313
/// \author David Rohr
1414

15-
#include "GPUReconstructionCUDAIncludesHost.h"
15+
#include "GPUReconstructionCUDAIncludesSystem.h"
16+
#include "GPUReconstructionCUDADef.h"
1617

1718
#include "GPUReconstructionCUDA.h"
1819
#include "GPUReconstructionCUDAInternals.h"

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
/// \author David Rohr
1414

1515
#define GPUCA_GPUCODE_HOSTONLY
16+
#define GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS
17+
#include "GPUDefParametersDefault.h"
18+
#include "GPUDefParametersLoad.inc"
19+
1620
#include "GPUReconstructionCUDA.h"
1721
#include "GPUParamRTC.h"
1822
#include "GPUDefMacros.h"
@@ -55,13 +59,15 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
5559
std::string baseCommand = (mProcessingSettings.RTCprependCommand != "" ? (mProcessingSettings.RTCprependCommand + " ") : "");
5660
baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
5761
baseCommand += std::string(" ") + (mProcessingSettings.RTCoverrideArchitecture != "" ? mProcessingSettings.RTCoverrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
62+
const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true);
5863

59-
char shasource[21], shaparam[21], shacmd[21], shakernels[21];
64+
char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21];
6065
if (mProcessingSettings.rtc.cacheOutput) {
6166
o2::framework::internal::SHA1(shasource, _binary_GPUReconstructionCUDArtc_src_start, _binary_GPUReconstructionCUDArtc_src_len);
6267
o2::framework::internal::SHA1(shaparam, rtcparam.c_str(), rtcparam.size());
6368
o2::framework::internal::SHA1(shacmd, baseCommand.c_str(), baseCommand.size());
6469
o2::framework::internal::SHA1(shakernels, kernelsall.c_str(), kernelsall.size());
70+
o2::framework::internal::SHA1(shabounds, launchBounds.c_str(), launchBounds.size());
6571
}
6672

6773
nCompile = mProcessingSettings.rtc.compilePerKernel ? kernels.size() : 1;
@@ -88,32 +94,29 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
8894
if (fp) {
8995
size_t len;
9096
while (true) {
91-
if (fread(sharead, 1, 20, fp) != 20) {
92-
throw std::runtime_error("Cache file corrupt");
93-
}
94-
if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shasource, 20)) {
95-
GPUInfo("Cache file content outdated (source)");
96-
break;
97-
}
98-
if (fread(sharead, 1, 20, fp) != 20) {
99-
throw std::runtime_error("Cache file corrupt");
100-
}
101-
if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shaparam, 20)) {
102-
GPUInfo("Cache file content outdated (param)");
103-
break;
104-
}
105-
if (fread(sharead, 1, 20, fp) != 20) {
106-
throw std::runtime_error("Cache file corrupt");
107-
}
108-
if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shacmd, 20)) {
109-
GPUInfo("Cache file content outdated (commandline)");
110-
break;
111-
}
112-
if (fread(sharead, 1, 20, fp) != 20) {
113-
throw std::runtime_error("Cache file corrupt");
114-
}
115-
if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shakernels, 20)) {
116-
GPUInfo("Cache file content outdated (kernel definitions)");
97+
auto checkSHA = [&](const char* shacmp, const char* name) {
98+
if (fread(sharead, 1, 20, fp) != 20) {
99+
throw std::runtime_error("Cache file corrupt");
100+
}
101+
if (mProcessingSettings.debugLevel >= 3) {
102+
char shaprint1[41], shaprint2[41];
103+
for (uint32_t i = 0; i < 20; i++) {
104+
sprintf(shaprint1 + 2 * i, "%02X ", shacmp[i]);
105+
sprintf(shaprint2 + 2 * i, "%02X ", sharead[i]);
106+
}
107+
GPUInfo("SHA for %s: expected %s, read %s", name, shaprint1, shaprint2);
108+
}
109+
if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
110+
GPUInfo("Cache file content outdated (%s)", name);
111+
return 1;
112+
}
113+
return 0;
114+
};
115+
if (checkSHA(shasource, "source") ||
116+
checkSHA(shaparam, "param") ||
117+
checkSHA(shacmd, "command line") ||
118+
checkSHA(shakernels, "kernel definitions") ||
119+
checkSHA(shabounds, "launch bounds")) {
117120
break;
118121
}
119122
GPUSettingsProcessingRTC cachedSettings;
@@ -169,11 +172,12 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
169172
kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall;
170173
kernel += "}";
171174

172-
bool deterministic = mProcessingSettings.rtc.deterministic || o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end();
175+
bool deterministic = mProcessingSettings.rtc.deterministic || (mProcessingSettings.rtc.compilePerKernel && o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end());
173176
const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");
174177

175178
if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
176179
fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
180+
fwrite(launchBounds.c_str(), 1, launchBounds.size(), fp) != launchBounds.size() ||
177181
fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len ||
178182
fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) {
179183
throw std::runtime_error("Error writing file");
@@ -213,6 +217,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
213217
fwrite(shaparam, 1, 20, fp) != 20 ||
214218
fwrite(shacmd, 1, 20, fp) != 20 ||
215219
fwrite(shakernels, 1, 20, fp) != 20 ||
220+
fwrite(shabounds, 1, 20, fp) != 20 ||
216221
fwrite(&mProcessingSettings.rtc, sizeof(mProcessingSettings.rtc), 1, fp) != 1) {
217222
throw std::runtime_error("Error writing cache file");
218223
}

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesHost.h renamed to GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAIncludesSystem.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
// granted to it by virtue of its status as an Intergovernmental Organization
1010
// or submit itself to any jurisdiction.
1111

12-
/// \file GPUReconstructionCUDAIncludes.h
12+
/// \file GPUReconstructionCUDAIncludesSystem.h
1313
/// \author David Rohr
1414

1515
#ifndef O2_GPU_GPURECONSTRUCTIONCUDAINCLUDES_H
@@ -32,8 +32,4 @@
3232
#include <sm_20_atomic_functions.h>
3333
#include <cuda_fp16.h>
3434

35-
#ifndef GPUCA_RTC_CODE
36-
#include "GPUReconstructionCUDADef.h"
37-
#endif
38-
3935
#endif

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
/// \file GPUReconstructionCUDAKernels.cu
1313
/// \author David Rohr
1414

15-
#include "GPUReconstructionCUDAIncludesHost.h"
15+
#include "GPUReconstructionCUDAIncludesSystem.h"
16+
#include "GPUReconstructionCUDADef.h"
1617

1718
#include "GPUReconstructionCUDA.h"
1819
#include "GPUReconstructionCUDAInternals.h"
@@ -108,13 +109,6 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
108109
#include "GPUReconstructionKernelList.h"
109110
#undef GPUCA_KRNL
110111

111-
void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
112-
{
113-
#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
114-
#include "GPUReconstructionKernelList.h"
115-
#undef GPUCA_KRNL
116-
}
117-
118112
#ifndef GPUCA_NO_CONSTANT_MEMORY
119113
static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
120114
void* retVal = nullptr;
@@ -124,3 +118,14 @@ static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstS
124118
return retVal;
125119
});
126120
#endif
121+
122+
void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
123+
{
124+
#undef GPUCA_KRNL_LB
125+
#undef __launch_bounds__
126+
#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
127+
#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_RTC_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__)
128+
#include "GPUReconstructionKernelList.h"
129+
#undef GPUCA_KRNL
130+
#undef GPUCA_KRNL_LB
131+
}

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@
1313
/// \author David Rohr
1414

1515
#define GPUCA_GPUCODE_COMPILEKERNELS
16-
#include "GPUReconstructionCUDAIncludesHost.h"
16+
#include "GPUReconstructionCUDAIncludesSystem.h"
1717
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
1818
#define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__);
19+
#include "GPUReconstructionCUDADef.h"
1920
#include "GPUReconstructionKernelMacros.h"
2021

2122
// clang-format off

0 commit comments

Comments
 (0)