Skip to content

Commit 857cc42

Browse files
committed
GPU RTC: Split options into technical and code-creation ones, add option to print launch-bounds used for RTC
1 parent ce1e5d6 commit 857cc42

File tree

5 files changed

+41
-22
lines changed

5 files changed

+41
-22
lines changed

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
114114
constexpr int32_t reqVerMaj = 2;
115115
constexpr int32_t reqVerMin = 0;
116116
#endif
117-
if (mProcessingSettings.rtc.enable && mProcessingSettings.rtc.runTest == 2) {
117+
if (mProcessingSettings.rtc.enable && mProcessingSettings.rtctech.runTest == 2) {
118118
genAndLoadRTC();
119119
exit(0);
120120
}
@@ -433,14 +433,14 @@ void GPUReconstructionCUDA::genAndLoadRTC()
433433
throw std::runtime_error("Runtime compilation failed");
434434
}
435435
for (uint32_t i = 0; i < nCompile; i++) {
436-
if (mProcessingSettings.rtc.runTest != 2) {
436+
if (mProcessingSettings.rtctech.runTest != 2) {
437437
mInternals->kernelModules.emplace_back(std::make_unique<CUmodule>());
438438
GPUChkErr(cuModuleLoad(mInternals->kernelModules.back().get(), (filename + "_" + std::to_string(i) + mRtcBinExtension).c_str()));
439439
}
440440
remove((filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str());
441441
remove((filename + "_" + std::to_string(i) + mRtcBinExtension).c_str());
442442
}
443-
if (mProcessingSettings.rtc.runTest == 2) {
443+
if (mProcessingSettings.rtctech.runTest == 2) {
444444
return;
445445
}
446446
loadKernelModules(mProcessingSettings.rtc.compilePerKernel);

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,13 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
5656
kernelsall += kernels[i] + "\n";
5757
}
5858

59-
std::string baseCommand = (mProcessingSettings.RTCprependCommand != "" ? (mProcessingSettings.RTCprependCommand + " ") : "");
59+
std::string baseCommand = (mProcessingSettings.rtctech.prependCommand != "" ? (mProcessingSettings.rtctech.prependCommand + " ") : "");
6060
baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
61-
baseCommand += std::string(" ") + (mProcessingSettings.RTCoverrideArchitecture != "" ? mProcessingSettings.RTCoverrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
61+
baseCommand += std::string(" ") + (mProcessingSettings.rtctech.overrideArchitecture != "" ? mProcessingSettings.rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
6262
const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true);
63+
if (mProcessingSettings.rtctech.printLaunchBounds || mProcessingSettings.debugLevel >= 3) {
64+
GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str());
65+
}
6366

6467
char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21];
6568
if (mProcessingSettings.rtc.cacheOutput) {
@@ -74,12 +77,12 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
7477
bool cacheLoaded = false;
7578
int32_t fd = 0;
7679
if (mProcessingSettings.rtc.cacheOutput) {
77-
if (mProcessingSettings.RTCcacheFolder != ".") {
78-
std::filesystem::create_directories(mProcessingSettings.RTCcacheFolder);
80+
if (mProcessingSettings.rtctech.cacheFolder != ".") {
81+
std::filesystem::create_directories(mProcessingSettings.rtctech.cacheFolder);
7982
}
80-
if (mProcessingSettings.rtc.cacheMutex) {
83+
if (mProcessingSettings.rtctech.cacheMutex) {
8184
mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
82-
fd = open((mProcessingSettings.RTCcacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
85+
fd = open((mProcessingSettings.rtctech.cacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
8386
if (fd == -1) {
8487
throw std::runtime_error("Error opening rtc cache mutex lock file");
8588
}
@@ -89,7 +92,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
8992
}
9093
}
9194

92-
FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "rb");
95+
FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "rb");
9396
char sharead[20];
9497
if (fp) {
9598
size_t len;
@@ -106,7 +109,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
106109
}
107110
GPUInfo("SHA for %s: expected %s, read %s", name, shaprint1, shaprint2);
108111
}
109-
if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
112+
if (!mProcessingSettings.rtctech.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
110113
GPUInfo("Cache file content outdated (%s)", name);
111114
return 1;
112115
}
@@ -124,7 +127,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
124127
if (fread(&cachedSettings, sizeof(cachedSettings), 1, fp) != 1) {
125128
throw std::runtime_error("Cache file corrupt");
126129
}
127-
if (!mProcessingSettings.rtc.ignoreCacheValid && !(cachedSettings == mProcessingSettings.rtc)) {
130+
if (!mProcessingSettings.rtctech.ignoreCacheValid && !(cachedSettings == mProcessingSettings.rtc)) {
128131
GPUInfo("Cache file content outdated (rtc parameters)");
129132
break;
130133
}
@@ -207,7 +210,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
207210
GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime());
208211
}
209212
if (mProcessingSettings.rtc.cacheOutput) {
210-
FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
213+
FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
211214
if (fp == nullptr) {
212215
throw std::runtime_error("Cannot open cache file for writing");
213216
}
@@ -245,7 +248,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
245248
fclose(fp);
246249
}
247250
}
248-
if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtc.cacheMutex) {
251+
if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtctech.cacheMutex) {
249252
if (lockf(fd, F_ULOCK, 0)) {
250253
throw std::runtime_error("Error unlocking RTC cache mutex file");
251254
}

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ BeginNamespace(gpu)
3535

3636
// Settings concerning the reconstruction, stored as parameters in GPU constant memory
3737
// There must be no bool in here, use int8_t, as sizeof(bool) is compiler dependent and fails on GPUs!!!!!!
38+
// Split in different blocks for global and per Detector
39+
40+
// Reconstruction parameters for TPC, no bool in here !!!
3841
BeginSubConfig(GPUSettingsRecTPC, tpc, configStandalone.rec, "RECTPC", 0, "Reconstruction settings", rec_tpc)
3942
AddOptionRTC(rejectQPtB5, float, 1.f / GPUCA_MIN_TRACK_PTB5_REJECT_DEFAULT, "", 0, "QPt threshold to reject clusters of TPC tracks (Inverse Pt, scaled to B=0.5T!!!)")
4043
AddOptionRTC(hitPickUpFactor, float, 1.f, "", 0, "multiplier for the combined cluster+track error during track following")
@@ -161,6 +164,7 @@ AddOptionArray(PID_remap, int8_t, 9, (0, 1, 2, 3, 4, 5, 6, 7, 8), "", 0, "Remap
161164
AddHelp("help", 'h')
162165
EndConfig()
163166

167+
// Reconstruction parameters for TRD, no bool in here !!!
164168
BeginSubConfig(GPUSettingsRecTRD, trd, configStandalone.rec, "RECTRD", 0, "Reconstruction settings", rec_trd)
165169
AddOptionRTC(minTrackPt, float, .5f, "", 0, "Min Pt for tracks to be propagated through the TRD")
166170
AddOptionRTC(maxChi2, float, 20.f, "", 0, "Max chi2 for TRD tracklets to be matched to a track")
@@ -182,11 +186,12 @@ AddOptionRTC(pileupBwdNBC, uint8_t, 80, "", 0, "Pre-trigger Pile-up integration
182186
AddHelp("help", 'h')
183187
EndConfig()
184188

185-
// Dynamic settings, must NOT use AddOptionRTC(...) !!!
189+
// Dynamic reconstruction parameters, no bool in here!!!, must NOT use AddOptionRTC(...) !!!
186190
BeginSubConfig(GPUSettingsRecDynamic, dyn, configStandalone.rec, "RECDYN", 0, "Reconstruction settings", rec_dyn)
187191
AddHelp("help", 'h')
188192
EndConfig()
189193

194+
// Global reconstruction parameters, no bool in here !!!
190195
BeginSubConfig(GPUSettingsRec, rec, configStandalone, "REC", 0, "Reconstruction settings", rec)
191196
AddOptionRTC(maxTrackQPtB5, float, 1.f / GPUCA_MIN_TRACK_PTB5_DEFAULT, "", 0, "required max Q/Pt (==min Pt) of tracks")
192197
AddOptionRTC(fwdTPCDigitsAsClusters, uint8_t, 0, "", 0, "Forward TPC digits as clusters (if they pass the ZS threshold)")
@@ -203,26 +208,38 @@ AddHelp("help", 'h')
203208
EndConfig()
204209

205210
#ifndef __OPENCL__
211+
// Parameters that might affect the RTC code (if these change, the cache cannot be used)
206212
BeginSubConfig(GPUSettingsProcessingRTC, rtc, configStandalone.proc, "RTC", 0, "Processing settings", proc_rtc)
207213
AddOption(cacheOutput, bool, false, "", 0, "Cache RTC compilation results")
208214
AddOption(optConstexpr, bool, true, "", 0, "Replace constant variables by static constexpr expressions")
209215
AddOption(optSpecialCode, int8_t, -1, "", 0, "Insert GPUCA_RTC_SPECIAL_CODE special code during RTC")
210216
AddOption(deterministic, bool, false, "", 0, "Compile RTC in deterministic mode, with NO_FAST_MATH flags and GPUCA_DETERMINISTIC_MODE define")
211217
AddOption(compilePerKernel, bool, true, "", 0, "Run one RTC compilation per kernel")
212218
AddOption(enable, bool, false, "", 0, "Use RTC to optimize GPU code")
219+
AddHelp("help", 'h')
220+
EndConfig()
221+
222+
// Technical parameters for RunTimeCompilation, which do not change the RTC code
223+
BeginSubConfig(GPUSettingsProcessingRTCtechnical, rtctech, configStandalone.proc, "RTCTECH", 0, "Processing settings", proc_rtctech)
213224
AddOption(runTest, int32_t, 0, "", 0, "Do not run the actual benchmark, but just test RTC compilation (1 full test, 2 test only compilation)")
214225
AddOption(cacheMutex, bool, true, "", 0, "Use a file lock to serialize access to the cache folder")
215226
AddOption(ignoreCacheValid, bool, false, "", 0, "If set, allows to use RTC cached code files even if they are not valid for the current source code / parameters")
227+
AddOption(printLaunchBounds, bool, false, "", 0, "Print launch bounds used for RTC code as debugging option")
228+
AddOption(cacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
229+
AddOption(prependCommand, std::string, "", "", 0, "Prepend RTC compilation commands by this string")
230+
AddOption(overrideArchitecture, std::string, "", "", 0, "Override arhcitecture part of RTC compilation command line") // Part of cmdLine, so checked against the cache
216231
AddHelp("help", 'h')
217232
EndConfig()
218233

234+
// Parameters that steer reconstruction that do not go to the device, or only in derrived form.
219235
BeginSubConfig(GPUSettingsProcessingParam, param, configStandalone.proc, "PARAM", 0, "Processing settings", proc_param)
220236
AddOptionArray(tpcErrorParamY, float, 4, (0.06f, 0.24f, 0.12f, 0.1f), "", 0, "TPC Cluster Y Error Parameterization")
221237
AddOptionArray(tpcErrorParamZ, float, 4, (0.06f, 0.24f, 0.15f, 0.1f), "", 0, "TPC Cluster Z Error Parameterization")
222238
AddOption(tpcTriggerHandling, bool, true, "", 0, "Enable TPC trigger handling")
223239
AddHelp("help", 'h')
224240
EndConfig()
225241

242+
// Settings steering the processing of NN Clusterization
226243
BeginSubConfig(GPUSettingsProcessingNNclusterizer, nn, configStandalone.proc, "NN", 0, "Processing settings for neural network clusterizer", proc_nn)
227244
AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.")
228245
AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
@@ -320,16 +337,14 @@ AddOption(tpcMaxAttachedClustersPerSectorRow, uint32_t, 51000, "", 0, "Maximum n
320337
AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
321338
AddOption(tpcApplyCFCutsAtDecoding, bool, false, "", 0, "Apply cluster cuts from clusterization during decoding of compressed clusters")
322339
AddOption(tpcApplyClusterFilterOnCPU, uint8_t, 0, "", 0, "Apply custom cluster filter of GPUTPCClusterFilter class, 0: off, 1: debug, 2: PbPb23")
323-
AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
324-
AddOption(RTCprependCommand, std::string, "", "", 0, "Prepend RTC compilation commands by this string")
325-
AddOption(RTCoverrideArchitecture, std::string, "", "", 0, "Override arhcitecture part of RTC compilation command line")
326340
AddOption(oclPlatformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (OpenCL only, -1 = auto-select, -2 query all platforms (also incompatible))")
327341
AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from included source code instead of using included spirv code")
328342
AddOption(oclOverrideSourceBuildFlags, std::string, "", "", 0, "Override OCL build flags for compilation from source, put a space for empty options")
329343
AddOption(printSettings, bool, false, "", 0, "Print all settings when initializing")
330344
AddOption(tpcFreeAllocatedMemoryAfterProcessing, bool, false, "", 0, "Clean all memory allocated by TPC when TPC processing done, only data written to external output resources will remain")
331345
AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
332346
AddSubConfig(GPUSettingsProcessingRTC, rtc)
347+
AddSubConfig(GPUSettingsProcessingRTCtechnical, rtctech)
333348
AddSubConfig(GPUSettingsProcessingParam, param)
334349
AddSubConfig(GPUSettingsProcessingNNclusterizer, nn)
335350
AddHelp("help", 'h')

GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessing + ;
3131
#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingParam + ;
3232
#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingRTC + ;
33+
#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingRTCtechnical + ;
3334
#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingNNclusterizer + ;
3435
#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplay + ;
3536
#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplayLight + ;

prodtests/full-system-test/dpl-workflow.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -327,11 +327,11 @@ if has_detector_calib PHS && workflow_has_parameter CALIB; then
327327
fi
328328

329329
[[ ${O2_GPU_DOUBLE_PIPELINE:-$EPNSYNCMODE} == 1 && $GPUTYPE != "CPU" ]] && GPU_CONFIG+=" --enableDoublePipeline"
330-
[[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc_rtc.enable=1;GPU_proc_rtc.cacheOutput=1;GPU_proc.RTCprependCommand=/usr/bin/env TMPDIR=/tmp /usr/bin/taskset -c 0-191;"
331-
[[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 && $EPNSYNCMODE == 1 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCcacheFolder=/var/tmp/o2_gpu_rtc_cache;"
330+
[[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc_rtc.enable=1;GPU_proc_rtc.cacheOutput=1;GPU_proc.RTCTECH.prependCommand=/usr/bin/env TMPDIR=/tmp /usr/bin/taskset -c 0-191;"
331+
[[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 && $EPNSYNCMODE == 1 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCTECH.cacheFolder=/var/tmp/o2_gpu_rtc_cache;"
332332
if [[ ${O2_GPU_RTC:-$EPNSYNCMODE} == 1 ]] && [[ ( ${ALICE_O2_FST:-0} == 1 && ${FST_TMUX_NO_EPN:-0} == 0 ) || $EPNSYNCMODE == 1 ]]; then
333-
[[ ${EPN_NODE_MI100:-0} == 0 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCoverrideArchitecture=--offload-arch=gfx906;"
334-
[[ ${EPN_NODE_MI100:-0} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCoverrideArchitecture=--offload-arch=gfx908;"
333+
[[ ${EPN_NODE_MI100:-0} == 0 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCTECH.overrideArchitecture=--offload-arch=gfx906;"
334+
[[ ${EPN_NODE_MI100:-0} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc.RTCTECH.overrideArchitecture=--offload-arch=gfx908;"
335335
fi
336336

337337
( workflow_has_parameter AOD || [[ -z "$DISABLE_ROOT_OUTPUT" ]] || needs_root_output o2-emcal-cell-writer-workflow ) && has_detector EMC && RAW_EMC_SUBSPEC=" --subspecification 1 "

0 commit comments

Comments
 (0)