Skip to content

Commit 04ba3bd

Browse files
committed
GPU: Fix performance regression: DETERMINISTIC CODE was used unintentionally in 2 places
1 parent f926be7 commit 04ba3bd

File tree

3 files changed

+33
-33
lines changed

3 files changed

+33
-33
lines changed

GPU/Common/GPUCommonAlgorithm.h

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -282,30 +282,27 @@ GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end, const S& comp)
282282
{
283283
#ifndef GPUCA_GPUCODE
284284
GPUCommonAlgorithm::sort(begin, end, comp);
285+
#elif defined(GPUCA_DETERMINISTIC_MODE) // Not using GPUCA_DETERMINISTIC_CODE, which is enforced in TPC compression
286+
if (get_local_id(0) == 0) {
287+
GPUCommonAlgorithm::sort(begin, end, comp);
288+
}
289+
GPUbarrier();
285290
#else
286-
GPUCA_DETERMINISTIC_CODE( // clang-format off
287-
GPUbarrier();
288-
if (get_local_id(0) == 0) {
289-
GPUCommonAlgorithm::sort(begin, end, comp);
290-
}
291-
GPUbarrier();
292-
, // !GPUCA_DETERMINISTIC_CODE
293-
int32_t n = end - begin;
294-
for (int32_t i = 0; i < n; i++) {
295-
for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) {
296-
int32_t offset = i % 2;
297-
int32_t curPos = 2 * tIdx + offset;
298-
int32_t nextPos = curPos + 1;
299-
300-
if (nextPos < n) {
301-
if (!comp(begin[curPos], begin[nextPos])) {
302-
IterSwap(&begin[curPos], &begin[nextPos]);
303-
}
291+
int32_t n = end - begin;
292+
for (int32_t i = 0; i < n; i++) {
293+
for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) {
294+
int32_t offset = i % 2;
295+
int32_t curPos = 2 * tIdx + offset;
296+
int32_t nextPos = curPos + 1;
297+
298+
if (nextPos < n) {
299+
if (!comp(begin[curPos], begin[nextPos])) {
300+
IterSwap(&begin[curPos], &begin[nextPos]);
304301
}
305302
}
306-
GPUbarrier();
307303
}
308-
) // clang-format on
304+
GPUbarrier();
305+
}
309306
#endif
310307
}
311308

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDArtc.cu

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@
1414

1515
#define GPUCA_GPUCODE_GENRTC
1616
#define GPUCA_GPUCODE_COMPILEKERNELS
17+
18+
// Keep some preprocessor calls unprocessed
1719
#define GPUCA_RTC_SPECIAL_CODE(...) GPUCA_RTC_SPECIAL_CODE(__VA_ARGS__)
1820
#define GPUCA_DETERMINISTIC_CODE(...) GPUCA_DETERMINISTIC_CODE(__VA_ARGS__)
19-
// GPUReconstructionCUDAIncludesSystem.h prependended without preprocessor running
21+
22+
// GPUReconstructionCUDAIncludesSystem.h prependended by CMakewithout preprocessor running
2023
#include "GPUReconstructionCUDADef.h"
2124
#include "GPUReconstructionIncludesDeviceAll.h"
2225

GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -273,19 +273,19 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
273273
#ifdef GPUCA_GPUCODE
274274
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= GPUCA_TPC_COMP_CHUNK_SIZE);
275275
#endif
276-
GPUCA_DETERMINISTIC_CODE( // clang-format off
276+
#ifdef GPUCA_DETERMINISTIC_MODE // Not using GPUCA_DETERMINISTIC_CODE, which is enforced in TPC compression
277+
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
278+
#else // GPUCA_DETERMINISTIC_MODE
279+
if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
277280
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
278-
, // !GPUCA_DETERMINISTIC_CODE
279-
if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
280-
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
281-
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
282-
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));
283-
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {
284-
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));
285-
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {
286-
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));
287-
}
288-
) // clang-format on
281+
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
282+
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));
283+
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {
284+
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));
285+
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {
286+
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));
287+
}
288+
#endif // GPUCA_DETERMINISTIC_MODE
289289
GPUbarrier();
290290
}
291291

0 commit comments

Comments
 (0)