Skip to content

Commit a80dfaf

Browse files
committed
GPU: Add option for the GPU backend to use nested OpenMP loops for better parallelism
1 parent 02cf084 commit a80dfaf

File tree

5 files changed

+22
-5
lines changed

5 files changed

+22
-5
lines changed

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ int GPUReconstructionCPUBackend::runKernelBackend(krnlSetup& _xyz, const Args&..
7272
unsigned int num = y.num == 0 || y.num == -1 ? 1 : y.num;
7373
for (unsigned int k = 0; k < num; k++) {
7474
if (mProcessingSettings.ompKernels) {
75-
GPUCA_OPENMP(parallel for num_threads(mProcessingSettings.ompThreads))
75+
int ompThreads = mProcessingSettings.ompKernels == 2 ? ((mProcessingSettings.ompThreads + mNestedLoopOmpFactor - 1) / mNestedLoopOmpFactor) : mProcessingSettings.ompThreads;
76+
GPUCA_OPENMP(parallel for num_threads(ompThreads))
7677
for (unsigned int iB = 0; iB < x.nBlocks; iB++) {
7778
typename T::GPUSharedMemory smem;
7879
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
@@ -344,3 +345,13 @@ unsigned int GPUReconstructionCPU::getNextTimerId()
344345
static std::atomic<unsigned int> id{0};
345346
return id.fetch_add(1);
346347
}
348+
349+
unsigned int GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor(bool condition, unsigned int max)
350+
{
351+
if (condition && mProcessingSettings.ompKernels != 1) {
352+
mNestedLoopOmpFactor = mProcessingSettings.ompKernels == 2 ? std::max<unsigned int>(max, mProcessingSettings.ompThreads) : mProcessingSettings.ompThreads;
353+
} else {
354+
mNestedLoopOmpFactor = 1;
355+
}
356+
return mNestedLoopOmpFactor;
357+
}

GPU/GPUTracking/Base/GPUReconstructionCPU.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class GPUReconstructionCPUBackend : public GPUReconstruction
5757
int runKernelBackend(krnlSetup& _xyz, const Args&... args);
5858
template <class T, int I>
5959
krnlProperties getKernelPropertiesBackend();
60+
unsigned int mNestedLoopOmpFactor = 1;
6061
};
6162

6263
template <class T>
@@ -148,6 +149,9 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
148149
HighResTimer& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)].timerTotal; }
149150
HighResTimer& getGeneralStepTimer(GeneralStep step) { return mTimersGeneralSteps[getGeneralStepNum(step)]; }
150151

152+
void SetNestedLoopOmpFactor(unsigned int f) { mNestedLoopOmpFactor = f; }
153+
unsigned int SetAndGetNestedLoopOmpFactor(bool condition, unsigned int max);
154+
151155
protected:
152156
struct GPUProcessorProcessors : public GPUProcessor {
153157
GPUConstantMem* mProcessorsProc = nullptr;

GPU/GPUTracking/Base/GPUSettingsList.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ AddOption(mergerSortTracks, int, -1, "", 0, "Sort track indizes for GPU track fi
121121
AddOption(tpcCompressionGatherMode, int, -1, "", 0, "TPC Compressed Clusters Gather Mode (0: DMA transfer gather gpu to host, 1: serial DMA to host and gather by copy on CPU, 2. gather via GPU kernal DMA access, 3. gather on GPU via kernel, dma afterwards")
122122
AddOption(tpcCompressionGatherModeKernel, int, -1, "", 0, "TPC Compressed Clusters Gather Mode Kernel (0: unbufferd, 1-3: buffered, 4: multi-block)")
123123
AddOption(runMC, bool, false, "", 0, "Process MC labels")
124-
AddOption(ompKernels, bool, true, "", 0, "Parallelize with OMP inside kernels instead of over slices")
124+
AddOption(ompKernels, unsigned char, 2, "", 0, "Parallelize with OMP inside kernels instead of over slices, 2 for nested parallelization over TPC sectors and inside kernels")
125125
AddOption(doublePipeline, bool, false, "", 0, "Double pipeline mode")
126126
AddOption(prefetchTPCpageScan, int, 0, "", 0, "Prefetch Data for TPC page scan in CPU cache")
127127
AddOption(debugLevel, int, -1, "debug", 'd', "Set debug level (-1 = silend)")

GPU/GPUTracking/Global/GPUChainTracking.cxx

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,7 +1536,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
15361536
int streamMap[NSLICES];
15371537

15381538
bool error = false;
1539-
GPUCA_OPENMP(parallel for if(!(doGPU || GetProcessingSettings().ompKernels)) num_threads(GetProcessingSettings().ompThreads))
1539+
GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES)))
15401540
for (unsigned int iSlice = 0; iSlice < NSLICES; iSlice++) {
15411541
if (mRec->GetDeviceType() == GPUReconstruction::DeviceType::HIP) {
15421542
SynchronizeGPU(); // BUG: Workaround for probable bug in AMD runtime, crashes randomly if not synchronized here
@@ -1649,6 +1649,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
16491649
DoDebugAndDump(RecoStep::TPCSliceTracking, 512, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile);
16501650
}
16511651
}
1652+
mRec->SetNestedLoopOmpFactor(1);
16521653
if (error) {
16531654
return (3);
16541655
}
@@ -1813,7 +1814,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
18131814
}
18141815
} else {
18151816
mSliceSelectorReady = NSLICES;
1816-
GPUCA_OPENMP(parallel for if(!(doGPU || GetProcessingSettings().ompKernels)) num_threads(GetProcessingSettings().ompThreads))
1817+
GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES)))
18171818
for (unsigned int iSlice = 0; iSlice < NSLICES; iSlice++) {
18181819
if (param().rec.GlobalTracking) {
18191820
GlobalTracking(iSlice, 0);
@@ -1822,6 +1823,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
18221823
WriteOutput(iSlice, 0);
18231824
}
18241825
}
1826+
mRec->SetNestedLoopOmpFactor(1);
18251827
}
18261828

18271829
if (param().rec.GlobalTracking && GetProcessingSettings().debugLevel >= 3) {

GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ void* GPUTPCTracker::SetPointersCommon(void* mem)
106106
void GPUTPCTracker::RegisterMemoryAllocation()
107107
{
108108
AllocateAndInitializeLate();
109-
bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || mRec->GetProcessingSettings().ompKernels || mRec->GetProcessingSettings().ompThreads == 1);
109+
bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || mRec->GetProcessingSettings().ompKernels == 1 || mRec->GetProcessingSettings().ompThreads == 1);
110110
GPUMemoryReuse reLinks{reuseCondition, GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::TrackerDataLinks, (unsigned short)(mISlice % mRec->GetProcessingSettings().nStreams)};
111111
mMemoryResLinks = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataLinks, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCSliceLinks", reLinks);
112112
mMemoryResSliceScratch = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataScratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK | GPUMemoryResource::MEMORY_CUSTOM, "TPCSliceScratch");

0 commit comments

Comments
 (0)