GPU: Add option for the GPU backend to use nested OpenMP loops for better parallelism

davidrohr · davidrohr · commit a80dfaf8964e · 2021-01-05T10:40:59.000+01:00
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -72,7 +72,8 @@ int GPUReconstructionCPUBackend::runKernelBackend(krnlSetup& _xyz, const Args&..
   unsigned int num = y.num == 0 || y.num == -1 ? 1 : y.num;
   for (unsigned int k = 0; k < num; k++) {
     if (mProcessingSettings.ompKernels) {
-      GPUCA_OPENMP(parallel for num_threads(mProcessingSettings.ompThreads))
+      int ompThreads = mProcessingSettings.ompKernels == 2 ? ((mProcessingSettings.ompThreads + mNestedLoopOmpFactor - 1) / mNestedLoopOmpFactor) : mProcessingSettings.ompThreads;
+      GPUCA_OPENMP(parallel for num_threads(ompThreads))
       for (unsigned int iB = 0; iB < x.nBlocks; iB++) {
         typename T::GPUSharedMemory smem;
         T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
@@ -344,3 +345,13 @@ unsigned int GPUReconstructionCPU::getNextTimerId()
   static std::atomic<unsigned int> id{0};
   return id.fetch_add(1);
 }
+
+unsigned int GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor(bool condition, unsigned int max)
+{
+  if (condition && mProcessingSettings.ompKernels != 1) {
+    mNestedLoopOmpFactor = mProcessingSettings.ompKernels == 2 ? std::max<unsigned int>(max, mProcessingSettings.ompThreads) : mProcessingSettings.ompThreads;
+  } else {
+    mNestedLoopOmpFactor = 1;
+  }
+  return mNestedLoopOmpFactor;
+}
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -57,6 +57,7 @@ class GPUReconstructionCPUBackend : public GPUReconstruction
   int runKernelBackend(krnlSetup& _xyz, const Args&... args);
   template <class T, int I>
   krnlProperties getKernelPropertiesBackend();
+  unsigned int mNestedLoopOmpFactor = 1;
 };
 
 template <class T>
@@ -148,6 +149,9 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   HighResTimer& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)].timerTotal; }
   HighResTimer& getGeneralStepTimer(GeneralStep step) { return mTimersGeneralSteps[getGeneralStepNum(step)]; }
 
+  void SetNestedLoopOmpFactor(unsigned int f) { mNestedLoopOmpFactor = f; }
+  unsigned int SetAndGetNestedLoopOmpFactor(bool condition, unsigned int max);
+
  protected:
   struct GPUProcessorProcessors : public GPUProcessor {
     GPUConstantMem* mProcessorsProc = nullptr;
diff --git a/GPU/GPUTracking/Base/GPUSettingsList.h b/GPU/GPUTracking/Base/GPUSettingsList.h
@@ -121,7 +121,7 @@ AddOption(mergerSortTracks, int, -1, "", 0, "Sort track indizes for GPU track fi
 AddOption(tpcCompressionGatherMode, int, -1, "", 0, "TPC Compressed Clusters Gather Mode (0: DMA transfer gather gpu to host, 1: serial DMA to host and gather by copy on CPU, 2. gather via GPU kernal DMA access, 3. gather on GPU via kernel, dma afterwards")
 AddOption(tpcCompressionGatherModeKernel, int, -1, "", 0, "TPC Compressed Clusters Gather Mode Kernel (0: unbufferd, 1-3: buffered, 4: multi-block)")
 AddOption(runMC, bool, false, "", 0, "Process MC labels")
-AddOption(ompKernels, bool, true, "", 0, "Parallelize with OMP inside kernels instead of over slices")
+AddOption(ompKernels, unsigned char, 2, "", 0, "Parallelize with OMP inside kernels instead of over slices, 2 for nested parallelization over TPC sectors and inside kernels")
 AddOption(doublePipeline, bool, false, "", 0, "Double pipeline mode")
 AddOption(prefetchTPCpageScan, int, 0, "", 0, "Prefetch Data for TPC page scan in CPU cache")
 AddOption(debugLevel, int, -1, "debug", 'd', "Set debug level (-1 = silend)")
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -1536,7 +1536,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
   int streamMap[NSLICES];
 
   bool error = false;
-  GPUCA_OPENMP(parallel for if(!(doGPU || GetProcessingSettings().ompKernels)) num_threads(GetProcessingSettings().ompThreads))
+  GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES)))
   for (unsigned int iSlice = 0; iSlice < NSLICES; iSlice++) {
     if (mRec->GetDeviceType() == GPUReconstruction::DeviceType::HIP) {
       SynchronizeGPU(); // BUG: Workaround for probable bug in AMD runtime, crashes randomly if not synchronized here
@@ -1649,6 +1649,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
       DoDebugAndDump(RecoStep::TPCSliceTracking, 512, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile);
     }
   }
+  mRec->SetNestedLoopOmpFactor(1);
   if (error) {
     return (3);
   }
@@ -1813,7 +1814,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
     }
   } else {
     mSliceSelectorReady = NSLICES;
-    GPUCA_OPENMP(parallel for if(!(doGPU || GetProcessingSettings().ompKernels)) num_threads(GetProcessingSettings().ompThreads))
+    GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES)))
     for (unsigned int iSlice = 0; iSlice < NSLICES; iSlice++) {
       if (param().rec.GlobalTracking) {
         GlobalTracking(iSlice, 0);
@@ -1822,6 +1823,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()
         WriteOutput(iSlice, 0);
       }
     }
+    mRec->SetNestedLoopOmpFactor(1);
   }
 
   if (param().rec.GlobalTracking && GetProcessingSettings().debugLevel >= 3) {
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
@@ -106,7 +106,7 @@ void* GPUTPCTracker::SetPointersCommon(void* mem)
 void GPUTPCTracker::RegisterMemoryAllocation()
 {
   AllocateAndInitializeLate();
-  bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || mRec->GetProcessingSettings().ompKernels || mRec->GetProcessingSettings().ompThreads == 1);
+  bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || mRec->GetProcessingSettings().ompKernels == 1 || mRec->GetProcessingSettings().ompThreads == 1);
   GPUMemoryReuse reLinks{reuseCondition, GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::TrackerDataLinks, (unsigned short)(mISlice % mRec->GetProcessingSettings().nStreams)};
   mMemoryResLinks = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataLinks, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCSliceLinks", reLinks);
   mMemoryResSliceScratch = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataScratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK | GPUMemoryResource::MEMORY_CUSTOM, "TPCSliceScratch");

Original file line number	Diff line number	Diff line change
`@@ -1536,7 +1536,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()`
`1536`	`1536`	`int streamMap[NSLICES];`
`1537`	`1537`
`1538`	`1538`	`bool error = false;`
`1539`		`- GPUCA_OPENMP(parallel for if(!(doGPU \|\| GetProcessingSettings().ompKernels)) num_threads(GetProcessingSettings().ompThreads))`
	`1539`	`+ GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES)))`
`1540`	`1540`	`for (unsigned int iSlice = 0; iSlice < NSLICES; iSlice++) {`
`1541`	`1541`	`if (mRec->GetDeviceType() == GPUReconstruction::DeviceType::HIP) {`
`1542`	`1542`	`SynchronizeGPU(); // BUG: Workaround for probable bug in AMD runtime, crashes randomly if not synchronized here`
`@@ -1649,6 +1649,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()`
`1649`	`1649`	`DoDebugAndDump(RecoStep::TPCSliceTracking, 512, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile);`
`1650`	`1650`	`}`
`1651`	`1651`	`}`
	`1652`	`+ mRec->SetNestedLoopOmpFactor(1);`
`1652`	`1653`	`if (error) {`
`1653`	`1654`	`return (3);`
`1654`	`1655`	`}`
`@@ -1813,7 +1814,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()`
`1813`	`1814`	`}`
`1814`	`1815`	`} else {`
`1815`	`1816`	`mSliceSelectorReady = NSLICES;`
`1816`		`- GPUCA_OPENMP(parallel for if(!(doGPU \|\| GetProcessingSettings().ompKernels)) num_threads(GetProcessingSettings().ompThreads))`
	`1817`	`+ GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES)))`
`1817`	`1818`	`for (unsigned int iSlice = 0; iSlice < NSLICES; iSlice++) {`
`1818`	`1819`	`if (param().rec.GlobalTracking) {`
`1819`	`1820`	`GlobalTracking(iSlice, 0);`
`@@ -1822,6 +1823,7 @@ int GPUChainTracking::RunTPCTrackingSlices_internal()`
`1822`	`1823`	`WriteOutput(iSlice, 0);`
`1823`	`1824`	`}`
`1824`	`1825`	`}`
	`1826`	`+ mRec->SetNestedLoopOmpFactor(1);`
`1825`	`1827`	`}`
`1826`	`1828`
`1827`	`1829`	`if (param().rec.GlobalTracking && GetProcessingSettings().debugLevel >= 3) {`
Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ void* GPUTPCTracker::SetPointersCommon(void* mem)`
`106`	`106`	`void GPUTPCTracker::RegisterMemoryAllocation()`
`107`	`107`	`{`
`108`	`108`	`AllocateAndInitializeLate();`
`109`		`- bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) \|\| mRec->GetProcessingSettings().ompKernels \|\| mRec->GetProcessingSettings().ompThreads == 1);`
	`109`	`+ bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) \|\| mRec->GetProcessingSettings().ompKernels == 1 \|\| mRec->GetProcessingSettings().ompThreads == 1);`
`110`	`110`	`GPUMemoryReuse reLinks{reuseCondition, GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::TrackerDataLinks, (unsigned short)(mISlice % mRec->GetProcessingSettings().nStreams)};`
`111`	`111`	`mMemoryResLinks = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataLinks, GPUMemoryResource::MEMORY_SCRATCH \| GPUMemoryResource::MEMORY_STACK, "TPCSliceLinks", reLinks);`
`112`	`112`	`mMemoryResSliceScratch = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataScratch, GPUMemoryResource::MEMORY_SCRATCH \| GPUMemoryResource::MEMORY_STACK \| GPUMemoryResource::MEMORY_CUSTOM, "TPCSliceScratch");`