GPU: Some cleanup, and fixes when running only part of processing on GPU

davidrohr · davidrohr · commit 356b066ecf6b · 2024-11-26T13:25:57.000+01:00
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -921,33 +921,6 @@ int32_t GPUChainTracking::FinalizePipelinedProcessing()
   return RunChainFinalize();
 }
 
-int32_t GPUChainTracking::HelperReadEvent(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par) { return ReadEvent(iSlice, threadId); }
-
-int32_t GPUChainTracking::HelperOutput(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par)
-{
-  if (param().rec.tpc.globalTracking) {
-    uint32_t tmpSlice = GPUTPCGlobalTracking::GlobalTrackingSliceOrder(iSlice);
-    uint32_t sliceLeft, sliceRight;
-    GPUTPCGlobalTracking::GlobalTrackingSliceLeftRight(tmpSlice, sliceLeft, sliceRight);
-
-    while (mSliceSelectorReady < (int32_t)tmpSlice || mSliceSelectorReady < (int32_t)sliceLeft || mSliceSelectorReady < (int32_t)sliceRight) {
-      if (par->reset) {
-        return 1;
-      }
-    }
-    GlobalTracking(tmpSlice, 0);
-    WriteOutput(tmpSlice, 0);
-  } else {
-    while (mSliceSelectorReady < iSlice) {
-      if (par->reset) {
-        return 1;
-      }
-    }
-    WriteOutput(iSlice, threadId);
-  }
-  return 0;
-}
-
 int32_t GPUChainTracking::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, std::vector<std::array<uint32_t, 4>>* fillErrors)
 {
   int32_t retVal = 0;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -457,6 +457,7 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int
 
 int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
 {
+  bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding;
   if (restorePointers) {
     for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
       processors()->tpcClusterer[iSlice].mPzsOffsets = mCFContext->ptrSave[iSlice].zsOffsetHost;
@@ -512,7 +513,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
       uint32_t threshold = 40000000;
       uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase);
       processors()->tpcClusterer[iSlice].SetNMaxDigits(processors()->tpcClusterer[iSlice].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSlice]);
-      if (mRec->IsGPU()) {
+      if (doGPU) {
         processorsShadow()->tpcClusterer[iSlice].SetNMaxDigits(processors()->tpcClusterer[iSlice].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSlice]);
       }
       if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
@@ -578,7 +579,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {
     return 1;
   }
-  if (GetProcessingSettings().ompAutoNThreads && !mRec->IsGPU()) {
+  if (GetProcessingSettings().ompAutoNThreads && !doGPU) {
     mRec->SetNOMPThreads(mRec->MemoryScalers()->nTPCdigits / 20000);
   }
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
@@ -532,3 +532,30 @@ void GPUChainTracking::WriteOutput(int32_t iSlice, int32_t threadId)
     GPUInfo("Finished WriteOutput for slice %d on thread %d\n", iSlice, threadId);
   }
 }
+
+int32_t GPUChainTracking::HelperReadEvent(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par) { return ReadEvent(iSlice, threadId); }
+
+int32_t GPUChainTracking::HelperOutput(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par)
+{
+  if (param().rec.tpc.globalTracking) {
+    uint32_t tmpSlice = GPUTPCGlobalTracking::GlobalTrackingSliceOrder(iSlice);
+    uint32_t sliceLeft, sliceRight;
+    GPUTPCGlobalTracking::GlobalTrackingSliceLeftRight(tmpSlice, sliceLeft, sliceRight);
+
+    while (mSliceSelectorReady < (int32_t)tmpSlice || mSliceSelectorReady < (int32_t)sliceLeft || mSliceSelectorReady < (int32_t)sliceRight) {
+      if (par->reset) {
+        return 1;
+      }
+    }
+    GlobalTracking(tmpSlice, 0);
+    WriteOutput(tmpSlice, 0);
+  } else {
+    while (mSliceSelectorReady < iSlice) {
+      if (par->reset) {
+        return 1;
+      }
+    }
+    WriteOutput(iSlice, threadId);
+  }
+  return 0;
+}
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx
@@ -265,22 +265,20 @@ GPUdii() int32_t GPUTPCSliceData::InitFromClusterData(int32_t nBlocks, int32_t n
       for (uint32_t i = iThread; i < NumberOfClusters; i += nThreads) {
         UpdateMinMaxYZ(yMin, yMax, zMin, zMax, YZData[RowOffset + i].x, YZData[RowOffset + i].y);
       }
+    } else if (mem->param.par.earlyTpcTransform) { // Early transform case with ClusterNative present
+      for (uint32_t i = iThread; i < NumberOfClusters; i += nThreads) {
+        float2 tmp;
+        tmp.x = mClusterData[RowOffset + i].y;
+        tmp.y = mClusterData[RowOffset + i].z;
+        UpdateMinMaxYZ(yMin, yMax, zMin, zMax, tmp.x, tmp.y);
+        YZData[RowOffset + i] = tmp;
+      }
     } else {
-      if (mem->param.par.earlyTpcTransform) { // Early transform case with ClusterNative present
-        for (uint32_t i = iThread; i < NumberOfClusters; i += nThreads) {
-          float2 tmp;
-          tmp.x = mClusterData[RowOffset + i].y;
-          tmp.y = mClusterData[RowOffset + i].z;
-          UpdateMinMaxYZ(yMin, yMax, zMin, zMax, tmp.x, tmp.y);
-          YZData[RowOffset + i] = tmp;
-        }
-      } else {
-        for (uint32_t i = iThread; i < NumberOfClusters; i += nThreads) {
-          float x, y, z;
-          GPUTPCConvertImpl::convert(*mem, iSlice, rowIndex, mem->ioPtrs.clustersNative->clusters[iSlice][rowIndex][i].getPad(), mem->ioPtrs.clustersNative->clusters[iSlice][rowIndex][i].getTime(), x, y, z);
-          UpdateMinMaxYZ(yMin, yMax, zMin, zMax, y, z);
-          YZData[RowOffset + i] = CAMath::MakeFloat2(y, z);
-        }
+      for (uint32_t i = iThread; i < NumberOfClusters; i += nThreads) {
+        float x, y, z;
+        GPUTPCConvertImpl::convert(*mem, iSlice, rowIndex, mem->ioPtrs.clustersNative->clusters[iSlice][rowIndex][i].getPad(), mem->ioPtrs.clustersNative->clusters[iSlice][rowIndex][i].getTime(), x, y, z);
+        UpdateMinMaxYZ(yMin, yMax, zMin, zMax, y, z);
+        YZData[RowOffset + i] = CAMath::MakeFloat2(y, z);
       }
     }
 
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
@@ -81,7 +81,7 @@ void* GPUTPCTracker::SetPointersScratch(void* mem)
   if (mRec->GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
     mem = SetPointersTracklets(mem);
   }
-  if (mRec->IsGPU()) {
+  if (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) {
     computePointerWithAlignment(mem, mTrackletTmpStartHits, GPUCA_ROW_COUNT * mNMaxRowStartHits);
     computePointerWithAlignment(mem, mRowStartHitCountOffset, GPUCA_ROW_COUNT);
   }
@@ -164,7 +164,7 @@ void GPUTPCTracker::SetMaxData(const GPUTrackingInOutPointers& io)
   mNMaxTracks = mRec->MemoryScalers()->NTPCSectorTracks(mData.NumberOfHits());
   mNMaxTrackHits = mRec->MemoryScalers()->NTPCSectorTrackHits(mData.NumberOfHits(), mRec->GetProcessingSettings().tpcInputWithClusterRejection);
 #ifdef GPUCA_SORT_STARTHITS_GPU
-  if (mRec->IsGPU()) {
+  if (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) {
     if (mNMaxStartHits > mNMaxRowStartHits * GPUCA_ROW_COUNT) {
       mNMaxStartHits = mNMaxRowStartHits * GPUCA_ROW_COUNT;
     }

Original file line number	Diff line number	Diff line change
`@@ -457,6 +457,7 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int`
`457`	`457`
`458`	`458`	`int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)`
`459`	`459`	`{`
	`460`	`+ bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding;`
`460`	`461`	`if (restorePointers) {`
`461`	`462`	`for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) {`
`462`	`463`	`processors()->tpcClusterer[iSlice].mPzsOffsets = mCFContext->ptrSave[iSlice].zsOffsetHost;`
`@@ -512,7 +513,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)`
`512`	`513`	`uint32_t threshold = 40000000;`
`513`	`514`	`uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase);`
`514`	`515`	`processors()->tpcClusterer[iSlice].SetNMaxDigits(processors()->tpcClusterer[iSlice].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSlice]);`
`515`		`- if (mRec->IsGPU()) {`
	`516`	`+ if (doGPU) {`
`516`	`517`	`processorsShadow()->tpcClusterer[iSlice].SetNMaxDigits(processors()->tpcClusterer[iSlice].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSlice]);`
`517`	`518`	`}`
`518`	`519`	`if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {`
`@@ -578,7 +579,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)`
`578`	`579`	`if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {`
`579`	`580`	`return 1;`
`580`	`581`	`}`
`581`		`- if (GetProcessingSettings().ompAutoNThreads && !mRec->IsGPU()) {`
	`582`	`+ if (GetProcessingSettings().ompAutoNThreads && !doGPU) {`
`582`	`583`	`mRec->SetNOMPThreads(mRec->MemoryScalers()->nTPCdigits / 20000);`
`583`	`584`	`}`
`584`	`585`
Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@ void* GPUTPCTracker::SetPointersScratch(void* mem)`
`81`	`81`	`if (mRec->GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_INDIVIDUAL) {`
`82`	`82`	`mem = SetPointersTracklets(mem);`
`83`	`83`	`}`
`84`		`- if (mRec->IsGPU()) {`
	`84`	`+ if (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) {`
`85`	`85`	`computePointerWithAlignment(mem, mTrackletTmpStartHits, GPUCA_ROW_COUNT * mNMaxRowStartHits);`
`86`	`86`	`computePointerWithAlignment(mem, mRowStartHitCountOffset, GPUCA_ROW_COUNT);`
`87`	`87`	`}`
`@@ -164,7 +164,7 @@ void GPUTPCTracker::SetMaxData(const GPUTrackingInOutPointers& io)`
`164`	`164`	`mNMaxTracks = mRec->MemoryScalers()->NTPCSectorTracks(mData.NumberOfHits());`
`165`	`165`	`mNMaxTrackHits = mRec->MemoryScalers()->NTPCSectorTrackHits(mData.NumberOfHits(), mRec->GetProcessingSettings().tpcInputWithClusterRejection);`
`166`	`166`	`#ifdef GPUCA_SORT_STARTHITS_GPU`
`167`		`- if (mRec->IsGPU()) {`
	`167`	`+ if (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) {`
`168`	`168`	`if (mNMaxStartHits > mNMaxRowStartHits * GPUCA_ROW_COUNT) {`
`169`	`169`	`mNMaxStartHits = mNMaxRowStartHits * GPUCA_ROW_COUNT;`
`170`	`170`	`}`