AliceO2Group · davidrohr · Nov 18, 2024 · Nov 16, 2024 · Nov 16, 2024 · Nov 16, 2024
@@ -399,7 +399,7 @@ GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ,
 
 GPUdi() float GPUCommonMath::InvSqrt(float _x)
 {
-#ifdef GPUCA_NO_FAST_MATH
+#if defined(GPUCA_NO_FAST_MATH) || defined(__OPENCL__)
   return 1.f / Sqrt(_x);
 #elif defined(__CUDACC__) || defined(__HIPCC__)
   return __frsqrt_rn(_x);

@@ -114,7 +114,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   virtual void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) {}
   virtual void StreamWaitForEvents(int32_t stream, deviceEvent* evList, int32_t nEvents = 1) {}
   virtual bool IsEventDone(deviceEvent* evList, int32_t nEvents = 1) { return true; }
-  virtual void RecordMarker(deviceEvent ev, int32_t stream) {}
+  virtual void RecordMarker(deviceEvent* ev, int32_t stream) {}
   virtual void SynchronizeGPU() {}
   virtual void ReleaseEvent(deviceEvent ev) {}
   virtual int32_t StartHelperThreads() { return 0; }

@@ -548,7 +548,7 @@ size_t GPUReconstructionCUDA::WriteToConstantMemory(size_t offset, const void* s
 }
 
 void GPUReconstructionCUDA::ReleaseEvent(deviceEvent ev) {}
-void GPUReconstructionCUDA::RecordMarker(deviceEvent ev, int32_t stream) { GPUFailedMsg(cudaEventRecord(ev.get<cudaEvent_t>(), mInternals->Streams[stream])); }
+void GPUReconstructionCUDA::RecordMarker(deviceEvent* ev, int32_t stream) { GPUFailedMsg(cudaEventRecord(ev->get<cudaEvent_t>(), mInternals->Streams[stream])); }
 
 std::unique_ptr<GPUReconstruction::GPUThreadContext> GPUReconstructionCUDA::GetThreadContext()
 {

@@ -84,7 +84,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
   size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override;
   size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
   void ReleaseEvent(deviceEvent ev) override;
-  void RecordMarker(deviceEvent ev, int32_t stream) override;
+  void RecordMarker(deviceEvent* ev, int32_t stream) override;
 
   void GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame) override;
 

@@ -359,6 +359,11 @@ int32_t GPUReconstructionOCL::InitDevice_Runtime()
     mInternals = master->mInternals;
   }
 
+  for (uint32_t i = 0; i < mEvents.size(); i++) {
+    cl_event* events = (cl_event*)mEvents[i].data();
+    new (events) cl_event[mEvents[i].size()];
+  }
+
   return (0);
 }
 
@@ -432,7 +437,7 @@ size_t GPUReconstructionOCL::WriteToConstantMemory(size_t offset, const void* sr
 
 void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUFailedMsg(clReleaseEvent(ev.get<cl_event>())); }
 
-void GPUReconstructionOCL::RecordMarker(deviceEvent ev, int32_t stream) { GPUFailedMsg(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev.getEventList<cl_event>())); }
+void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUFailedMsg(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList<cl_event>())); }
 
 int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event)
 {

@@ -52,7 +52,7 @@ class GPUReconstructionOCL : public GPUReconstructionDeviceBase
   size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override;
   size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
   void ReleaseEvent(deviceEvent ev) override;
-  void RecordMarker(deviceEvent ev, int32_t stream) override;
+  void RecordMarker(deviceEvent* ev, int32_t stream) override;
 
   virtual int32_t GetOCLPrograms() = 0;
   virtual bool CheckPlatform(uint32_t i) = 0;

@@ -23,9 +23,11 @@ endif()
 set(CL_SRC ${GPUDIR}/Base/opencl-common/GPUReconstructionOCL.cl)
 set(CL_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionOCL2Code)
 
-set(OCL_FLAGS -ferror-limit=1000 -Dcl_clang_storage_class_specifiers -Wno-invalid-constexpr -Wno-unused-command-line-argument -cl-std=CLC++2021)
+set(OCL_FLAGS -Dcl_clang_storage_class_specifiers -cl-std=CLC++2021)
 if(NOT DEFINED GPUCA_NO_FAST_MATH OR NOT ${GPUCA_NO_FAST_MATH})
-  set(OCL_FLAGS ${OCL_FLAGS} -Xclang -fdenormal-fp-math-f32=ieee -cl-mad-enable -cl-no-signed-zeros)
+  set(OCL_FLAGS ${OCL_FLAGS} -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math)
+else()
+set(OCL_FLAGS ${OCL_FLAGS} -cl-fp32-correctly-rounded-divide-sqrt)
 endif()
 set(OCL_DEFINECL "-D$<JOIN:$<TARGET_PROPERTY:O2::GPUTracking,COMPILE_DEFINITIONS>,$<SEMICOLON>-D>"
             "-I$<JOIN:$<FILTER:$<TARGET_PROPERTY:O2::GPUTracking,INCLUDE_DIRECTORIES>,EXCLUDE,^/usr/include/?>,$<SEMICOLON>-I>"
@@ -47,6 +49,7 @@ if(OPENCL2_ENABLED_SPIRV) # BUILD OpenCL2 intermediate code for SPIR-V target
               -O0
               --target=spirv64
               -fno-integrated-objemitter
+              -ferror-limit=1000 -Wno-invalid-constexpr -Wno-unused-command-line-argument
               ${OCL_FLAGS}
               ${OCL_DEFINECL}
               -o ${CL_BIN}.spirv -c ${CL_SRC}
@@ -64,6 +67,7 @@ if(OPENCL2_ENABLED) # BUILD OpenCL2 source code for runtime compilation target
   add_custom_command(
       OUTPUT ${CL_BIN}.src
       COMMAND ${LLVM_CLANG}
+              -Wno-unused-command-line-argument
               ${OCL_FLAGS}
               ${OCL_DEFINECL}
               -cl-no-stdinc

@@ -64,14 +64,14 @@ int32_t GPUReconstructionOCL2Backend::GetOCLPrograms()
   const char* ocl_flags = GPUCA_M_STR(OCL_FLAGS);
 
 #ifdef OPENCL2_ENABLED_SPIRV // clang-format off
-  if (ver >= 2.2f) {
-    GPUInfo("Reading OpenCL program from SPIR-V IL (Platform version %f)", ver);
+  if (ver >= 2.2f && !GetProcessingSettings().oclCompileFromSources) {
+    GPUInfo("Reading OpenCL program from SPIR-V IL (Platform version %4.2f)", ver);
     mInternals->program = clCreateProgramWithIL(mInternals->context, _binary_GPUReconstructionOCL2Code_spirv_start, _binary_GPUReconstructionOCL2Code_spirv_len, &ocl_error);
     ocl_flags = "";
   } else
 #endif // clang-format on
   {
-    GPUInfo("Compiling OpenCL program from sources (Platform version %f, %s)", ver);
+    GPUInfo("Compiling OpenCL program from sources (Platform version %4.2f)", ver);
     size_t program_sizes[1] = {_binary_GPUReconstructionOCL2Code_src_len};
     char* programs_sources[1] = {_binary_GPUReconstructionOCL2Code_src_start};
     mInternals->program = clCreateProgramWithSource(mInternals->context, (cl_uint)1, (const char**)&programs_sources, program_sizes, &ocl_error);

@@ -218,7 +218,7 @@ AddHelp("help", 'h')
 EndConfig()
 
 BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Processing settings", proc)
-AddOption(platformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (-1 = auto-select)")
+AddOption(platformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (OpenCL only, -1 = auto-select)")
 AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)")
 AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)")
 AddOption(globalInitMutex, bool, false, "", 0, "Use global mutex to synchronize initialization of multiple GPU instances")
@@ -291,6 +291,7 @@ AddOption(tpcApplyDebugClusterFilter, bool, false, "", 0, "Apply custom cluster
 AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
 AddOption(RTCprependCommand, std::string, "", "", 0, "Prepend RTC compilation commands by this string")
 AddOption(RTCoverrideArchitecture, std::string, "", "", 0, "Override arhcitecture part of RTC compilation command line")
+AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from included source code instead of using included spirv code")
 AddOption(printSettings, bool, false, "", 0, "Print all settings when initializing")
 AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)

@@ -101,7 +101,7 @@ class GPUChain
     }
   }
   inline bool IsEventDone(deviceEvent* evList, int32_t nEvents = 1) { return mRec->IsEventDone(evList, nEvents); }
-  inline void RecordMarker(deviceEvent ev, int32_t stream) { mRec->RecordMarker(ev, stream); }
+  inline void RecordMarker(deviceEvent* ev, int32_t stream) { mRec->RecordMarker(ev, stream); }
   virtual inline std::unique_ptr<GPUReconstruction::GPUThreadContext> GetThreadContext() { return mRec->GetThreadContext(); }
   inline void SynchronizeGPU() { mRec->SynchronizeGPU(); }
   inline void ReleaseEvent(deviceEvent ev, bool doGPU = true)

@@ -865,8 +865,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         }
 
         if (fragment.index == 0) {
-          runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, transferRunning[lane] == 1 ? &mEvents->stream[lane] : nullptr}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow));
-          transferRunning[lane] = 2;
+          deviceEvent* waitEvent = nullptr;
+          if (transferRunning[lane] == 1) {
+            waitEvent = &mEvents->stream[lane];
+            transferRunning[lane] = 2;
+          }
+          runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow));
         }
 
         if (clusterer.mPmemory->counters.nClusters == 0) {
@@ -930,7 +934,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (transferRunning[lane]) {
           ReleaseEvent(mEvents->stream[lane], doGPU);
         }
-        RecordMarker(mEvents->stream[lane], mRec->NStreams() - 1);
+        RecordMarker(&mEvents->stream[lane], mRec->NStreams() - 1);
         transferRunning[lane] = 1;
       }
 

@@ -37,7 +37,7 @@ int32_t GPUChainTracking::RunTPCCompression()
   GPUTPCCompression& CompressorShadow = doGPU ? processorsShadow()->tpcCompressor : Compressor;
   const auto& threadContext = GetThreadContext();
   if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) {
-    RecordMarker(mEvents->single, 0);
+    RecordMarker(&mEvents->single, 0);
   }
 
   if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
@@ -124,7 +124,7 @@ int32_t GPUChainTracking::RunTPCCompression()
         return 1;
     }
     if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
-      RecordMarker(mEvents->stream[outputStream], outputStream);
+      RecordMarker(&mEvents->stream[outputStream], outputStream);
       char* deviceFlatPts = (char*)Compressor.mOutput->qTotU;
       if (GetProcessingSettings().doublePipeline) {
         const size_t blockSize = CAMath::nextMultipleOf<1024>(copySize / 30);

@@ -33,7 +33,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice
   uint32_t n = withinSlice == -1 ? NSLICES / 2 : NSLICES;
   if (GetProcessingSettings().alternateBorderSort && (!mRec->IsGPU() || doGPUall)) {
     TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->init);
-    RecordMarker(mEvents->single, 0);
+    RecordMarker(&mEvents->single, 0);
     for (uint32_t i = 0; i < n; i++) {
       int32_t stream = i % mRec->NStreams();
       runKernel<GPUTPCGMMergerMergeBorders, 0>({GetGridAuto(stream, deviceType), krnlRunRangeNone, {nullptr, stream && i < (uint32_t)mRec->NStreams() ? &mEvents->single : nullptr}}, i, withinSlice, mergeMode);
@@ -55,7 +55,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice
       if (i == n - 1) { // Synchronize all execution on stream 0 with the last kernel
         ne = std::min<int32_t>(n, mRec->NStreams());
         for (int32_t j = 1; j < ne; j++) {
-          RecordMarker(mEvents->slice[j], j);
+          RecordMarker(&mEvents->slice[j], j);
         }
         e = &mEvents->slice[1];
         ne--;
@@ -251,7 +251,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
   DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile);
 
   if (doGPUall) {
-    RecordMarker(mEvents->single, 0);
+    RecordMarker(&mEvents->single, 0);
     auto* waitEvent = &mEvents->single;
     if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().createO2Output <= 1 || mFractionalQAEnabled) {
       if (!(GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().createO2Output <= 1)) {
@@ -317,7 +317,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
       TransferMemoryResourcesToHost(RecoStep::TPCMerging, &Merger, -1, true);
       runKernel<GPUTPCGMO2Output, GPUTPCGMO2Output::mc>(GetGridAuto(0, GPUReconstruction::krnlDeviceType::CPU));
     } else if (doGPUall) {
-      RecordMarker(mEvents->single, 0);
+      RecordMarker(&mEvents->single, 0);
       TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2(), outputStream, nullptr, &mEvents->single);
       TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2Clus(), outputStream);
       ReleaseEvent(mEvents->single);

@@ -164,9 +164,11 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
       TransferMemoryResourceLinkToGPU(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap, streamOccMap, &mEvents->init);
     }
   }
-  uint32_t& occupancyTotal = *mInputsHost->mTPCClusterOccupancyMap;
-  occupancyTotal = CAMath::Float2UIntRn(mRec->MemoryScalers()->nTPCHits / (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF ? mIOPtrs.settingsTF->nHBFPerTF : 128));
-  mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, streamOccMap);
+  if (param().rec.tpc.occupancyMapTimeBins || param().rec.tpc.sysClusErrorC12Norm) {
+    uint32_t& occupancyTotal = *mInputsHost->mTPCClusterOccupancyMap;
+    occupancyTotal = CAMath::Float2UIntRn(mRec->MemoryScalers()->nTPCHits / (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF ? mIOPtrs.settingsTF->nHBFPerTF : 128));
+    mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, streamOccMap);
+  }
 
   int32_t streamMap[NSLICES];
 
@@ -305,7 +307,7 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
         SynchronizeGPU();
       } else {
         for (int32_t i = 0; i < mRec->NStreams(); i++) {
-          RecordMarker(mEvents->stream[i], i);
+          RecordMarker(&mEvents->stream[i], i);
         }
         runKernel<GPUTPCTrackletConstructor, 1>({GetGridAuto(0), krnlRunRangeNone, {&mEvents->single, mEvents->stream, mRec->NStreams()}});
         for (int32_t i = 0; i < mRec->NStreams(); i++) {