AliceO2Group · davidrohr · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025
@@ -60,6 +60,21 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
   Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
 }
 
+int32_t GPUReconstructionCPUBackend::getNOMPThreads()
+{
+  int32_t ompThreads = 0;
+  if (mProcessingSettings.ompKernels == 2) {
+    ompThreads = mProcessingSettings.ompThreads / mNestedLoopOmpFactor;
+    if ((uint32_t)getOMPThreadNum() < mProcessingSettings.ompThreads % mNestedLoopOmpFactor) {
+      ompThreads++;
+    }
+    ompThreads = std::max(1, ompThreads);
+  } else {
+    ompThreads = mProcessingSettings.ompKernels ? mProcessingSettings.ompThreads : 1;
+  }
+  return ompThreads;
+}
+
 template <class T, int32_t I, typename... Args>
 inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
 {
@@ -73,16 +88,7 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
   }
   uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num;
   for (uint32_t k = 0; k < num; k++) {
-    int32_t ompThreads = 0;
-    if (mProcessingSettings.ompKernels == 2) {
-      ompThreads = mProcessingSettings.ompThreads / mNestedLoopOmpFactor;
-      if ((uint32_t)getOMPThreadNum() < mProcessingSettings.ompThreads % mNestedLoopOmpFactor) {
-        ompThreads++;
-      }
-      ompThreads = std::max(1, ompThreads);
-    } else {
-      ompThreads = mProcessingSettings.ompKernels ? mProcessingSettings.ompThreads : 1;
-    }
+    int32_t ompThreads = getNOMPThreads();
     if (ompThreads > 1) {
       if (mProcessingSettings.debugLevel >= 5) {
         printf("Running %d ompThreads\n", ompThreads);
@@ -105,7 +111,12 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
 template <>
 inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
-  memset(ptr, 0, size);
+  int32_t ompThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNOMPThreads()));
+  if (ompThreads > 1) {
+    memset(ptr, 0, size);
+  } else {
+    memset(ptr, 0, size);
+  }
   return 0;
 }
 

@@ -46,6 +46,7 @@ class GPUReconstructionCPUBackend : public GPUReconstruction
   uint32_t mNestedLoopOmpFactor = 1;
   static int32_t getOMPThreadNum();
   static int32_t getOMPMaxThreads();
+  int32_t getNOMPThreads();
 };
 
 class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCPUBackend>

@@ -160,7 +160,7 @@ elseif(GPUCA_CUDA_COMPILE_MODE STREQUAL "perkernel")
   add_custom_command(
     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/GPUTrackingCUDAKernelModules.o
     COMMAND cp -u $<TARGET_OBJECTS:GPUTrackingCUDAKernels> ${CMAKE_CURRENT_BINARY_DIR}/cuda_kernel_module_fatbin/
-    COMMAND ${CMAKE_LINKER} --relocatable --format binary --output ${CMAKE_CURRENT_BINARY_DIR}/GPUTrackingCUDAKernelModules.o $<PATH:RELATIVE_PATH,$<LIST:TRANSFORM,$<PATH:GET_FILENAME,$<TARGET_OBJECTS:GPUTrackingCUDAKernels>>,PREPEND,${CMAKE_CURRENT_BINARY_DIR}/cuda_kernel_module_fatbin/>,${CMAKE_CURRENT_BINARY_DIR}>
+    COMMAND ${CMAKE_LINKER} -z noexecstack --relocatable --format binary --output ${CMAKE_CURRENT_BINARY_DIR}/GPUTrackingCUDAKernelModules.o $<PATH:RELATIVE_PATH,$<LIST:TRANSFORM,$<PATH:GET_FILENAME,$<TARGET_OBJECTS:GPUTrackingCUDAKernels>>,PREPEND,${CMAKE_CURRENT_BINARY_DIR}/cuda_kernel_module_fatbin/>,${CMAKE_CURRENT_BINARY_DIR}>
     DEPENDS GPUTrackingCUDAKernels $<TARGET_OBJECTS:GPUTrackingCUDAKernels>
     COMMENT "Compiling fatbin kernels ${CMAKE_CURRENT_BINARY_DIR}/GPUTrackingCUDAKernelModules.o"
     VERBATIM

@@ -217,7 +217,7 @@ elseif(GPUCA_HIP_COMPILE_MODE STREQUAL "perkernel")
   add_custom_command(
     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/GPUTrackingHIPKernelModules.o
     COMMAND cp -u $<TARGET_OBJECTS:GPUTrackingHIPKernels> ${CMAKE_CURRENT_BINARY_DIR}/hip_kernel_module_fatbin/
-    COMMAND ${CMAKE_LINKER} --relocatable --format binary --output ${CMAKE_CURRENT_BINARY_DIR}/GPUTrackingHIPKernelModules.o $<PATH:RELATIVE_PATH,$<LIST:TRANSFORM,$<PATH:GET_FILENAME,$<TARGET_OBJECTS:GPUTrackingHIPKernels>>,PREPEND,${CMAKE_CURRENT_BINARY_DIR}/hip_kernel_module_fatbin/>,${CMAKE_CURRENT_BINARY_DIR}>
+    COMMAND ${CMAKE_LINKER} -z noexecstack --relocatable --format binary --output ${CMAKE_CURRENT_BINARY_DIR}/GPUTrackingHIPKernelModules.o $<PATH:RELATIVE_PATH,$<LIST:TRANSFORM,$<PATH:GET_FILENAME,$<TARGET_OBJECTS:GPUTrackingHIPKernels>>,PREPEND,${CMAKE_CURRENT_BINARY_DIR}/hip_kernel_module_fatbin/>,${CMAKE_CURRENT_BINARY_DIR}>
     DEPENDS GPUTrackingHIPKernels $<TARGET_OBJECTS:GPUTrackingHIPKernels>
     COMMENT "Compiling fatbin kernels ${CMAKE_CURRENT_BINARY_DIR}/GPUTrackingHIPKernelModules.o"
     VERBATIM

@@ -17,7 +17,7 @@ function(create_binary_resource RESOURCE OUTPUTFILE)
   FILE(RELATIVE_PATH input-file-rel ${CMAKE_CURRENT_BINARY_DIR} ${input-file-abs})
   add_custom_command(
     OUTPUT ${OUTPUTFILE}
-    COMMAND ${CMAKE_LINKER} --relocatable --format binary --output ${OUTPUTFILE} ${input-file-rel}
+    COMMAND ${CMAKE_LINKER} -z noexecstack --relocatable --format binary --output ${OUTPUTFILE} ${input-file-rel}
     DEPENDS ${input-file-rel}
     COMMENT "Adding binary resource ${input-file-rel}"
     VERBATIM