DCAFitterGPU: reduce I/O overhead by copying elements using a kernel (#13556)

mconcas · web-flow · commit c0c70ae617dc · 2024-10-03T05:16:40.000+02:00
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -56,14 +56,22 @@ GPUg() void printKernel(Fitter* fitter)
   }
 }
 
+template <typename Fitter>
+GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
+{
+  for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
+    fitters[iThread + off] = fitters[off];
+  }
+}
+
 template <typename Fitter, typename... Tr>
 GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
 {
   *res = fitter->process(*tracks...);
 }
 
 template <typename Fitter, typename... Tr>
-GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks)
+GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
 {
   for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
     results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
@@ -186,7 +194,7 @@ void processBulk(const int nBlocks,
     auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
 
     gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
-    gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
+    gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
     iArg = 0;
     ([&] {
       gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
@@ -196,6 +204,7 @@ void processBulk(const int nBlocks,
     gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
 
     gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
+    kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
     std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
     gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
 
diff --git a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
@@ -560,8 +560,8 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
   const char* nBlocksEnvVarName = "DCAFITTERGPU_TEST_NBLOCKS";
   const char* nBatchesEnvVarName = "DCAFITTERGPU_TEST_NBATCHES";
   const char* nTestsEnvVarName = "DCAFITTERGPU_TEST_NTESTS";
-  int nBlocks = std::getenv(nThreadsEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nThreadsEnvVarName));
-  int nThreads = std::getenv(nBlocksEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nBlocksEnvVarName));
+  int nBlocks = std::getenv(nBlocksEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nBlocksEnvVarName));
+  int nThreads = std::getenv(nThreadsEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nThreadsEnvVarName));
   int nBatches = std::getenv(nBatchesEnvVarName) == nullptr ? 8 : std::stoi(std::getenv(nBatchesEnvVarName));
   int NTest = std::getenv(nTestsEnvVarName) == nullptr ? 100001 : std::stoi(std::getenv(nTestsEnvVarName));