Skip to content

Commit c0c70ae

Browse files
authored
DCAFitterGPU: reduce I/O overhead by copying elements using a kernel (#13556)
1 parent 8957bd6 commit c0c70ae

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

Common/DCAFitter/GPU/cuda/DCAFitterN.cu

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,22 @@ GPUg() void printKernel(Fitter* fitter)
5656
}
5757
}
5858

59+
template <typename Fitter>
60+
GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
61+
{
62+
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
63+
fitters[iThread + off] = fitters[off];
64+
}
65+
}
66+
5967
template <typename Fitter, typename... Tr>
6068
GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
6169
{
6270
*res = fitter->process(*tracks...);
6371
}
6472

6573
template <typename Fitter, typename... Tr>
66-
GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks)
74+
GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
6775
{
6876
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
6977
results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
@@ -186,7 +194,7 @@ void processBulk(const int nBlocks,
186194
auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
187195
188196
gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
189-
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
197+
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
190198
iArg = 0;
191199
([&] {
192200
gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
@@ -196,6 +204,7 @@ void processBulk(const int nBlocks,
196204
gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
197205
198206
gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
207+
kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
199208
std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
200209
gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
201210

Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -560,8 +560,8 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
560560
const char* nBlocksEnvVarName = "DCAFITTERGPU_TEST_NBLOCKS";
561561
const char* nBatchesEnvVarName = "DCAFITTERGPU_TEST_NBATCHES";
562562
const char* nTestsEnvVarName = "DCAFITTERGPU_TEST_NTESTS";
563-
int nBlocks = std::getenv(nThreadsEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nThreadsEnvVarName));
564-
int nThreads = std::getenv(nBlocksEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nBlocksEnvVarName));
563+
int nBlocks = std::getenv(nBlocksEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nBlocksEnvVarName));
564+
int nThreads = std::getenv(nThreadsEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nThreadsEnvVarName));
565565
int nBatches = std::getenv(nBatchesEnvVarName) == nullptr ? 8 : std::stoi(std::getenv(nBatchesEnvVarName));
566566
int NTest = std::getenv(nTestsEnvVarName) == nullptr ? 100001 : std::stoi(std::getenv(nTestsEnvVarName));
567567

0 commit comments

Comments
 (0)