@@ -56,14 +56,22 @@ GPUg() void printKernel(Fitter* fitter)
5656 }
5757}
5858
59+ template <typename Fitter>
60+ GPUg () void initFitters (Fitter* fitters, unsigned int off, unsigned int N)
61+ {
62+ for (auto iThread{blockIdx .x * blockDim .x + threadIdx .x + 1 }; iThread < N; iThread += blockDim .x * gridDim .x ) {
63+ fitters[iThread + off] = fitters[off];
64+ }
65+ }
66+
5967template <typename Fitter, typename ... Tr>
6068GPUg () void processKernel (Fitter* fitter, int * res, Tr*... tracks)
6169{
6270 *res = fitter->process (*tracks...);
6371}
6472
6573template <typename Fitter, typename ... Tr>
66- GPUg () void processBatchKernel (Fitter* fitters, int * results, size_t off, size_t N, Tr*... tracks)
74+ GPUg () void processBatchKernel (Fitter* fitters, int * results, unsigned int off, unsigned int N, Tr*... tracks)
6775{
6876 for (auto iThread{blockIdx .x * blockDim .x + threadIdx .x }; iThread < N; iThread += blockDim .x * gridDim .x ) {
6977 results[iThread + off] = fitters[iThread + off].process (tracks[iThread + off]...);
@@ -186,7 +194,7 @@ void processBulk(const int nBlocks,
186194 auto nFits = batchSize + (iBatch < remainder ? 1 : 0 );
187195
188196 gpuCheckError (cudaEventRecord (startIOUp[iBatch], stream));
189- gpuCheckError (cudaMemcpyAsync (fitters_device + offset, fitters.data () + offset, sizeof (Fitter) * nFits, cudaMemcpyHostToDevice, stream));
197+ gpuCheckError (cudaMemcpyAsync (fitters_device + offset, fitters.data () + offset, sizeof (Fitter) /* * nFits */ , cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
190198 iArg = 0 ;
191199 ([&] {
192200 gpuCheckError (cudaMemcpyAsync (tracks_device[iArg] + offset, args.data () + offset, sizeof (Tr) * nFits, cudaMemcpyHostToDevice, stream));
@@ -196,6 +204,7 @@ void processBulk(const int nBlocks,
196204 gpuCheckError (cudaEventRecord (endIOUp[iBatch], stream));
197205
198206 gpuCheckError (cudaEventRecord (startKer[iBatch], stream));
207+ kernel::initFitters<<<nBlocks, nThreads, 0 , stream>>> (fitters_device, offset, nFits);
199208 std::apply ([&](auto &&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0 , stream>>> (fitters_device, results_device, offset, nFits, args...); }, tracks_device);
200209 gpuCheckError (cudaEventRecord (endKer[iBatch], stream));
201210
0 commit comments