@@ -36,34 +36,51 @@ namespace o2::vertexing::device
3636{
3737namespace kernel
3838{
39+ GPUg () void warmUpGpuKernel ()
40+ {
41+ unsigned int tid = blockIdx .x * blockDim .x + threadIdx .x ;
42+ float ia, ib;
43+ ia = ib = 0 .0f ;
44+ ib += ia + tid;
45+ }
46+
3947template <typename Fitter>
40- GPUg () void printKernel (Fitter* ft )
48+ GPUg () void printKernel (Fitter* fitter )
4149{
4250 if (threadIdx .x == 0 ) {
43- printf (" =============== GPU DCA Fitter %d prongs ================\n " , Fitter::getNProngs ());
44- ft ->print ();
51+ printf (" =============== GPU DCA Fitter %d prongs ================= \n " , Fitter::getNProngs ());
52+ fitter ->print ();
4553 printf (" =========================================================\n " );
4654 }
4755}
4856
4957template <typename Fitter, typename ... Tr>
50- GPUg () void processKernel (Fitter* ft , int * res, Tr*... tracks)
58+ GPUg () void processKernel (Fitter* fitter , int * res, Tr*... tracks)
5159{
52- *res = ft ->process (*tracks...);
60+ *res = fitter ->process (*tracks...);
5361}
62+
63+ template <typename Fitter, typename ... Tr>
64+ GPUg () void processBulkKernel (Fitter* fitters, int * results, unsigned int N, Tr*... tracks)
65+ {
66+ for (auto iThread{blockIdx .x * blockDim .x + threadIdx .x }; iThread < N; iThread += blockDim .x * gridDim .x ) {
67+ results[iThread] = fitters[iThread].process (tracks[iThread]...);
68+ }
69+ }
70+
5471} // namespace kernel
5572
5673// / CPU handlers
5774template <typename Fitter>
5875void print (const int nBlocks,
5976 const int nThreads,
60- Fitter& ft )
77+ Fitter& fitter )
6178{
62- Fitter* ft_device ;
63- gpuCheckError (cudaMalloc (reinterpret_cast <void **>(&ft_device ), sizeof (Fitter)));
64- gpuCheckError (cudaMemcpy (ft_device , &ft , sizeof (Fitter), cudaMemcpyHostToDevice));
79+ Fitter* fitter_device ;
80+ gpuCheckError (cudaMalloc (reinterpret_cast <void **>(&fitter_device ), sizeof (Fitter)));
81+ gpuCheckError (cudaMemcpy (fitter_device , &fitter , sizeof (Fitter), cudaMemcpyHostToDevice));
6582
66- kernel::printKernel<<<nBlocks, nThreads>>> (ft_device );
83+ kernel::printKernel<<<nBlocks, nThreads>>> (fitter_device );
6784
6885 gpuCheckError (cudaPeekAtLastError ());
6986 gpuCheckError (cudaDeviceSynchronize ());
@@ -75,11 +92,11 @@ int process(const int nBlocks,
7592 Fitter& fitter,
7693 Tr&... args)
7794{
78- Fitter* ft_device ;
95+ Fitter* fitter_device ;
7996 std::array<o2::track::TrackParCov*, Fitter::getNProngs ()> tracks_device;
8097 int result, *result_device;
8198
82- gpuCheckError (cudaMalloc (reinterpret_cast <void **>(&ft_device ), sizeof (Fitter)));
99+ gpuCheckError (cudaMalloc (reinterpret_cast <void **>(&fitter_device ), sizeof (Fitter)));
83100 gpuCheckError (cudaMalloc (reinterpret_cast <void **>(&result_device), sizeof (int )));
84101
85102 int iArg{0 };
@@ -90,15 +107,15 @@ int process(const int nBlocks,
90107 }(),
91108 ...);
92109
93- gpuCheckError (cudaMemcpy (ft_device , &fitter, sizeof (Fitter), cudaMemcpyHostToDevice));
110+ gpuCheckError (cudaMemcpy (fitter_device , &fitter, sizeof (Fitter), cudaMemcpyHostToDevice));
94111
95- std::apply ([&](auto &&... args) { kernel::processKernel<<<nBlocks, nThreads>>> (ft_device , result_device, args...); }, tracks_device);
112+ std::apply ([&](auto &&... args) { kernel::processKernel<<<nBlocks, nThreads>>> (fitter_device , result_device, args...); }, tracks_device);
96113
97114 gpuCheckError (cudaPeekAtLastError ());
98115 gpuCheckError (cudaDeviceSynchronize ());
99116
100117 gpuCheckError (cudaMemcpy (&result, result_device, sizeof (int ), cudaMemcpyDeviceToHost));
101- gpuCheckError (cudaMemcpy (&fitter, ft_device , sizeof (Fitter), cudaMemcpyDeviceToHost));
118+ gpuCheckError (cudaMemcpy (&fitter, fitter_device , sizeof (Fitter), cudaMemcpyDeviceToHost));
102119 iArg = 0 ;
103120 ([&] {
104121 gpuCheckError (cudaMemcpy (&args, tracks_device[iArg], sizeof (o2::track::TrackParCov), cudaMemcpyDeviceToHost));
@@ -107,11 +124,71 @@ int process(const int nBlocks,
107124 }(),
108125 ...);
109126
127+ gpuCheckError (cudaFree (fitter_device));
110128 gpuCheckError (cudaFree (result_device));
111129
112130 return result;
113131}
114132
133+ template <typename Fitter, class ... Tr>
134+ std::vector<int > processBulk (const int nBlocks,
135+ const int nThreads,
136+ std::vector<Fitter>& fitters,
137+ std::vector<Tr>&... args)
138+ {
139+ kernel::warmUpGpuKernel<<<1 , 1 >>> ();
140+
141+ cudaEvent_t start, stop;
142+ gpuCheckError (cudaEventCreate (&start));
143+ gpuCheckError (cudaEventCreate (&stop));
144+ const auto nFits{fitters.size ()}; // for clarity: size of all the vectors needs to be equal, not enforcing it here yet.
145+ std::vector<int > results (nFits);
146+ int * results_device;
147+ Fitter* fitters_device;
148+ std::array<o2::track::TrackParCov*, Fitter::getNProngs ()> tracks_device;
149+
150+ int iArg{0 };
151+ ([&] {
152+ gpuCheckError (cudaMalloc (reinterpret_cast <void **>(&(tracks_device[iArg])), sizeof (Tr) * args.size ()));
153+ gpuCheckError (cudaMemcpy (tracks_device[iArg], args.data (), sizeof (Tr) * args.size (), cudaMemcpyHostToDevice));
154+ ++iArg;
155+ }(),
156+ ...);
157+ gpuCheckError (cudaMalloc (reinterpret_cast <void **>(&results_device), sizeof (int ) * nFits));
158+ gpuCheckError (cudaMalloc (reinterpret_cast <void **>(&fitters_device), sizeof (Fitter) * nFits));
159+ gpuCheckError (cudaMemcpy (fitters_device, fitters.data (), sizeof (Fitter) * nFits, cudaMemcpyHostToDevice));
160+
161+ gpuCheckError (cudaEventRecord (start));
162+ std::apply ([&](auto &&... args) { kernel::processBulkKernel<<<nBlocks, nThreads>>> (fitters_device, results_device, nFits, args...); }, tracks_device);
163+ gpuCheckError (cudaEventRecord (stop));
164+
165+ gpuCheckError (cudaPeekAtLastError ());
166+ gpuCheckError (cudaDeviceSynchronize ());
167+
168+ gpuCheckError (cudaMemcpy (results.data (), results_device, sizeof (int ) * results.size (), cudaMemcpyDeviceToHost));
169+ gpuCheckError (cudaMemcpy (fitters.data (), fitters_device, sizeof (Fitter) * nFits, cudaMemcpyDeviceToHost));
170+
171+ iArg = 0 ;
172+ ([&] {
173+ gpuCheckError (cudaMemcpy (args.data (), tracks_device[iArg], sizeof (Tr) * args.size (), cudaMemcpyDeviceToHost));
174+ gpuCheckError (cudaFree (tracks_device[iArg]));
175+ ++iArg;
176+ }(),
177+ ...);
178+
179+ gpuCheckError (cudaFree (fitters_device));
180+ gpuCheckError (cudaFree (results_device));
181+ gpuCheckError (cudaEventSynchronize (stop));
182+
183+ float milliseconds = 0 ;
184+ gpuCheckError (cudaEventElapsedTime (&milliseconds, start, stop));
185+
186+ LOGP (info, " Kernel run in: {} ms using {} blocks and {} threads." , milliseconds, nBlocks, nThreads);
187+ return results;
188+ }
189+
190+ template std::vector<int > processBulk (const int , const int , std::vector<o2::vertexing::DCAFitterN<2 >>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
191+ template std::vector<int > processBulk (const int , const int , std::vector<o2::vertexing::DCAFitterN<3 >>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
115192template int process (const int , const int , o2::vertexing::DCAFitterN<2 >&, o2::track::TrackParCov&, o2::track::TrackParCov&);
116193template int process (const int , const int , o2::vertexing::DCAFitterN<3 >&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
117194template void print (const int , const int , o2::vertexing::DCAFitterN<2 >&);
0 commit comments