Skip to content

Commit bc67e6d

Browse files
authored
Add processBulk to perform fits in parallel (#13527)
1 parent 66e8774 commit bc67e6d

File tree

4 files changed

+660
-75
lines changed

4 files changed

+660
-75
lines changed

Common/DCAFitter/GPU/cuda/DCAFitterN.cu

Lines changed: 92 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,34 +36,51 @@ namespace o2::vertexing::device
3636
{
3737
namespace kernel
3838
{
39+
GPUg() void warmUpGpuKernel()
40+
{
41+
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
42+
float ia, ib;
43+
ia = ib = 0.0f;
44+
ib += ia + tid;
45+
}
46+
3947
template <typename Fitter>
40-
GPUg() void printKernel(Fitter* ft)
48+
GPUg() void printKernel(Fitter* fitter)
4149
{
4250
if (threadIdx.x == 0) {
43-
printf(" =============== GPU DCA Fitter %d prongs ================\n", Fitter::getNProngs());
44-
ft->print();
51+
printf(" =============== GPU DCA Fitter %d prongs =================\n", Fitter::getNProngs());
52+
fitter->print();
4553
printf(" =========================================================\n");
4654
}
4755
}
4856

4957
template <typename Fitter, typename... Tr>
50-
GPUg() void processKernel(Fitter* ft, int* res, Tr*... tracks)
58+
GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
5159
{
52-
*res = ft->process(*tracks...);
60+
*res = fitter->process(*tracks...);
5361
}
62+
63+
template <typename Fitter, typename... Tr>
64+
GPUg() void processBulkKernel(Fitter* fitters, int* results, unsigned int N, Tr*... tracks)
65+
{
66+
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
67+
results[iThread] = fitters[iThread].process(tracks[iThread]...);
68+
}
69+
}
70+
5471
} // namespace kernel
5572

5673
/// CPU handlers
5774
template <typename Fitter>
5875
void print(const int nBlocks,
5976
const int nThreads,
60-
Fitter& ft)
77+
Fitter& fitter)
6178
{
62-
Fitter* ft_device;
63-
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
64-
gpuCheckError(cudaMemcpy(ft_device, &ft, sizeof(Fitter), cudaMemcpyHostToDevice));
79+
Fitter* fitter_device;
80+
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
81+
gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
6582

66-
kernel::printKernel<<<nBlocks, nThreads>>>(ft_device);
83+
kernel::printKernel<<<nBlocks, nThreads>>>(fitter_device);
6784

6885
gpuCheckError(cudaPeekAtLastError());
6986
gpuCheckError(cudaDeviceSynchronize());
@@ -75,11 +92,11 @@ int process(const int nBlocks,
7592
Fitter& fitter,
7693
Tr&... args)
7794
{
78-
Fitter* ft_device;
95+
Fitter* fitter_device;
7996
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
8097
int result, *result_device;
8198

82-
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
99+
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
83100
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));
84101

85102
int iArg{0};
@@ -90,15 +107,15 @@ int process(const int nBlocks,
90107
}(),
91108
...);
92109

93-
gpuCheckError(cudaMemcpy(ft_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
110+
gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
94111

95-
std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, result_device, args...); }, tracks_device);
112+
std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(fitter_device, result_device, args...); }, tracks_device);
96113

97114
gpuCheckError(cudaPeekAtLastError());
98115
gpuCheckError(cudaDeviceSynchronize());
99116

100117
gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
101-
gpuCheckError(cudaMemcpy(&fitter, ft_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
118+
gpuCheckError(cudaMemcpy(&fitter, fitter_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
102119
iArg = 0;
103120
([&] {
104121
gpuCheckError(cudaMemcpy(&args, tracks_device[iArg], sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
@@ -107,11 +124,71 @@ int process(const int nBlocks,
107124
}(),
108125
...);
109126

127+
gpuCheckError(cudaFree(fitter_device));
110128
gpuCheckError(cudaFree(result_device));
111129

112130
return result;
113131
}
114132

133+
template <typename Fitter, class... Tr>
134+
std::vector<int> processBulk(const int nBlocks,
135+
const int nThreads,
136+
std::vector<Fitter>& fitters,
137+
std::vector<Tr>&... args)
138+
{
139+
kernel::warmUpGpuKernel<<<1, 1>>>();
140+
141+
cudaEvent_t start, stop;
142+
gpuCheckError(cudaEventCreate(&start));
143+
gpuCheckError(cudaEventCreate(&stop));
144+
const auto nFits{fitters.size()}; // for clarity: size of all the vectors needs to be equal, not enforcing it here yet.
145+
std::vector<int> results(nFits);
146+
int* results_device;
147+
Fitter* fitters_device;
148+
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
149+
150+
int iArg{0};
151+
([&] {
152+
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size()));
153+
gpuCheckError(cudaMemcpy(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice));
154+
++iArg;
155+
}(),
156+
...);
157+
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits));
158+
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits));
159+
gpuCheckError(cudaMemcpy(fitters_device, fitters.data(), sizeof(Fitter) * nFits, cudaMemcpyHostToDevice));
160+
161+
gpuCheckError(cudaEventRecord(start));
162+
std::apply([&](auto&&... args) { kernel::processBulkKernel<<<nBlocks, nThreads>>>(fitters_device, results_device, nFits, args...); }, tracks_device);
163+
gpuCheckError(cudaEventRecord(stop));
164+
165+
gpuCheckError(cudaPeekAtLastError());
166+
gpuCheckError(cudaDeviceSynchronize());
167+
168+
gpuCheckError(cudaMemcpy(results.data(), results_device, sizeof(int) * results.size(), cudaMemcpyDeviceToHost));
169+
gpuCheckError(cudaMemcpy(fitters.data(), fitters_device, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost));
170+
171+
iArg = 0;
172+
([&] {
173+
gpuCheckError(cudaMemcpy(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost));
174+
gpuCheckError(cudaFree(tracks_device[iArg]));
175+
++iArg;
176+
}(),
177+
...);
178+
179+
gpuCheckError(cudaFree(fitters_device));
180+
gpuCheckError(cudaFree(results_device));
181+
gpuCheckError(cudaEventSynchronize(stop));
182+
183+
float milliseconds = 0;
184+
gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop));
185+
186+
LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads);
187+
return results;
188+
}
189+
190+
template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<2>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
191+
template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<3>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
115192
template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&);
116193
template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
117194
template void print(const int, const int, o2::vertexing::DCAFitterN<2>&);

0 commit comments

Comments
 (0)