Skip to content

Commit f49a578

Browse files
authored
DCAFitterGPU: add asynch batching + benchmarking (#13553)
1 parent 7380cae commit f49a578

File tree

8 files changed

+374
-98
lines changed

8 files changed

+374
-98
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2+
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3+
// All rights not expressly granted are reserved.
4+
//
5+
// This software is distributed under the terms of the GNU General Public
6+
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7+
//
8+
// In applying this license CERN does not waive the privileges and immunities
9+
// granted to it by virtue of its status as an Intergovernmental Organization
10+
// or submit itself to any jurisdiction.
11+
12+
/// \brief Helper interface to the GPU device, meant to be compatible with manual allocation/streams and GPUReconstruction ones.
13+
/// \author matteo.concas@cern.ch
14+
15+
#ifndef DCAFITTER_GPU_INTERFACE
16+
#define DCAFITTER_GPU_INTERFACE
17+
18+
#include <thread>
19+
#include <vector>
20+
#include <atomic>
21+
22+
namespace o2
23+
{
24+
namespace vertexing
25+
{
26+
namespace device
27+
{
28+
29+
#if !defined(__HIPCC__) && !defined(__CUDACC__)
30+
typedef struct _dummyStream {
31+
} Stream;
32+
#else
33+
#ifdef __HIPCC__
34+
typedef hipStream_t Stream;
35+
#else
36+
typedef cudaStream_t Stream;
37+
#endif
38+
#endif
39+
40+
class GPUInterface
41+
{
42+
public:
43+
GPUInterface(GPUInterface& other) = delete;
44+
void operator=(const GPUInterface&) = delete;
45+
46+
static GPUInterface* Instance();
47+
48+
// APIs
49+
void registerBuffer(void*, size_t);
50+
void unregisterBuffer(void* addr);
51+
void allocDevice(void**, size_t);
52+
void freeDevice(void*);
53+
Stream& getStream(unsigned short N = 0);
54+
Stream& getNextStream();
55+
56+
protected:
57+
GPUInterface(size_t N = 1);
58+
~GPUInterface();
59+
60+
void resize(size_t);
61+
62+
std::atomic<unsigned short> mLastUsedStream{0};
63+
static GPUInterface* sGPUInterface;
64+
std::vector<std::thread> mPool{};
65+
std::vector<Stream> mStreams{};
66+
};
67+
68+
} // namespace device
69+
} // namespace vertexing
70+
} // namespace o2
71+
#endif

Common/DCAFitter/GPU/cuda/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@
1212
o2_add_library(DCAFitterCUDA
1313
TARGETVARNAME targetName
1414
SOURCES DCAFitterN.cu
15+
GPUInterface.cu
1516
PUBLIC_INCLUDE_DIRECTORIES ../../include
17+
PUBLIC_INCLUDE_DIRECTORIES ../
1618
PUBLIC_LINK_LIBRARIES O2::MathUtils
1719
O2::ReconstructionDataFormats
1820
O2::DetectorsBase
1921
PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider)
2022
set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
23+
# add_compile_options(-lineinfo)
2124

2225
o2_add_test(DCAFitterNCUDA
2326
SOURCES test/testDCAFitterNGPU.cxx

Common/DCAFitter/GPU/cuda/DCAFitterN.cu

Lines changed: 119 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@
1515
#include <cuda.h>
1616
#endif
1717

18+
#include <numeric>
19+
1820
#include "GPUCommonDef.h"
1921
#include "DCAFitter/DCAFitterN.h"
20-
// #include "MathUtils/SMatrixGPU.h"
22+
#include "DeviceInterface/GPUInterface.h"
2123

2224
#define gpuCheckError(x) \
2325
{ \
@@ -61,10 +63,10 @@ GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
6163
}
6264

6365
template <typename Fitter, typename... Tr>
64-
GPUg() void processBulkKernel(Fitter* fitters, int* results, unsigned int N, Tr*... tracks)
66+
GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks)
6567
{
6668
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
67-
results[iThread] = fitters[iThread].process(tracks[iThread]...);
69+
results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
6870
}
6971
}
7072

@@ -131,64 +133,137 @@ int process(const int nBlocks,
131133
}
132134

133135
template <typename Fitter, class... Tr>
134-
std::vector<int> processBulk(const int nBlocks,
135-
const int nThreads,
136-
std::vector<Fitter>& fitters,
137-
std::vector<Tr>&... args)
136+
void processBulk(const int nBlocks,
137+
const int nThreads,
138+
const int nBatches,
139+
std::vector<Fitter>& fitters,
140+
std::vector<int>& results,
141+
std::vector<Tr>&... args)
138142
{
139-
kernel::warmUpGpuKernel<<<1, 1>>>();
143+
auto* gpuInterface = GPUInterface::Instance();
144+
kernel::warmUpGpuKernel<<<1, 1, 0, gpuInterface->getNextStream()>>>();
140145
141-
cudaEvent_t start, stop;
142-
gpuCheckError(cudaEventCreate(&start));
143-
gpuCheckError(cudaEventCreate(&stop));
144-
const auto nFits{fitters.size()}; // for clarity: size of all the vectors needs to be equal, not enforcing it here yet.
145-
std::vector<int> results(nFits);
146-
int* results_device;
147-
Fitter* fitters_device;
148-
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
146+
// Benchmarking events
147+
std::vector<float> ioUp(nBatches), ioDown(nBatches), kerElapsed(nBatches);
148+
std::vector<cudaEvent_t> startIOUp(nBatches), endIOUp(nBatches), startIODown(nBatches), endIODown(nBatches), startKer(nBatches), endKer(nBatches);
149+
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
150+
gpuCheckError(cudaEventCreate(&startIOUp[iBatch]));
151+
gpuCheckError(cudaEventCreate(&endIOUp[iBatch]));
152+
gpuCheckError(cudaEventCreate(&startIODown[iBatch]));
153+
gpuCheckError(cudaEventCreate(&endIODown[iBatch]));
154+
gpuCheckError(cudaEventCreate(&startKer[iBatch]));
155+
gpuCheckError(cudaEventCreate(&endKer[iBatch]));
156+
}
149157
158+
// Tracks
159+
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
150160
int iArg{0};
151161
([&] {
152-
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size()));
153-
gpuCheckError(cudaMemcpy(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice));
162+
gpuInterface->registerBuffer(reinterpret_cast<void*>(args.data()), sizeof(Tr) * args.size());
163+
gpuInterface->allocDevice(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size());
154164
++iArg;
155165
}(),
156166
...);
157-
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits));
158-
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits));
159-
gpuCheckError(cudaMemcpy(fitters_device, fitters.data(), sizeof(Fitter) * nFits, cudaMemcpyHostToDevice));
160167
161-
gpuCheckError(cudaEventRecord(start));
162-
std::apply([&](auto&&... args) { kernel::processBulkKernel<<<nBlocks, nThreads>>>(fitters_device, results_device, nFits, args...); }, tracks_device);
163-
gpuCheckError(cudaEventRecord(stop));
168+
// Fitters
169+
gpuInterface->registerBuffer(reinterpret_cast<void*>(fitters.data()), sizeof(Fitter) * fitters.size());
170+
Fitter* fitters_device;
171+
gpuInterface->allocDevice(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * fitters.size());
164172
165-
gpuCheckError(cudaPeekAtLastError());
166-
gpuCheckError(cudaDeviceSynchronize());
173+
// Results
174+
gpuInterface->registerBuffer(reinterpret_cast<void*>(results.data()), sizeof(int) * fitters.size());
175+
int* results_device;
176+
gpuInterface->allocDevice(reinterpret_cast<void**>(&results_device), sizeof(int) * fitters.size());
167177
168-
gpuCheckError(cudaMemcpy(results.data(), results_device, sizeof(int) * results.size(), cudaMemcpyDeviceToHost));
169-
gpuCheckError(cudaMemcpy(fitters.data(), fitters_device, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost));
178+
// R.R. Computation
179+
int totalSize = fitters.size();
180+
int batchSize = totalSize / nBatches;
181+
int remainder = totalSize % nBatches;
170182
171-
iArg = 0;
172-
([&] {
173-
gpuCheckError(cudaMemcpy(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost));
174-
gpuCheckError(cudaFree(tracks_device[iArg]));
175-
++iArg;
176-
}(),
177-
...);
183+
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
184+
auto& stream = gpuInterface->getNextStream();
185+
auto offset = iBatch * batchSize + std::min(iBatch, remainder);
186+
auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
187+
188+
gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
189+
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
190+
iArg = 0;
191+
([&] {
192+
gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
193+
++iArg;
194+
}(),
195+
...);
196+
gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
197+
198+
gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
199+
std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
200+
gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
201+
202+
gpuCheckError(cudaPeekAtLastError());
203+
iArg = 0;
204+
gpuCheckError(cudaEventRecord(startIODown[iBatch], stream));
205+
([&] {
206+
gpuCheckError(cudaMemcpyAsync(args.data() + offset, tracks_device[iArg] + offset, sizeof(Tr) * nFits, cudaMemcpyDeviceToHost, stream));
207+
++iArg;
208+
}(),
209+
...);
210+
211+
gpuCheckError(cudaMemcpyAsync(fitters.data() + offset, fitters_device + offset, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost, stream));
212+
gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream));
213+
gpuCheckError(cudaEventRecord(endIODown[iBatch], stream));
214+
}
178215
179-
gpuCheckError(cudaFree(fitters_device));
180-
gpuCheckError(cudaFree(results_device));
181-
gpuCheckError(cudaEventSynchronize(stop));
216+
([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
182217
183-
float milliseconds = 0;
184-
gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop));
218+
for (auto* tracksD : tracks_device) {
219+
gpuInterface->freeDevice(tracksD);
220+
}
221+
222+
gpuInterface->freeDevice(fitters_device);
223+
gpuInterface->freeDevice(results_device);
224+
gpuInterface->unregisterBuffer(fitters.data());
225+
gpuInterface->unregisterBuffer(results.data());
185226
186-
LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads);
187-
return results;
227+
// Do benchmarks
228+
gpuCheckError(cudaDeviceSynchronize());
229+
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
230+
gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch]));
231+
gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch]));
232+
gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
233+
}
234+
235+
float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f);
236+
float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f);
237+
float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f);
238+
LOGP(info, "Config: {} batches, {} blocks, {} threads", nBatches, nBlocks, nThreads);
239+
LOGP(info, "Total I/O time: Up {} ms Avg {} ms, Down {} ms Avg {} ms", totalUp, totalUp / float(nBatches), totalDown, totalDown / (float)nBatches);
240+
LOGP(info, "Total Kernel time: {} ms Avg {} ms", totalKernels, totalKernels / (float)nBatches);
241+
242+
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
243+
gpuCheckError(cudaEventDestroy(startIOUp[iBatch]));
244+
gpuCheckError(cudaEventDestroy(endIOUp[iBatch]));
245+
gpuCheckError(cudaEventDestroy(startIODown[iBatch]));
246+
gpuCheckError(cudaEventDestroy(endIODown[iBatch]));
247+
gpuCheckError(cudaEventDestroy(startKer[iBatch]));
248+
gpuCheckError(cudaEventDestroy(endKer[iBatch]));
249+
}
188250
}
189251
190-
template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<2>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
191-
template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<3>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
252+
template void processBulk(const int,
253+
const int,
254+
const int,
255+
std::vector<o2::vertexing::DCAFitterN<2>>&,
256+
std::vector<int>&,
257+
std::vector<o2::track::TrackParCov>&,
258+
std::vector<o2::track::TrackParCov>&);
259+
template void processBulk(const int,
260+
const int,
261+
const int,
262+
std::vector<o2::vertexing::DCAFitterN<3>>&,
263+
std::vector<int>&,
264+
std::vector<o2::track::TrackParCov>&,
265+
std::vector<o2::track::TrackParCov>&,
266+
std::vector<o2::track::TrackParCov>&);
192267
template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&);
193268
template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
194269
template void print(const int, const int, o2::vertexing::DCAFitterN<2>&);

0 commit comments

Comments
 (0)