Skip to content

Commit 724ab6a

Browse files
committed
ITS: various fixes also for GPU
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent a5a8469 commit 724ab6a

File tree

16 files changed

+174
-175
lines changed

16 files changed

+174
-175
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
4141
void popMemoryStack(const int);
4242
void registerHostMemory(const int);
4343
void unregisterHostMemory(const int);
44-
void initialise(const int, const TrackingParameters&, const int, IndexTableUtilsN* utils = nullptr);
45-
void initDeviceSAFitting();
44+
void initialise(const int, const TrackingParameters&, const int);
4645
void loadIndexTableUtils(const int);
4746
void loadTrackingFrameInfoDevice(const int, const int);
4847
void createTrackingFrameInfoDeviceArray(const int);
@@ -59,8 +58,8 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
5958
void createROFrameClustersDeviceArray(const int);
6059
void loadMultiplicityCutMask(const int);
6160
void loadVertices(const int);
62-
void loadROFOverlapTable();
63-
void loadROFVertexLookupTable();
61+
void loadROFOverlapTable(const int);
62+
void loadROFVertexLookupTable(const int);
6463
void updateROFVertexLookupTable(const int);
6564

6665
///
@@ -174,9 +173,9 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
174173
gsl::span<CellSeedN*> getDeviceCells() { return mCellsDevice; }
175174

176175
// Overridden getters
177-
int getNumberOfTracklets() const final;
178-
int getNumberOfCells() const final;
179-
int getNumberOfNeighbours() const final;
176+
size_t getNumberOfTracklets() const final;
177+
size_t getNumberOfCells() const final;
178+
size_t getNumberOfNeighbours() const final;
180179

181180
private:
182181
void allocMemAsync(void**, size_t, Stream&, bool, int32_t = o2::gpu::GPUMemoryResource::MEMORY_GPU); // Abstract owned and unowned memory allocations on specific stream
@@ -275,19 +274,19 @@ inline std::vector<unsigned int> TimeFrameGPU<NLayers>::getClusterSizes()
275274
}
276275

277276
template <int NLayers>
278-
inline int TimeFrameGPU<NLayers>::getNumberOfTracklets() const
277+
inline size_t TimeFrameGPU<NLayers>::getNumberOfTracklets() const
279278
{
280279
return std::accumulate(mNTracklets.begin(), mNTracklets.end(), 0);
281280
}
282281

283282
template <int NLayers>
284-
inline int TimeFrameGPU<NLayers>::getNumberOfCells() const
283+
inline size_t TimeFrameGPU<NLayers>::getNumberOfCells() const
285284
{
286285
return std::accumulate(mNCells.begin(), mNCells.end(), 0);
287286
}
288287

289288
template <int NLayers>
290-
inline int TimeFrameGPU<NLayers>::getNumberOfNeighbours() const
289+
inline size_t TimeFrameGPU<NLayers>::getNumberOfNeighbours() const
291290
{
292291
return std::accumulate(mNNeighbours.begin(), mNNeighbours.end(), 0);
293292
}

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,16 @@
1919
namespace o2::its
2020
{
2121

22-
template <int nLayers = 7>
23-
class TrackerTraitsGPU final : public TrackerTraits<nLayers>
22+
template <int NLayers = 7>
23+
class TrackerTraitsGPU final : public TrackerTraits<NLayers>
2424
{
25-
using typename TrackerTraits<nLayers>::IndexTableUtilsN;
25+
using typename TrackerTraits<NLayers>::IndexTableUtilsN;
2626

2727
public:
2828
TrackerTraitsGPU() = default;
2929
~TrackerTraitsGPU() final = default;
3030

31-
void adoptTimeFrame(TimeFrame<nLayers>* tf) final;
31+
void adoptTimeFrame(TimeFrame<NLayers>* tf) final;
3232
void initialiseTimeFrame(const int iteration) final;
3333

3434
void computeLayerTracklets(const int iteration, int) final;
@@ -48,7 +48,7 @@ class TrackerTraitsGPU final : public TrackerTraits<nLayers>
4848

4949
private:
5050
IndexTableUtilsN* mDeviceIndexTableUtils;
51-
gpu::TimeFrameGPU<nLayers>* mTimeFrameGPU;
51+
gpu::TimeFrameGPU<NLayers>* mTimeFrameGPU;
5252
};
5353

5454
} // namespace o2::its

Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@
1313
if(CUDA_ENABLED)
1414
find_package(CUDAToolkit)
1515
message(STATUS "Building ITS CUDA tracker")
16-
# add_compile_options(-O0 -g -lineinfo -fPIC -DGPU_FORCE_DEVICE_ASSERTS=ON)
17-
# add_compile_definitions(ITS_MEASURE_GPU_TIME)
18-
# add_compile_definitions(ITS_GPU_LOG)
1916
o2_add_library(ITStrackingCUDA
2017
SOURCES TrackerTraitsGPU.cxx
2118
TimeFrameGPU.cu
@@ -29,7 +26,13 @@ if(CUDA_ENABLED)
2926
PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider
3027
TARGETVARNAME targetName)
3128

29+
set_target_gpu_arch("CUDA" ${targetName})
30+
# Enable relocatable device code (needed for separable compilation + debugging)
3231
set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
32+
# target_compile_options(${targetName} PRIVATE
33+
# $<$<COMPILE_LANGUAGE:CUDA>:-G;-O0;-Xptxas=-O0>
34+
# $<$<COMPILE_LANGUAGE:CXX>:-O0;-g>
35+
# )
36+
# target_compile_definitions(${targetName} PRIVATE ITS_MEASURE_GPU_TIME ITS_GPU_LOG)
3337
target_compile_definitions(${targetName} PRIVATE $<TARGET_PROPERTY:O2::ITStracking,COMPILE_DEFINITIONS>)
34-
set_target_gpu_arch("CUDA" ${targetName})
3538
endif()

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 44 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -265,57 +265,58 @@ void TimeFrameGPU<NLayers>::loadVertices(const int iteration)
265265
{
266266
if (!iteration) {
267267
GPUTimer timer("loading seeding vertices");
268-
// GPULog("gpu-transfer: loading {} ROframes vertices, for {:.2f} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / constants::MB);
269-
// allocMem(reinterpret_cast<void**>(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->hasFrameworkAllocator());
270-
// GPUChkErrS(cudaMemcpy(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice));
271268
GPULog("gpu-transfer: loading {} seeding vertices, for {:.2f} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / constants::MB);
272269
allocMem(reinterpret_cast<void**>(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), this->hasFrameworkAllocator());
273270
GPUChkErrS(cudaMemcpy(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice));
274271
}
275272
}
276273

277274
template <int NLayers>
278-
void TimeFrameGPU<NLayers>::loadROFOverlapTable()
275+
void TimeFrameGPU<NLayers>::loadROFOverlapTable(const int iteration)
279276
{
280-
GPUTimer timer("initialising device view of ROFOverlapTable");
281-
const auto& hostTable = this->getROFOverlapTable();
282-
const auto& hostView = this->getROFOverlapTableView();
283-
using TableEntry = ROFOverlapTable<NLayers>::TableEntry;
284-
using TableIndex = ROFOverlapTable<NLayers>::TableIndex;
285-
using LayerTiming = o2::its::LayerTiming;
286-
TableEntry* d_flatTable{nullptr};
287-
TableIndex* d_indices{nullptr};
288-
LayerTiming* d_layers{nullptr};
289-
size_t flatTableSize = hostTable.getFlatTableSize();
290-
allocMem(reinterpret_cast<void**>(&d_flatTable), flatTableSize * sizeof(TableEntry), this->hasFrameworkAllocator());
291-
GPUChkErrS(cudaMemcpy(d_flatTable, hostView.mFlatTable, flatTableSize * sizeof(TableEntry), cudaMemcpyHostToDevice));
292-
allocMem(reinterpret_cast<void**>(&d_indices), hostTable.getIndicesSize() * sizeof(TableIndex), this->hasFrameworkAllocator());
293-
GPUChkErrS(cudaMemcpy(d_indices, hostView.mIndices, hostTable.getIndicesSize() * sizeof(TableIndex), cudaMemcpyHostToDevice));
294-
allocMem(reinterpret_cast<void**>(&d_layers), NLayers * sizeof(LayerTiming), this->hasFrameworkAllocator());
295-
GPUChkErrS(cudaMemcpy(d_layers, hostView.mLayers, NLayers * sizeof(LayerTiming), cudaMemcpyHostToDevice));
296-
mDeviceROFOverlapTableView = hostTable.getDeviceView(d_flatTable, d_indices, d_layers);
277+
if (!iteration) {
278+
GPUTimer timer("initialising device view of ROFOverlapTable");
279+
const auto& hostTable = this->getROFOverlapTable();
280+
const auto& hostView = this->getROFOverlapTableView();
281+
using TableEntry = ROFOverlapTable<NLayers>::TableEntry;
282+
using TableIndex = ROFOverlapTable<NLayers>::TableIndex;
283+
using LayerTiming = o2::its::LayerTiming;
284+
TableEntry* d_flatTable{nullptr};
285+
TableIndex* d_indices{nullptr};
286+
LayerTiming* d_layers{nullptr};
287+
size_t flatTableSize = hostTable.getFlatTableSize();
288+
allocMem(reinterpret_cast<void**>(&d_flatTable), flatTableSize * sizeof(TableEntry), this->hasFrameworkAllocator());
289+
GPUChkErrS(cudaMemcpy(d_flatTable, hostView.mFlatTable, flatTableSize * sizeof(TableEntry), cudaMemcpyHostToDevice));
290+
allocMem(reinterpret_cast<void**>(&d_indices), hostTable.getIndicesSize() * sizeof(TableIndex), this->hasFrameworkAllocator());
291+
GPUChkErrS(cudaMemcpy(d_indices, hostView.mIndices, hostTable.getIndicesSize() * sizeof(TableIndex), cudaMemcpyHostToDevice));
292+
allocMem(reinterpret_cast<void**>(&d_layers), NLayers * sizeof(LayerTiming), this->hasFrameworkAllocator());
293+
GPUChkErrS(cudaMemcpy(d_layers, hostView.mLayers, NLayers * sizeof(LayerTiming), cudaMemcpyHostToDevice));
294+
mDeviceROFOverlapTableView = hostTable.getDeviceView(d_flatTable, d_indices, d_layers);
295+
}
297296
}
298297

299298
template <int NLayers>
300-
void TimeFrameGPU<NLayers>::loadROFVertexLookupTable()
299+
void TimeFrameGPU<NLayers>::loadROFVertexLookupTable(const int iteration)
301300
{
302-
GPUTimer timer("initialising device view of ROFVertexLookupTable");
303-
const auto& hostTable = this->getROFVertexLookupTable();
304-
const auto& hostView = this->getROFVertexLookupTableView();
305-
using TableEntry = ROFVertexLookupTable<NLayers>::TableEntry;
306-
using TableIndex = ROFVertexLookupTable<NLayers>::TableIndex;
307-
using LayerTiming = o2::its::LayerTiming;
308-
TableEntry* d_flatTable{nullptr};
309-
TableIndex* d_indices{nullptr};
310-
LayerTiming* d_layers{nullptr};
311-
size_t flatTableSize = hostTable.getFlatTableSize();
312-
allocMem(reinterpret_cast<void**>(&d_flatTable), flatTableSize * sizeof(TableEntry), this->hasFrameworkAllocator());
313-
GPUChkErrS(cudaMemcpy(d_flatTable, hostView.mFlatTable, flatTableSize * sizeof(TableEntry), cudaMemcpyHostToDevice));
314-
allocMem(reinterpret_cast<void**>(&d_indices), hostTable.getIndicesSize() * sizeof(TableIndex), this->hasFrameworkAllocator());
315-
GPUChkErrS(cudaMemcpy(d_indices, hostView.mIndices, hostTable.getIndicesSize() * sizeof(TableIndex), cudaMemcpyHostToDevice));
316-
allocMem(reinterpret_cast<void**>(&d_layers), NLayers * sizeof(LayerTiming), this->hasFrameworkAllocator());
317-
GPUChkErrS(cudaMemcpy(d_layers, hostView.mLayers, NLayers * sizeof(LayerTiming), cudaMemcpyHostToDevice));
318-
mDeviceROFVertexLookupTableView = hostTable.getDeviceView(d_flatTable, d_indices, d_layers);
301+
if (!iteration) {
302+
GPUTimer timer("initialising device view of ROFVertexLookupTable");
303+
const auto& hostTable = this->getROFVertexLookupTable();
304+
const auto& hostView = this->getROFVertexLookupTableView();
305+
using TableEntry = ROFVertexLookupTable<NLayers>::TableEntry;
306+
using TableIndex = ROFVertexLookupTable<NLayers>::TableIndex;
307+
using LayerTiming = o2::its::LayerTiming;
308+
TableEntry* d_flatTable{nullptr};
309+
TableIndex* d_indices{nullptr};
310+
LayerTiming* d_layers{nullptr};
311+
size_t flatTableSize = hostTable.getFlatTableSize();
312+
allocMem(reinterpret_cast<void**>(&d_flatTable), flatTableSize * sizeof(TableEntry), this->hasFrameworkAllocator());
313+
GPUChkErrS(cudaMemcpy(d_flatTable, hostView.mFlatTable, flatTableSize * sizeof(TableEntry), cudaMemcpyHostToDevice));
314+
allocMem(reinterpret_cast<void**>(&d_indices), hostTable.getIndicesSize() * sizeof(TableIndex), this->hasFrameworkAllocator());
315+
GPUChkErrS(cudaMemcpy(d_indices, hostView.mIndices, hostTable.getIndicesSize() * sizeof(TableIndex), cudaMemcpyHostToDevice));
316+
allocMem(reinterpret_cast<void**>(&d_layers), NLayers * sizeof(LayerTiming), this->hasFrameworkAllocator());
317+
GPUChkErrS(cudaMemcpy(d_layers, hostView.mLayers, NLayers * sizeof(LayerTiming), cudaMemcpyHostToDevice));
318+
mDeviceROFVertexLookupTableView = hostTable.getDeviceView(d_flatTable, d_indices, d_layers);
319+
}
319320
}
320321

321322
template <int NLayers>
@@ -373,6 +374,7 @@ void TimeFrameGPU<NLayers>::createTrackletsBuffers(const int layer)
373374
mGpuStreams[layer].sync(); // ensure number of tracklets is correct
374375
GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer], layer, mNTracklets[layer] * sizeof(Tracklet) / constants::MB);
375376
allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[layer]), mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
377+
GPUChkErrS(cudaMemsetAsync(mTrackletsDevice[layer], 0, mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer].get()));
376378
GPUChkErrS(cudaMemcpyAsync(&mTrackletsDeviceArray[layer], &mTrackletsDevice[layer], sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
377379
}
378380

@@ -468,6 +470,7 @@ void TimeFrameGPU<NLayers>::createCellsBuffers(const int layer)
468470
mGpuStreams[layer].sync(); // ensure number of cells is correct
469471
GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeedN) / constants::MB);
470472
allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeedN), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
473+
GPUChkErrS(cudaMemsetAsync(mCellsDevice[layer], 0, mNCells[layer] * sizeof(CellSeedN), mGpuStreams[layer].get()));
471474
GPUChkErrS(cudaMemcpyAsync(&mCellsDeviceArray[layer], &mCellsDevice[layer], sizeof(CellSeedN*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
472475
}
473476

@@ -637,11 +640,10 @@ void TimeFrameGPU<NLayers>::popMemoryStack(const int iteration)
637640
template <int NLayers>
638641
void TimeFrameGPU<NLayers>::initialise(const int iteration,
639642
const TrackingParameters& trkParam,
640-
const int maxLayers,
641-
IndexTableUtilsN* utils)
643+
const int maxLayers)
642644
{
643645
mGpuStreams.resize(NLayers);
644-
o2::its::TimeFrame<NLayers>::initialise(iteration, trkParam, maxLayers);
646+
o2::its::TimeFrame<NLayers>::initialise(iteration, trkParam, maxLayers, false);
645647
}
646648

647649
template <int NLayers>

0 commit comments

Comments
 (0)