Skip to content

Commit a2319b2

Browse files
committed
ITS: GPU: added launch bounds for ITS kernels, not fully optimised for MI50
1 parent 17ae0d0 commit a2319b2

File tree

2 files changed

+31
-29
lines changed

2 files changed

+31
-29
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ GPUdii() gpuSpan<const Cluster> getClustersOnLayer(const int rof,
311311
}
312312

313313
template <int nLayers>
314-
GPUg() void fitTrackSeedsKernel(
314+
GPUg() void __launch_bounds__(256, 1) fitTrackSeedsKernel(
315315
CellSeed<nLayers>* trackSeeds,
316316
const TrackingFrameInfo** foundTrackingFrameInfo,
317317
o2::its::TrackITSExt* tracks,
@@ -374,7 +374,7 @@ GPUg() void fitTrackSeedsKernel(
374374
}
375375

376376
template <bool initRun, int nLayers = 7>
377-
GPUg() void computeLayerCellNeighboursKernel(
377+
GPUg() void __launch_bounds__(256, 1) computeLayerCellNeighboursKernel(
378378
CellSeed<nLayers>** cellSeedArray,
379379
int* neighboursLUT,
380380
int* neighboursIndexTable,
@@ -438,7 +438,7 @@ GPUg() void computeLayerCellNeighboursKernel(
438438
}
439439

440440
template <bool initRun, int nLayers>
441-
GPUg() void computeLayerCellsKernel(
441+
GPUg() void __launch_bounds__(256, 1) computeLayerCellsKernel(
442442
const Cluster** sortedClusters,
443443
const Cluster** unsortedClusters,
444444
const TrackingFrameInfo** tfInfo,
@@ -525,7 +525,7 @@ GPUg() void computeLayerCellsKernel(
525525
}
526526

527527
template <bool initRun, int nLayers>
528-
GPUg() void computeLayerTrackletsMultiROFKernel(
528+
GPUg() void __launch_bounds__(256, 1) computeLayerTrackletsMultiROFKernel(
529529
const IndexTableUtils<nLayers>* utils,
530530
const uint8_t* multMask,
531531
const int layerIndex,
@@ -652,33 +652,35 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
652652
}
653653
}
654654

655-
GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets,
656-
int* trackletsLookUpTable,
657-
const int nTracklets)
655+
GPUg() void __launch_bounds__(256, 1) compileTrackletsLookupTableKernel(
656+
const Tracklet* tracklets,
657+
int* trackletsLookUpTable,
658+
const int nTracklets)
658659
{
659660
for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) {
660661
atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1);
661662
}
662663
}
663664

664665
template <bool dryRun, int nLayers = 7>
665-
GPUg() void processNeighboursKernel(const int layer,
666-
const int level,
667-
CellSeed<nLayers>** allCellSeeds,
668-
CellSeed<nLayers>* currentCellSeeds,
669-
const int* currentCellIds,
670-
const unsigned int nCurrentCells,
671-
CellSeed<nLayers>* updatedCellSeeds,
672-
int* updatedCellsIds,
673-
int* foundSeedsTable, // auxiliary only in GPU code to compute the number of cells per iteration
674-
const unsigned char** usedClusters, // Used clusters
675-
int* neighbours,
676-
int* neighboursLUT,
677-
const TrackingFrameInfo** foundTrackingFrameInfo,
678-
const float bz,
679-
const float maxChi2ClusterAttachment,
680-
const o2::base::Propagator* propagator,
681-
const o2::base::PropagatorF::MatCorrType matCorrType)
666+
GPUg() void __launch_bounds__(256, 1) processNeighboursKernel(
667+
const int layer,
668+
const int level,
669+
CellSeed<nLayers>** allCellSeeds,
670+
CellSeed<nLayers>* currentCellSeeds,
671+
const int* currentCellIds,
672+
const unsigned int nCurrentCells,
673+
CellSeed<nLayers>* updatedCellSeeds,
674+
int* updatedCellsIds,
675+
int* foundSeedsTable, // auxiliary only in GPU code to compute the number of cells per iteration
676+
const unsigned char** usedClusters, // Used clusters
677+
int* neighbours,
678+
int* neighboursLUT,
679+
const TrackingFrameInfo** foundTrackingFrameInfo,
680+
const float bz,
681+
const float maxChi2ClusterAttachment,
682+
const o2::base::Propagator* propagator,
683+
const o2::base::PropagatorF::MatCorrType matCorrType)
682684
{
683685
constexpr float layerxX0[7] = {5.e-3f, 5.e-3f, 5.e-3f, 1.e-2f, 1.e-2f, 1.e-2f, 1.e-2f}; // Hardcoded here for the moment.
684686
for (unsigned int iCurrentCell = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCell < nCurrentCells; iCurrentCell += blockDim.x * gridDim.x) {

Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,19 +121,19 @@ struct ITSGpuTrackingParamConfig : public o2::conf::ConfigurableParamHelper<ITSG
121121
void maybeOverride() const;
122122

123123
/// Individual kernel launch parameter for each iteration
124-
int nBlocksLayerTracklets[MaxIter] = {30, 30, 30, 30};
124+
int nBlocksLayerTracklets[MaxIter] = {60, 60, 60, 60};
125125
int nThreadsLayerTracklets[MaxIter] = {256, 256, 256, 256};
126126

127-
int nBlocksLayerCells[MaxIter] = {30, 30, 30, 30};
127+
int nBlocksLayerCells[MaxIter] = {60, 60, 60, 60};
128128
int nThreadsLayerCells[MaxIter] = {256, 256, 256, 256};
129129

130-
int nBlocksFindNeighbours[MaxIter] = {30, 30, 30, 30};
130+
int nBlocksFindNeighbours[MaxIter] = {60, 60, 60, 60};
131131
int nThreadsFindNeighbours[MaxIter] = {256, 256, 256, 256};
132132

133-
int nBlocksProcessNeighbours[MaxIter] = {30, 30, 30, 30};
133+
int nBlocksProcessNeighbours[MaxIter] = {60, 60, 60, 60};
134134
int nThreadsProcessNeighbours[MaxIter] = {256, 256, 256, 256};
135135

136-
int nBlocksTracksSeeds[MaxIter] = {30, 30, 30, 30};
136+
int nBlocksTracksSeeds[MaxIter] = {60, 60, 60, 60};
137137
int nThreadsTracksSeeds[MaxIter] = {256, 256, 256, 256};
138138

139139
O2ParamDef(ITSGpuTrackingParamConfig, "ITSGpuTrackingParam");

0 commit comments

Comments
 (0)