Skip to content

Commit 80c4d14

Browse files
committed
ITS: GPU: resolve added TODOs
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent 7b7d8ff commit 80c4d14

File tree

2 files changed

+84
-41
lines changed

2 files changed

+84
-41
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,11 @@ class Stream
9595
#if defined(__HIPCC__)
9696
using Handle = hipStream_t;
9797
static constexpr Handle DefaultStream = 0;
98-
// static constexpr unsigned int DefaultFlag = hipStreamNonBlocking; TODO replace once ready
99-
static constexpr unsigned int DefaultFlag = 0;
98+
static constexpr unsigned int DefaultFlag = hipStreamNonBlocking;
10099
#elif defined(__CUDACC__)
101100
using Handle = cudaStream_t;
102101
static constexpr Handle DefaultStream = 0;
103-
// static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking; TODO replace once ready
104-
static constexpr unsigned int DefaultFlag = 0;
102+
static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking;
105103
#else
106104
using Handle = void*;
107105
static constexpr Handle DefaultStream = nullptr;

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 82 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,7 @@ GPUd() bool fitTrack(TrackITSExt& track,
145145

146146
if (matCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) {
147147
const float xx0 = (iLayer > 2) ? 1.e-2f : 5.e-3f; // Rough layer thickness
148-
constexpr float radiationLength = 9.36f; // Radiation length of Si [cm]
149-
constexpr float density = 2.33f; // Density of Si [g/cm^3]
150-
if (!track.correctForMaterial(xx0, xx0 * radiationLength * density, true)) {
148+
if (!track.correctForMaterial(xx0, xx0 * constants::Radl * constants::Rho, true)) {
151149
return false;
152150
}
153151
}
@@ -728,13 +726,13 @@ GPUg() void processNeighboursKernel(const int layer,
728726
if (!seed.o2::track::TrackParCov::update(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)) {
729727
continue;
730728
}
731-
seed.getClusters()[layer - 1] = neighbourCell.getFirstClusterIndex();
732-
seed.setLevel(neighbourCell.getLevel());
733-
seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex());
734-
seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex());
735729
if constexpr (dryRun) {
736730
foundSeedsTable[iCurrentCell]++;
737731
} else {
732+
seed.getClusters()[layer - 1] = neighbourCell.getFirstClusterIndex();
733+
seed.setLevel(neighbourCell.getLevel());
734+
seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex());
735+
seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex());
738736
updatedCellsIds[foundSeedsTable[iCurrentCell] + foundSeeds] = neighbourCellId;
739737
updatedCellSeeds[foundSeedsTable[iCurrentCell] + foundSeeds] = seed;
740738
}
@@ -870,25 +868,35 @@ GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId =
870868
}
871869
}
872870

873-
template <typename T>
874-
GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
871+
GPUhi() void allocateMemory(void** p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
875872
{
876-
void* d_temp_storage = nullptr;
877-
size_t temp_storage_bytes = 0;
878-
GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
879873
if (alloc) {
880-
d_temp_storage = alloc->allocate(temp_storage_bytes);
874+
*p = alloc->allocate(bytes);
881875
} else {
882-
GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
876+
GPUChkErrS(cudaMallocAsync(p, bytes, stream));
883877
}
884-
GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
878+
}
879+
880+
GPUhi() void deallocateMemory(void* p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
881+
{
885882
if (alloc) {
886-
alloc->deallocate(reinterpret_cast<char*>(d_temp_storage), temp_storage_bytes);
883+
alloc->deallocate(reinterpret_cast<char*>(p), bytes);
887884
} else {
888-
GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
885+
GPUChkErrS(cudaFreeAsync(p, stream));
889886
}
890887
}
891888

889+
template <typename T>
890+
GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
891+
{
892+
void* d_temp_storage = nullptr;
893+
size_t temp_storage_bytes = 0;
894+
GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
895+
allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc);
896+
GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
897+
deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc);
898+
}
899+
892900
template <typename Vector>
893901
GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
894902
{
@@ -901,21 +909,13 @@ GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre
901909
void* d_temp_storage = nullptr;
902910
size_t temp_storage_bytes = 0;
903911
GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
904-
if (alloc) {
905-
d_temp_storage = alloc->allocate(temp_storage_bytes);
906-
} else {
907-
GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
908-
}
912+
allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc);
909913
GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
910-
if (alloc) {
911-
alloc->deallocate(reinterpret_cast<char*>(d_temp_storage), temp_storage_bytes);
912-
} else {
913-
GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
914-
}
914+
deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc);
915915
}
916916

917917
template <typename Vector>
918-
GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
918+
GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, o2::its::ExternalAllocator* alloc = nullptr)
919919
{
920920
cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc);
921921
}
@@ -1048,13 +1048,30 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
10481048
resolutions[iLayer],
10491049
radii[iLayer + 1] - radii[iLayer],
10501050
mulScatAng[iLayer]);
1051-
/// Internal thrust allocation serialize this part to a degree
1052-
/// TODO switch to cub equivelent and do all work on one stream
1053-
thrust::device_ptr<Tracklet> tracklets_ptr(spanTracklets[iLayer]);
1054-
auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(streams[iLayer].get());
1055-
thrust::sort(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets());
1056-
auto unique_end = thrust::unique(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets());
1057-
nTracklets[iLayer] = unique_end - tracklets_ptr;
1051+
if (nTracklets[iLayer]) {
1052+
Tracklet *tracklets_in = spanTracklets[iLayer], *tracklets_out{nullptr};
1053+
size_t n = nTracklets[iLayer];
1054+
size_t sort_temp_bytes = 0;
1055+
GPUChkErrS(cub::DeviceMergeSort::SortKeys(nullptr, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get()));
1056+
void* sort_temp_storage = nullptr;
1057+
gpu::allocateMemory(&sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc);
1058+
GPUChkErrS(cub::DeviceMergeSort::SortKeys(sort_temp_storage, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get()));
1059+
gpu::allocateMemory(reinterpret_cast<void**>(&tracklets_out), n * sizeof(Tracklet), streams[iLayer].get(), alloc);
1060+
size_t unique_temp_bytes = 0;
1061+
int* num_selected = nullptr;
1062+
gpu::allocateMemory(reinterpret_cast<void**>(&num_selected), sizeof(int), streams[iLayer].get(), alloc);
1063+
GPUChkErrS(cub::DeviceSelect::Unique(nullptr, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get()));
1064+
void* unique_temp_storage = nullptr;
1065+
gpu::allocateMemory(&unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc);
1066+
GPUChkErrS(cub::DeviceSelect::Unique(unique_temp_storage, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get()));
1067+
GPUChkErrS(cudaMemcpyAsync(tracklets_in, tracklets_out, n * sizeof(Tracklet), cudaMemcpyDeviceToDevice, streams[iLayer].get()));
1068+
GPUChkErrS(cudaMemcpyAsync(&nTracklets[iLayer], num_selected, sizeof(int), cudaMemcpyDeviceToHost, streams[iLayer].get()));
1069+
streams[iLayer].sync();
1070+
gpu::deallocateMemory(tracklets_out, n * sizeof(Tracklet), streams[iLayer].get(), alloc);
1071+
gpu::deallocateMemory(sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc);
1072+
gpu::deallocateMemory(unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc);
1073+
gpu::deallocateMemory(num_selected, sizeof(int), streams[iLayer].get(), alloc);
1074+
}
10581075
if (iLayer > 0) {
10591076
GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int), streams[iLayer].get()));
10601077
gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads, 0, streams[iLayer].get()>>>(
@@ -1215,15 +1232,43 @@ int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
12151232
gpu::Stream& stream,
12161233
o2::its::ExternalAllocator* allocator)
12171234
{
1218-
/// Internal thrust allocation serialize this part to a degree
1219-
/// TODO switch to cub equivelent and do all work on one stream
1235+
#ifndef __HIPCC__
1236+
int* d_num_selected = nullptr;
1237+
gpu::allocateMemory(reinterpret_cast<void**>(&d_num_selected), sizeof(int), stream.get(), allocator);
1238+
size_t select_bytes = 0;
1239+
GPUChkErrS(cub::DeviceSelect::If(nullptr, select_bytes, cellNeighbourPairs, static_cast<gpuPair<int, int>*>(nullptr), d_num_selected, nNeigh, gpu::is_valid_pair<int, int>(), stream.get()));
1240+
void* select_temp = nullptr;
1241+
gpu::allocateMemory(&select_temp, select_bytes, stream.get(), allocator);
1242+
gpuPair<int, int>* d_temp_valid = nullptr;
1243+
gpu::allocateMemory(reinterpret_cast<void**>(&d_temp_valid), nNeigh * sizeof(gpuPair<int, int>), stream.get(), allocator);
1244+
GPUChkErrS(cub::DeviceSelect::If(select_temp, select_bytes, cellNeighbourPairs, d_temp_valid, d_num_selected, nNeigh, gpu::is_valid_pair<int, int>(), stream.get()));
1245+
int newSize = 0;
1246+
GPUChkErrS(cudaMemcpyAsync(&newSize, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost, stream.get()));
1247+
stream.sync(); // needed to get newSize
1248+
size_t sort_bytes = 0;
1249+
GPUChkErrS(cub::DeviceMergeSort::SortPairs(nullptr, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second<int, int>(), stream.get()));
1250+
void* sort_temp = nullptr;
1251+
gpu::allocateMemory(&sort_temp, sort_bytes, stream.get(), allocator);
1252+
GPUChkErrS(cub::DeviceMergeSort::SortPairs(sort_temp, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second<int, int>(), stream.get()));
1253+
size_t transform_bytes = 0;
1254+
GPUChkErrS(cub::DeviceTransform::Transform(nullptr, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first<int, int>(), stream.get()));
1255+
void* transform_temp = nullptr;
1256+
gpu::allocateMemory(&transform_temp, transform_bytes, stream.get(), allocator);
1257+
GPUChkErrS(cub::DeviceTransform::Transform(transform_temp, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first<int, int>(), stream.get()));
1258+
gpu::deallocateMemory(transform_temp, transform_bytes, stream.get(), allocator);
1259+
gpu::deallocateMemory(d_temp_valid, newSize * sizeof(gpuPair<int, int>), stream.get(), allocator);
1260+
gpu::deallocateMemory(sort_temp, sort_bytes, stream.get(), allocator);
1261+
gpu::deallocateMemory(d_num_selected, sizeof(int), stream.get(), allocator);
1262+
gpu::deallocateMemory(select_temp, select_bytes, stream.get(), allocator);
1263+
#else // FIXME using thrust here since hipcub does not yet have DeviceTransform
12201264
auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(stream.get());
12211265
thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
12221266
thrust::device_ptr<int> validNeighs(cellNeighbours);
12231267
auto updatedEnd = thrust::remove_if(nosync_policy, neighVectorPairs, neighVectorPairs + nNeigh, gpu::is_invalid_pair<int, int>());
12241268
size_t newSize = updatedEnd - neighVectorPairs;
12251269
thrust::stable_sort(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second<int, int>());
12261270
thrust::transform(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first<int, int>());
1271+
#endif
12271272

12281273
return newSize;
12291274
}

0 commit comments

Comments
 (0)