@@ -145,9 +145,7 @@ GPUd() bool fitTrack(TrackITSExt& track,
145145
146146 if (matCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) {
147147 const float xx0 = (iLayer > 2 ) ? 1 .e -2f : 5 .e -3f ; // Rough layer thickness
148- constexpr float radiationLength = 9 .36f ; // Radiation length of Si [cm]
149- constexpr float density = 2 .33f ; // Density of Si [g/cm^3]
150- if (!track.correctForMaterial (xx0, xx0 * radiationLength * density, true )) {
148+ if (!track.correctForMaterial (xx0, xx0 * constants::Radl * constants::Rho, true )) {
151149 return false ;
152150 }
153151 }
@@ -728,13 +726,13 @@ GPUg() void processNeighboursKernel(const int layer,
728726 if (!seed.o2 ::track::TrackParCov::update (trHit.positionTrackingFrame , trHit.covarianceTrackingFrame )) {
729727 continue ;
730728 }
731- seed.getClusters ()[layer - 1 ] = neighbourCell.getFirstClusterIndex ();
732- seed.setLevel (neighbourCell.getLevel ());
733- seed.setFirstTrackletIndex (neighbourCell.getFirstTrackletIndex ());
734- seed.setSecondTrackletIndex (neighbourCell.getSecondTrackletIndex ());
735729 if constexpr (dryRun) {
736730 foundSeedsTable[iCurrentCell]++;
737731 } else {
732+ seed.getClusters ()[layer - 1 ] = neighbourCell.getFirstClusterIndex ();
733+ seed.setLevel (neighbourCell.getLevel ());
734+ seed.setFirstTrackletIndex (neighbourCell.getFirstTrackletIndex ());
735+ seed.setSecondTrackletIndex (neighbourCell.getSecondTrackletIndex ());
738736 updatedCellsIds[foundSeedsTable[iCurrentCell] + foundSeeds] = neighbourCellId;
739737 updatedCellSeeds[foundSeedsTable[iCurrentCell] + foundSeeds] = seed;
740738 }
@@ -870,25 +868,35 @@ GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId =
870868 }
871869}
872870
873- template <typename T>
874- GPUhi () void cubExclusiveScanInPlace (T* in_out, int num_items, cudaStream_t stream = nullptr , ExternalAllocator* alloc = nullptr )
871+ GPUhi () void allocateMemory (void ** p, size_t bytes, cudaStream_t stream = nullptr , ExternalAllocator* alloc = nullptr )
875872{
876- void * d_temp_storage = nullptr ;
877- size_t temp_storage_bytes = 0 ;
878- GPUChkErrS (cub::DeviceScan::ExclusiveSum (d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
879873 if (alloc) {
880- d_temp_storage = alloc->allocate (temp_storage_bytes );
874+ *p = alloc->allocate (bytes );
881875 } else {
882- GPUChkErrS (cudaMallocAsync (&d_temp_storage, temp_storage_bytes , stream));
876+ GPUChkErrS (cudaMallocAsync (p, bytes , stream));
883877 }
884- GPUChkErrS (cub::DeviceScan::ExclusiveSum (d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
878+ }
879+
880+ GPUhi () void deallocateMemory (void * p, size_t bytes, cudaStream_t stream = nullptr , ExternalAllocator* alloc = nullptr )
881+ {
885882 if (alloc) {
886- alloc->deallocate (reinterpret_cast <char *>(d_temp_storage ), temp_storage_bytes );
883+ alloc->deallocate (reinterpret_cast <char *>(p ), bytes );
887884 } else {
888- GPUChkErrS (cudaFreeAsync (d_temp_storage , stream));
885+ GPUChkErrS (cudaFreeAsync (p , stream));
889886 }
890887}
891888
889+ template <typename T>
890+ GPUhi () void cubExclusiveScanInPlace (T* in_out, int num_items, cudaStream_t stream = nullptr , ExternalAllocator* alloc = nullptr )
891+ {
892+ void * d_temp_storage = nullptr ;
893+ size_t temp_storage_bytes = 0 ;
894+ GPUChkErrS (cub::DeviceScan::ExclusiveSum (d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
895+ allocateMemory (&d_temp_storage, temp_storage_bytes, stream, alloc);
896+ GPUChkErrS (cub::DeviceScan::ExclusiveSum (d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
897+ deallocateMemory (d_temp_storage, temp_storage_bytes, stream, alloc);
898+ }
899+
892900template <typename Vector>
893901GPUhi () void cubExclusiveScanInPlace (Vector& in_out, int num_items, cudaStream_t stream = nullptr , ExternalAllocator* alloc = nullptr )
894902{
@@ -901,21 +909,13 @@ GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre
901909 void * d_temp_storage = nullptr ;
902910 size_t temp_storage_bytes = 0 ;
903911 GPUChkErrS (cub::DeviceScan::InclusiveSum (d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
904- if (alloc) {
905- d_temp_storage = alloc->allocate (temp_storage_bytes);
906- } else {
907- GPUChkErrS (cudaMallocAsync (&d_temp_storage, temp_storage_bytes, stream));
908- }
912+ allocateMemory (&d_temp_storage, temp_storage_bytes, stream, alloc);
909913 GPUChkErrS (cub::DeviceScan::InclusiveSum (d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
910- if (alloc) {
911- alloc->deallocate (reinterpret_cast <char *>(d_temp_storage), temp_storage_bytes);
912- } else {
913- GPUChkErrS (cudaFreeAsync (d_temp_storage, stream));
914- }
914+ deallocateMemory (d_temp_storage, temp_storage_bytes, stream, alloc);
915915}
916916
917917template <typename Vector>
918- GPUhi () void cubInclusiveScanInPlace (Vector& in_out, int num_items, cudaStream_t stream = nullptr , ExternalAllocator* alloc = nullptr )
918+ GPUhi () void cubInclusiveScanInPlace (Vector& in_out, int num_items, cudaStream_t stream = nullptr , o2::its:: ExternalAllocator* alloc = nullptr )
919919{
920920 cubInclusiveScanInPlace (thrust::raw_pointer_cast (in_out.data ()), num_items, stream, alloc);
921921}
@@ -1048,13 +1048,30 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
10481048 resolutions[iLayer],
10491049 radii[iLayer + 1 ] - radii[iLayer],
10501050 mulScatAng[iLayer]);
1051- // / Internal thrust allocation serialize this part to a degree
1052- // / TODO switch to cub equivelent and do all work on one stream
1053- thrust::device_ptr<Tracklet> tracklets_ptr (spanTracklets[iLayer]);
1054- auto nosync_policy = THRUST_NAMESPACE::par_nosync.on (streams[iLayer].get ());
1055- thrust::sort (nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets ());
1056- auto unique_end = thrust::unique (nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets ());
1057- nTracklets[iLayer] = unique_end - tracklets_ptr;
1051+ if (nTracklets[iLayer]) {
1052+ Tracklet *tracklets_in = spanTracklets[iLayer], *tracklets_out{nullptr };
1053+ size_t n = nTracklets[iLayer];
1054+ size_t sort_temp_bytes = 0 ;
1055+ GPUChkErrS (cub::DeviceMergeSort::SortKeys (nullptr , sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get ()));
1056+ void * sort_temp_storage = nullptr ;
1057+ gpu::allocateMemory (&sort_temp_storage, sort_temp_bytes, streams[iLayer].get (), alloc);
1058+ GPUChkErrS (cub::DeviceMergeSort::SortKeys (sort_temp_storage, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get ()));
1059+ gpu::allocateMemory (reinterpret_cast <void **>(&tracklets_out), n * sizeof (Tracklet), streams[iLayer].get (), alloc);
1060+ size_t unique_temp_bytes = 0 ;
1061+ int * num_selected = nullptr ;
1062+ gpu::allocateMemory (reinterpret_cast <void **>(&num_selected), sizeof (int ), streams[iLayer].get (), alloc);
1063+ GPUChkErrS (cub::DeviceSelect::Unique (nullptr , unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get ()));
1064+ void * unique_temp_storage = nullptr ;
1065+ gpu::allocateMemory (&unique_temp_storage, unique_temp_bytes, streams[iLayer].get (), alloc);
1066+ GPUChkErrS (cub::DeviceSelect::Unique (unique_temp_storage, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get ()));
1067+ GPUChkErrS (cudaMemcpyAsync (tracklets_in, tracklets_out, n * sizeof (Tracklet), cudaMemcpyDeviceToDevice, streams[iLayer].get ()));
1068+ GPUChkErrS (cudaMemcpyAsync (&nTracklets[iLayer], num_selected, sizeof (int ), cudaMemcpyDeviceToHost, streams[iLayer].get ()));
1069+ streams[iLayer].sync ();
1070+ gpu::deallocateMemory (tracklets_out, n * sizeof (Tracklet), streams[iLayer].get (), alloc);
1071+ gpu::deallocateMemory (sort_temp_storage, sort_temp_bytes, streams[iLayer].get (), alloc);
1072+ gpu::deallocateMemory (unique_temp_storage, unique_temp_bytes, streams[iLayer].get (), alloc);
1073+ gpu::deallocateMemory (num_selected, sizeof (int ), streams[iLayer].get (), alloc);
1074+ }
10581075 if (iLayer > 0 ) {
10591076 GPUChkErrS (cudaMemsetAsync (trackletsLUTsHost[iLayer], 0 , nClusters[iLayer] * sizeof (int ), streams[iLayer].get ()));
10601077 gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads, 0 , streams[iLayer].get()>>> (
@@ -1215,15 +1232,43 @@ int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
12151232 gpu::Stream& stream,
12161233 o2::its::ExternalAllocator* allocator)
12171234{
1218- // / Internal thrust allocation serialize this part to a degree
1219- // / TODO switch to cub equivelent and do all work on one stream
1235+ #ifndef __HIPCC__
1236+ int * d_num_selected = nullptr ;
1237+ gpu::allocateMemory (reinterpret_cast <void **>(&d_num_selected), sizeof (int ), stream.get (), allocator);
1238+ size_t select_bytes = 0 ;
1239+ GPUChkErrS (cub::DeviceSelect::If (nullptr , select_bytes, cellNeighbourPairs, static_cast <gpuPair<int , int >*>(nullptr ), d_num_selected, nNeigh, gpu::is_valid_pair<int , int >(), stream.get ()));
1240+ void * select_temp = nullptr ;
1241+ gpu::allocateMemory (&select_temp, select_bytes, stream.get (), allocator);
1242+ gpuPair<int , int >* d_temp_valid = nullptr ;
1243+ gpu::allocateMemory (reinterpret_cast <void **>(&d_temp_valid), nNeigh * sizeof (gpuPair<int , int >), stream.get (), allocator);
1244+ GPUChkErrS (cub::DeviceSelect::If (select_temp, select_bytes, cellNeighbourPairs, d_temp_valid, d_num_selected, nNeigh, gpu::is_valid_pair<int , int >(), stream.get ()));
1245+ int newSize = 0 ;
1246+ GPUChkErrS (cudaMemcpyAsync (&newSize, d_num_selected, sizeof (int ), cudaMemcpyDeviceToHost, stream.get ()));
1247+ stream.sync (); // needed to get newSize
1248+ size_t sort_bytes = 0 ;
1249+ GPUChkErrS (cub::DeviceMergeSort::SortPairs (nullptr , sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second<int , int >(), stream.get ()));
1250+ void * sort_temp = nullptr ;
1251+ gpu::allocateMemory (&sort_temp, sort_bytes, stream.get (), allocator);
1252+ GPUChkErrS (cub::DeviceMergeSort::SortPairs (sort_temp, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second<int , int >(), stream.get ()));
1253+ size_t transform_bytes = 0 ;
1254+ GPUChkErrS (cub::DeviceTransform::Transform (nullptr , transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first<int , int >(), stream.get ()));
1255+ void * transform_temp = nullptr ;
1256+ gpu::allocateMemory (&transform_temp, transform_bytes, stream.get (), allocator);
1257+ GPUChkErrS (cub::DeviceTransform::Transform (transform_temp, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first<int , int >(), stream.get ()));
1258+ gpu::deallocateMemory (transform_temp, transform_bytes, stream.get (), allocator);
1259+ gpu::deallocateMemory (d_temp_valid, newSize * sizeof (gpuPair<int , int >), stream.get (), allocator);
1260+ gpu::deallocateMemory (sort_temp, sort_bytes, stream.get (), allocator);
1261+ gpu::deallocateMemory (d_num_selected, sizeof (int ), stream.get (), allocator);
1262+ gpu::deallocateMemory (select_temp, select_bytes, stream.get (), allocator);
1263+ #else // FIXME using thrust here since hipcub does not yet have DeviceTransform
12201264 auto nosync_policy = THRUST_NAMESPACE::par_nosync.on (stream.get ());
12211265 thrust::device_ptr<gpuPair<int , int >> neighVectorPairs (cellNeighbourPairs);
12221266 thrust::device_ptr<int > validNeighs (cellNeighbours);
12231267 auto updatedEnd = thrust::remove_if (nosync_policy, neighVectorPairs, neighVectorPairs + nNeigh, gpu::is_invalid_pair<int , int >());
12241268 size_t newSize = updatedEnd - neighVectorPairs;
12251269 thrust::stable_sort (nosync_policy, neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second<int , int >());
12261270 thrust::transform (nosync_policy, neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first<int , int >());
1271+ #endif
12271272
12281273 return newSize;
12291274}
0 commit comments