@@ -24,14 +24,14 @@ using namespace o2::gpu;
2424void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks (int8_t withinSlice, int8_t mergeMode, GPUReconstruction::krnlDeviceType deviceType)
2525{
2626 GPUTPCGMMerger& Merger = processors ()->tpcMerger ;
27- bool doGPUall = GetRecoStepsGPU () & RecoStep::TPCMerging && GetProcessingSettings (). fullMergerOnGPU ;
28- GPUTPCGMMerger& MergerShadow = doGPUall ? processorsShadow ()->tpcMerger : Merger;
27+ bool doGPU = GetRecoStepsGPU () & RecoStep::TPCMerging;
28+ GPUTPCGMMerger& MergerShadow = doGPU ? processorsShadow ()->tpcMerger : Merger;
2929 if (GetProcessingSettings ().deterministicGPUReconstruction ) {
3030 uint32_t nBorderTracks = withinSlice == 1 ? NSLICES : (2 * NSLICES);
3131 runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::borderTracks>({{nBorderTracks, -WarpSize (), 0 , deviceType}}, 0 );
3232 }
3333 uint32_t n = withinSlice == -1 ? NSLICES / 2 : NSLICES;
34- if (GetProcessingSettings ().alternateBorderSort && (!mRec ->IsGPU () || doGPUall )) {
34+ if (GetProcessingSettings ().alternateBorderSort && (!mRec ->IsGPU () || doGPU )) {
3535 TransferMemoryResourceLinkToHost (RecoStep::TPCMerging, Merger.MemoryResMemory (), 0 , &mEvents ->init );
3636 RecordMarker (&mEvents ->single , 0 );
3737 for (uint32_t i = 0 ; i < n; i++) {
@@ -72,7 +72,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice
7272 runKernel<GPUTPCGMMergerMergeBorders, 2 >(GetGridAuto (0 , deviceType), i, withinSlice, mergeMode);
7373 }
7474 }
75- DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPUall , Merger, &GPUTPCGMMerger::DumpMergeRanges, *mDebugFile , withinSlice, mergeMode);
75+ DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPU , Merger, &GPUTPCGMMerger::DumpMergeRanges, *mDebugFile , withinSlice, mergeMode);
7676 mRec ->ReturnVolatileDeviceMemory ();
7777}
7878
@@ -89,12 +89,11 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
8989{
9090 mRec ->PushNonPersistentMemory (qStr2Tag (" TPCMERGE" ));
9191 bool doGPU = GetRecoStepsGPU () & RecoStep::TPCMerging;
92- bool doGPUall = doGPU && GetProcessingSettings ().fullMergerOnGPU ;
93- GPUReconstruction::krnlDeviceType deviceType = doGPUall ? GPUReconstruction::krnlDeviceType::Auto : GPUReconstruction::krnlDeviceType::CPU;
94- uint32_t numBlocks = (!mRec ->IsGPU () || doGPUall) ? BlockCount () : 1 ;
92+ GPUReconstruction::krnlDeviceType deviceType = doGPU ? GPUReconstruction::krnlDeviceType::Auto : GPUReconstruction::krnlDeviceType::CPU;
93+ uint32_t numBlocks = (!mRec ->IsGPU () || doGPU) ? BlockCount () : 1 ;
9594 GPUTPCGMMerger& Merger = processors ()->tpcMerger ;
9695 GPUTPCGMMerger& MergerShadow = doGPU ? processorsShadow ()->tpcMerger : Merger;
97- GPUTPCGMMerger& MergerShadowAll = doGPUall ? processorsShadow ()->tpcMerger : Merger;
96+ GPUTPCGMMerger& MergerShadowAll = doGPU ? processorsShadow ()->tpcMerger : Merger;
9897 const int32_t outputStream = OutputStream ();
9998 if (GetProcessingSettings ().debugLevel >= 2 ) {
10099 GPUInfo (" Running TPC Merger" );
@@ -112,7 +111,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
112111
113112 memset (Merger.Memory (), 0 , sizeof (*Merger.Memory ()));
114113 WriteToConstantMemory (RecoStep::TPCMerging, (char *)&processors ()->tpcMerger - (char *)processors (), &MergerShadow, sizeof (MergerShadow), 0 );
115- if (doGPUall ) {
114+ if (doGPU ) {
116115 TransferMemoryResourcesToGPU (RecoStep::TPCMerging, &Merger, 0 );
117116 }
118117
@@ -136,14 +135,14 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
136135 if (GetProcessingSettings ().deterministicGPUReconstruction ) {
137136 runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::sectorTracks>({{GPUCA_NSLICES, -WarpSize (), 0 , deviceType}}, 1 );
138137 }
139- DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPUall , Merger, &GPUTPCGMMerger::DumpSliceTracks, *mDebugFile );
138+ DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPU , Merger, &GPUTPCGMMerger::DumpSliceTracks, *mDebugFile );
140139
141140 runKernel<GPUTPCGMMergerClearLinks>(GetGridAuto (0 , deviceType), false );
142141 runKernel<GPUMemClean16>({{1 , -WarpSize (), 0 , deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter (), NSLICES * sizeof (*MergerShadowAll.TmpCounter ()));
143142 runKernel<GPUTPCGMMergerMergeWithinPrepare>(GetGridAuto (0 , deviceType));
144143 RunTPCTrackingMerger_MergeBorderTracks (1 , 0 , deviceType);
145144 RunTPCTrackingMerger_Resolve (0 , 1 , deviceType);
146- DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPUall , Merger, &GPUTPCGMMerger::DumpMergedWithinSlices, *mDebugFile );
145+ DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPU , Merger, &GPUTPCGMMerger::DumpMergedWithinSlices, *mDebugFile );
147146
148147 runKernel<GPUTPCGMMergerClearLinks>(GetGridAuto (0 , deviceType), false );
149148 runKernel<GPUMemClean16>({{1 , -WarpSize (), 0 , deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter (), 2 * NSLICES * sizeof (*MergerShadowAll.TmpCounter ()));
@@ -158,7 +157,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
158157 runKernel<GPUTPCGMMergerMergeSlicesPrepare>(GetGridBlk (std::max (2u , numBlocks), 0 , deviceType), 0 , 1 , 1 );
159158 RunTPCTrackingMerger_MergeBorderTracks (0 , -1 , deviceType);
160159 RunTPCTrackingMerger_Resolve (0 , 1 , deviceType);
161- DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPUall , Merger, &GPUTPCGMMerger::DumpMergedBetweenSlices, *mDebugFile );
160+ DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPU , Merger, &GPUTPCGMMerger::DumpMergedBetweenSlices, *mDebugFile );
162161
163162 runKernel<GPUMemClean16>({{1 , -WarpSize (), 0 , deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter (), 2 * NSLICES * sizeof (*MergerShadowAll.TmpCounter ()));
164163
@@ -168,17 +167,17 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
168167 runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::globalTracks1>({{1 , -WarpSize (), 0 , deviceType}}, 1 );
169168 runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::globalTracks2>({{1 , -WarpSize (), 0 , deviceType}}, 1 );
170169 }
171- DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPUall , Merger, &GPUTPCGMMerger::DumpCollected, *mDebugFile );
170+ DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPU , Merger, &GPUTPCGMMerger::DumpCollected, *mDebugFile );
172171
173172 if (param ().rec .tpc .mergeCE ) {
174173 runKernel<GPUTPCGMMergerClearLinks>(GetGridAuto (0 , deviceType), true );
175174 RunTPCTrackingMerger_MergeBorderTracks (-1 , 1 , deviceType);
176175 RunTPCTrackingMerger_MergeBorderTracks (-1 , 2 , deviceType);
177176 runKernel<GPUTPCGMMergerMergeCE>(GetGridAuto (0 , deviceType));
178- DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPUall , Merger, &GPUTPCGMMerger::DumpMergeCE, *mDebugFile );
177+ DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPU , Merger, &GPUTPCGMMerger::DumpMergeCE, *mDebugFile );
179178 }
180179 int32_t waitForTransfer = 0 ;
181- if (doGPUall ) {
180+ if (doGPU ) {
182181 TransferMemoryResourceLinkToHost (RecoStep::TPCMerging, Merger.MemoryResMemory (), 0 , &mEvents ->single );
183182 waitForTransfer = 1 ;
184183 }
@@ -189,23 +188,21 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
189188 runKernel<GPUTPCGMMergerSortTracks>(GetGridAuto (0 , deviceType));
190189 }
191190
192- uint32_t maxId = param (). rec . nonConsecutiveIDs ? Merger. Memory ()-> nOutputTrackClusters : Merger.NMaxClusters ();
191+ uint32_t maxId = Merger.NMaxClusters ();
193192 if (maxId > Merger.NMaxClusters ()) {
194193 throw std::runtime_error (" mNMaxClusters too small" );
195194 }
196- if (!param ().rec .nonConsecutiveIDs ) {
197- runKernel<GPUMemClean16>({{numBlocks, -ThreadCount (), 0 , deviceType, RecoStep::TPCMerging}}, MergerShadowAll.SharedCount (), maxId * sizeof (*MergerShadowAll.SharedCount ()));
198- runKernel<GPUMemClean16>({{numBlocks, -ThreadCount (), 0 , deviceType, RecoStep::TPCMerging}}, MergerShadowAll.ClusterAttachment (), maxId * sizeof (*MergerShadowAll.ClusterAttachment ()));
199- runKernel<GPUTPCGMMergerPrepareClusters, 0 >(GetGridAuto (0 , deviceType));
200- CondWaitEvent (waitForTransfer, &mEvents ->single );
201- runKernel<GPUTPCGMMergerSortTracksQPt>(GetGridAuto (0 , deviceType));
202- runKernel<GPUTPCGMMergerPrepareClusters, 1 >(GetGridAuto (0 , deviceType));
203- runKernel<GPUTPCGMMergerPrepareClusters, 2 >(GetGridAuto (0 , deviceType));
204- }
195+ runKernel<GPUMemClean16>({{numBlocks, -ThreadCount (), 0 , deviceType, RecoStep::TPCMerging}}, MergerShadowAll.SharedCount (), maxId * sizeof (*MergerShadowAll.SharedCount ()));
196+ runKernel<GPUMemClean16>({{numBlocks, -ThreadCount (), 0 , deviceType, RecoStep::TPCMerging}}, MergerShadowAll.ClusterAttachment (), maxId * sizeof (*MergerShadowAll.ClusterAttachment ()));
197+ runKernel<GPUTPCGMMergerPrepareClusters, 0 >(GetGridAuto (0 , deviceType));
198+ CondWaitEvent (waitForTransfer, &mEvents ->single );
199+ runKernel<GPUTPCGMMergerSortTracksQPt>(GetGridAuto (0 , deviceType));
200+ runKernel<GPUTPCGMMergerPrepareClusters, 1 >(GetGridAuto (0 , deviceType));
201+ runKernel<GPUTPCGMMergerPrepareClusters, 2 >(GetGridAuto (0 , deviceType));
205202
206- DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPUall , Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile );
203+ DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPU , Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile );
207204
208- if (doGPUall ) {
205+ if (doGPU ) {
209206 CondWaitEvent (waitForTransfer, &mEvents ->single );
210207 if (waitForTransfer) {
211208 ReleaseEvent (mEvents ->single );
@@ -228,29 +225,23 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
228225 if (param ().rec .tpc .looperInterpolationInExtraPass ) {
229226 runKernel<GPUTPCGMMergerFollowLoopers>(GetGridAuto (0 ));
230227 }
231- if (doGPU && !doGPUall) {
232- TransferMemoryResourcesToHost (RecoStep::TPCMerging, &Merger, 0 );
233- SynchronizeStream (0 );
234- }
235228
236229 DoDebugAndDump (RecoStep::TPCMerging, 2048 , Merger, &GPUTPCGMMerger::DumpRefit, *mDebugFile );
237230 runKernel<GPUTPCGMMergerFinalize, 0 >(GetGridAuto (0 , deviceType));
238- if (!param ().rec .nonConsecutiveIDs ) {
239- runKernel<GPUTPCGMMergerFinalize, 1 >(GetGridAuto (0 , deviceType));
240- runKernel<GPUTPCGMMergerFinalize, 2 >(GetGridAuto (0 , deviceType));
241- }
231+ runKernel<GPUTPCGMMergerFinalize, 1 >(GetGridAuto (0 , deviceType));
232+ runKernel<GPUTPCGMMergerFinalize, 2 >(GetGridAuto (0 , deviceType));
242233 if (param ().rec .tpc .mergeLoopersAfterburner ) {
243- runKernel<GPUTPCGMMergerMergeLoopers, 0 >(doGPUall ? GetGrid (Merger.NOutputTracks (), 0 , deviceType) : GetGridAuto (0 , deviceType));
234+ runKernel<GPUTPCGMMergerMergeLoopers, 0 >(doGPU ? GetGrid (Merger.NOutputTracks (), 0 , deviceType) : GetGridAuto (0 , deviceType));
244235 if (doGPU) {
245236 TransferMemoryResourceLinkToHost (RecoStep::TPCMerging, Merger.MemoryResMemory (), 0 );
246237 SynchronizeStream (0 ); // TODO: could probably synchronize on an event after runKernel<GPUTPCGMMergerMergeLoopers, 1>
247238 }
248239 runKernel<GPUTPCGMMergerMergeLoopers, 1 >(GetGridAuto (0 , deviceType));
249- runKernel<GPUTPCGMMergerMergeLoopers, 2 >(doGPUall ? GetGrid (Merger.Memory ()->nLooperMatchCandidates , 0 , deviceType) : GetGridAuto (0 , deviceType));
240+ runKernel<GPUTPCGMMergerMergeLoopers, 2 >(doGPU ? GetGrid (Merger.Memory ()->nLooperMatchCandidates , 0 , deviceType) : GetGridAuto (0 , deviceType));
250241 }
251- DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPUall , Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile );
242+ DoDebugAndDump (RecoStep::TPCMerging, 2048 , doGPU , Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile );
252243
253- if (doGPUall ) {
244+ if (doGPU ) {
254245 RecordMarker (&mEvents ->single , 0 );
255246 auto * waitEvent = &mEvents ->single ;
256247 if (GetProcessingSettings ().keepDisplayMemory || GetProcessingSettings ().createO2Output <= 1 || mFractionalQAEnabled ) {
@@ -302,7 +293,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
302293 TransferMemoryResourceLinkToHost (RecoStep::TPCMerging, Merger.MemoryResMemory (), 0 , &mEvents ->single );
303294 runKernel<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(GetGridAuto (0 , deviceType));
304295 mRec ->ReturnVolatileDeviceMemory ();
305- SynchronizeEventAndRelease (mEvents ->single , doGPUall );
296+ SynchronizeEventAndRelease (mEvents ->single , doGPU );
306297
307298 if (GetProcessingSettings ().clearO2OutputFromGPU ) {
308299 mRec ->AllocateVolatileDeviceMemory (0 ); // make future device memory allocation volatile
@@ -316,7 +307,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
316307 AllocateRegisteredMemory (Merger.MemoryResOutputO2MC (), mSubOutputControls [GPUTrackingOutputs::getIndex (&GPUTrackingOutputs::tpcTracksO2Labels)]);
317308 TransferMemoryResourcesToHost (RecoStep::TPCMerging, &Merger, -1 , true );
318309 runKernel<GPUTPCGMO2Output, GPUTPCGMO2Output::mc>(GetGridAuto (0 , GPUReconstruction::krnlDeviceType::CPU));
319- } else if (doGPUall ) {
310+ } else if (doGPU ) {
320311 RecordMarker (&mEvents ->single , 0 );
321312 TransferMemoryResourceLinkToHost (RecoStep::TPCMerging, Merger.MemoryResOutputO2 (), outputStream, nullptr , &mEvents ->single );
322313 TransferMemoryResourceLinkToHost (RecoStep::TPCMerging, Merger.MemoryResOutputO2Clus (), outputStream);
0 commit comments