Skip to content

Commit dfb9fba

Browse files
committed
GPU TPC: Simplify host code for driving the extrapolation tracking
1 parent c3d005f commit dfb9fba

File tree

4 files changed

+33
-91
lines changed

4 files changed

+33
-91
lines changed

GPU/GPUTracking/Base/GPUReconstruction.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ class GPUReconstruction
200200
void SetOutputControl(void* ptr, size_t size);
201201
void SetInputControl(void* ptr, size_t size);
202202
GPUOutputControl& OutputControl() { return mOutputControl; }
203-
int32_t NStreams() const { return mNStreams; }
203+
uint32_t NStreams() const { return mNStreams; }
204204
const void* DeviceMemoryBase() const { return mDeviceMemoryBase; }
205205

206206
RecoStepField GetRecoSteps() const { return mRecoSteps.steps; }

GPU/GPUTracking/Global/GPUChainTracking.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ bool GPUChainTracking::ValidateSettings()
260260
GPUError("configured max time bin exceeds 256 orbits");
261261
return false;
262262
}
263-
if ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) && std::max(GetProcessingSettings().nTPCClustererLanes + 1, GetProcessingSettings().nTPCClustererLanes * 2) + (GetProcessingSettings().doublePipeline ? 1 : 0) > mRec->NStreams()) {
263+
if ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) && std::max(GetProcessingSettings().nTPCClustererLanes + 1, GetProcessingSettings().nTPCClustererLanes * 2) + (GetProcessingSettings().doublePipeline ? 1 : 0) > (int32_t)mRec->NStreams()) {
264264
GPUError("NStreams (%d) must be > nTPCClustererLanes (%d)", mRec->NStreams(), (int32_t)GetProcessingSettings().nTPCClustererLanes);
265265
return false;
266266
}

GPU/GPUTracking/Global/GPUChainTracking.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ class GPUChainTracking : public GPUChain
220220

221221
GPUChainTracking(GPUReconstruction* rec, uint32_t maxTPCHits = GPUCA_MAX_CLUSTERS, uint32_t maxTRDTracklets = GPUCA_MAX_TRD_TRACKLETS);
222222

223-
int32_t ExtrapolationTracking(uint32_t iSector, int32_t threadId, bool synchronizeOutput = true);
223+
int32_t ExtrapolationTracking(uint32_t iSector, bool blocking);
224224

225225
int32_t PrepareProfile();
226226
int32_t DoProfile();
@@ -278,7 +278,6 @@ class GPUChainTracking : public GPUChain
278278

279279
// Synchronization and Locks
280280
eventStruct* mEvents = nullptr;
281-
volatile int32_t mSectorSelectorReady = 0;
282281
std::array<int8_t, NSECTORS> mExtrapolationTrackingDone;
283282

284283
std::vector<outputQueueEntry> mOutputQueue;
@@ -299,6 +298,7 @@ class GPUChainTracking : public GPUChain
299298
void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType);
300299
void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
301300
bool NeedTPCClustersOnGPU();
301+
uint32_t StreamForSector(uint32_t sector) const;
302302

303303
std::mutex mMutexUpdateCalib;
304304
std::unique_ptr<GPUChainTrackingFinalContext> mPipelineFinalizationCtx;

GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx

Lines changed: 29 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,18 @@
2424

2525
using namespace o2::gpu;
2626

27-
int32_t GPUChainTracking::ExtrapolationTracking(uint32_t iSector, int32_t threadId, bool synchronizeOutput)
27+
uint32_t GPUChainTracking::StreamForSector(uint32_t sector) const
2828
{
29-
runKernel<GPUTPCExtrapolationTracking>({GetGridBlk(256, iSector % mRec->NStreams()), {iSector}});
30-
TransferMemoryResourceLinkToHost(RecoStep::TPCSectorTracking, processors()->tpcTrackers[iSector].MemoryResCommon(), iSector % mRec->NStreams());
31-
if (synchronizeOutput) {
32-
SynchronizeStream(iSector % mRec->NStreams());
29+
return sector % mRec->NStreams();
30+
}
31+
32+
int32_t GPUChainTracking::ExtrapolationTracking(uint32_t iSector, bool blocking)
33+
{
34+
const uint32_t stream = StreamForSector(iSector);
35+
runKernel<GPUTPCExtrapolationTracking>({GetGridBlk(256, stream), {iSector}});
36+
TransferMemoryResourceLinkToHost(RecoStep::TPCSectorTracking, processors()->tpcTrackers[iSector].MemoryResCommon(), stream);
37+
if (blocking) {
38+
SynchronizeStream(stream);
3339
}
3440
return (0);
3541
}
@@ -153,7 +159,7 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
153159
mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t iSector) {
154160
GPUTPCTracker& trk = processors()->tpcTrackers[iSector];
155161
GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSector] : trk;
156-
int32_t useStream = (iSector % mRec->NStreams());
162+
int32_t useStream = StreamForSector(iSector);
157163

158164
if (GetProcessingSettings().debugLevel >= 3) {
159165
GPUInfo("Creating Sector Data (Sector %d)", iSector);
@@ -234,102 +240,38 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
234240
}
235241

236242
if (doGPU || GetProcessingSettings().debugLevel >= 1) {
237-
if (doGPU) {
238-
ReleaseEvent(mEvents->init);
239-
}
240-
241-
mSectorSelectorReady = 0;
242-
243-
std::array<bool, NSECTORS> transferRunning;
244-
transferRunning.fill(true);
245-
if (doGPU && !(GetRecoStepsGPU() & RecoStep::TPCMerging)) { // TODO: This seems pretty obsolete code path, can probably be removed.
246-
if (param().rec.tpc.extrapolationTracking) {
247-
mExtrapolationTrackingDone.fill(0);
248-
}
249-
250-
uint32_t tmpSector = 0;
251-
for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
252-
if (GetProcessingSettings().debugLevel >= 3) {
253-
GPUInfo("Transfering Tracks from GPU to Host");
254-
}
255-
256-
if (tmpSector == iSector) {
257-
SynchronizeEvents(&mEvents->sector[iSector]);
258-
}
259-
while (tmpSector < NSECTORS && (tmpSector == iSector || IsEventDone(&mEvents->sector[tmpSector]))) {
260-
ReleaseEvent(mEvents->sector[tmpSector]);
261-
if (*processors()->tpcTrackers[tmpSector].NTracks() > 0) {
262-
TransferMemoryResourceLinkToHost(RecoStep::TPCSectorTracking, processors()->tpcTrackers[tmpSector].MemoryResOutput(), streamMap[tmpSector], &mEvents->sector[tmpSector]);
263-
} else {
264-
transferRunning[tmpSector] = false;
265-
}
266-
tmpSector++;
267-
}
268-
269-
if (GetProcessingSettings().keepAllMemory) {
270-
TransferMemoryResourcesToHost(RecoStep::TPCSectorTracking, &processors()->tpcTrackers[iSector], -1, true);
271-
}
272-
273-
if (transferRunning[iSector]) {
274-
SynchronizeEvents(&mEvents->sector[iSector]);
275-
}
276-
if (GetProcessingSettings().debugLevel >= 3) {
277-
GPUInfo("Tracks Transfered: %d / %d", *processors()->tpcTrackers[iSector].NTracks(), *processors()->tpcTrackers[iSector].NTrackHits());
278-
}
279-
280-
if (GetProcessingSettings().debugLevel >= 3) {
281-
GPUInfo("Data ready for sector %d", iSector);
282-
}
283-
mSectorSelectorReady = iSector;
284-
285-
if (param().rec.tpc.extrapolationTracking) {
286-
for (uint32_t tmpSector2a = 0; tmpSector2a <= iSector; tmpSector2a++) {
287-
uint32_t tmpSector2 = GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorOrder(tmpSector2a);
288-
uint32_t sectorLeft, sectorRight;
289-
GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorLeftRight(tmpSector2, sectorLeft, sectorRight);
290-
291-
if (tmpSector2 <= iSector && sectorLeft <= iSector && sectorRight <= iSector && mExtrapolationTrackingDone[tmpSector2] == 0) {
292-
ExtrapolationTracking(tmpSector2, 0);
293-
mExtrapolationTrackingDone[tmpSector2] = 1;
294-
}
295-
}
296-
}
297-
}
298-
}
299243
if (param().rec.tpc.extrapolationTracking) {
300244
std::vector<bool> blocking(NSECTORS * mRec->NStreams());
301-
for (int32_t i = 0; i < NSECTORS; i++) {
302-
for (int32_t j = 0; j < mRec->NStreams(); j++) {
303-
blocking[i * mRec->NStreams() + j] = i % mRec->NStreams() == j;
245+
for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
246+
for (uint32_t iStream = 0; iStream < mRec->NStreams(); iStream++) {
247+
blocking[iSector * mRec->NStreams() + iStream] = StreamForSector(iSector) == iStream;
304248
}
305249
}
306250
for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
307251
uint32_t tmpSector = GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorOrder(iSector);
308-
if (!(doGPU && !(GetRecoStepsGPU() & RecoStep::TPCMerging))) {
309-
uint32_t sectorLeft, sectorRight;
310-
GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorLeftRight(tmpSector, sectorLeft, sectorRight);
311-
if (doGPU && !blocking[tmpSector * mRec->NStreams() + sectorLeft % mRec->NStreams()]) {
312-
StreamWaitForEvents(tmpSector % mRec->NStreams(), &mEvents->sector[sectorLeft]);
313-
blocking[tmpSector * mRec->NStreams() + sectorLeft % mRec->NStreams()] = true;
314-
}
315-
if (doGPU && !blocking[tmpSector * mRec->NStreams() + sectorRight % mRec->NStreams()]) {
316-
StreamWaitForEvents(tmpSector % mRec->NStreams(), &mEvents->sector[sectorRight]);
317-
blocking[tmpSector * mRec->NStreams() + sectorRight % mRec->NStreams()] = true;
318-
}
252+
uint32_t sectorLeft, sectorRight;
253+
GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorLeftRight(tmpSector, sectorLeft, sectorRight);
254+
if (doGPU && !blocking[tmpSector * mRec->NStreams() + StreamForSector(sectorLeft)]) {
255+
StreamWaitForEvents(StreamForSector(tmpSector), &mEvents->sector[sectorLeft]);
256+
blocking[tmpSector * mRec->NStreams() + StreamForSector(sectorLeft)] = true;
257+
}
258+
if (doGPU && !blocking[tmpSector * mRec->NStreams() + StreamForSector(sectorRight)]) {
259+
StreamWaitForEvents(StreamForSector(tmpSector), &mEvents->sector[sectorRight]);
260+
blocking[tmpSector * mRec->NStreams() + StreamForSector(sectorRight)] = true;
319261
}
320-
ExtrapolationTracking(tmpSector, 0, false);
262+
ExtrapolationTracking(tmpSector, false);
321263
}
322264
}
323-
for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
324-
if (doGPU && transferRunning[iSector]) {
265+
if (doGPU) {
266+
ReleaseEvent(mEvents->init);
267+
for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
325268
ReleaseEvent(mEvents->sector[iSector]);
326269
}
327270
}
328271
} else {
329-
mSectorSelectorReady = NSECTORS;
330272
mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t iSector) {
331273
if (param().rec.tpc.extrapolationTracking) {
332-
ExtrapolationTracking(iSector, 0);
274+
ExtrapolationTracking(iSector, true);
333275
}
334276
});
335277
mRec->SetNActiveThreadsOuterLoop(1);

0 commit comments

Comments
 (0)