Skip to content

Commit eecb8a8

Browse files
committed
Improve kernel speed by ~15%. Next test: for-loop in pad direction for coallesced access
1 parent 9a7fa44 commit eecb8a8

File tree

6 files changed

+173
-120
lines changed

6 files changed

+173
-120
lines changed

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -706,8 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
706706
nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
707707
}
708708
AllocateRegisteredMemory(clustererNN.mMemoryId);
709-
nnApplications[lane].createBoundary(clustererNNShadow);
710-
nnApplications[lane].createIndexLookup(clustererNNShadow);
709+
// nnApplications[lane].createBoundary(clustererNNShadow);
710+
// nnApplications[lane].createIndexLookup(clustererNNShadow);
711711
});
712712
if (doGPU) {
713713
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
6161
}
6262
if (mNnClusterizerTotalClusters > 0) {
6363
computePointerWithAlignment(mem, mOutputDataClass, mNnClusterizerTotalClusters);
64-
computePointerWithAlignment(mem, mIsBoundary, mBoundaryMapSize);
65-
computePointerWithAlignment(mem, mIndexLookup, mIndexLookupSize);
6664
}
6765
return mem;
6866
}

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,22 @@ class GPUTPCNNClusterizer : public GPUProcessor
5959
int32_t mISector = -1;
6060
int32_t mDeviceId = -1;
6161

62+
// GPU optimizations
63+
uint32_t mNnClusterizerFullRowSize = 0;
64+
uint32_t mNnClusterizerFullPadSize = 0;
65+
uint32_t mNnClusterizerFullTimeSize = 0;
66+
uint32_t mNnClusterizerPadTimeSize = 0;
67+
6268
// Boundary lookup table
63-
int32_t mBoundaryMapSizeRow = 0;
64-
int32_t mBoundaryMapSizePadsPerRow = 0;
65-
int32_t mBoundaryMapSize = 0;
66-
int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset
67-
int8_t* mIsBoundary = nullptr;
69+
// int32_t mBoundaryMapSizeRow = 0;
70+
// int32_t mBoundaryMapSizePadsPerRow = 0;
71+
// int32_t mBoundaryMapSize = 0;
72+
// int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset
73+
// int8_t* mIsBoundary = nullptr;
6874

6975
// Index lookup table
70-
int32_t mIndexLookupSize = 0;
71-
int32_t* mIndexLookup = nullptr;
76+
// int32_t mIndexLookupSize = 0;
77+
// int32_t* mIndexLookup = nullptr;
7278

7379
// Memory allocation for neural network
7480

@@ -83,6 +89,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
8389

8490
// FP16
8591
OrtDataType::Float16_t* mInputData_16 = nullptr;
92+
OrtDataType::Float16_t* mInputData_16_Test = nullptr;
8693
OrtDataType::Float16_t* mModelProbabilities_16 = nullptr;
8794
OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr;
8895
OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr;

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx

Lines changed: 42 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -89,13 +89,17 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
8989
clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
9090
clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
9191
clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
92-
clustererNN.mNnClusterizerChargeArraySize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1));
92+
clustererNN.mNnClusterizerFullRowSize = 2 * settings.nnClusterizerSizeInputRow + 1;
93+
clustererNN.mNnClusterizerFullPadSize = 2 * settings.nnClusterizerSizeInputPad + 1;
94+
clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
95+
clustererNN.mNnClusterizerChargeArraySize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
96+
clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
9397
clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
94-
clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
95-
clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
96-
clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
97-
clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
98-
clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
98+
// clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
99+
// clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
100+
// clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
101+
// clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
102+
// clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
99103
clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
100104
clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
101105
clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
@@ -124,38 +128,38 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
124128
}
125129
}
126130

127-
void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
128-
{
129-
// Call after init of the clustererNN elements
130-
for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
131-
int8_t skipCheckInRow = 0;
132-
for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
133-
int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
134-
clustererNN.mIsBoundary[i] = 1;
135-
if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
136-
if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
137-
clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
138-
} else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
139-
clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
140-
}
141-
skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
142-
}
143-
}
144-
}
145-
}
146-
147-
void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
148-
{
149-
for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
150-
int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
151-
int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
152-
int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
153-
int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
154-
clustererNN.mIndexLookup[3 * i] = r;
155-
clustererNN.mIndexLookup[3 * i + 1] = p;
156-
clustererNN.mIndexLookup[3 * i + 2] = t;
157-
}
158-
}
131+
// void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
132+
// {
133+
// // Call after init of the clustererNN elements
134+
// for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
135+
// int8_t skipCheckInRow = 0;
136+
// for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
137+
// int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
138+
// clustererNN.mIsBoundary[i] = 1;
139+
// if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
140+
// if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
141+
// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
142+
// } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
143+
// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
144+
// }
145+
// skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
146+
// }
147+
// }
148+
// }
149+
// }
150+
151+
// void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
152+
// {
153+
// for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
154+
// int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
155+
// int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
156+
// int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
157+
// int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
158+
// clustererNN.mIndexLookup[3 * i] = r;
159+
// clustererNN.mIndexLookup[3 * i + 1] = p;
160+
// clustererNN.mIndexLookup[3 * i + 2] = t;
161+
// }
162+
// }
159163

160164
// MockedOrtAllocator implementation to be able to use volatile assignment
161165
struct MockedOrtAllocator : OrtAllocator {

0 commit comments

Comments
 (0)