@@ -75,14 +75,10 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
7575 return ;
7676 }
7777
78- #define UnrollGlobal 4
79- #define MaxShared GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP
80- #if MaxShared < GPUCA_MAXN
81- #define MaxGlobal ((GPUCA_MAXN - MaxShared - 1 ) / UnrollGlobal + 1 ) * UnrollGlobal
82- #else
83- #define MaxGlobal 0
84- #endif
85- #define MaxTotal MaxShared + MaxGlobal
78+ static constexpr uint32_t UNROLL_GLOBAL = 4 ;
79+ static constexpr uint32_t MAX_SHARED = GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP;
80+ static constexpr uint32_t MAX_GLOBAL = (MAX_SHARED < GPUCA_MAXN) ? (((GPUCA_MAXN - MAX_SHARED - 1 ) / UNROLL_GLOBAL + 1 ) * UNROLL_GLOBAL) : 0 ;
81+ static constexpr uint32_t MAX_TOTAL = MAX_SHARED + MAX_GLOBAL;
8682
8783 const float chi2Cut = 3 .f * 3 .f * 4 * (s.mUpDx * s.mUpDx + s.mDnDx * s.mDnDx );
8884 // float chi2Cut = 3.f*3.f*(s.mUpDx*s.mUpDx + s.mDnDx*s.mDnDx ); //SG
@@ -117,18 +113,16 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
117113 const float kAreaSlopeZUp = kAngularMultiplier != 0 .f ? 1 .f : s.mUpTx ;
118114 const float kAreaSlopeZDn = kAngularMultiplier != 0 .f ? 1 .f : s.mDnTx ;
119115
120- #if MaxGlobal > 0
121- calink neighUp[MaxGlobal];
122- float yzUp[2 * MaxGlobal];
123- #endif
116+ calink neighUp[MAX_GLOBAL];
117+ float yzUp[2 * MAX_GLOBAL];
124118
125119 for (int32_t ih = iThread; ih < s.mNHits ; ih += nThreads) {
126120
127121 const GPUglobalref () cahit2& hitData = pHitData[lHitNumberOffset + ih];
128122 const float y = y0 + hitData.x * stepY;
129123 const float z = z0 + hitData.y * stepZ;
130124
131- int32_t nNeighUp = 0 ;
125+ uint32_t nNeighUp = 0 ;
132126 float minZ, maxZ, minY, maxY;
133127 int32_t binYmin, binYmax, binZmin, binZmax;
134128 int32_t nY;
@@ -145,11 +139,11 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
145139 nY = rowUp.Grid ().Ny ();
146140 }
147141
148- for (int32_t k1 = binZmin; k1 <= binZmax && (nNeighUp < MaxTotal ); k1++) {
142+ for (int32_t k1 = binZmin; k1 <= binZmax && (nNeighUp < MAX_TOTAL ); k1++) {
149143 int32_t iMin = lFirstHitInBin[lFirstHitInBinOffsetUp + k1 * nY + binYmin];
150144 int32_t iMax = lFirstHitInBin[lFirstHitInBinOffsetUp + k1 * nY + binYmax + 1 ];
151145 GPUCA_UNROLL (U (4 ), U (2 ))
152- for (int32_t i = iMin; i < iMax && (nNeighUp < MaxTotal ); i++) {
146+ for (int32_t i = iMin; i < iMax && (nNeighUp < MAX_TOTAL ); i++) {
153147 const GPUglobalref () cahit2& hitDataUp = pHitData[lHitNumberOffsetUp + i];
154148 GPUTPCHit h;
155149 h.mY = y0Up + (hitDataUp.x ) * stepYUp;
@@ -159,51 +153,48 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
159153 continue ;
160154 }
161155
162- #if MaxGlobal > 0
163- #if MaxShared == 0
164- if (true ) {
165- #else
166- if (nNeighUp >= MaxShared) {
167- #endif
168- neighUp[nNeighUp - MaxShared] = (calink)i;
169- yzUp[2 * (nNeighUp - MaxShared)] = s.mDnDx * (h.Y () - y);
170- yzUp[2 * (nNeighUp - MaxShared) + 1 ] = s.mDnDx * (h.Z () - z);
171- } else
172- #endif
173- {
174- #if MaxShared > 0
175- s.mB [nNeighUp][iThread] = (calink)i;
176- s.mA1 [nNeighUp][iThread] = s.mDnDx * (h.Y () - y);
177- s.mA2 [nNeighUp][iThread] = s.mDnDx * (h.Z () - z);
178- #endif
156+ const bool inGlobal = nNeighUp >= MAX_SHARED;
157+ if constexpr (MAX_GLOBAL > 0 ) {
158+ if (inGlobal) {
159+ neighUp[nNeighUp - MAX_SHARED] = (calink)i;
160+ yzUp[2 * (nNeighUp - MAX_SHARED)] = s.mDnDx * (h.Y () - y);
161+ yzUp[2 * (nNeighUp - MAX_SHARED) + 1 ] = s.mDnDx * (h.Z () - z);
162+ }
163+ }
164+ if constexpr (MAX_SHARED > 0 ) {
165+ if (!inGlobal) {
166+ s.mB [nNeighUp][iThread] = (calink)i;
167+ s.mA1 [nNeighUp][iThread] = s.mDnDx * (h.Y () - y);
168+ s.mA2 [nNeighUp][iThread] = s.mDnDx * (h.Z () - z);
169+ }
179170 }
180171 nNeighUp++;
181172 }
182173 }
183174
184- #if MaxShared > 0 // init a rest of the shared array
185- for (int32_t iUp = nNeighUp; iUp < MaxShared; iUp++) {
186- s.mA1 [iUp][iThread] = -1 .e10f ;
187- s.mA2 [iUp][iThread] = -1 .e10f ;
188- s.mB [iUp][iThread] = (calink)-1 ;
175+ if constexpr (MAX_SHARED > 0 ) { // init the rest of the shared array
176+ for (uint32_t iUp = nNeighUp; iUp < MAX_SHARED; iUp++) {
177+ s.mA1 [iUp][iThread] = -1 .e10f ;
178+ s.mA2 [iUp][iThread] = -1 .e10f ;
179+ s.mB [iUp][iThread] = (calink)-1 ;
180+ }
189181 }
190- #endif
191182
192- #if MaxGlobal > 0 // init a rest of the UnrollGlobal chunk of the global array
193- int32_t Nrest = nNeighUp - MaxShared;
194- int32_t N4 = (Nrest / UnrollGlobal) * UnrollGlobal;
195- if (N4 < Nrest) {
196- N4 += UnrollGlobal;
197- GPUCA_UNROLL (U (UnrollGlobal - 1 ), U (UnrollGlobal - 1 ))
198- for (int32_t k = 0 ; k < UnrollGlobal - 1 ; k++) {
199- if (Nrest + k < N4) {
200- yzUp[2 * (Nrest + k)] = -1 .e10f ;
201- yzUp[2 * (Nrest + k) + 1 ] = -1 .e10f ;
202- neighUp[Nrest + k] = (calink)-1 ;
183+ const uint32_t Nrest = nNeighUp - MAX_SHARED;
184+ uint32_t N4 = (Nrest / UNROLL_GLOBAL) * UNROLL_GLOBAL;
185+ if constexpr (MAX_GLOBAL > 0 ) { // init the rest of the UNROLL_GLOBAL chunk of the global array
186+ if (nNeighUp > MAX_SHARED && N4 < Nrest) {
187+ N4 += UNROLL_GLOBAL;
188+ GPUCA_UNROLL (U (UNROLL_GLOBAL - 1 ), U (UNROLL_GLOBAL - 1 ))
189+ for (uint32_t k = 0 ; k + 1 < UNROLL_GLOBAL; k++) {
190+ if (Nrest + k < N4) {
191+ yzUp[2 * (Nrest + k)] = -1 .e10f ;
192+ yzUp[2 * (Nrest + k) + 1 ] = -1 .e10f ;
193+ neighUp[Nrest + k] = (calink)-1 ;
194+ }
203195 }
204196 }
205197 }
206- #endif
207198
208199 { // area in the lower row
209200 const float yy = y * s.mDnTx ;
@@ -236,47 +227,49 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
236227 float yDnProjUp = s.mUpDx * (yDn - y);
237228 float zDnProjUp = s.mUpDx * (zDn - z);
238229
239- #if MaxShared > 0
240- GPUCA_UNROLL (U (MaxShared), U (MaxShared))
241- for (int32_t iUp = 0 ; iUp < MaxShared; iUp++) {
242- const float dy = yDnProjUp - s.mA1 [iUp][iThread];
243- const float dz = zDnProjUp - s.mA2 [iUp][iThread];
244- const float d = dy * dy + dz * dz;
245- if (d < bestD) {
246- bestD = d;
247- linkDn = i;
248- linkUp = iUp;
249- }
250- }
251- #endif
252-
253- #if MaxGlobal > 0
254- for (int32_t iUp = 0 ; iUp < N4; iUp += UnrollGlobal) {
255- GPUCA_UNROLL (U (UnrollGlobal), U (UnrollGlobal))
256- for (int32_t k = 0 ; k < UnrollGlobal; k++) {
257- int32_t jUp = iUp + k;
258- const float dy = yDnProjUp - yzUp[2 * jUp];
259- const float dz = zDnProjUp - yzUp[2 * jUp + 1 ];
230+ if constexpr (MAX_SHARED > 0 ) {
231+ GPUCA_UNROLL (U (MAX_SHARED), U (MAX_SHARED))
232+ for (uint32_t iUp = 0 ; iUp < MAX_SHARED; iUp++) {
233+ const float dy = yDnProjUp - s.mA1 [iUp][iThread];
234+ const float dz = zDnProjUp - s.mA2 [iUp][iThread];
260235 const float d = dy * dy + dz * dz;
261236 if (d < bestD) {
262237 bestD = d;
263238 linkDn = i;
264- linkUp = MaxShared + jUp;
239+ linkUp = iUp;
240+ }
241+ }
242+ }
243+
244+ if constexpr (MAX_GLOBAL > 0 ) {
245+ if (nNeighUp > MAX_SHARED) {
246+ for (uint32_t iUp = 0 ; iUp < N4; iUp += UNROLL_GLOBAL) {
247+ GPUCA_UNROLL (U (UNROLL_GLOBAL), U (UNROLL_GLOBAL))
248+ for (uint32_t k = 0 ; k < UNROLL_GLOBAL; k++) {
249+ const uint32_t jUp = iUp + k;
250+ const float dy = yDnProjUp - yzUp[2 * jUp];
251+ const float dz = zDnProjUp - yzUp[2 * jUp + 1 ];
252+ const float d = dy * dy + dz * dz;
253+ if (d < bestD) {
254+ bestD = d;
255+ linkDn = i;
256+ linkUp = MAX_SHARED + jUp;
257+ }
258+ }
265259 }
266260 }
267261 }
268- #endif
269262 }
270263 }
271264
272265 if (linkUp >= 0 ) {
273- # if MaxShared > 0 && MaxGlobal > 0
274- linkUp = (linkUp >= MaxShared ) ? neighUp[linkUp - MaxShared ] : s.mB [linkUp][iThread];
275- # elif MaxShared > 0
276- linkUp = s.mB [linkUp][iThread];
277- # else
278- linkUp = neighUp[linkUp];
279- # endif
266+ if constexpr (MAX_SHARED > 0 && MAX_GLOBAL > 0 ) {
267+ linkUp = (( uint32_t ) linkUp >= MAX_SHARED ) ? neighUp[linkUp - MAX_SHARED ] : s.mB [linkUp][iThread];
268+ } else if constexpr (MAX_SHARED > 0 ) {
269+ linkUp = s.mB [linkUp][iThread];
270+ } else {
271+ linkUp = neighUp[linkUp];
272+ }
280273 }
281274
282275 tracker.mData .mLinkUpData [lHitNumberOffset + ih] = linkUp;
0 commit comments