Skip to content

Commit 9bb2157

Browse files
committed
GPU: Add a fallback implementation for Vc.
1 parent 62b9999 commit 9bb2157

File tree

3 files changed

+198
-39
lines changed

3 files changed

+198
-39
lines changed

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,10 @@
5353
#include "CommonDataFormat/InteractionRecord.h"
5454
#endif
5555

56+
#include "utils/VcShim.h"
5657
#include "utils/strtag.h"
5758
#include <fstream>
5859

59-
#ifndef GPUCA_NO_VC
60-
#include <Vc/Vc>
61-
#endif
62-
6360
using namespace o2::gpu;
6461
using namespace o2::tpc;
6562
using namespace o2::tpc::constants;
@@ -173,7 +170,7 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
173170
int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
174171

175172
for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
176-
#ifndef GPUCA_NO_VC
173+
177174
if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) {
178175
for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) {
179176
for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) {
@@ -182,7 +179,6 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
182179
}
183180
}
184181
}
185-
#endif
186182

187183
std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
188184
fragments.reserve(mCFContext->nFragments);
@@ -201,12 +197,12 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
201197
}
202198
nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
203199
for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
204-
#ifndef GPUCA_NO_VC
200+
205201
if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) {
206202
Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
207203
Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
208204
}
209-
#endif
205+
210206
const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
211207
const o2::header::RAWDataHeader* rdh = (const o2::header::RAWDataHeader*)page;
212208
if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
@@ -510,7 +506,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
510506
return 1;
511507
}
512508
}
513-
#ifndef GPUCA_NO_VC
509+
514510
if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) {
515511
for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
516512
for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
@@ -521,7 +517,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
521517
}
522518
}
523519
}
524-
#endif
520+
525521
const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
526522
nDigitsFragmentMax[iSector] = x.first;
527523
processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first;

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,7 @@
1919
#include "clusterFinderDefs.h"
2020

2121
#ifndef GPUCA_GPUCODE
22-
#ifndef GPUCA_NO_VC
23-
#include <Vc/Vc>
24-
#else
25-
#include <array>
26-
#endif
22+
#include "utils/VcShim.h"
2723
#endif
2824

2925
using namespace o2::gpu;
@@ -80,20 +76,13 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
8076

8177
constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
8278

83-
#ifndef GPUCA_NO_VC
8479
using UShort8 = Vc::fixed_size_simd<uint16_t, PadsPerCacheline>;
8580
using Charge8 = Vc::fixed_size_simd<float, PadsPerCacheline>;
8681

8782
UShort8 totalCharges{Vc::Zero};
8883
UShort8 consecCharges{Vc::Zero};
8984
UShort8 maxConsecCharges{Vc::Zero};
9085
Charge8 maxCharge{Vc::Zero};
91-
#else
92-
std::array<uint16_t, PadsPerCacheline> totalCharges{0};
93-
std::array<uint16_t, PadsPerCacheline> consecCharges{0};
94-
std::array<uint16_t, PadsPerCacheline> maxConsecCharges{0};
95-
std::array<Charge, PadsPerCacheline> maxCharge{0};
96-
#endif
9786

9887
tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin();
9988

@@ -102,7 +91,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
10291

10392
for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
10493
for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) {
105-
#ifndef GPUCA_NO_VC
10694
const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned};
10795
const UShort8::mask_type isCharge = packedCharges != 0;
10896

@@ -123,22 +111,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
123111
} else {
124112
consecCharges = 0;
125113
}
126-
#else // Vc not available
127-
for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
128-
const uint16_t packedCharge = packedChargeStart[PadsPerCacheline * localtime + localpad];
129-
const bool isCharge = packedCharge != 0;
130-
if (isCharge) {
131-
totalCharges[localpad]++;
132-
consecCharges[localpad]++;
133-
maxConsecCharges[localpad] = CAMath::Max(maxConsecCharges[localpad], consecCharges[localpad]);
134-
135-
const Charge unpackedCharge = Charge(packedCharge) / Charge(1 << PackedCharge::DecimalBits);
136-
maxCharge[localpad] = CAMath::Max<Charge>(maxCharge[localpad], unpackedCharge);
137-
} else {
138-
consecCharges[localpad] = 0;
139-
}
140-
}
141-
#endif
142114
}
143115

144116
packedChargeStart += ElemsInTileRow;

GPU/GPUTracking/utils/VcShim.h

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
// Copyright 2020-2025 CERN and copyright holders of ALICE O2. This software is
2+
// distributed under the terms of the GNU General Public License v3 (GPL
3+
// Version 3), copied verbatim in the file "COPYING".
4+
//
5+
// See http://alice-o2.web.cern.ch/license for full licensing information.
6+
//
7+
// In applying this license CERN does not waive the privileges and immunities
8+
// granted to it by virtue of its status as an Intergovernmental Organization
9+
// or submit itself to any jurisdiction.
10+
11+
/// \file VcShim.h
12+
/// \brief Provides a basic fallback implementation for Vc
13+
///
14+
/// \author Felix Weiglhofer
15+
16+
#ifndef GPU_UTILS_VCSHIM_H
17+
#define GPU_UTILS_VCSHIM_H
18+
19+
#ifndef GPUCA_NO_VC
20+
21+
#include <Vc/Vc>
22+
23+
#else
24+
25+
#include <algorithm>
26+
#include <array>
27+
#include <bitset>
28+
#include <cstddef>
29+
30+
namespace Vc
31+
{
32+
33+
constexpr struct VectorSpecialInitializerZero {
34+
} Zero;
35+
constexpr struct AlignedTag {
36+
} Aligned;
37+
38+
template <typename T>
39+
typename T::vector_type& internal_data(T& v)
40+
{
41+
return v.mData;
42+
}
43+
44+
template <typename T>
45+
const typename T::vector_type& internal_data(const T& v)
46+
{
47+
return v.mData;
48+
}
49+
50+
namespace Common
51+
{
52+
53+
template <typename V, typename M>
54+
class WriteMaskVector
55+
{
56+
private:
57+
const M& mMask;
58+
V& mVec;
59+
60+
public:
61+
using value_type = typename V::value_type;
62+
63+
WriteMaskVector(V& v, const M& m) : mMask(m), mVec(v) {}
64+
65+
WriteMaskVector& operator++(int)
66+
{
67+
for (size_t i = 0; i < mVec.size(); i++)
68+
mVec[i] += value_type(mMask[i]);
69+
return *this;
70+
}
71+
72+
WriteMaskVector& operator=(const value_type& v)
73+
{
74+
for (size_t i = 0; i < mVec.size(); i++) {
75+
if (mMask[i])
76+
mVec[i] = v;
77+
}
78+
return *this;
79+
}
80+
};
81+
82+
inline void prefetchMid(const void*) {}
83+
inline void prefetchFar(const void*) {}
84+
inline void prefetchForOneRead(const void*) {}
85+
86+
} // namespace Common
87+
88+
template <typename T, size_t N>
89+
class fixed_size_simd_mask
90+
{
91+
private:
92+
std::bitset<N> mData;
93+
94+
public:
95+
bool isNotEmpty() const { return mData.any(); }
96+
97+
std::bitset<N>::reference operator[](size_t i) { return mData[i]; }
98+
bool operator[](size_t i) const { return mData[i]; }
99+
100+
fixed_size_simd_mask operator!() const
101+
{
102+
auto o = *this;
103+
o.mData.flip();
104+
return o;
105+
}
106+
};
107+
108+
template <typename T, size_t N>
109+
class fixed_size_simd
110+
{
111+
private:
112+
std::array<T, N> mData;
113+
114+
public:
115+
using vector_type = std::array<T, N>;
116+
using value_type = T;
117+
using mask_type = fixed_size_simd_mask<T, N>;
118+
119+
static constexpr size_t size() { return N; }
120+
121+
fixed_size_simd() = default;
122+
explicit fixed_size_simd(VectorSpecialInitializerZero) { mData = {}; }
123+
124+
template <typename U>
125+
fixed_size_simd(const fixed_size_simd<U, N>& w)
126+
{
127+
std::copy_n(internal_data(w).begin(), N, mData.begin());
128+
}
129+
130+
fixed_size_simd(const T* d, AlignedTag) { std::copy_n(d, N, mData.begin()); }
131+
132+
T& operator[](size_t i) { return mData[i]; }
133+
const T& operator[](size_t i) const { return mData[i]; }
134+
135+
Common::WriteMaskVector<fixed_size_simd, mask_type> operator()(const mask_type& m) { return {*this, m}; }
136+
137+
fixed_size_simd& operator=(const T& v)
138+
{
139+
for (auto& x : mData)
140+
x = v;
141+
return *this;
142+
}
143+
144+
fixed_size_simd& operator+=(const T& v)
145+
{
146+
for (auto& x : mData)
147+
x += v;
148+
return *this;
149+
}
150+
151+
fixed_size_simd& operator/=(const T& v)
152+
{
153+
for (auto& x : mData)
154+
x /= v;
155+
return *this;
156+
}
157+
158+
fixed_size_simd operator/(const T& v) const
159+
{
160+
auto x = *this;
161+
return x /= v;
162+
}
163+
164+
mask_type operator==(const T& v) const
165+
{
166+
mask_type m;
167+
for (size_t i = 0; i < N; i++)
168+
m[i] = mData[i] == v;
169+
return m;
170+
}
171+
172+
mask_type operator!=(const T& v) const { return !(*this == v); }
173+
174+
friend vector_type& internal_data<>(fixed_size_simd& x);
175+
friend const vector_type& internal_data<>(const fixed_size_simd& x);
176+
};
177+
178+
template <typename V>
179+
V max(const V& a, const V& b)
180+
{
181+
V o;
182+
for (size_t i = 0; i < a.size(); i++)
183+
o[i] = std::max(a[i], b[i]);
184+
return o;
185+
}
186+
187+
} // namespace Vc
188+
189+
#endif // ifndef GPUCA_NO_VC
190+
191+
#endif

0 commit comments

Comments
 (0)