Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 20 additions & 12 deletions GPU/Common/GPUCommonAlgorithm.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,21 +283,29 @@ GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end, const S& comp)
#ifndef GPUCA_GPUCODE
GPUCommonAlgorithm::sort(begin, end, comp);
#else
int32_t n = end - begin;
for (int32_t i = 0; i < n; i++) {
for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) {
int32_t offset = i % 2;
int32_t curPos = 2 * tIdx + offset;
int32_t nextPos = curPos + 1;

if (nextPos < n) {
if (!comp(begin[curPos], begin[nextPos])) {
IterSwap(&begin[curPos], &begin[nextPos]);
GPUCA_DETERMINISTIC_CODE( // clang-format off
GPUbarrier();
if (get_local_id(0) == 0) {
GPUCommonAlgorithm::sort(begin, end, comp);
}
GPUbarrier();
, // !GPUCA_DETERMINISTIC_CODE
int32_t n = end - begin;
for (int32_t i = 0; i < n; i++) {
for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) {
int32_t offset = i % 2;
int32_t curPos = 2 * tIdx + offset;
int32_t nextPos = curPos + 1;

if (nextPos < n) {
if (!comp(begin[curPos], begin[nextPos])) {
IterSwap(&begin[curPos], &begin[nextPos]);
}
}
}
GPUbarrier();
}
GPUbarrier();
}
) // clang-format on
#endif
}

Expand Down
26 changes: 14 additions & 12 deletions GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ void GPUTPCCompression::DumpCompressedClusters(std::ostream& out)
for (uint32_t i = 0; i < NSECTORS; i++) {
out << "Sector " << i << ": ";
for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
out << O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] << ", ";
out << (O.nSliceRowClusters ? O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] : 0) << ", ";
}
out << "\n";
}
Expand All @@ -153,18 +153,20 @@ void GPUTPCCompression::DumpCompressedClusters(std::ostream& out)
}
out << "\n\nUnattached Clusters\n";
uint32_t offset = 0;
for (uint32_t i = 0; i < NSECTORS; i++) {
for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
out << "Sector " << i << " Row " << j << ": ";
for (uint32_t k = 0; k < O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; k++) {
if (k && k % 10 == 0) {
out << "\n ";
if (O.nSliceRowClusters) {
for (uint32_t i = 0; i < NSECTORS; i++) {
for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
out << "Sector " << i << " Row " << j << ": ";
for (uint32_t k = 0; k < O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; k++) {
if (k && k % 10 == 0) {
out << "\n ";
}
const uint32_t l = k + offset;
out << "[" << (uint32_t)O.qTotU[l] << ", " << (uint32_t)O.qMaxU[l] << ", " << (uint32_t)O.flagsU[l] << ", " << (int32_t)O.padDiffU[l] << ", " << (int32_t)O.timeDiffU[l] << ", " << (uint32_t)O.sigmaPadU[l] << ", " << (uint32_t)O.sigmaTimeU[l] << "] ";
}
const uint32_t l = k + offset;
out << "[" << (uint32_t)O.qTotU[l] << ", " << (uint32_t)O.qMaxU[l] << ", " << (uint32_t)O.flagsU[l] << ", " << (int32_t)O.padDiffU[l] << ", " << (int32_t)O.timeDiffU[l] << ", " << (uint32_t)O.sigmaPadU[l] << ", " << (uint32_t)O.sigmaTimeU[l] << "] ";
offset += O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
out << "\n";
}
offset += O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
out << "\n";
}
}
out << "\n\nAttached Clusters\n";
Expand All @@ -175,7 +177,7 @@ void GPUTPCCompression::DumpCompressedClusters(std::ostream& out)
if (k && k % 10 == 0) {
out << "\n ";
}
const uint32_t l1 = k + offset, l2 = k + offset - i;
const uint32_t l1 = offset + k, l2 = offset - i + k - 1;
out << "[";
if (k) {
out << (int32_t)O.rowDiffA[l2] << ", " << (int32_t)O.sliceLegDiffA[l2] << ", " << (uint32_t)O.padResA[l2] << ", " << (uint32_t)O.timeResA[l2] << ", ";
Expand Down
40 changes: 28 additions & 12 deletions GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -148,19 +148,19 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0at
}

template <>
GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<0>::operator()(uint32_t a, uint32_t b) const
GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>::operator()(uint32_t a, uint32_t b) const
{
return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
}

template <>
GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<1>::operator()(uint32_t a, uint32_t b) const
GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>::operator()(uint32_t a, uint32_t b) const
{
return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
}

template <>
GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<2>::operator()(uint32_t a, uint32_t b) const
GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>::operator()(uint32_t a, uint32_t b) const
{
if (mClsPtr[a].getTimePacked() >> 3 == mClsPtr[b].getTimePacked() >> 3) {
return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
Expand All @@ -169,14 +169,26 @@ GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<2>::opera
}

template <>
GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<3>::operator()(uint32_t a, uint32_t b) const
GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>::operator()(uint32_t a, uint32_t b) const
{
if (mClsPtr[a].padPacked >> 3 == mClsPtr[b].padPacked >> 3) {
return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
}
return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
}

template <> // Deterministic comparison
GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<4>::operator()(uint32_t a, uint32_t b) const
{
if (mClsPtr[a].getTimePacked() != mClsPtr[b].getTimePacked()) {
return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
}
if (mClsPtr[a].padPacked != mClsPtr[b].padPacked) {
return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
}
return mClsPtr[a].qTot < mClsPtr[b].qTot;
}

template <>
GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
{
Expand Down Expand Up @@ -261,15 +273,19 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
#ifdef GPUCA_GPUCODE
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= GPUCA_TPC_COMP_CHUNK_SIZE);
#endif
if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
GPUCA_DETERMINISTIC_CODE( // clang-format off
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));
}
, // !GPUCA_DETERMINISTIC_CODE
if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));
} else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));
}
) // clang-format on
GPUbarrier();
}

Expand Down
1 change: 1 addition & 0 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GP
AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))")
AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
AddOption(debugMask, uint32_t, 262143, "", 0, "Mask for debug output dumps to file")
AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6")
AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6", def(1))
Expand Down
6 changes: 3 additions & 3 deletions GPU/GPUTracking/Global/GPUChainTracking.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ int32_t GPUChainTracking::Init()
}

if (GetProcessingSettings().debugLevel >= 6) {
std::string filename = std::string(mRec->IsGPU() ? "GPU" : "CPU") + (mRec->slaveId() != -1 ? (std::string("_slave") + std::to_string(mRec->slaveId())) : std::string(mRec->slavesExist() ? "_master" : "")) + ".out";
std::string filename = std::string(mRec->IsGPU() ? "GPU" : "CPU") + (mRec->slaveId() != -1 ? (std::string("_slave") + std::to_string(mRec->slaveId())) : std::string(mRec->slavesExist() ? "_master" : "")) + GetProcessingSettings().debugLogSuffix + ".out";
mDebugFile->open(filename.c_str());
}

Expand Down Expand Up @@ -838,7 +838,7 @@ int32_t GPUChainTracking::RunChainFinalize()

int32_t iKey;
do {
Sleep(10);
usleep(10000);
if (GetProcessingSettings().eventDisplay->EnableSendKey()) {
iKey = kbhit() ? getch() : 0;
if (iKey == 27) {
Expand All @@ -847,7 +847,7 @@ int32_t GPUChainTracking::RunChainFinalize()
break;
} else if (iKey) {
while (GetProcessingSettings().eventDisplay->getSendKey() != 0) {
Sleep(1);
usleep(1000);
}
GetProcessingSettings().eventDisplay->setSendKey(iKey);
}
Expand Down
1 change: 1 addition & 0 deletions GPU/GPUTracking/Global/GPUChainTracking.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ class GPUChainTracking : public GPUChain
void PrintDebugOutput();
void PrintOutputStat();
static void DumpClusters(std::ostream& out, const o2::tpc::ClusterNativeAccess* clusters);
static void DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls);

bool ValidateSteps();
bool ValidateSettings();
Expand Down
4 changes: 4 additions & 0 deletions GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ int32_t GPUChainTracking::RunTPCCompression()
((GPUChainTracking*)GetNextChainInQueue())->mRec->BlockStackedMemory(mRec);
}
mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR"));
if (GetProcessingSettings().deterministicGPUReconstruction) {
SynchronizeGPU();
DebugSortCompressedClusters(Compressor.mOutputFlat);
}
DoDebugAndDump(RecoStep::TPCCompression, GPUChainTrackingDebugFlags::TPCCompressedClusters, Compressor, &GPUTPCCompression::DumpCompressedClusters, *mDebugFile);
return 0;
}
Expand Down
42 changes: 42 additions & 0 deletions GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <map>
#include <memory>
#include <string>
#include <numeric>

#ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
#include "bitmapfile.h"
Expand Down Expand Up @@ -348,3 +349,44 @@ void GPUChainTracking::DumpClusters(std::ostream& out, const o2::tpc::ClusterNat
}
}
}

void GPUChainTracking::DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls)
{
o2::tpc::CompressedClusters c = *cls;
std::vector<uint32_t> sorted(c.nTracks), offsets(c.nTracks);
std::iota(sorted.begin(), sorted.end(), 0);
auto sorter = [&c](const auto a, const auto b) {
return std::tie(c.sliceA[a], c.rowA[a], c.timeA[a], c.padA[a], c.qPtA[a]) <
std::tie(c.sliceA[b], c.rowA[b], c.timeA[b], c.padA[b], c.qPtA[b]);
};
std::sort(sorted.begin(), sorted.end(), sorter);
uint32_t offset = 0;
for (uint32_t i = 0; i < c.nTracks; i++) {
offsets[i] = offset;
offset += c.nTrackClusters[i];
}

auto sortArray = [&c, &sorted, &offsets](auto* src, size_t totalSize, auto getOffset, auto getSize) {
auto buf = std::make_unique<std::remove_reference_t<decltype(src[0])>[]>(totalSize);
memcpy(buf.get(), src, totalSize * sizeof(*src));
uint32_t targetOffset = 0;
for (uint32_t i = 0; i < c.nTracks; i++) {
const uint32_t j = sorted[i];
memcpy(src + targetOffset, buf.get() + getOffset(offsets[j], j), getSize(j) * sizeof(*src));
targetOffset += getSize(j);
}
};
auto sortMultiple = [&sortArray](size_t totalSize, auto getOffset, auto getSize, auto&&... arrays) {
(..., sortArray(std::forward<decltype(arrays)>(arrays), totalSize, getOffset, getSize));
};
auto getFullOffset = [](uint32_t off, uint32_t ind) { return off; };
auto getReducedOffset = [](uint32_t off, uint32_t ind) { return off - ind; };
auto getIndex = [](uint32_t off, uint32_t ind) { return ind; };
auto getN = [&c](uint32_t j) { return c.nTrackClusters[j]; };
auto getN1 = [&c](uint32_t j) { return c.nTrackClusters[j] - 1; };
auto get1 = [](uint32_t j) { return 1; };

sortMultiple(c.nAttachedClusters, getFullOffset, getN, c.qTotA, c.qMaxA, c.flagsA, c.sigmaPadA, c.sigmaTimeA);
sortMultiple(c.nAttachedClustersReduced, getReducedOffset, getN1, c.rowDiffA, c.sliceLegDiffA, c.padResA, c.timeResA);
sortMultiple(c.nTracks, getIndex, get1, c.qPtA, c.rowA, c.sliceA, c.timeA, c.padA, c.nTrackClusters); // NOTE: This must be last, since nTrackClusters is used for handling the arrays above!
}
2 changes: 0 additions & 2 deletions GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@
#include "TPCFastTransform.h"
#include "CorrectionMapsHelper.h"

#include "utils/linux_helpers.h"

using namespace o2::gpu;

#include "GPUO2DataTypes.h"
Expand Down