Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion GPU/Common/GPUCommonMath.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ,

GPUdi() float GPUCommonMath::InvSqrt(float _x)
{
#ifdef GPUCA_NO_FAST_MATH
#if defined(GPUCA_NO_FAST_MATH) || defined(__OPENCL__)
return 1.f / Sqrt(_x);
#elif defined(__CUDACC__) || defined(__HIPCC__)
return __frsqrt_rn(_x);
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/GPUReconstructionCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
virtual void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) {}
virtual void StreamWaitForEvents(int32_t stream, deviceEvent* evList, int32_t nEvents = 1) {}
virtual bool IsEventDone(deviceEvent* evList, int32_t nEvents = 1) { return true; }
virtual void RecordMarker(deviceEvent ev, int32_t stream) {}
virtual void RecordMarker(deviceEvent* ev, int32_t stream) {}
virtual void SynchronizeGPU() {}
virtual void ReleaseEvent(deviceEvent ev) {}
virtual int32_t StartHelperThreads() { return 0; }
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ size_t GPUReconstructionCUDA::WriteToConstantMemory(size_t offset, const void* s
}

void GPUReconstructionCUDA::ReleaseEvent(deviceEvent ev) {}
void GPUReconstructionCUDA::RecordMarker(deviceEvent ev, int32_t stream) { GPUFailedMsg(cudaEventRecord(ev.get<cudaEvent_t>(), mInternals->Streams[stream])); }
void GPUReconstructionCUDA::RecordMarker(deviceEvent* ev, int32_t stream) { GPUFailedMsg(cudaEventRecord(ev->get<cudaEvent_t>(), mInternals->Streams[stream])); }

std::unique_ptr<GPUReconstruction::GPUThreadContext> GPUReconstructionCUDA::GetThreadContext()
{
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override;
size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
void ReleaseEvent(deviceEvent ev) override;
void RecordMarker(deviceEvent ev, int32_t stream) override;
void RecordMarker(deviceEvent* ev, int32_t stream) override;

void GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame) override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,11 @@ int32_t GPUReconstructionOCL::InitDevice_Runtime()
mInternals = master->mInternals;
}

for (uint32_t i = 0; i < mEvents.size(); i++) {
cl_event* events = (cl_event*)mEvents[i].data();
new (events) cl_event[mEvents[i].size()];
}

return (0);
}

Expand Down Expand Up @@ -432,7 +437,7 @@ size_t GPUReconstructionOCL::WriteToConstantMemory(size_t offset, const void* sr

void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUFailedMsg(clReleaseEvent(ev.get<cl_event>())); }

void GPUReconstructionOCL::RecordMarker(deviceEvent ev, int32_t stream) { GPUFailedMsg(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev.getEventList<cl_event>())); }
void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUFailedMsg(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList<cl_event>())); }

int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class GPUReconstructionOCL : public GPUReconstructionDeviceBase
size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override;
size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
void ReleaseEvent(deviceEvent ev) override;
void RecordMarker(deviceEvent ev, int32_t stream) override;
void RecordMarker(deviceEvent* ev, int32_t stream) override;

virtual int32_t GetOCLPrograms() = 0;
virtual bool CheckPlatform(uint32_t i) = 0;
Expand Down
8 changes: 6 additions & 2 deletions GPU/GPUTracking/Base/opencl2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ endif()
set(CL_SRC ${GPUDIR}/Base/opencl-common/GPUReconstructionOCL.cl)
set(CL_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionOCL2Code)

set(OCL_FLAGS -ferror-limit=1000 -Dcl_clang_storage_class_specifiers -Wno-invalid-constexpr -Wno-unused-command-line-argument -cl-std=CLC++2021)
set(OCL_FLAGS -Dcl_clang_storage_class_specifiers -cl-std=CLC++2021)
if(NOT DEFINED GPUCA_NO_FAST_MATH OR NOT ${GPUCA_NO_FAST_MATH})
set(OCL_FLAGS ${OCL_FLAGS} -Xclang -fdenormal-fp-math-f32=ieee -cl-mad-enable -cl-no-signed-zeros)
set(OCL_FLAGS ${OCL_FLAGS} -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math)
else()
set(OCL_FLAGS ${OCL_FLAGS} -cl-fp32-correctly-rounded-divide-sqrt)
endif()
set(OCL_DEFINECL "-D$<JOIN:$<TARGET_PROPERTY:O2::GPUTracking,COMPILE_DEFINITIONS>,$<SEMICOLON>-D>"
"-I$<JOIN:$<FILTER:$<TARGET_PROPERTY:O2::GPUTracking,INCLUDE_DIRECTORIES>,EXCLUDE,^/usr/include/?>,$<SEMICOLON>-I>"
Expand All @@ -47,6 +49,7 @@ if(OPENCL2_ENABLED_SPIRV) # BUILD OpenCL2 intermediate code for SPIR-V target
-O0
--target=spirv64
-fno-integrated-objemitter
-ferror-limit=1000 -Wno-invalid-constexpr -Wno-unused-command-line-argument
${OCL_FLAGS}
${OCL_DEFINECL}
-o ${CL_BIN}.spirv -c ${CL_SRC}
Expand All @@ -64,6 +67,7 @@ if(OPENCL2_ENABLED) # BUILD OpenCL2 source code for runtime compilation target
add_custom_command(
OUTPUT ${CL_BIN}.src
COMMAND ${LLVM_CLANG}
-Wno-unused-command-line-argument
${OCL_FLAGS}
${OCL_DEFINECL}
-cl-no-stdinc
Expand Down
6 changes: 3 additions & 3 deletions GPU/GPUTracking/Base/opencl2/GPUReconstructionOCL2.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,14 @@ int32_t GPUReconstructionOCL2Backend::GetOCLPrograms()
const char* ocl_flags = GPUCA_M_STR(OCL_FLAGS);

#ifdef OPENCL2_ENABLED_SPIRV // clang-format off
if (ver >= 2.2f) {
GPUInfo("Reading OpenCL program from SPIR-V IL (Platform version %f)", ver);
if (ver >= 2.2f && !GetProcessingSettings().oclCompileFromSources) {
GPUInfo("Reading OpenCL program from SPIR-V IL (Platform version %4.2f)", ver);
mInternals->program = clCreateProgramWithIL(mInternals->context, _binary_GPUReconstructionOCL2Code_spirv_start, _binary_GPUReconstructionOCL2Code_spirv_len, &ocl_error);
ocl_flags = "";
} else
#endif // clang-format on
{
GPUInfo("Compiling OpenCL program from sources (Platform version %f, %s)", ver);
GPUInfo("Compiling OpenCL program from sources (Platform version %4.2f)", ver);
size_t program_sizes[1] = {_binary_GPUReconstructionOCL2Code_src_len};
char* programs_sources[1] = {_binary_GPUReconstructionOCL2Code_src_start};
mInternals->program = clCreateProgramWithSource(mInternals->context, (cl_uint)1, (const char**)&programs_sources, program_sizes, &ocl_error);
Expand Down
3 changes: 2 additions & 1 deletion GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ AddHelp("help", 'h')
EndConfig()

BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Processing settings", proc)
AddOption(platformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (-1 = auto-select)")
AddOption(platformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (OpenCL only, -1 = auto-select)")
AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)")
AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)")
AddOption(globalInitMutex, bool, false, "", 0, "Use global mutex to synchronize initialization of multiple GPU instances")
Expand Down Expand Up @@ -291,6 +291,7 @@ AddOption(tpcApplyDebugClusterFilter, bool, false, "", 0, "Apply custom cluster
AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
AddOption(RTCprependCommand, std::string, "", "", 0, "Prepend RTC compilation commands by this string")
AddOption(RTCoverrideArchitecture, std::string, "", "", 0, "Override arhcitecture part of RTC compilation command line")
AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from included source code instead of using included spirv code")
AddOption(printSettings, bool, false, "", 0, "Print all settings when initializing")
AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr)
AddSubConfig(GPUSettingsProcessingRTC, rtc)
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Global/GPUChain.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class GPUChain
}
}
inline bool IsEventDone(deviceEvent* evList, int32_t nEvents = 1) { return mRec->IsEventDone(evList, nEvents); }
inline void RecordMarker(deviceEvent ev, int32_t stream) { mRec->RecordMarker(ev, stream); }
inline void RecordMarker(deviceEvent* ev, int32_t stream) { mRec->RecordMarker(ev, stream); }
virtual inline std::unique_ptr<GPUReconstruction::GPUThreadContext> GetThreadContext() { return mRec->GetThreadContext(); }
inline void SynchronizeGPU() { mRec->SynchronizeGPU(); }
inline void ReleaseEvent(deviceEvent ev, bool doGPU = true)
Expand Down
10 changes: 7 additions & 3 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -865,8 +865,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}

if (fragment.index == 0) {
runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, transferRunning[lane] == 1 ? &mEvents->stream[lane] : nullptr}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow));
transferRunning[lane] = 2;
deviceEvent* waitEvent = nullptr;
if (transferRunning[lane] == 1) {
waitEvent = &mEvents->stream[lane];
transferRunning[lane] = 2;
}
runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow));
}

if (clusterer.mPmemory->counters.nClusters == 0) {
Expand Down Expand Up @@ -930,7 +934,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
if (transferRunning[lane]) {
ReleaseEvent(mEvents->stream[lane], doGPU);
}
RecordMarker(mEvents->stream[lane], mRec->NStreams() - 1);
RecordMarker(&mEvents->stream[lane], mRec->NStreams() - 1);
transferRunning[lane] = 1;
}

Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ int32_t GPUChainTracking::RunTPCCompression()
GPUTPCCompression& CompressorShadow = doGPU ? processorsShadow()->tpcCompressor : Compressor;
const auto& threadContext = GetThreadContext();
if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) {
RecordMarker(mEvents->single, 0);
RecordMarker(&mEvents->single, 0);
}

if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
Expand Down Expand Up @@ -124,7 +124,7 @@ int32_t GPUChainTracking::RunTPCCompression()
return 1;
}
if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
RecordMarker(mEvents->stream[outputStream], outputStream);
RecordMarker(&mEvents->stream[outputStream], outputStream);
char* deviceFlatPts = (char*)Compressor.mOutput->qTotU;
if (GetProcessingSettings().doublePipeline) {
const size_t blockSize = CAMath::nextMultipleOf<1024>(copySize / 30);
Expand Down
8 changes: 4 additions & 4 deletions GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice
uint32_t n = withinSlice == -1 ? NSLICES / 2 : NSLICES;
if (GetProcessingSettings().alternateBorderSort && (!mRec->IsGPU() || doGPUall)) {
TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->init);
RecordMarker(mEvents->single, 0);
RecordMarker(&mEvents->single, 0);
for (uint32_t i = 0; i < n; i++) {
int32_t stream = i % mRec->NStreams();
runKernel<GPUTPCGMMergerMergeBorders, 0>({GetGridAuto(stream, deviceType), krnlRunRangeNone, {nullptr, stream && i < (uint32_t)mRec->NStreams() ? &mEvents->single : nullptr}}, i, withinSlice, mergeMode);
Expand All @@ -55,7 +55,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice
if (i == n - 1) { // Synchronize all execution on stream 0 with the last kernel
ne = std::min<int32_t>(n, mRec->NStreams());
for (int32_t j = 1; j < ne; j++) {
RecordMarker(mEvents->slice[j], j);
RecordMarker(&mEvents->slice[j], j);
}
e = &mEvents->slice[1];
ne--;
Expand Down Expand Up @@ -251,7 +251,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile);

if (doGPUall) {
RecordMarker(mEvents->single, 0);
RecordMarker(&mEvents->single, 0);
auto* waitEvent = &mEvents->single;
if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().createO2Output <= 1 || mFractionalQAEnabled) {
if (!(GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().createO2Output <= 1)) {
Expand Down Expand Up @@ -317,7 +317,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
TransferMemoryResourcesToHost(RecoStep::TPCMerging, &Merger, -1, true);
runKernel<GPUTPCGMO2Output, GPUTPCGMO2Output::mc>(GetGridAuto(0, GPUReconstruction::krnlDeviceType::CPU));
} else if (doGPUall) {
RecordMarker(mEvents->single, 0);
RecordMarker(&mEvents->single, 0);
TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2(), outputStream, nullptr, &mEvents->single);
TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2Clus(), outputStream);
ReleaseEvent(mEvents->single);
Expand Down
10 changes: 6 additions & 4 deletions GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,11 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
TransferMemoryResourceLinkToGPU(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap, streamOccMap, &mEvents->init);
}
}
uint32_t& occupancyTotal = *mInputsHost->mTPCClusterOccupancyMap;
occupancyTotal = CAMath::Float2UIntRn(mRec->MemoryScalers()->nTPCHits / (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF ? mIOPtrs.settingsTF->nHBFPerTF : 128));
mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, streamOccMap);
if (param().rec.tpc.occupancyMapTimeBins || param().rec.tpc.sysClusErrorC12Norm) {
uint32_t& occupancyTotal = *mInputsHost->mTPCClusterOccupancyMap;
occupancyTotal = CAMath::Float2UIntRn(mRec->MemoryScalers()->nTPCHits / (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF ? mIOPtrs.settingsTF->nHBFPerTF : 128));
mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, streamOccMap);
}

int32_t streamMap[NSLICES];

Expand Down Expand Up @@ -305,7 +307,7 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
SynchronizeGPU();
} else {
for (int32_t i = 0; i < mRec->NStreams(); i++) {
RecordMarker(mEvents->stream[i], i);
RecordMarker(&mEvents->stream[i], i);
}
runKernel<GPUTPCTrackletConstructor, 1>({GetGridAuto(0), krnlRunRangeNone, {&mEvents->single, mEvents->stream, mRec->NStreams()}});
for (int32_t i = 0; i < mRec->NStreams(); i++) {
Expand Down
Loading