Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Common/ML/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
o2_add_library(ML
SOURCES src/OrtInterface.cxx
TARGETVARNAME targetName
PRIVATE_LINK_LIBRARIES O2::Framework onnxruntime::onnxruntime)
PRIVATE_LINK_LIBRARIES O2::GPUCommon onnxruntime::onnxruntime)

# Pass ORT variables as a preprocessor definition
target_compile_definitions(${targetName} PRIVATE
Expand Down
10 changes: 5 additions & 5 deletions Common/ML/include/ML/3rdparty/GPUORTFloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,22 +535,22 @@ GPUdi() uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
result = kPositiveQNaNBits;
} else {
auto get_msb_half = [](float fl) {
uint16_t result;
uint16_t res;
#ifdef GPUCA_GPUCODE
o2::gpu::CAMath::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
o2::gpu::CAMath::memcpy(&res, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
#else
#ifdef __cpp_if_constexpr
if constexpr (detail::endian::native == detail::endian::little)
#else
if (detail::endian::native == detail::endian::little)
#endif
{
std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
std::memcpy(&res, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
} else {
std::memcpy(&result, &fl, sizeof(uint16_t));
std::memcpy(&res, &fl, sizeof(uint16_t));
}
#endif
return result;
return res;
};

uint16_t upper_bits = get_msb_half(v);
Expand Down
3 changes: 2 additions & 1 deletion Common/ML/include/ML/OrtInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@
#include <memory>
#include <map>
#include <thread>
#include <unordered_map>

// O2 includes
#include "Framework/Logger.h"
#include "GPUCommonLogger.h"

namespace Ort
{
Expand Down
4 changes: 2 additions & 2 deletions Common/ML/src/OrtInterface.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
// ONNX includes
#include <onnxruntime_cxx_api.h>

#include <sstream>

namespace o2
{

Expand Down Expand Up @@ -139,7 +141,6 @@ void OrtModel::initSession()

void OrtModel::memoryOnDevice(int32_t deviceIndex)
{
#if (defined(ORT_ROCM_BUILD) || defined(ORT_MIGRAPHX_BUILD) || defined(ORT_CUDA_BUILD) || defined(ORT_TENSORRT_BUILD))
if (deviceIndex >= 0) {
(pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
(pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
Expand All @@ -161,7 +162,6 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex << " and pImplOrt pointer " << pImplOrt;
}
}
#endif
}

void OrtModel::resetSession()
Expand Down
12 changes: 6 additions & 6 deletions GPU/GPUTracking/Base/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,6 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
${CMAKE_SOURCE_DIR}/DataFormats/Reconstruction/src
${CMAKE_CURRENT_SOURCE_DIR}
TARGETVARNAME targetName)

target_compile_definitions(${targetName} PRIVATE
GPUCA_HAS_ONNX=1
$<$<BOOL:${ORT_CUDA_BUILD}>:ORT_CUDA_BUILD>
$<$<BOOL:${ORT_TENSORRT_BUILD}>:ORT_TENSORRT_BUILD>)

install(FILES ${HDRS} DESTINATION include/GPU)
endif()

Expand All @@ -141,6 +135,12 @@ endif()

target_compile_definitions(${targetName} PRIVATE $<TARGET_PROPERTY:O2::GPUTracking,COMPILE_DEFINITIONS>)

if (onnxruntime_FOUND)
target_compile_definitions(${targetName} PRIVATE
$<$<BOOL:${ORT_CUDA_BUILD}>:ORT_CUDA_BUILD>
$<$<BOOL:${ORT_TENSORRT_BUILD}>:ORT_TENSORRT_BUILD>)
endif()

# Setting target architecture and adding GPU libraries
target_link_libraries(${targetName} PRIVATE cuda cudart nvrtc)
set_target_cuda_arch(${targetName})
Expand Down
18 changes: 14 additions & 4 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -621,24 +621,34 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
}
}

#define ORTCHK(command) \
{ \
OrtStatus* status = command; \
if (status != nullptr) { \
const char* msg = api->GetErrorMessage(status); \
GPUFatal("ONNXRuntime Error: %s", msg); \
} \
}

void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
{
GPUChkErr(cudaGetDevice(deviceId));
#if !defined(__HIPCC__) && defined(ORT_CUDA_BUILD)
const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
OrtCUDAProviderOptionsV2* cuda_options = nullptr;
CreateCUDAProviderOptions(&cuda_options);
ORTCHK(api->CreateCUDAProviderOptions(&cuda_options));

// std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
// std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
// UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());

// this implicitly sets "has_user_compute_stream"
cuda_options.has_user_compute_stream = 1;
UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]);
cuda_options->has_user_compute_stream = 1;
ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]));
session_options.AppendExecutionProvider_CUDA_V2(cuda_options);

// Finally, don't forget to release the provider options
ReleaseCUDAProviderOptions(cuda_options);
api->ReleaseCUDAProviderOptions(cuda_options);
#elif defined(ORT_ROCM_BUILD)
// const auto& api = Ort::GetApi();
// api.GetCurrentGpuDeviceId(deviceId);
Expand Down
11 changes: 6 additions & 5 deletions GPU/GPUTracking/Base/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,6 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
${GPUCA_HIP_SOURCE_DIR}
TARGETVARNAME targetName)

target_compile_definitions(${targetName} PRIVATE
GPUCA_HAS_ONNX=1
$<$<BOOL:${ORT_ROCM_BUILD}>:ORT_ROCM_BUILD>
$<$<BOOL:${ORT_MIGRAPHX_BUILD}>:ORT_MIGRAPHX_BUILD>)

install(FILES ${HDRS} DESTINATION include/GPU)

# o2_add_test(GPUsortHIP NAME test_GPUsortHIP
Expand All @@ -195,6 +190,12 @@ endif()

target_compile_definitions(${targetName} PRIVATE $<TARGET_PROPERTY:O2::GPUTracking,COMPILE_DEFINITIONS>)

if (onnxruntime_FOUND)
target_compile_definitions(${targetName} PRIVATE
$<$<BOOL:${ORT_ROCM_BUILD}>:ORT_ROCM_BUILD>
$<$<BOOL:${ORT_MIGRAPHX_BUILD}>:ORT_MIGRAPHX_BUILD>)
endif()

add_library(${MODULE}_CXX OBJECT ${SRCS_CXX}) # Adding a C++ library for the .cxx code of the HIP library, such that it does not link to HIP libraries, and CMake HIP Language doesn't add HIP compile flags.
target_compile_definitions(${MODULE}_CXX PRIVATE $<TARGET_PROPERTY:O2::GPUTracking,COMPILE_DEFINITIONS>)
target_include_directories(${MODULE}_CXX PRIVATE $<TARGET_PROPERTY:O2::GPUTracking,INCLUDE_DIRECTORIES>)
Expand Down
10 changes: 7 additions & 3 deletions GPU/GPUTracking/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ set(SRCS_NO_CINT ${SRCS_NO_CINT}
Refit/GPUTrackingRefitKernel.cxx
Merger/GPUTPCGMO2Output.cxx)

if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
if(onnxruntime_FOUND)
list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerHost.cxx)
endif()

Expand Down Expand Up @@ -343,15 +343,14 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
O2::DetectorsRaw
O2::Steer
O2::ML
PRIVATE_LINK_LIBRARIES onnxruntime::onnxruntime
PUBLIC_INCLUDE_DIRECTORIES ${INCDIRS}
SOURCES ${SRCS} ${SRCS_NO_CINT} ${SRCS_NO_H})

target_include_directories(
${targetName}
PRIVATE $<TARGET_PROPERTY:O2::Framework,INTERFACE_INCLUDE_DIRECTORIES>)

target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)

o2_target_root_dictionary(${MODULE}
HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL}
Expand Down Expand Up @@ -421,6 +420,11 @@ target_link_libraries(${targetName} PRIVATE TBB::tbb)

target_compile_options(${targetName} PRIVATE -Wno-instantiation-after-specialization)

if (onnxruntime_FOUND)
target_compile_definitions(${targetName} PRIVATE GPUCA_HAS_ONNX=1)
target_link_libraries(${targetName} PRIVATE onnxruntime::onnxruntime)
endif()

# Add CMake recipes for GPU Tracking librararies
if(CUDA_ENABLED OR OPENCL_ENABLED OR HIP_ENABLED)
if(CMAKE_SYSTEM_NAME MATCHES Darwin)
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -980,12 +980,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
}

float time_clusterizer = 0, time_fill = 0, time_networks = 0;
// float time_clusterizer = 0, time_fill = 0, time_networks = 0;
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) {
uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode;
size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));

auto start0 = std::chrono::high_resolution_clock::now();
// auto start0 = std::chrono::high_resolution_clock::now();
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data

// auto stop0 = std::chrono::high_resolution_clock::now();
Expand Down
23 changes: 21 additions & 2 deletions GPU/GPUTracking/Standalone/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,25 @@ else()
endif()

# Detect GPU Backends
find_package(O2GPU)
find_package(O2GPU REQUIRED)

if(GPUCA_CONFIG_ONNX)
find_package(onnxruntime REQUIRED)
if(CUDA_ENABLED AND NOT DEFINED ORT_CUDA_BUILD)
set(ORT_CUDA_BUILD ON)
elseif(HIP_ENABLED AND NOT DEFINED ORT_ROCM_BUILD)
set(ORT_ROCM_BUILD ON)
endif()
else()
set(onnxruntime_FOUND OFF)
endif()

# Create main targets
add_subdirectory(../../ GPU)
add_library(standalone_support SHARED ${O2_DIR}/Common/Field/src/MagFieldFast.cxx
add_library(standalone_support SHARED
${O2_DIR}/Common/Field/src/MagFieldFast.cxx
${O2_DIR}/Common/ML/src/OrtInterface.cxx
${O2_DIR}/Common/Utils/src/StringUtils.cxx
${O2_DIR}/DataFormats/Detectors/TPC/src/CompressedClusters.cxx
${O2_DIR}/DataFormats/Reconstruction/src/TrackParametrization.cxx
${O2_DIR}/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx
Expand All @@ -150,6 +164,7 @@ target_include_directories(standalone_support PUBLIC
${O2_DIR}/Common/Constants/include
${O2_DIR}/Common/MathUtils/include
${O2_DIR}/Common/Utils/include
${O2_DIR}/Common/ML/include
${O2_DIR}/DataFormats/common/include
${O2_DIR}/DataFormats/Detectors/Common/include
${O2_DIR}/DataFormats/Detectors/ITSMFT/common/include
Expand Down Expand Up @@ -210,6 +225,10 @@ if(GPUCA_CONFIG_ROOT)
ROOT::Tree)
endif()

if(GPUCA_CONFIG_ONNX)
target_link_libraries(standalone_support PRIVATE onnxruntime::onnxruntime)
endif()

if (GPUCA_BUILD_DEBUG_SANITIZE AND CMAKE_CXX_COMPILER MATCHES "clang\\+\\+")
execute_process(COMMAND ${CMAKE_CXX_COMPILER} -print-file-name=libclang_rt.asan-x86_64.so OUTPUT_VARIABLE CLANG_ASAN_SO_PATH OUTPUT_STRIP_TRAILING_WHITESPACE)
get_filename_component(CLANG_ASAN_SO_PATH "${CLANG_ASAN_SO_PATH}" DIRECTORY)
Expand Down
1 change: 1 addition & 0 deletions GPU/GPUTracking/Standalone/cmake/config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set(ENABLE_OPENCL AUTO)
set(GPUCA_CONFIG_VC 1)
set(GPUCA_CONFIG_FMT 1)
set(GPUCA_CONFIG_ROOT 1)
set(GPUCA_CONFIG_ONNX 0)
set(GPUCA_BUILD_EVENT_DISPLAY 1)
set(GPUCA_BUILD_EVENT_DISPLAY_FREETYPE 1)
set(GPUCA_BUILD_EVENT_DISPLAY_VULKAN 1)
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Standalone/cmake/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ else
fi
eval "`alienv shell-helper`"
# alienv load O2/latest
for i in Vc boost fmt CMake ms_gsl Clang ninja TBB ROOT; do
for i in Vc boost fmt CMake ms_gsl Clang ninja TBB ROOT ONNXRuntime; do
source sw/$ALIARCH/$i/latest/etc/profile.d/init.sh
done
16 changes: 9 additions & 7 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ using namespace o2::gpu;
void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
{
std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
std::vector<std::string> reg_model_paths;
std::vector<std::string> reg_model_paths_local;
std::vector<std::string> evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':');

if (settings.nnLoadFromCCDB) {
Expand Down Expand Up @@ -60,20 +60,20 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
model_class.initOptions(OrtOptions);
modelsUsed[0] = true;

reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':');
reg_model_paths_local = o2::utils::Str::tokenize(reg_model_path, ':');

if (!settings.nnClusterizerUseCfRegression) {
if (reg_model_paths.size() == 1) {
OrtOptions["model-path"] = reg_model_paths[0];
if (reg_model_paths_local.size() == 1) {
OrtOptions["model-path"] = reg_model_paths_local[0];
OrtOptions["onnx-environment-name"] = "r1";
model_reg_1.initOptions(OrtOptions);
modelsUsed[1] = true;
} else {
OrtOptions["model-path"] = reg_model_paths[0];
OrtOptions["model-path"] = reg_model_paths_local[0];
OrtOptions["onnx-environment-name"] = "r1";
model_reg_1.initOptions(OrtOptions);
modelsUsed[1] = true;
OrtOptions["model-path"] = reg_model_paths[1];
OrtOptions["model-path"] = reg_model_paths_local[1];
OrtOptions["onnx-environment-name"] = "r2";
model_reg_2.initOptions(OrtOptions);
modelsUsed[2] = true;
Expand Down Expand Up @@ -154,6 +154,7 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info
MockedOrtAllocator::~MockedOrtAllocator()
{
// Ort::GetApi().ReleaseMemoryInfo(memory_info);
(void)0; // Suppress warning for empty destructor
}

void* MockedOrtAllocator::Alloc(size_t size)
Expand Down Expand Up @@ -191,8 +192,9 @@ size_t MockedOrtAllocator::NumReserveAllocations() const

void MockedOrtAllocator::LeakCheck()
{
if (memory_inuse.load())
if (memory_inuse.load()) {
LOG(warning) << "memory leak!!!";
}
}

void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
CfChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart];
int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());

if (clustererNN.nnClusterizerAddIndexData && transient_index == (clustererNN.nnClusterizerElementSize - 1)) {
if (clustererNN.nnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.nnClusterizerElementSize - 1)) {
uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize;
for (uint16_t i = 0; i < 8; i++) {
Delta2 d = cfconsts::InnerNeighbors[i];
Expand All @@ -141,7 +141,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
clustererNN.inputData_32[top_idx - 2] = row / 152.f;
clustererNN.inputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
}
} else if (transient_index < (clustererNN.nnClusterizerElementSize - 3)) {
} else if ((int32_t)transient_index < (clustererNN.nnClusterizerElementSize - 3)) {
int time = static_cast<int>(peak.time());
int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow;
bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
Expand Down Expand Up @@ -197,7 +197,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
uint elem_iterator = glo_idx * clustererNN.nnClusterizerModelClassNumOutputNodes;
float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
uint class_label = 0;
for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) {
for (uint pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) {
if (pIdx == elem_iterator) {
if (dtype == 0) {
current_max_prob = static_cast<float>(clustererNN.modelProbabilities_16[pIdx]);
Expand Down
2 changes: 0 additions & 2 deletions GPU/GPUTracking/cmake/kernel_helpers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -174,15 +174,13 @@ function(o2_gpu_kernel_add_parameter)
list(LENGTH ARGV n)
math(EXPR n "${n} - 1")
foreach(i RANGE 0 ${n})
message(STATUS "Adding ${ARGV${i}}")
set_property(TARGET O2_GPU_KERNELS APPEND PROPERTY O2_GPU_KERNEL_PARAMS "${ARGV${i}}")
endforeach()
endfunction()
function(o2_gpu_kernel_add_string_parameter)
list(LENGTH ARGV n)
math(EXPR n "${n} - 1")
foreach(i RANGE 0 ${n})
message(STATUS "Adding ${ARGV${i}}")
set_property(TARGET O2_GPU_KERNELS APPEND PROPERTY O2_GPU_KERNEL_STRING_PARAMS "${ARGV${i}}")
endforeach()
endfunction()
Loading