Skip to content

Commit 9037ea6

Browse files
committed
First version of GPU stream implementation. Still needs testing.
1 parent 81dad27 commit 9037ea6

File tree

7 files changed

+57
-8
lines changed

7 files changed

+57
-8
lines changed

Common/ML/include/ML/OrtInterface.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ class OrtModel
8484

8585
// Environment settings
8686
bool mInitialized = false;
87-
std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
88-
int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
87+
std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
88+
int intraOpNumThreads = 1, interOpNumThreads = 1, streamId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
8989

9090
std::string printShape(const std::vector<int64_t>&);
9191
};

Common/ML/src/OrtInterface.cxx

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
4848
if (!optionsMap["model-path"].empty()) {
4949
modelPath = optionsMap["model-path"];
5050
device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
51-
dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
52-
deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
51+
streamId = (optionsMap.contains("stream-id") ? std::stoi(optionsMap["stream-id"]) : 0);
5352
allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
5453
intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
5554
interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0);
@@ -61,31 +60,33 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
6160
#if defined(ORT_ROCM_BUILD)
6261
#if ORT_ROCM_BUILD == 1
6362
if (device == "ROCM") {
64-
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
63+
// Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, streamId));
64+
o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
6565
LOG(info) << "(ORT) ROCM execution provider set";
6666
}
6767
#endif
6868
#endif
6969
#if defined(ORT_MIGRAPHX_BUILD)
7070
#if ORT_MIGRAPHX_BUILD == 1
7171
if (device == "MIGRAPHX") {
72-
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
72+
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, streamId));
7373
LOG(info) << "(ORT) MIGraphX execution provider set";
7474
}
7575
#endif
7676
#endif
7777
#if defined(ORT_CUDA_BUILD)
7878
#if ORT_CUDA_BUILD == 1
7979
if (device == "CUDA") {
80-
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
80+
// Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, streamId));
81+
o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
8182
LOG(info) << "(ORT) CUDA execution provider set";
8283
dev_mem_str = "Cuda";
8384
}
8485
#endif
8586
#endif
8687

8788
if (allocateDeviceMemory) {
88-
pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
89+
pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, streamId, OrtMemType::OrtMemTypeDefault);
8990
LOG(info) << "(ORT) Memory info set to on-device memory";
9091
}
9192

GPU/GPUTracking/Base/GPUReconstructionProcessing.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include <functional>
2323
#include <atomic>
2424

25+
struct OrtSessionOptions;
26+
2527
namespace o2::gpu
2628
{
2729

@@ -88,6 +90,7 @@ class GPUReconstructionProcessing : public GPUReconstruction
8890
void AddGPUEvents(T*& events);
8991

9092
virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
93+
virtual int32_t SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) { return 0; }
9194

9295
struct RecoStepTimerMeta {
9396
HighResTimer timerToGPU;

GPU/GPUTracking/Base/cuda/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
115115
${MODULE}
116116
SOURCES ${SRCS}
117117
PUBLIC_LINK_LIBRARIES ${TMP_BASELIB} O2::ITStrackingCUDA
118+
PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
118119
PRIVATE_INCLUDE_DIRECTORIES
119120
${CMAKE_SOURCE_DIR}/Detectors/Base/src
120121
${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "GPUReconstructionCUDAIncludesHost.h"
1717

1818
#include <cuda_profiler_api.h>
19+
#include "ML/OrtInterface.h"
1920

2021
#include "GPUReconstructionCUDA.h"
2122
#include "GPUReconstructionCUDAInternals.h"
@@ -35,6 +36,10 @@
3536
#undef GPUCA_KRNL
3637
#endif
3738

39+
#ifdef GPUCA_HAS_ONNX
40+
#include <onnxruntime_cxx_api.h>
41+
#endif
42+
3843
static constexpr size_t REQUIRE_MIN_MEMORY = 1024L * 1024 * 1024;
3944
static constexpr size_t REQUIRE_MEMORY_RESERVED = 512L * 1024 * 1024;
4045
static constexpr size_t REQUIRE_FREE_MEMORY_RESERVED_PER_SM = 40L * 1024 * 1024;
@@ -656,13 +661,50 @@ void GPUReconstructionCUDA::endGPUProfiling()
656661
{
657662
GPUChkErr(cudaProfilerStop());
658663
}
664+
665+
#ifdef GPUCA_HAS_ONNX
666+
int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream)
667+
{
668+
OrtCUDAProviderOptionsV2* cuda_options = nullptr;
669+
CreateCUDAProviderOptions(&cuda_options);
670+
671+
// std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
672+
// std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
673+
// UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
674+
675+
// this implicitly sets "has_user_compute_stream"
676+
UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", &mInternals->Streams[stream]);
677+
Ort::ThrowOnError(SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options));
678+
679+
// Finally, don't forget to release the provider options
680+
ReleaseCUDAProviderOptions(cuda_options);
681+
682+
return 0;
683+
}
684+
#endif // GPUCA_HAS_ONNX
685+
659686
#else // HIP
660687
void* GPUReconstructionHIP::getGPUPointer(void* ptr)
661688
{
662689
void* retVal = nullptr;
663690
GPUChkErr(hipHostGetDevicePointer(&retVal, ptr, 0));
664691
return retVal;
665692
}
693+
694+
#ifdef GPUCA_HAS_ONNX
695+
int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream)
696+
{
697+
// Create ROCm provider options
698+
const auto& api = Ort::GetApi();
699+
OrtROCMProviderOptions rocm_options{};
700+
rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
701+
rocm_options.user_compute_stream = &mInternals->Streams[stream];
702+
703+
// Append the ROCm execution provider with the custom HIP stream
704+
Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_ROCM(session_options, &rocm_options));
705+
return 0;
706+
}
707+
#endif // GPUCA_HAS_ONNX
666708
#endif // __HIPCC__
667709

668710
namespace o2::gpu

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
7979
size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
8080
void ReleaseEvent(deviceEvent ev) override;
8181
void RecordMarker(deviceEvent* ev, int32_t stream) override;
82+
int32_t SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) override;
8283

8384
void GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame) override;
8485

GPU/GPUTracking/Base/hip/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
153153
${MODULE}
154154
SOURCES ${SRCS}
155155
PUBLIC_LINK_LIBRARIES ${TMP_BASELIB} O2::ITStrackingHIP
156+
PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
156157
PRIVATE_INCLUDE_DIRECTORIES
157158
${CMAKE_SOURCE_DIR}/Detectors/Base/src
158159
${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src

0 commit comments

Comments
 (0)