Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions core/runtime/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ cc_library(
"RTDevice.cpp",
"TRTEngine.cpp",
"TRTEngineProfiler.cpp",
"TRTRuntimeConfig.cpp",
"execute_engine.cpp",
"register_jit_hooks.cpp",
"runtime.cpp",
Expand All @@ -77,14 +78,23 @@ cc_library(
"RTDevice.h",
"TRTEngine.h",
"TRTEngineProfiler.h",
"TRTRuntimeConfig.h",
"runtime.h",
],
copts = if_torch_nccl(["-DUSE_C10D_NCCL"]),
linkopts = [
"-lstdc++fs",
],
local_defines = select({
# TensorRT-RTX builds: opt into feature-gated APIs that the runtime layer
# depends on (e.g. IExecutionContext::isStreamCapturable).
":rtx_win": ["ENABLE_FEATURE_DISABLE_RUNTIME_ALLOCATION"],
":rtx_x86_64": ["ENABLE_FEATURE_DISABLE_RUNTIME_ALLOCATION"],
"//conditions:default": [],
}),
deps = [
"//core/plugins:torch_tensorrt_plugins",
"//core/util:file_lock",
"//core/util:prelude",
] + if_torch_nccl(["@torch_nccl//:nccl_headers"]) + select({
":jetpack": ["@tensorrt_l4t//:nvinfer"],
Expand All @@ -110,6 +120,7 @@ filegroup(
"RTDevice.h",
"TRTEngine.h",
"TRTEngineProfiler.h",
"TRTRuntimeConfig.h",
"runtime.h",
],
visibility = ["//visibility:public"],
Expand Down
120 changes: 77 additions & 43 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <algorithm>
#include <filesystem>

#include <cuda_runtime.h>
#include "NvInfer.h"
Expand Down Expand Up @@ -61,26 +62,28 @@ void DynamicOutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims
}

TRTEngine::TRTEngine(
const std::string& serialized_engine,
std::string serialized_engine,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this need to be a deep copy?

const RTDevice& cuda_device,
const std::vector<std::string>& _in_binding_names,
const std::vector<std::string>& _out_binding_names,
const Platform& target_platform,
bool hardware_compatible,
bool requires_output_allocator,
const std::string& serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy)
std::string serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy,
TRTRuntimeConfig runtime_cfg)
: TRTEngine(
"deserialized_trt",
serialized_engine,
std::move(serialized_engine),
cuda_device,
_in_binding_names,
_out_binding_names,
target_platform,
hardware_compatible,
requires_output_allocator,
serialized_metadata,
resource_allocation_strategy) {}
std::move(serialized_metadata),
resource_allocation_strategy,
std::move(runtime_cfg)) {}

TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
: TRTEngine(
Expand All @@ -95,24 +98,27 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
serialized_info[SERIALIZED_METADATA_IDX],
(static_cast<bool>(std::stoi(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX]))
? ResourceAllocationStrategy::kDynamic
: ResourceAllocationStrategy::kStatic)) {
: ResourceAllocationStrategy::kStatic),
make_runtime_config_from_serialized(serialized_info)) {
this->requires_native_multidevice = std::stoi(serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX]);
if (this->requires_native_multidevice) {
LOG_INFO("Loaded distributed TRT engine (contains NCCL collectives); NCCL comm will be bound on first execution");
}
}

TRTEngine::TRTEngine(
const std::string& mod_name,
const std::string& serialized_engine,
std::string mod_name,
std::string serialized_engine,
const RTDevice& cuda_device,
const std::vector<std::string>& _in_binding_names,
const std::vector<std::string>& _out_binding_names,
const Platform& target_platform,
bool hardware_compatible,
bool requires_output_allocator,
const std::string& serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy) {
std::string serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy,
TRTRuntimeConfig runtime_cfg) {
this->runtime_cfg = std::move(runtime_cfg);
TORCHTRT_CHECK(
is_supported_on_current_platform(target_platform),
"This engine was not built to run on this platform (built for: " << target_platform << ", current platform: "
Expand All @@ -123,15 +129,15 @@ TRTEngine::TRTEngine(
auto most_compatible_device = get_most_compatible_device(cuda_device, RTDevice(), hardware_compatible);
TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");

this->serialized_metadata = serialized_metadata;
this->serialized_metadata = std::move(serialized_metadata);
this->requires_output_allocator = requires_output_allocator;
device_info = most_compatible_device.value();
multi_gpu_device_check();
set_rt_device(device_info);

rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));

name = slugify(mod_name);
name = slugify(std::move(mod_name));

cuda_engine = make_trt(rt->deserializeCudaEngine(serialized_engine.c_str(), serialized_engine.size()));
TORCHTRT_CHECK((cuda_engine.get() != nullptr), "Unable to deserialize the TensorRT engine");
Expand All @@ -146,13 +152,7 @@ TRTEngine::TRTEngine(
LOG_DEBUG(
"Resource allocation strategy: "
<< (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static"));
if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) {
this->exec_ctx =
make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
} else {
this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
}
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");
recreate_execution_context();

// Pre-allocate placeholder for empty tensors (TensorRT requires non-null addresses)
cudaMalloc(&empty_tensor_placeholder, 1);
Expand Down Expand Up @@ -288,6 +288,9 @@ TRTEngine::TRTEngine(
}

TRTEngine::~TRTEngine() {
// Marked noexcept so safe to invoke from a destructor without
// explicit try/catch; any I/O error is logged internally.
runtime_cfg.save_runtime_cache();
trt_engine_profiler.reset();
exec_ctx.reset();
cuda_engine.reset();
Expand All @@ -301,8 +304,7 @@ void TRTEngine::disable_profiling() {
torch::cuda::synchronize(device_info.id);
profile_execution = false;
trt_engine_profiler.reset();
exec_ctx = make_trt(cuda_engine->createExecutionContext());
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context");
recreate_execution_context();
}

void TRTEngine::dump_engine_layer_info_to_file(const std::string& path) {
Expand Down Expand Up @@ -399,10 +401,7 @@ bool TRTEngine::set_device_memory_budget(int64_t budget) {
trt_engine_profiler.reset();
}
bool result = cuda_engine->setWeightStreamingBudgetV2(budget);
exec_ctx = make_trt(cuda_engine->createExecutionContext());
TORCHTRT_CHECK(
(exec_ctx.get() != nullptr),
"Unable to recreate TensorRT execution context after setting new device memory budget");
recreate_execution_context();
if (profile_execution) {
enable_profiling();
}
Expand Down Expand Up @@ -459,6 +458,7 @@ std::string TRTEngine::to_str() const {
ss << " Target Platform: " << target_platform << std::endl;
ss << " Resource Allocation Strategy: " << (resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static") << std::endl;
ss << " Multi-Device Engine: " << (requires_native_multidevice) << std::endl;
ss << runtime_cfg.to_str();
// clang-format on
return ss.str();
}
Expand Down Expand Up @@ -495,7 +495,14 @@ FlattenedState TRTEngine::__obj_flatten__() {
std::tuple("requires_output_allocator", serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]),
std::tuple("target_platform", serialized_info[TARGET_PLATFORM_IDX]),
std::tuple("resource_allocation_strategy", serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX]),
std::tuple("requires_native_multidevice", serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX]));
std::tuple("requires_native_multidevice", serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX])
#ifdef TRT_MAJOR_RTX
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above comment

,
std::tuple("runtime_cache_path", serialized_info[RUNTIME_CACHE_PATH_IDX]),
std::tuple("dynamic_shapes_kernel_strategy", serialized_info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX]),
std::tuple("cuda_graph_strategy", serialized_info[CUDA_GRAPH_STRATEGY_IDX])
#endif
);
}

std::vector<std::string> TRTEngine::serialize() {
Expand All @@ -522,6 +529,13 @@ std::vector<std::string> TRTEngine::serialize() {
this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "1" : "0";
serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX] = this->requires_native_multidevice ? "1" : "0";
// rank/world_size are runtime facts (may differ at load time); not serialized.
#ifdef TRT_MAJOR_RTX
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here

serialized_info[RUNTIME_CACHE_PATH_IDX] = runtime_cfg.runtime_cache_path;
serialized_info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX] = std::to_string(
static_cast<std::underlying_type_t<DynamicShapesKernelStrategy>>(runtime_cfg.dynamic_shapes_kernel_strategy));
serialized_info[CUDA_GRAPH_STRATEGY_IDX] =
std::to_string(static_cast<std::underlying_type_t<CudaGraphStrategyOption>>(runtime_cfg.cuda_graph_strategy));
#endif

return serialized_info;
}
Expand All @@ -533,14 +547,11 @@ void TRTEngine::reset_captured_graph() {
void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationStrategy new_strategy) {
if (new_strategy != this->resource_allocation_strategy) {
this->resource_allocation_strategy = new_strategy;
if (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) {
LOG_DEBUG("Setting resource allocation strategy to dynamic");
this->exec_ctx =
make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
} else {
LOG_DEBUG("Setting resource allocation strategy to static");
this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
}
LOG_DEBUG(
"Setting resource allocation strategy to "
<< (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic ? "dynamic"
: "static"));
recreate_execution_context();
}
}

Expand Down Expand Up @@ -637,19 +648,42 @@ void TRTEngine::release_nccl_comm() {
LOG_INFO("Releasing NCCL communicator from engine '" << this->name << "'");
torch::cuda::synchronize(device_info.id);
this->exec_ctx.reset();
if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) {
this->exec_ctx =
make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
} else {
this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
}
TORCHTRT_CHECK(
(exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context after releasing NCCL comm");
recreate_execution_context();
this->nccl_initialized = false;
LOG_INFO("NCCL communicator released from engine '" << this->name << "'");
}
#endif // ENABLE_TRT_NCCL_COLLECTIVES

bool TRTEngine::is_monolithic_capturable(cudaStream_t stream) const {
return runtime_cfg.is_monolithic_capturable(exec_ctx.get(), stream);
}

void TRTEngine::disable_rtx_native_cudagraphs() {
bool was_disabled = runtime_cfg.rtx_native_cudagraphs_disabled;
runtime_cfg.disable_rtx_native_cudagraphs(name);
if (!was_disabled && runtime_cfg.rtx_native_cudagraphs_disabled) {
// The CUDA graph strategy on the IRuntimeConfig has been flipped; rebuild exec_ctx
// so the new strategy takes effect for subsequent enqueueV3 calls.
recreate_execution_context();
}
}

void TRTEngine::recreate_execution_context() {
// Flush any kernels the previous execution context may have compiled into the
// runtime cache before creating the replacement. The destructor also saves, but
// doing it here guards against losing compiled kernels across profiling toggles,
// allocator changes, or process kills that happen between allocator changes and
// teardown. No-op on standard TensorRT or when no cache path is configured.
runtime_cfg.save_runtime_cache();
runtime_cfg.ensure_initialized(cuda_engine.get());
runtime_cfg.set_execution_context_allocation_strategy(
resource_allocation_strategy == ResourceAllocationStrategy::kDynamic
? nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED
: nvinfer1::ExecutionContextAllocationStrategy::kSTATIC);
exec_ctx = make_trt(cuda_engine->createExecutionContext(runtime_cfg.config.get()));
TORCHTRT_CHECK(exec_ctx.get() != nullptr, "Unable to (re)create TensorRT execution context");
}

} // namespace runtime
} // namespace core
} // namespace torch_tensorrt
44 changes: 36 additions & 8 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "torch/custom_class.h"

#include "core/runtime/TRTEngineProfiler.h"
#include "core/runtime/TRTRuntimeConfig.h"
#include "core/util/prelude.h"

// TensorRT 10.16+ has native NCCL collective support via IExecutionContext::setCommunicator()
Expand Down Expand Up @@ -45,7 +46,14 @@ using FlattenedState = std::tuple<
std::tuple<std::string, std::string>, // serialized metadata
std::tuple<std::string, std::string>, // Platform
std::tuple<std::string, std::string>, // Resource Allocation Strategy
std::tuple<std::string, std::string>>; // requires_native_multidevice
std::tuple<std::string, std::string> // requires_native_multidevice
#ifdef TRT_MAJOR_RTX
,
std::tuple<std::string, std::string>, // Runtime Cache Path (TRT-RTX)
std::tuple<std::string, std::string>, // Dynamic Shapes Kernel Strategy (TRT-RTX)
std::tuple<std::string, std::string> // CUDA Graph Strategy (TRT-RTX)
#endif
>;

struct TorchTRTRuntimeStates {
// Indicates whether CUDAGraphs were enabled in the previous execute_engine
Expand Down Expand Up @@ -140,31 +148,33 @@ struct TRTEngine : torch::CustomClassHolder {

~TRTEngine();
TRTEngine(
const std::string& serialized_engine,
std::string serialized_engine,
const RTDevice& cuda_device,
const std::vector<std::string>& in_binding_names,
const std::vector<std::string>& out_binding_names,
const Platform& target_platform = get_current_platform(),
bool hardware_compatible = false,
bool requires_output_allocator = false,
const std::string& serialized_metadata = "",
std::string serialized_metadata = "",
const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy =
TRTEngine::ResourceAllocationStrategy::kStatic);
TRTEngine::ResourceAllocationStrategy::kStatic,
TRTRuntimeConfig runtime_cfg = TRTRuntimeConfig{});

TRTEngine(std::vector<std::string> serialized_info);

TRTEngine(
const std::string& mod_name,
const std::string& serialized_engine,
std::string mod_name,
std::string serialized_engine,
const RTDevice& cuda_device,
const std::vector<std::string>& in_binding_names,
const std::vector<std::string>& out_binding_names,
const Platform& target_platform = get_current_platform(),
bool hardware_compatible = false,
bool requires_output_allocator = false,
const std::string& serialized_metadata = "",
std::string serialized_metadata = "",
const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy =
TRTEngine::ResourceAllocationStrategy::kStatic);
TRTEngine::ResourceAllocationStrategy::kStatic,
TRTRuntimeConfig runtime_cfg = TRTRuntimeConfig{});

std::string to_str() const;
static void verify_serialization_fmt(const std::vector<std::string>& serialized_info);
Expand Down Expand Up @@ -257,6 +267,24 @@ struct TRTEngine : torch::CustomClassHolder {
ResourceAllocationStrategy resource_allocation_strategy = kStatic;
void set_resource_allocation_strategy(ResourceAllocationStrategy new_strategy);
ResourceAllocationStrategy get_resource_allocation_strategy();

// All TensorRT-RTX-specific IRuntimeConfig state lives here. On non-RTX builds this
// still owns a shared IRuntimeConfig (so the execution-context allocation strategy is
// applied via the uniform code path) but the RTX-only setters become no-ops.
TRTRuntimeConfig runtime_cfg;

// Monolithic-capturability check used when this engine is wrapped by an outer whole-graph
// capture (e.g. CudaGraphsTorchTensorRTModule). Non-RTX builds always return true.
bool is_monolithic_capturable(cudaStream_t stream) const;

// Disable TensorRT-RTX native CUDA graph capture on this engine (one-shot, invoked when
// an outer stream capture is detected around execute_engine). No-op on non-RTX.
void disable_rtx_native_cudagraphs();

private:
// Single entry point that (re)creates exec_ctx. Also creates (once) the IRuntimeConfig
// owned by runtime_cfg and applies all runtime config settings.
void recreate_execution_context();
};

} // namespace runtime
Expand Down
Loading
Loading