Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions core/runtime/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ cc_library(
"RTDevice.cpp",
"TRTEngine.cpp",
"TRTEngineProfiler.cpp",
"TRTRuntimeConfig.cpp",
"execute_engine.cpp",
"register_jit_hooks.cpp",
"runtime.cpp",
Expand All @@ -77,12 +78,34 @@ cc_library(
"RTDevice.h",
"TRTEngine.h",
"TRTEngineProfiler.h",
"TRTRuntimeConfig.h",
"runtime.h",
],
copts = if_torch_nccl(["-DUSE_C10D_NCCL"]),
defines = select({
# nvinfer1::IRuntimeConfig (and the matching ICudaEngine::createRuntimeConfig
# / createExecutionContext(IRuntimeConfig*) overloads) was introduced in
# TensorRT 10.11. The TensorRT shipped with the Jetpack l4t-r36.4 toolchain
# (@tensorrt_l4t) predates 10.11 and does not export this type. Every other
# configuration here (RTX, SBSA, Windows, default x86_64 Linux) is on a
# TensorRT >= 10.11 bundle, so it gets the macro.
#
# Gate every IRuntimeConfig-using site in core/runtime with
# `#ifdef TRT_HAS_IRUNTIME_CONFIG`; the Jetpack path falls back to the
# legacy createExecutionContext() no-arg overload.
":jetpack": [],
"//conditions:default": ["TRT_HAS_IRUNTIME_CONFIG"],
}),
linkopts = [
"-lstdc++fs",
],
local_defines = select({
Comment thread
tp5uiuc marked this conversation as resolved.
# TensorRT-RTX builds: opt into feature-gated APIs that the runtime layer
# depends on (e.g. IExecutionContext::isStreamCapturable).
":rtx_win": ["ENABLE_FEATURE_DISABLE_RUNTIME_ALLOCATION"],
":rtx_x86_64": ["ENABLE_FEATURE_DISABLE_RUNTIME_ALLOCATION"],
"//conditions:default": [],
}),
deps = [
"//core/plugins:torch_tensorrt_plugins",
"//core/util:prelude",
Expand Down Expand Up @@ -110,6 +133,7 @@ filegroup(
"RTDevice.h",
"TRTEngine.h",
"TRTEngineProfiler.h",
"TRTRuntimeConfig.h",
"runtime.h",
],
visibility = ["//visibility:public"],
Expand Down
127 changes: 84 additions & 43 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <algorithm>
#include <filesystem>

#include <cuda_runtime.h>
#include "NvInfer.h"
Expand Down Expand Up @@ -61,26 +62,28 @@ void DynamicOutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims
}

TRTEngine::TRTEngine(
const std::string& serialized_engine,
std::string serialized_engine,
Comment thread
tp5uiuc marked this conversation as resolved.
const RTDevice& cuda_device,
const std::vector<std::string>& _in_binding_names,
const std::vector<std::string>& _out_binding_names,
const Platform& target_platform,
bool hardware_compatible,
bool requires_output_allocator,
const std::string& serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy)
std::string serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy,
TRTRuntimeConfig runtime_cfg)
: TRTEngine(
"deserialized_trt",
serialized_engine,
std::move(serialized_engine),
cuda_device,
_in_binding_names,
_out_binding_names,
target_platform,
hardware_compatible,
requires_output_allocator,
serialized_metadata,
resource_allocation_strategy) {}
std::move(serialized_metadata),
resource_allocation_strategy,
std::move(runtime_cfg)) {}

TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
: TRTEngine(
Expand All @@ -95,24 +98,27 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
serialized_info[SERIALIZED_METADATA_IDX],
(static_cast<bool>(std::stoi(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX]))
? ResourceAllocationStrategy::kDynamic
: ResourceAllocationStrategy::kStatic)) {
: ResourceAllocationStrategy::kStatic),
make_runtime_config_from_serialized(serialized_info)) {
this->requires_native_multidevice = std::stoi(serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX]);
if (this->requires_native_multidevice) {
LOG_INFO("Loaded distributed TRT engine (contains NCCL collectives); NCCL comm will be bound on first execution");
}
}

TRTEngine::TRTEngine(
const std::string& mod_name,
const std::string& serialized_engine,
std::string mod_name,
std::string serialized_engine,
const RTDevice& cuda_device,
const std::vector<std::string>& _in_binding_names,
const std::vector<std::string>& _out_binding_names,
const Platform& target_platform,
bool hardware_compatible,
bool requires_output_allocator,
const std::string& serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy) {
std::string serialized_metadata,
const ResourceAllocationStrategy resource_allocation_strategy,
TRTRuntimeConfig runtime_cfg) {
this->runtime_cfg = std::move(runtime_cfg);
TORCHTRT_CHECK(
is_supported_on_current_platform(target_platform),
"This engine was not built to run on this platform (built for: " << target_platform << ", current platform: "
Expand All @@ -123,15 +129,15 @@ TRTEngine::TRTEngine(
auto most_compatible_device = get_most_compatible_device(cuda_device, RTDevice(), hardware_compatible);
TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");

this->serialized_metadata = serialized_metadata;
this->serialized_metadata = std::move(serialized_metadata);
this->requires_output_allocator = requires_output_allocator;
device_info = most_compatible_device.value();
multi_gpu_device_check();
set_rt_device(device_info);

rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));

name = slugify(mod_name);
name = slugify(std::move(mod_name));

cuda_engine = make_trt(rt->deserializeCudaEngine(serialized_engine.c_str(), serialized_engine.size()));
TORCHTRT_CHECK((cuda_engine.get() != nullptr), "Unable to deserialize the TensorRT engine");
Expand All @@ -146,13 +152,7 @@ TRTEngine::TRTEngine(
LOG_DEBUG(
"Resource allocation strategy: "
<< (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static"));
if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) {
this->exec_ctx =
make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
} else {
this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
}
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");
recreate_execution_context();

// Pre-allocate placeholder for empty tensors (TensorRT requires non-null addresses)
cudaMalloc(&empty_tensor_placeholder, 1);
Expand Down Expand Up @@ -288,6 +288,9 @@ TRTEngine::TRTEngine(
}

TRTEngine::~TRTEngine() {
// Marked noexcept so safe to invoke from a destructor without
// explicit try/catch; any I/O error is logged internally.
runtime_cfg.save_runtime_cache();
trt_engine_profiler.reset();
exec_ctx.reset();
cuda_engine.reset();
Expand All @@ -301,8 +304,7 @@ void TRTEngine::disable_profiling() {
torch::cuda::synchronize(device_info.id);
profile_execution = false;
trt_engine_profiler.reset();
exec_ctx = make_trt(cuda_engine->createExecutionContext());
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context");
recreate_execution_context();
Comment thread
tp5uiuc marked this conversation as resolved.
}

void TRTEngine::dump_engine_layer_info_to_file(const std::string& path) {
Expand Down Expand Up @@ -399,10 +401,7 @@ bool TRTEngine::set_device_memory_budget(int64_t budget) {
trt_engine_profiler.reset();
}
bool result = cuda_engine->setWeightStreamingBudgetV2(budget);
exec_ctx = make_trt(cuda_engine->createExecutionContext());
TORCHTRT_CHECK(
(exec_ctx.get() != nullptr),
"Unable to recreate TensorRT execution context after setting new device memory budget");
recreate_execution_context();
Comment thread
tp5uiuc marked this conversation as resolved.
if (profile_execution) {
enable_profiling();
}
Expand Down Expand Up @@ -459,6 +458,7 @@ std::string TRTEngine::to_str() const {
ss << " Target Platform: " << target_platform << std::endl;
ss << " Resource Allocation Strategy: " << (resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static") << std::endl;
ss << " Multi-Device Engine: " << (requires_native_multidevice) << std::endl;
ss << runtime_cfg.to_str();
// clang-format on
return ss.str();
}
Expand Down Expand Up @@ -495,7 +495,14 @@ FlattenedState TRTEngine::__obj_flatten__() {
std::tuple("requires_output_allocator", serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]),
std::tuple("target_platform", serialized_info[TARGET_PLATFORM_IDX]),
std::tuple("resource_allocation_strategy", serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX]),
std::tuple("requires_native_multidevice", serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX]));
std::tuple("requires_native_multidevice", serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX])
#ifdef TRT_MAJOR_RTX
,
std::tuple("runtime_cache_path", serialized_info[RUNTIME_CACHE_PATH_IDX]),
std::tuple("dynamic_shapes_kernel_strategy", serialized_info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX]),
std::tuple("cuda_graph_strategy", serialized_info[CUDA_GRAPH_STRATEGY_IDX])
#endif
);
}

std::vector<std::string> TRTEngine::serialize() {
Expand All @@ -522,6 +529,13 @@ std::vector<std::string> TRTEngine::serialize() {
this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "1" : "0";
serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX] = this->requires_native_multidevice ? "1" : "0";
// rank/world_size are runtime facts (may differ at load time); not serialized.
#ifdef TRT_MAJOR_RTX
serialized_info[RUNTIME_CACHE_PATH_IDX] = runtime_cfg.runtime_cache_path;
serialized_info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX] = std::to_string(
static_cast<std::underlying_type_t<DynamicShapesKernelStrategy>>(runtime_cfg.dynamic_shapes_kernel_strategy));
serialized_info[CUDA_GRAPH_STRATEGY_IDX] =
std::to_string(static_cast<std::underlying_type_t<CudaGraphStrategyOption>>(runtime_cfg.cuda_graph_strategy));
#endif

return serialized_info;
}
Expand All @@ -533,14 +547,11 @@ void TRTEngine::reset_captured_graph() {
void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationStrategy new_strategy) {
if (new_strategy != this->resource_allocation_strategy) {
this->resource_allocation_strategy = new_strategy;
if (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) {
LOG_DEBUG("Setting resource allocation strategy to dynamic");
this->exec_ctx =
make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
} else {
LOG_DEBUG("Setting resource allocation strategy to static");
this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
}
LOG_DEBUG(
"Setting resource allocation strategy to "
<< (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic ? "dynamic"
: "static"));
recreate_execution_context();
}
}

Expand Down Expand Up @@ -637,19 +648,49 @@ void TRTEngine::release_nccl_comm() {
LOG_INFO("Releasing NCCL communicator from engine '" << this->name << "'");
torch::cuda::synchronize(device_info.id);
this->exec_ctx.reset();
if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) {
this->exec_ctx =
make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
} else {
this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
}
TORCHTRT_CHECK(
(exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context after releasing NCCL comm");
recreate_execution_context();
this->nccl_initialized = false;
LOG_INFO("NCCL communicator released from engine '" << this->name << "'");
}
#endif // ENABLE_TRT_NCCL_COLLECTIVES

bool TRTEngine::is_monolithic_capturable(cudaStream_t stream) const {
return runtime_cfg.is_monolithic_capturable(exec_ctx.get(), stream);
}

void TRTEngine::disable_rtx_native_cudagraphs() {
bool was_disabled = runtime_cfg.rtx_native_cudagraphs_disabled;
runtime_cfg.disable_rtx_native_cudagraphs(name);
if (!was_disabled && runtime_cfg.rtx_native_cudagraphs_disabled) {
// The CUDA graph strategy on the IRuntimeConfig has been flipped; rebuild exec_ctx
// so the new strategy takes effect for subsequent enqueueV3 calls.
recreate_execution_context();
}
}

void TRTEngine::recreate_execution_context() {
// Flush any kernels the previous execution context may have compiled into the
// runtime cache before creating the replacement. The destructor also saves, but
// doing it here guards against losing compiled kernels across profiling toggles,
// allocator changes, or process kills that happen between allocator changes and
// teardown. No-op on standard TensorRT or when no cache path is configured.
runtime_cfg.save_runtime_cache();
runtime_cfg.ensure_initialized(cuda_engine.get());
Comment thread
tp5uiuc marked this conversation as resolved.
const auto allocation_strategy = resource_allocation_strategy == ResourceAllocationStrategy::kDynamic
? nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED
: nvinfer1::ExecutionContextAllocationStrategy::kSTATIC;
runtime_cfg.set_execution_context_allocation_strategy(allocation_strategy);
#ifdef TRT_HAS_IRUNTIME_CONFIG
exec_ctx = make_trt(cuda_engine->createExecutionContext(runtime_cfg.config.get()));
#else
// Older standard TensorRT (e.g. Jetpack) doesn't expose IRuntimeConfig; fall back
// to the legacy createExecutionContext(strategy) overload. The
// set_execution_context_allocation_strategy call above is a no-op on this path.
exec_ctx = make_trt(cuda_engine->createExecutionContext(allocation_strategy));
#endif
TORCHTRT_CHECK(exec_ctx.get() != nullptr, "Unable to (re)create TensorRT execution context");
}

} // namespace runtime
} // namespace core
} // namespace torch_tensorrt
44 changes: 36 additions & 8 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "torch/custom_class.h"

#include "core/runtime/TRTEngineProfiler.h"
#include "core/runtime/TRTRuntimeConfig.h"
#include "core/util/prelude.h"

// TensorRT 10.16+ has native NCCL collective support via IExecutionContext::setCommunicator()
Expand Down Expand Up @@ -45,7 +46,14 @@ using FlattenedState = std::tuple<
std::tuple<std::string, std::string>, // serialized metadata
std::tuple<std::string, std::string>, // Platform
std::tuple<std::string, std::string>, // Resource Allocation Strategy
std::tuple<std::string, std::string>>; // requires_native_multidevice
std::tuple<std::string, std::string> // requires_native_multidevice
#ifdef TRT_MAJOR_RTX
,
std::tuple<std::string, std::string>, // Runtime Cache Path (TRT-RTX)
std::tuple<std::string, std::string>, // Dynamic Shapes Kernel Strategy (TRT-RTX)
std::tuple<std::string, std::string> // CUDA Graph Strategy (TRT-RTX)
#endif
>;

struct TorchTRTRuntimeStates {
// Indicates whether CUDAGraphs were enabled in the previous execute_engine
Expand Down Expand Up @@ -140,31 +148,33 @@ struct TRTEngine : torch::CustomClassHolder {

~TRTEngine();
TRTEngine(
const std::string& serialized_engine,
std::string serialized_engine,
const RTDevice& cuda_device,
const std::vector<std::string>& in_binding_names,
const std::vector<std::string>& out_binding_names,
const Platform& target_platform = get_current_platform(),
bool hardware_compatible = false,
bool requires_output_allocator = false,
const std::string& serialized_metadata = "",
std::string serialized_metadata = "",
const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy =
TRTEngine::ResourceAllocationStrategy::kStatic);
TRTEngine::ResourceAllocationStrategy::kStatic,
TRTRuntimeConfig runtime_cfg = TRTRuntimeConfig{});

TRTEngine(std::vector<std::string> serialized_info);

TRTEngine(
const std::string& mod_name,
const std::string& serialized_engine,
std::string mod_name,
std::string serialized_engine,
const RTDevice& cuda_device,
const std::vector<std::string>& in_binding_names,
const std::vector<std::string>& out_binding_names,
const Platform& target_platform = get_current_platform(),
bool hardware_compatible = false,
bool requires_output_allocator = false,
const std::string& serialized_metadata = "",
std::string serialized_metadata = "",
const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy =
TRTEngine::ResourceAllocationStrategy::kStatic);
TRTEngine::ResourceAllocationStrategy::kStatic,
TRTRuntimeConfig runtime_cfg = TRTRuntimeConfig{});

std::string to_str() const;
static void verify_serialization_fmt(const std::vector<std::string>& serialized_info);
Expand Down Expand Up @@ -257,6 +267,24 @@ struct TRTEngine : torch::CustomClassHolder {
ResourceAllocationStrategy resource_allocation_strategy = kStatic;
void set_resource_allocation_strategy(ResourceAllocationStrategy new_strategy);
ResourceAllocationStrategy get_resource_allocation_strategy();

// All TensorRT-RTX-specific IRuntimeConfig state lives here. On non-RTX builds this
// still owns a shared IRuntimeConfig (so the execution-context allocation strategy is
// applied via the uniform code path) but the RTX-only setters become no-ops.
TRTRuntimeConfig runtime_cfg;

// Monolithic-capturability check used when this engine is wrapped by an outer whole-graph
// capture (e.g. CudaGraphsTorchTensorRTModule). Non-RTX builds always return true.
bool is_monolithic_capturable(cudaStream_t stream) const;

// Disable TensorRT-RTX native CUDA graph capture on this engine (one-shot, invoked when
// an outer stream capture is detected around execute_engine). No-op on non-RTX.
void disable_rtx_native_cudagraphs();

private:
// Single entry point that (re)creates exec_ctx. Also creates (once) the IRuntimeConfig
// owned by runtime_cfg and applies all runtime config settings.
void recreate_execution_context();
};

} // namespace runtime
Expand Down
Loading
Loading