microsoft · chilo-ms · Jan 27, 2026 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
diff --git a/plugin_execution_providers/tensorrt/CMakeLists.txt b/plugin_execution_providers/tensorrt/CMakeLists.txt
@@ -5,6 +5,8 @@
 cmake_minimum_required(VERSION 3.26)
 project(TensorRTEp VERSION 1.0)
 set(CMAKE_CXX_STANDARD 17)
+set(plugin_ep_common_dir ${CMAKE_SOURCE_DIR}/../common)
+include(${plugin_ep_common_dir}/cmake/onnxruntime_library_utils.cmake)
 
 enable_language(CUDA) # via nvcc to get the CUDA tool kit
 file(TO_CMAKE_PATH "/usr/local/cuda" CUDAToolkit_ROOT)
@@ -28,12 +30,17 @@ endif()
 add_definitions(-DONNX_NAMESPACE=onnx)
 add_definitions(-DONNX_ML)
 add_definitions(-DNOMINMAX)
-file(GLOB tensorrt_src "./*.cc" "./utils/*.cc" "./cuda/unary_elementwise_ops_impl.cu" "./*.h")
+file(GLOB tensorrt_src "./src/*.cc" "./src/utils/*.cc" "./src/cuda/unary_elementwise_ops_impl.cu" "./src/*.h")
 add_library(TensorRTEp SHARED ${tensorrt_src})
 
-if (NOT ORT_HOME)
-  message(FATAL_ERROR "Please specify ORT_HOME, e.g. -DORT_HOME=/path/to/ort/")
-endif()
+set_onnxruntime_paths(
+  ORT_HOME ${ORT_HOME}
+  DEFAULT_ORT_VERSION "1.23.2"
+  ORT_INCLUDE_DIR_VAR ORT_INCLUDE_DIR
+  ORT_LIBRARY_DIR_VAR ORT_LIBRARY_DIR)
+
+message(STATUS "ORT_LIBRARY_DIR: ${ORT_LIBRARY_DIR}")
+message(STATUS "ORT_INCLUDE_DIR: ${ORT_INCLUDE_DIR}")
 
 if (NOT TENSORRT_HOME)
   message(FATAL_ERROR "Please specify TENSORRT_HOME, e.g. -DTENSORRT_HOME=/path/to/trt/")
@@ -111,7 +118,7 @@ if (WIN32) # Windows
                 "${DEPS_PATH}/onnx-build/${CMAKE_BUILD_TYPE}/onnx_proto.lib")
 
   set(TRT_EP_LIB_LINK_FLAG
-        "-DEF:${CMAKE_SOURCE_DIR}/tensorrt_execution_provider.def")
+        "-DEF:${CMAKE_SOURCE_DIR}/src/tensorrt_execution_provider.def")
 
 else() # Linux
   set(ORT_LIB "${ORT_HOME}/lib/libonnxruntime.so")
@@ -142,7 +149,7 @@ set_property(TARGET TensorRTEp APPEND_STRING PROPERTY LINK_FLAGS
                ${TRT_EP_LIB_LINK_FLAG})
 
 target_include_directories(TensorRTEp PUBLIC "${ORT_HOME}/include"
-                                             "./utils"
+                                             "./src/utils"
                                              "/usr/local/cuda/include"
                                              "${TENSORRT_HOME}/include"
                                              "${DEPS_PATH}/flatbuffers-src/include"

diff --git a/...rt/cuda/cu_inc/unary_elementwise_impl.cuh → ...rc/cuda/cu_inc/unary_elementwise_impl.cuh b/...rt/cuda/cu_inc/unary_elementwise_impl.cuh → ...rc/cuda/cu_inc/unary_elementwise_impl.cuh
diff --git a/...nsorrt/cuda/unary_elementwise_ops_impl.cu → ...rt/src/cuda/unary_elementwise_ops_impl.cu b/...nsorrt/cuda/unary_elementwise_ops_impl.cu → ...rt/src/cuda/unary_elementwise_ops_impl.cu
diff --git a/...ensorrt/cuda/unary_elementwise_ops_impl.h → ...rrt/src/cuda/unary_elementwise_ops_impl.h b/...ensorrt/cuda/unary_elementwise_ops_impl.h → ...rrt/src/cuda/unary_elementwise_ops_impl.h
diff --git a/...tion_providers/tensorrt/cuda_allocator.cc → ..._providers/tensorrt/src/cuda_allocator.cc b/...tion_providers/tensorrt/cuda_allocator.cc → ..._providers/tensorrt/src/cuda_allocator.cc
diff --git a/...ution_providers/tensorrt/cuda_allocator.h → ...n_providers/tensorrt/src/cuda_allocator.h b/...ution_providers/tensorrt/cuda_allocator.h → ...n_providers/tensorrt/src/cuda_allocator.h
diff --git a/...xecution_providers/tensorrt/nv_includes.h → ...tion_providers/tensorrt/src/nv_includes.h b/...xecution_providers/tensorrt/nv_includes.h → ...tion_providers/tensorrt/src/nv_includes.h
diff --git a/...oviders/tensorrt/onnx_ctx_model_helper.cc → ...ers/tensorrt/src/onnx_ctx_model_helper.cc b/...oviders/tensorrt/onnx_ctx_model_helper.cc → ...ers/tensorrt/src/onnx_ctx_model_helper.cc
diff --git a/...roviders/tensorrt/onnx_ctx_model_helper.h → ...ders/tensorrt/src/onnx_ctx_model_helper.h b/...roviders/tensorrt/onnx_ctx_model_helper.h → ...ders/tensorrt/src/onnx_ctx_model_helper.h
diff --git a/...ers/tensorrt/ort_trt_int8_cal_table.fbs.h → ...tensorrt/src/ort_trt_int8_cal_table.fbs.h b/...ers/tensorrt/ort_trt_int8_cal_table.fbs.h → ...tensorrt/src/ort_trt_int8_cal_table.fbs.h
diff --git a/...s/tensorrt/tensorrt_execution_provider.cc → ...nsorrt/src/tensorrt_execution_provider.cc b/...s/tensorrt/tensorrt_execution_provider.cc → ...nsorrt/src/tensorrt_execution_provider.cc
diff --git a/.../tensorrt/tensorrt_execution_provider.def → ...sorrt/src/tensorrt_execution_provider.def b/.../tensorrt/tensorrt_execution_provider.def → ...sorrt/src/tensorrt_execution_provider.def
diff --git a/...rs/tensorrt/tensorrt_execution_provider.h → ...ensorrt/src/tensorrt_execution_provider.h b/...rs/tensorrt/tensorrt_execution_provider.h → ...ensorrt/src/tensorrt_execution_provider.h
@@ -124,6 +124,7 @@ struct TensorrtComputeState {
   std::string compute_capability;
   size_t max_workspace_size = 1 << 30;  // 1GB;
   bool fp16_enable = false;
+  bool bf16_enable = false;
   bool int8_enable = false;
   bool int8_calibration_cache_available = false;
   bool dla_enable = false;
@@ -276,6 +277,7 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
   size_t max_workspace_size_ = 1 << 30;  // 1GB
   bool fp16_enable_ = false;
   bool int8_enable_ = false;
+  bool bf16_enable_ = false;
   bool dla_enable_ = false;
   int dla_core_ = 0;
   bool force_sequential_engine_build_ = false;

diff --git a/.../tensorrt/tensorrt_execution_provider.lds → ...sorrt/src/tensorrt_execution_provider.lds b/.../tensorrt/tensorrt_execution_provider.lds → ...sorrt/src/tensorrt_execution_provider.lds
diff --git a/...sorrt_execution_provider_data_transfer.cc → ...sorrt_execution_provider_data_transfer.cc b/...sorrt_execution_provider_data_transfer.cc → ...sorrt_execution_provider_data_transfer.cc
diff --git a/...nsorrt_execution_provider_data_transfer.h → ...nsorrt_execution_provider_data_transfer.h b/...nsorrt_execution_provider_data_transfer.h → ...nsorrt_execution_provider_data_transfer.h
diff --git a/...sorrt/tensorrt_execution_provider_info.cc → ...t/src/tensorrt_execution_provider_info.cc b/...sorrt/tensorrt_execution_provider_info.cc → ...t/src/tensorrt_execution_provider_info.cc
@@ -18,6 +18,7 @@ constexpr const char* kMinSubgraphSize = "trt_min_subgraph_size";
 constexpr const char* kMaxWorkspaceSize = "trt_max_workspace_size";
 constexpr const char* kFp16Enable = "trt_fp16_enable";
 constexpr const char* kInt8Enable = "trt_int8_enable";
+constexpr const char* kBf16Enable = "trt_bf16_enable";
 constexpr const char* kInt8CalibTable = "trt_int8_calibration_table_name";
 constexpr const char* kInt8UseNativeCalibTable = "trt_int8_use_native_calibration_table";
 constexpr const char* kDLAEnable = "trt_dla_enable";
@@ -95,6 +96,7 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kMaxWorkspaceSize, info.max_workspace_size)
           .AddAssignmentToReference(tensorrt::provider_option_names::kFp16Enable, info.fp16_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kInt8Enable, info.int8_enable)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kBf16Enable, info.bf16_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kInt8CalibTable, info.int8_calibration_table_name)
           .AddAssignmentToReference(tensorrt::provider_option_names::kInt8UseNativeCalibTable, info.int8_use_native_calibration_table)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDLAEnable, info.dla_enable)

diff --git a/...nsorrt/tensorrt_execution_provider_info.h → ...rt/src/tensorrt_execution_provider_info.h b/...nsorrt/tensorrt_execution_provider_info.h → ...rt/src/tensorrt_execution_provider_info.h
@@ -18,6 +18,7 @@ struct TensorrtExecutionProviderInfo {
   size_t max_workspace_size{1 << 30};
   bool fp16_enable{false};
   bool int8_enable{false};
+  bool bf16_enable{false};
   std::string int8_calibration_table_name{""};
   bool int8_use_native_calibration_table{false};
   bool dla_enable{false};

diff --git a/...orrt_execution_provider_stream_support.cc → ...orrt_execution_provider_stream_support.cc b/...orrt_execution_provider_stream_support.cc → ...orrt_execution_provider_stream_support.cc
diff --git a/...sorrt_execution_provider_stream_support.h → ...sorrt_execution_provider_stream_support.h b/...sorrt_execution_provider_stream_support.h → ...sorrt_execution_provider_stream_support.h
diff --git a/...sorrt/tensorrt_execution_provider_utils.h → ...t/src/tensorrt_execution_provider_utils.h b/...sorrt/tensorrt_execution_provider_utils.h → ...t/src/tensorrt_execution_provider_utils.h
@@ -55,9 +55,6 @@ AllocatorUniquePtr<T> MakeUniquePtrFromOrtAllocator(OrtAllocator* ort_allocator,
   return AllocatorUniquePtr<T>{p, [ort_allocator](T* p) { ort_allocator->Free(ort_allocator, p); }};
 }
 
-// Following helper functions/struct, GetNodeInputEdgeCount, GetOutputNodes, KahnsTopologicalSort, VisitorPriorityQueue, PriorityNodeCompare are added but are not used for now.
-// TODO: They will be used for graph partition in the following PR.
-
 template <typename T>
 struct VisitorPriorityQueue {
   using ComparatorType = std::function<bool(T, T)>;

diff --git a/...ers/tensorrt/tensorrt_provider_factory.cc → ...tensorrt/src/tensorrt_provider_factory.cc b/...ers/tensorrt/tensorrt_provider_factory.cc → ...tensorrt/src/tensorrt_provider_factory.cc
@@ -14,7 +14,7 @@
 namespace trt_ep {
 
 TensorrtExecutionProviderFactory::TensorrtExecutionProviderFactory(const char* ep_name, const OrtLogger& default_logger, ApiPtrs apis)
-    : ApiPtrs(apis), default_logger_{default_logger}, ep_name_{ep_name} {
+    : OrtEpFactory {}, ApiPtrs(apis), default_logger_{default_logger}, ep_name_{ep_name} {
   ort_version_supported = ORT_API_VERSION;  // set to the ORT version we were compiled with.
   GetName = GetNameImpl;
   GetVendor = GetVendorImpl;

diff --git a/...ders/tensorrt/tensorrt_provider_factory.h → .../tensorrt/src/tensorrt_provider_factory.h b/...ders/tensorrt/tensorrt_provider_factory.h → .../tensorrt/src/tensorrt_provider_factory.h
diff --git a/...providers/tensorrt/utils/cuda/cuda_call.h → ...iders/tensorrt/src/utils/cuda/cuda_call.h b/...providers/tensorrt/utils/cuda/cuda_call.h → ...iders/tensorrt/src/utils/cuda/cuda_call.h
diff --git a/...oviders/tensorrt/utils/cuda/cuda_common.h → ...ers/tensorrt/src/utils/cuda/cuda_common.h b/...oviders/tensorrt/utils/cuda/cuda_common.h → ...ers/tensorrt/src/utils/cuda/cuda_common.h
diff --git a/...ution_providers/tensorrt/utils/ep_utils.h → ...n_providers/tensorrt/src/utils/ep_utils.h b/...ution_providers/tensorrt/utils/ep_utils.h → ...n_providers/tensorrt/src/utils/ep_utils.h
diff --git a/...cution_providers/tensorrt/utils/helper.cc → ...on_providers/tensorrt/src/utils/helper.cc b/...cution_providers/tensorrt/utils/helper.cc → ...on_providers/tensorrt/src/utils/helper.cc
diff --git a/...on_providers/tensorrt/utils/make_string.h → ...roviders/tensorrt/src/utils/make_string.h b/...on_providers/tensorrt/utils/make_string.h → ...roviders/tensorrt/src/utils/make_string.h