Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,024 changes: 2,024 additions & 0 deletions ATTRIBUTIONS-Python.md

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion cpp/include/tensorrt_llm/executor/transferAgent.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,8 @@ struct BaseAgentConfig
bool useProgThread;
bool multiThread;
bool useListenThread;
unsigned int numWorkers;
bool enableTelemetry;
std::unordered_map<std::string, std::string> backendParams;
};

class BaseTransferAgent
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ AgentConnectionManager::AgentConnectionManager(

mAgentName = genUniqueAgentName();
// Create Agent
BaseAgentConfig config{mAgentName, true, false, true, 1};
BaseAgentConfig config{mAgentName, true, false, true};
m_Agent = makeTransferAgent(backendType, &config);
TLLM_CHECK(!mCacheTransBufferManagers.empty());
std::vector<MemoryDesc> memDescs;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,9 @@
#endif

#include <nanobind/nanobind.h>
#include <nanobind/stl/function.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/pair.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/tuple.h>
#include <nanobind/stl/unordered_map.h>
#include <nanobind/stl/vector.h>

Expand Down Expand Up @@ -69,6 +68,21 @@ NB_MODULE(tensorrt_llm_transfer_agent_binding, m)
// MemoryDescs class
nb::class_<kvc::MemoryDescs>(m, "MemoryDescs")
.def(nb::init<kvc::MemoryType, std::vector<kvc::MemoryDesc>>(), nb::arg("type"), nb::arg("descs"))
// Batch constructor from list of tuples: [(ptr, size, device_id), ...]
.def(
"__init__",
[](kvc::MemoryDescs* self, kvc::MemoryType type,
std::vector<std::tuple<uintptr_t, size_t, uint32_t>> const& tuples)
{
std::vector<kvc::MemoryDesc> descs;
descs.reserve(tuples.size());
for (auto const& [addr, len, deviceId] : tuples)
{
descs.emplace_back(addr, len, deviceId);
}
new (self) kvc::MemoryDescs(type, std::move(descs));
},
nb::arg("type"), nb::arg("tuples"))
.def_prop_ro("type", &kvc::MemoryDescs::getType)
.def_prop_ro("descs", &kvc::MemoryDescs::getDescs);

Expand Down Expand Up @@ -113,17 +127,21 @@ NB_MODULE(tensorrt_llm_transfer_agent_binding, m)
.def(
"__init__",
[](kvc::BaseAgentConfig* self, std::string name, bool use_prog_thread, bool multi_thread,
bool use_listen_thread, unsigned int num_workers) {
new (self) kvc::BaseAgentConfig{
std::move(name), use_prog_thread, multi_thread, use_listen_thread, num_workers};
bool use_listen_thread, bool enable_telemetry,
std::unordered_map<std::string, std::string> backend_params)
{
new (self) kvc::BaseAgentConfig{std::move(name), use_prog_thread, multi_thread, use_listen_thread,
enable_telemetry, std::move(backend_params)};
},
nb::arg("name"), nb::arg("use_prog_thread") = true, nb::arg("multi_thread") = false,
nb::arg("use_listen_thread") = false, nb::arg("num_workers") = 1)
nb::arg("use_listen_thread") = false, nb::arg("enable_telemetry") = false,
nb::arg("backend_params") = std::unordered_map<std::string, std::string>{})
.def_rw("name", &kvc::BaseAgentConfig::mName)
.def_rw("use_prog_thread", &kvc::BaseAgentConfig::useProgThread)
.def_rw("multi_thread", &kvc::BaseAgentConfig::multiThread)
.def_rw("use_listen_thread", &kvc::BaseAgentConfig::useListenThread)
.def_rw("num_workers", &kvc::BaseAgentConfig::numWorkers);
.def_rw("enable_telemetry", &kvc::BaseAgentConfig::enableTelemetry)
.def_rw("backend_params", &kvc::BaseAgentConfig::backendParams);

// BaseTransferAgent class (abstract base)
nb::class_<kvc::BaseTransferAgent>(m, "BaseTransferAgent")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ PYBIND11_MODULE(tensorrt_llm_transfer_agent_binding, m)
// MemoryDescs class
py::class_<kvc::MemoryDescs>(m, "MemoryDescs")
.def(py::init<kvc::MemoryType, std::vector<kvc::MemoryDesc>>(), py::arg("type"), py::arg("descs"))
// Batch constructor from list of tuples: [(ptr, size, device_id), ...]
.def(py::init(
[](kvc::MemoryType type, std::vector<std::tuple<uintptr_t, size_t, uint32_t>> const& tuples)
{
std::vector<kvc::MemoryDesc> descs;
descs.reserve(tuples.size());
for (auto const& [addr, len, deviceId] : tuples)
{
descs.emplace_back(addr, len, deviceId);
}
return kvc::MemoryDescs(type, std::move(descs));
}),
py::arg("type"), py::arg("tuples"))
.def_property_readonly("type", &kvc::MemoryDescs::getType)
.def_property_readonly("descs", &kvc::MemoryDescs::getDescs);

Expand Down Expand Up @@ -108,17 +121,20 @@ PYBIND11_MODULE(tensorrt_llm_transfer_agent_binding, m)
.def(py::init<>())
.def(py::init(
[](std::string name, bool use_prog_thread, bool multi_thread, bool use_listen_thread,
unsigned int num_workers) {
return kvc::BaseAgentConfig{
std::move(name), use_prog_thread, multi_thread, use_listen_thread, num_workers};
bool enable_telemetry, std::unordered_map<std::string, std::string> backend_params)
{
return kvc::BaseAgentConfig{std::move(name), use_prog_thread, multi_thread, use_listen_thread,
enable_telemetry, std::move(backend_params)};
}),
py::arg("name"), py::arg("use_prog_thread") = true, py::arg("multi_thread") = false,
py::arg("use_listen_thread") = false, py::arg("num_workers") = 1)
py::arg("use_listen_thread") = false, py::arg("enable_telemetry") = false,
py::arg("backend_params") = std::unordered_map<std::string, std::string>{})
.def_readwrite("name", &kvc::BaseAgentConfig::mName)
.def_readwrite("use_prog_thread", &kvc::BaseAgentConfig::useProgThread)
.def_readwrite("multi_thread", &kvc::BaseAgentConfig::multiThread)
.def_readwrite("use_listen_thread", &kvc::BaseAgentConfig::useListenThread)
.def_readwrite("num_workers", &kvc::BaseAgentConfig::numWorkers);
.def_readwrite("enable_telemetry", &kvc::BaseAgentConfig::enableTelemetry)
.def_readwrite("backend_params", &kvc::BaseAgentConfig::backendParams);

// BaseTransferAgent class (abstract base)
py::class_<kvc::BaseTransferAgent>(m, "BaseTransferAgent")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -374,22 +374,28 @@ NixlTransferAgent::NixlTransferAgent(BaseAgentConfig const& config)
}
auto envPort = common::getEnvNixlPort();
uint16_t port = envPort > 0 ? getIncrmentPort(envPort) : getAvailablePort();
nixlAgentConfig nixlConfig{
config.useProgThread, true, port, nixl_thread_sync_t::NIXL_THREAD_SYNC_DEFAULT, config.numWorkers};
uint32_t numWorker = config.backendParams.find("num_workers") != config.backendParams.end()
? std::stoi(config.backendParams.at("num_workers"))
: 1;
nixlAgentConfig nixlConfig{config.useProgThread, true, port, nixl_thread_sync_t::NIXL_THREAD_SYNC_DEFAULT,
numWorker, 0, 10000, config.enableTelemetry};
mAddress = getAvailableIP() + ":" + std::to_string(port);
mRawAgent = std::make_unique<nixlAgent>(config.mName, std::move(nixlConfig));
}
else
{
uint32_t numWorker = config.backendParams.find("num_workers") != config.backendParams.end()
? std::stoi(config.backendParams.at("num_workers"))
: 1;
mAddress.clear();
nixlAgentConfig nixlConfig{
config.useProgThread, false, 0, nixl_thread_sync_t::NIXL_THREAD_SYNC_DEFAULT, config.numWorkers};
nixlAgentConfig nixlConfig{config.useProgThread, false, 0, nixl_thread_sync_t::NIXL_THREAD_SYNC_DEFAULT,
numWorker, 0, 10000, config.enableTelemetry};
mRawAgent = std::make_unique<nixlAgent>(config.mName, std::move(nixlConfig));
}

std::string nixlBackend = common::getEnvNixlBackend();
// List of supported backends - extend this list as new backends are added
static const std::set<std::string> kSUPPORTED_BACKENDS = {"UCX", "LIBFABRIC"};
static std::set<std::string> const kSUPPORTED_BACKENDS = {"UCX", "LIBFABRIC"};

if (kSUPPORTED_BACKENDS.find(nixlBackend) == kSUPPORTED_BACKENDS.end())
{
Expand All @@ -400,6 +406,11 @@ NixlTransferAgent::NixlTransferAgent(BaseAgentConfig const& config)
TLLM_LOG_INFO("NixlTransferAgent::NixlTransferAgent using NIXL backend: %s", nixlBackend.c_str());

nixl_b_params_t init1;
for (auto const& [key, value] : config.backendParams)
{
init1[key] = value;
TLLM_LOG_INFO("NixlTransferAgent::NixlTransferAgent backendParams: %s: %s", key.c_str(), value.c_str());
}
nixl_mem_list_t mems1;
status = mRawAgent->getPluginParams(nixlBackend.c_str(), mems1, init1);
TLLM_CHECK(status == NIXL_SUCCESS);
Expand Down
1 change: 1 addition & 0 deletions cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ void initBindings(nb::module_& m)
.def_prop_ro("context_phase_params", &GenLlmReq::getContextPhaseParams)
.def_prop_ro("is_context_only_request", &GenLlmReq::isContextOnlyRequest)
.def_prop_ro("is_generation_only_request", &GenLlmReq::isGenerationOnlyRequest)
.def_prop_ro("is_generation_to_complete_state", &GenLlmReq::isGenerationToCompleteState)
.def_prop_ro("is_generation_complete_state", &GenLlmReq::isGenerationCompleteState)
.def_prop_ro("is_context_finished", &GenLlmReq::isContextFinished)
.def_prop_ro("is_disagg_generation_init_state", &GenLlmReq::isDisaggGenerationInitState)
Expand Down
1 change: 1 addition & 0 deletions cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ void initBindings(pybind11::module_& m)
.def_property_readonly("context_phase_params", &GenLlmReq::getContextPhaseParams)
.def_property_readonly("is_context_only_request", &GenLlmReq::isContextOnlyRequest)
.def_property_readonly("is_generation_only_request", &GenLlmReq::isGenerationOnlyRequest)
.def_property_readonly("is_generation_to_complete_state", &GenLlmReq::isGenerationToCompleteState)
.def_property_readonly("is_generation_complete_state", &GenLlmReq::isGenerationCompleteState)
.def_property_readonly("is_context_finished", &GenLlmReq::isContextFinished)
.def_property_readonly("is_disagg_generation_init_state", &GenLlmReq::isDisaggGenerationInitState)
Expand Down
12 changes: 6 additions & 6 deletions cpp/tests/unit_tests/executor/transferAgentTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ TEST_P(TransferAgentTest, Basic)
{

std::string const agent0{"agent0"}, agent1{"agent1"};
BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1};
BaseAgentConfig config0{agent0, true, false, true}, config1{agent1, true, false, true};
auto xferAgent0 = makeTransferAgent(config0);
auto xferAgent1 = makeTransferAgent(config1);

Expand Down Expand Up @@ -127,7 +127,7 @@ TEST_P(TransferAgentTest, Basic2)
{

std::string const agent0{"agent0"}, agent1{"agent1"};
BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1};
BaseAgentConfig config0{agent0, true, false, true}, config1{agent1, true, false, true};
auto xferAgent0 = makeTransferAgent(config0);
auto xferAgent1 = makeTransferAgent(config1);

Expand Down Expand Up @@ -162,7 +162,7 @@ TEST_P(TransferAgentTest, DeviceMemory)
{

std::string const agent0{"agent0"}, agent1{"agent1"};
BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1};
BaseAgentConfig config0{agent0, true, false, true}, config1{agent1, true, false, true};
auto xferAgent0 = makeTransferAgent(config0);
auto xferAgent1 = makeTransferAgent(config1);

Expand Down Expand Up @@ -209,8 +209,8 @@ TEST_P(TransferAgentTest, Connect)
{

std::string const agent0{"agent0"}, agent1{"agent1"}, agent2{"agent2"};
BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1},
config2{agent2, true, false, true, 1};
BaseAgentConfig config0{agent0, true, false, true}, config1{agent1, true, false, true},
config2{agent2, true, false, true};
auto xferAgent0 = makeTransferAgent(config0);
auto xferAgent1 = makeTransferAgent(config1);
auto xferAgent2 = makeTransferAgent(config2);
Expand Down Expand Up @@ -261,7 +261,7 @@ TEST_P(TransferAgentTest, SyncMessage)
{
constexpr std::size_t MAX_QUERY_TIMES = std::numeric_limits<size_t>::max();
std::string const agent0{"agent0"}, agent1{"agent1"};
BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1};
BaseAgentConfig config0{agent0, true, false, true}, config1{agent1, true, false, true};
auto xferAgent0 = makeTransferAgent(config0);
auto xferAgent1 = makeTransferAgent(config1);

Expand Down
11 changes: 5 additions & 6 deletions examples/auto_deploy/build_and_run_ad.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
SettingsConfigDict,
)

from tensorrt_llm._torch.auto_deploy import LLM, AutoDeployConfig, DemoLLM
from tensorrt_llm._torch.auto_deploy import LLM, DemoLLM
from tensorrt_llm._torch.auto_deploy.llm_args import LlmArgs
from tensorrt_llm._torch.auto_deploy.utils._config import (
DynamicYamlMixInForSettings,
Expand Down Expand Up @@ -142,7 +142,6 @@ class ExperimentConfig(DynamicYamlMixInForSettings, BaseSettings):
# The main AutoDeploy arguments - contains model, tokenizer, backend configs, etc.
args: LlmArgs = Field(
description="The main AutoDeploy arguments containing model, tokenizer, backend configs, etc. "
"Contains all the fields from `AutoDeployConfig` and `BaseLlmArgs`. "
"Please check `tensorrt_llm._torch.auto_deploy.llm_args.LlmArgs` for more details."
)

Expand Down Expand Up @@ -213,15 +212,15 @@ def process_extra_cli_args(cls, data: Dict) -> Dict:
def sync_model_with_args(cls, model_value, info):
if "args" not in info.data:
return model_value
args: AutoDeployConfig = info.data["args"]
args: LlmArgs = info.data["args"]
return args.model

@field_validator("prompt", mode="after")
@classmethod
def sync_prompt_batch_size_with_args_max_batch_size(cls, prompt: PromptConfig, info):
if "args" not in info.data:
return prompt
args: AutoDeployConfig = info.data["args"]
args: LlmArgs = info.data["args"]
if args.max_batch_size < prompt.batch_size:
args.max_batch_size = prompt.batch_size
return prompt
Expand All @@ -231,7 +230,7 @@ def sync_prompt_batch_size_with_args_max_batch_size(cls, prompt: PromptConfig, i
def adjust_args_for_benchmark(cls, benchmark: BenchmarkConfig, info):
if "args" not in info.data:
return benchmark
args: AutoDeployConfig = info.data["args"]
args: LlmArgs = info.data["args"]
if benchmark.enabled:
# propagate benchmark settings to args
args.max_batch_size = max(benchmark.bs, args.max_batch_size)
Expand All @@ -246,7 +245,7 @@ def build_llm_from_config(config: ExperimentConfig) -> LLM:
"demollm": DemoLLM,
"trtllm": LLM,
}
llm = llm_lookup[config.args.runtime](**config.args.to_llm_kwargs())
llm = llm_lookup[config.args.runtime](**config.args.model_dump(exclude_unset=True))
return llm


Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ opentelemetry-semantic-conventions-ai>=0.4.1
fuzzywuzzy==0.18.0
aiperf==0.3.0
nanobind>=2.9.0
nixl==0.8.0
Loading
Loading