Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 3 additions & 18 deletions conf/experimental/ai_dynamo/test/sglang.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
name = "sglang"
description = "sglang backend"
test_template_name = "AIDynamo"
extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
extra_container_mounts = ["/run/udev:/run/udev"]

[cmd_args]
docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.1.1"
Expand Down Expand Up @@ -59,23 +59,8 @@ workloads = "aiperf.sh"
host = "0.0.0.0"
disaggregation-transfer-backend = "nixl"

[cmd_args.lmcache]
controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"

[cmd_args.lmcache.args]
chunk_size = 256
local_cpu = false
nixl_buffer_size = 10737418240
nixl_buffer_device = "cuda"
extra_config_enable_nixl_storage = true
extra_config_nixl_backend = "GDS_MT"
extra_config_nixl_file_pool_size = 64

enable_controller = true
lmcache_instance_id = "lmcache_default_instance"
controller_url = "localhost:9001"
lmcache_worker_port = 8788
distributed_url = "localhost:8789"
[cmd_args.lmcache_controller]
cmd = "lmcache_controller --host 0.0.0.0 --port 9000 --monitor-port 9001"

[cmd_args.genai_perf]
cmd = "genai-perf profile"
Expand Down
21 changes: 3 additions & 18 deletions conf/experimental/ai_dynamo/test/vllm.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
name = "vLLM"
description = "vLLM backend"
test_template_name = "AIDynamo"
extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
extra_container_mounts = ["/run/udev:/run/udev"]

[cmd_args]
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1"
Expand Down Expand Up @@ -53,23 +53,8 @@ workloads = "aiperf.sh"
data-parallel-size = 1
kv-transfer-config = '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'

[cmd_args.lmcache]
controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"

[cmd_args.lmcache.args]
chunk_size = 256
local_cpu = false
nixl_buffer_size = 10737418240
nixl_buffer_device = "cuda"
extra_config_enable_nixl_storage = true
extra_config_nixl_backend = "GDS_MT"
extra_config_nixl_file_pool_size = 64

enable_controller = true
lmcache_instance_id = "lmcache_default_instance"
controller_url = "localhost:9001"
lmcache_worker_port = 8788
distributed_url = "localhost:8789"
[cmd_args.lmcache_controller]
cmd = "lmcache_controller --host 0.0.0.0 --port 9000 --monitor-port 9001"

[cmd_args.genai_perf]
cmd = "genai-perf profile"
Expand Down
117 changes: 117 additions & 0 deletions conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "dynamo-vllm-lmcache"
job_status_check = false

[[Tests]]
id = "test.disagg.lmcache-controller"
name = "vllm-disagg-lmcache-controller"
description = "Self-contained AIDynamo scenario wiring vLLM disaggregated inference, LMCache config propagation, and LMCache controller launch."
test_template_name = "AIDynamo"
time_limit = "00:10:00"
extra_container_mounts = ["/run/udev:/run/udev"]
dse_excluded_args = ["cmd_args.lmcache.lmcache_worker_ports"]

[Tests.cmd_args]
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.1.1"
# storage_cache_dir = "/lustre/.../install/tmp"
workloads = "aiperf.sh"

[Tests.cmd_args.lmcache]
enable_controller = true
chunk_size = 256
local_cpu = true
lmcache_instance_id = "lmcache_default_instance"
controller_pull_url = "{frontend_node}:8300"
controller_reply_url = "{frontend_node}:8400"
lmcache_worker_ports = [8788, 8789, 8790, 8791]
max_local_cpu_size = 6.0
nixl_buffer_size = 2079377920
nixl_buffer_device = "cpu"

[Tests.cmd_args.lmcache.extra_config]
enable_nixl_storage = false
nixl_backend = "POSIX"
nixl_path = "{storage_cache_dir}"
nixl_pool_size = 2048

[Tests.cmd_args.lmcache_controller]
cmd = "lmcache_controller --host 0.0.0.0 --port 9000 --monitor-ports {\"pull\":8300,\"reply\":8400}"

[Tests.cmd_args.dynamo]
backend = "vllm"
model = "Qwen/Qwen3-0.6B"

[Tests.cmd_args.dynamo.prefill_worker]
num-nodes = 1
cmd = 'python3 -m dynamo.vllm --is-prefill-worker'
worker-initialized-regex = 'VllmWorker.*has.been.initialized'
extra-args = "--no-enable-expert-parallel"

[Tests.cmd_args.dynamo.prefill_worker.args]
gpu-memory-utilization = 0.8
kv-transfer-config = '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]}}'
tensor-parallel-size = 4
pipeline-parallel-size = 1
data-parallel-size = 1

[Tests.cmd_args.dynamo.decode_worker]
num-nodes = 1
cmd = 'python3 -m dynamo.vllm'
worker-initialized-regex = 'VllmWorker.*has.been.initialized'
extra-args = "--no-enable-expert-parallel"

[Tests.cmd_args.dynamo.decode_worker.args]
gpu-memory-utilization = 0.8
kv-transfer-config = '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
tensor-parallel-size = 4
pipeline-parallel-size = 1
data-parallel-size = 1

[Tests.cmd_args.aiperf]
[Tests.cmd_args.aiperf.args]
concurrency = 2
extra-inputs = '{"min_tokens":10}'
output-tokens-mean = 500
request-count = 50
synthetic-input-tokens-mean = 300

[Tests.cmd_args.aiperf_accuracy]
entrypoint = "aiperf profile"
setup-cmd = "python -m pip install --break-system-packages --upgrade aiperf==0.8.0"
cli = '''
--model {model}
--url {url}
--endpoint-type chat
--streaming
--artifact-dir {artifact_dir}
--no-server-metrics
--accuracy-benchmark mmlu
--accuracy-n-shots 5
--accuracy-tasks abstract_algebra
--concurrency 10
--extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
--num-requests 100
'''

[Tests.extra_env_vars]
UCX_LOG_LEVEL = "warn"
HF_HUB_OFFLINE = "0"
TRANSFORMERS_OFFLINE = "0"
HF_DATASETS_OFFLINE = "0"
DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
UCX_TLS = "all"
25 changes: 25 additions & 0 deletions doc/USER_GUIDE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,31 @@ action, typically seeded by ``random_seed``.

Custom agents may extend the ``BaseAgentConfig`` and offer more parameters to configure.

DSE parameter exclusions
~~~~~~~~~~~~~~~~~~~~~~~~

CloudAI builds the DSE parameter space implicitly from list-valued fields under ``cmd_args``, list-valued
``extra_env_vars``, and list-valued ``num_nodes``. If a list-valued ``cmd_args`` field is configuration data rather than
a sweep dimension, exclude it with ``dse_excluded_args`` in the test or scenario definition.

Entries in ``dse_excluded_args`` must be dot-separated paths that start with ``cmd_args.``. Each entry excludes that
field and any nested fields below it from DSE parameter discovery:

.. code-block:: toml

[[Tests]]
id = "Tests.1"
test_name = "my_test"
dse_excluded_args = ["cmd_args.lmcache.lmcache_worker_ports"]

[Tests.cmd_args.lmcache]
chunk_size = [256, 512]
lmcache_worker_ports = [8788, 8789, 8790, 8791]

In this example, ``cmd_args.lmcache.chunk_size`` is still swept, while
``cmd_args.lmcache.lmcache_worker_ports`` is treated as a single configuration value. The exclusion mechanism currently
applies only to ``cmd_args`` paths; it does not exclude ``extra_env_vars`` or ``num_nodes`` from DSE.

Metric errors and report strategies
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
72 changes: 72 additions & 0 deletions doc/workloads/ai_dynamo.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,77 @@ To use genai-perf, set:
output-tokens-mean = 500
request-count = 50

Propagating LMCache Configuration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

AIDynamo can pass an LMCache YAML config to the worker processes by setting ``LMCACHE_CONFIG_FILE`` inside the
container. This only propagates the LMCache configuration; the vLLM/SGLang runtime still needs to be launched with the
appropriate LMCache or KV-transfer connector for that image/version.

The preferred form is structured TOML under ``[cmd_args.lmcache]``. CloudAI converts that object to YAML in the
run output directory, mounts that directory as ``/cloudai_run_results``, and exports the generated file path as
``LMCACHE_CONFIG_FILE``:

.. code-block:: toml

[cmd_args]
[cmd_args.lmcache]
chunk_size = 256
local_cpu = true
controller_pull_url = "{frontend_node}:8300"
controller_reply_url = "{frontend_node}:8400"
lmcache_worker_ports = [8788, 8789, 8790, 8791]
max_local_cpu_size = 6.0
nixl_buffer_size = 2079377920
nixl_buffer_device = "cpu"

[cmd_args.lmcache.extra_config]
enable_nixl_storage = false
nixl_backend = "POSIX"
nixl_path = "{storage_cache_dir}"
nixl_pool_size = 2048

For an example that uses test-in-scenario mode, see
``conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml``. Because the test is fully defined inside the scenario,
``--tests-dir`` is not required when running that example:

.. code-block:: bash

uv run cloudai run --system-config <slurm system toml> \
--test-scenario conf/experimental/ai_dynamo/test_scenario/vllm_lmcache.toml

The example sets ``dse_excluded_args = ["cmd_args.lmcache.lmcache_worker_ports"]`` because
``lmcache_worker_ports`` is a list-valued LMCache setting, not a DSE sweep dimension. Other list-valued LMCache fields
can still be swept unless their ``cmd_args.`` path is also excluded.

Alternatively, mount your own LMCache YAML file with ``extra_container_mounts`` and set ``LMCACHE_CONFIG_FILE`` through
``extra_env_vars``:

.. code-block:: toml

extra_container_mounts = ["/host/lmcache:/lmcache"]
extra_env_vars = { LMCACHE_CONFIG_FILE = "/lmcache/config.yaml" }

For multi-node LMCache storage tests, any path referenced by the LMCache YAML, such as ``nixl_path`` for POSIX-backed
storage, must be visible and writable from every node that is expected to share cached data. A node-local path such as
``/tmp`` is suitable only for single-node smoke tests or configuration propagation checks.

LMCache YAML values can use runtime placeholders. CloudAI renders them inside the Slurm job before launching workers:
``{frontend_node}``, ``{frontend_ip}``, ``{results_dir}``, and ``{storage_cache_dir}``. Unknown placeholders fail the
run before worker processes start.

If the selected LMCache mode needs a controller, CloudAI can start one on the frontend node:

.. code-block:: toml

[cmd_args.lmcache_controller]
cmd = "lmcache_controller --host 0.0.0.0 --port 9000 --monitor-ports {\"pull\":8300,\"reply\":8400}"

This only launches the process. For disaggregated or multi-node runs, the LMCache YAML still needs controller addresses
that resolve to the frontend node from every worker. With the default controller monitor ports, use
``controller_pull_url = "{frontend_node}:8300"`` and ``controller_reply_url = "{frontend_node}:8400"``. The
``lmcache_worker_ports`` list must match the number of worker ranks.

Semantic Degradation With AIPerf Accuracy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -215,6 +286,7 @@ Supported Backends
The following backends are available via the ``conf/experimental/ai_dynamo/test/`` directory:

- **vLLM** (``vllm.toml``) — use with ``test_scenario/vllm_slurm.toml``
- **vLLM with LMCache config propagation** — use self-contained scenario ``test_scenario/vllm_lmcache.toml``
- **sglang** (``sglang.toml``) — use with ``test_scenario/sglang_slurm.toml``

Both backends use ``aiperf`` as the default benchmark tool and support disaggregated prefill/decode.
Expand Down
13 changes: 10 additions & 3 deletions src/cloudai/_core/test_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,11 @@ def param_space(self) -> dict[str, Any]:
extra_env_vars_dict = self.test.extra_env_vars

action_space: dict[str, Any] = {
**{key: value for key, value in cmd_args_dict.items() if isinstance(value, list)},
**{
key: value
for key, value in cmd_args_dict.items()
if isinstance(value, list) and not self.test.is_dse_excluded_arg(key)
},
**{f"extra_env_vars.{key}": value for key, value in extra_env_vars_dict.items() if isinstance(value, list)},
}
if isinstance(self.num_nodes, list):
Expand Down Expand Up @@ -183,8 +187,11 @@ def apply_params_set(self, action: dict[str, Any]) -> "TestRun":
attrs = key.split(".")
obj = tdef.cmd_args
for attr in attrs[:-1]:
obj = getattr(obj, attr)
setattr(obj, attrs[-1], value)
obj = obj[attr] if isinstance(obj, dict) else getattr(obj, attr)
if isinstance(obj, dict):
obj[attrs[-1]] = value
else:
setattr(obj, attrs[-1], value)

type(tdef)(**tdef.model_dump()) # trigger validation

Expand Down
2 changes: 2 additions & 0 deletions src/cloudai/models/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class TestRunModel(BaseModel):
description: Optional[str] = None
test_template_name: Optional[str] = None
cmd_args: Optional[CmdArgs] = None
dse_excluded_args: Optional[list[str]] = None
extra_env_vars: dict[str, str | list[str]] | None = None
extra_container_mounts: Optional[list[str]] = None
git_repos: Optional[list[GitRepo]] = None
Expand All @@ -114,6 +115,7 @@ def tdef_model_dump(self, by_alias: bool) -> dict:
"agent_metrics": self.agent_metrics if "agent_metrics" in self.model_fields_set else None,
"agent_reward_function": self.agent_reward_function,
"agent_config": self.agent_config,
"dse_excluded_args": self.dse_excluded_args,
"extra_container_mounts": self.extra_container_mounts,
"extra_env_vars": self.extra_env_vars if self.extra_env_vars else None,
"cmd_args": self.cmd_args.model_dump(by_alias=by_alias) if self.cmd_args else None,
Expand Down
Loading
Loading