Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion conf/experimental/ai_dynamo/test/sglang.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]

[cmd_args]
docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
workloads = "genai_perf.sh"
workloads = "aiperf.sh"

[cmd_args.dynamo]
backend = "sglang"
Expand Down Expand Up @@ -93,6 +93,14 @@ workloads = "genai_perf.sh"
warmup-request-count = 5
concurrency = 2

[cmd_args.aiperf]

[cmd_args.aiperf.args]
concurrency = 2
request-count = 50
synthetic-input-tokens-mean = 300
output-tokens-mean = 500

[extra_env_vars]
UCX_LOG_LEVEL = "warn"
HF_HUB_OFFLINE = "1"
Expand Down
10 changes: 9 additions & 1 deletion conf/experimental/ai_dynamo/test/vllm.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]

[cmd_args]
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1"
workloads = "genai_perf.sh"
workloads = "aiperf.sh"

[cmd_args.dynamo]
backend = "vllm"
Expand Down Expand Up @@ -85,6 +85,14 @@ workloads = "genai_perf.sh"
warmup-request-count = 5
concurrency = 2

[cmd_args.aiperf]

[cmd_args.aiperf.args]
concurrency = 2
request-count = 50
synthetic-input-tokens-mean = 300
output-tokens-mean = 500

[extra_env_vars]
UCX_LOG_LEVEL = "warn"
HF_HUB_OFFLINE = "1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ name = "dynamo_sglang"

[[Tests]]
id = "sglang-Qwen3-0.6B"
test_name = "sglang-Qwen3-0.6B"
test_name = "sglang"
time_limit = "00:20:00"

[Tests.cmd_args]
Expand Down
4 changes: 2 additions & 2 deletions conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ job_status_check = false

[[Tests]]
id = "test.disagg.single-node"
test_name = "vLLM-Qwen3-0.6B"
test_name = "vLLM"
time_limit = "00:10:00"

[Tests.cmd_args]
Expand All @@ -38,7 +38,7 @@ time_limit = "00:10:00"

[[Tests]]
id = "test.disagg.multinode"
test_name = "vLLM-Qwen3-0.6B"
test_name = "vLLM"
time_limit = "00:10:00"

[Tests.cmd_args]
Expand Down
75 changes: 57 additions & 18 deletions doc/workloads/ai_dynamo.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Node Configuration for AI Dynamo

AI Dynamo jobs use three distinct types of nodes:

- **Frontend node**: Hosts the coordination services (`etcd`, `nats`), the **frontend server**, the **request generator** (`genai-perf`), and the first decode worker
- **Frontend node**: Hosts the coordination services (`etcd`, `nats`), the **frontend server**, the **request generator** (`aiperf` by default, configurable via ``workloads`` in the test TOML), and the first decode worker
- **Prefill node(s)**: Handle the prefill stage of inference
- **Decode node(s)**: Handle the decode stage of inference (optional, depending on model and setup)

Expand Down Expand Up @@ -82,32 +82,71 @@ The job progress monitoring can be done using either of the following options:

watch tail -n 4 ./results/<scenario name>/*.txt

The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch ``genai-perf``, which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``.
The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch the configured benchmark tool (``aiperf`` by default), which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``.

Review genai-perf Benchmark Results
-----------------------------------
Choosing a Benchmark Tool
~~~~~~~~~~~~~~~~~~~~~~~~~

After job completion, CloudAI will place the output logs and result files in the designated results directory. To analyze performance metrics and validate inference outcomes:
The benchmark tool is controlled by the ``workloads`` field in the test TOML. The default is ``aiperf.sh``:

- Navigate to the results directory (e.g., ``./results/...``)
- Most importantly, open the ``profile_genai_perf.csv`` file to examine the final benchmarking results
.. code-block:: toml

This CSV file includes detailed metrics collected by genai-perf, such as request latency, throughput, and system utilization statistics. Use this data to evaluate the model's performance and identify potential bottlenecks or optimization opportunities.
[cmd_args]
workloads = "aiperf.sh" # default — uses aiperf, writes aiperf_report.csv

To use genai-perf instead, set:

.. code-block:: toml

[cmd_args]
workloads = "genai_perf.sh" # uses genai-perf, writes genai_perf_report.csv

[cmd_args.genai_perf]
cmd = "genai-perf profile"
extra-args = "--streaming --verbose -- -v --async"

[cmd_args.genai_perf.args]
endpoint-type = "chat"
output-tokens-mean = 500
request-count = 50

Review Benchmark Results
------------------------

After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field:

- ``aiperf.sh`` (default) → ``aiperf_report.csv``
- ``genai_perf.sh`` → ``genai_perf_report.csv``

Navigate to ``./results/<scenario>/<test-id>/0/`` and open the CSV to examine performance metrics.

Example ``aiperf_report.csv`` (default):

::

Metric,avg,min,max,p99,p95,p90,p75,p50,p25
Time To First Token (ms),"1,146.31",249.48,"3,485.23","3,457.97","3,349.56","3,215.06","1,330.93",640.07,286.52
Time To Second Token (ms),26.05,0.00,133.51,96.12,36.56,34.88,34.35,33.55,1.78
Request Latency (ms),"6,406.20","5,371.47","9,608.72","9,436.13","9,046.58","9,028.16","6,549.60","5,690.23","5,493.63"
Inter Token Latency (ms),30.35,27.59,35.60,35.23,33.88,32.53,31.05,30.13,29.04
Output Sequence Length (tokens),174.45,164.00,187.00,186.22,183.10,180.10,177.00,174.00,171.75
Input Sequence Length (tokens),"3,000.05","2,999.00","3,001.00","3,001.00","3,001.00","3,000.00","3,000.00","3,000.00","3,000.00"
Metric,avg,min,max,p25,p50,p75,p99,std
Inter Token Latency (ms),2.81,2.66,2.88,2.79,2.83,2.84,2.87,0.04
Time to First Token (ms),49.87,17.15,99.91,49.35,49.87,50.52,92.31,9.20
Time to Second Token (ms),0.50,0.03,4.05,0.03,0.04,0.04,3.47,1.08
Request Latency (ms),1652.30,1203.61,6433.87,1453.19,1462.99,1466.72,6431.16,976.18
Output Sequence Length (tokens),498.06,410.00,501.00,500.00,500.00,500.00,501.00,12.62
Input Sequence Length (tokens),300.00,300.00,300.00,300.00,300.00,300.00,300.00,0.00

Metric,Value
Output Token Throughput (per sec),261.25
Request Throughput (per sec),1.50
Request Count (count),40.00
Output Token Throughput (tokens/sec),598.78
Total Token Throughput (tokens/sec),962.32
Request Throughput (requests/sec),1.20
Request Count,50.00

Supported Backends
------------------

The following backends are available via the ``conf/experimental/ai_dynamo/test/`` directory:

- **vLLM** (``vllm.toml``) — use with ``test_scenario/vllm_slurm.toml``
- **sglang** (``sglang.toml``) — use with ``test_scenario/sglang_slurm.toml``

Both backends use ``aiperf`` as the default benchmark tool and support disaggregated prefill/decode.


API Documentation
Expand Down
2 changes: 2 additions & 0 deletions src/cloudai/workloads/ai_dynamo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
AIDynamoArgs,
AIDynamoCmdArgs,
AIDynamoTestDefinition,
AIPerf,
GenAIPerf,
LMCache,
LMCacheArgs,
Expand All @@ -35,6 +36,7 @@
"AIDynamoReportGenerationStrategy",
"AIDynamoSlurmCommandGenStrategy",
"AIDynamoTestDefinition",
"AIPerf",
"GenAIPerf",
"LMCache",
"LMCacheArgs",
Expand Down
24 changes: 23 additions & 1 deletion src/cloudai/workloads/ai_dynamo/ai_dynamo.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,25 @@ def installables(self) -> list[Installable]:
return [self.script]


class AIPerf(Workload):
"""Workload configuration for aiperf benchmarking."""

model_config = ConfigDict(extra="allow")

name: str = "aiperf"
cmd: str = "aiperf profile"
script: File = File(Path(__file__).parent.parent / "ai_dynamo/aiperf.sh")
report_name: str = Field(
default="aiperf_report.csv",
serialization_alias="report-name",
validation_alias=AliasChoices("report-name", "report_name"),
)

@property
def installables(self) -> list[Installable]:
return [self.script]


class Constraints(BaseModel):
"""Constraints for validation of AI Dynamo configurations when using DSE."""

Expand All @@ -301,12 +320,13 @@ class AIDynamoCmdArgs(CmdArgs):
dynamo: AIDynamoArgs
lmcache: LMCache = Field(default_factory=LMCache)
genai_perf: GenAIPerf = Field(default_factory=GenAIPerf)
aiperf: AIPerf = Field(default_factory=AIPerf)
workloads: str = "genai_perf.sh"

@field_validator("workloads", mode="before")
@classmethod
def validate_workloads(cls, v: str) -> str:
allowed_workloads = ["genai_perf.sh"]
allowed_workloads = ["genai_perf.sh", "aiperf.sh"]
values = [w.strip() for w in v.split(",")]
for workload in values:
if workload not in allowed_workloads:
Expand All @@ -322,6 +342,7 @@ def installables(self) -> list[Installable]:
return [
*self.lmcache.installables,
*self.genai_perf.installables,
*self.aiperf.installables,
]


Expand Down Expand Up @@ -356,6 +377,7 @@ def get_workload_map(self) -> dict[str, Workload]:
"""Get a map of workload scripts to workload objects."""
return {
self.cmd_args.genai_perf.script.src.name: self.cmd_args.genai_perf,
self.cmd_args.aiperf.script.src.name: self.cmd_args.aiperf,
}

@property
Expand Down
16 changes: 16 additions & 0 deletions src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ declare -A lmcache_args
declare -A lmcache_config
declare -A genai_perf_args
declare -A genai_perf_config
declare -A aiperf_args
declare -A aiperf_config

declare -A dynamo_args
dynamo_args["backend"]="vllm"
Expand Down Expand Up @@ -163,6 +165,10 @@ _parse_cli_pairs() {
genai_perf_args["--${key#--genai_perf-args-}"]="$2" ;;
--genai_perf-*)
genai_perf_config["--${key#--genai_perf-}"]="$2" ;;
--aiperf-args-*)
aiperf_args["--${key#--aiperf-args-}"]="$2" ;;
--aiperf-*)
aiperf_config["--${key#--aiperf-}"]="$2" ;;
--hf-home)
HUGGINGFACE_HOME="$2" ;;
--storage-cache-dir)
Expand Down Expand Up @@ -353,6 +359,8 @@ _dump_args() {
log "LMCache args:\n$(arg_array_to_string lmcache_args)"
log "GenAI config params:\n$(arg_array_to_string genai_perf_config)"
log "GenAI-Perf args:\n$(arg_array_to_string genai_perf_args)"
log "AIPerf config params:\n$(arg_array_to_string aiperf_config)"
log "AIPerf args:\n$(arg_array_to_string aiperf_args)"
log "--------------------------------"
}

Expand Down Expand Up @@ -505,6 +513,10 @@ _is_genai_perf_workload() {
[[ "${dynamo_args["workloads"]}" == *"genai_perf.sh"* ]]
}

_is_aiperf_workload() {
[[ "${dynamo_args["workloads"]}" == *"aiperf.sh"* ]]
}

_init_runtime_env() {
if _is_vllm || _is_sglang; then
export HF_HOME="${HUGGINGFACE_HOME}"
Expand Down Expand Up @@ -1026,6 +1038,10 @@ function launch_workloads()
launch_workload genai_perf_config genai_perf_args
fi

if _is_aiperf_workload; then
launch_workload aiperf_config aiperf_args
fi

mark_done
}

Expand Down
Loading
Loading