NVIDIA · podkidyshev · May 21, 2026 · May 13, 2026 · May 20, 2026 · May 20, 2026
@@ -43,10 +43,12 @@ These schemas enable CloudAI to be flexible and compatible with different system
 |NIXL benchmark|✅|❌|❌|❌|
 |NIXL kvbench|✅|❌|❌|❌|
 |NIXL CTPerf|✅|❌|❌|❌|
+|SGLang|✅|❌|❌|❌|
 |Sleep|✅|✅|❌|✅|
 |SlurmContainer|✅|❌|❌|❌|
 |Triton Inference|✅|❌|❌|❌|
 |UCC|✅|❌|❌|❌|
+|vLLM|✅|❌|❌|❌|
 
 Note: Deprecated means that a workload support exists, but we are not maintaining it actively anymore and newer configurations might not work.
 

@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "sglang"
+description = "SGLang test"
+test_template_name = "sglang"
+
+[cmd_args]
+docker_image_url = "lmsysorg/sglang:dev-cu13"
+
+[semantic_eval_cmd_args]
+module = "sglang.test.run_eval"
+args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+
+[extra_env_vars]
+UCX_NET_DEVICES = "all"
+UCX_TLS = "^gdr_copy,cuda_ipc"
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "sglang"
+
+[[Tests]]
+id = "sglang.agg.2nodes"
+test_name = "sglang"
+num_nodes = 2
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.decode]
+  mem_fraction_static = 0.75
+
+[[Tests]]
+id = "sglang.agg.1node"
+test_name = "sglang"
+num_nodes = 1
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.decode]
+  mem_fraction_static = 0.75
+
+[[Tests]]
+id = "sglang.disagg.sync"
+test_name = "sglang"
+num_nodes = 1
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.prefill]
+  gpu_ids = "0,1"
+  tensor_parallel_size = 2
+  mem_fraction_static = 0.75
+
+  [Tests.cmd_args.decode]
+  gpu_ids = "2,3"
+  tensor_parallel_size = 2
+  mem_fraction_static = 0.75
+
+[[Tests]]
+id = "sglang.disagg.async"
+test_name = "sglang"
+num_nodes = 1
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.prefill]
+  gpu_ids = "0,1"
+  tensor_parallel_size = 2
+  mem_fraction_static = 0.75
+
+  [Tests.cmd_args.decode]
+  gpu_ids = "2,3"
+  tensor_parallel_size = 2
+  mem_fraction_static = 0.75
+
+[[Tests]]
+id = "sglang.disagg.2nodes"
+test_name = "sglang"
+num_nodes = 2
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.prefill]
+  tensor_parallel_size = 4
+  mem_fraction_static = 0.75
+
+  [Tests.cmd_args.decode]
+  tensor_parallel_size = 4
+  mem_fraction_static = 0.75
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "vllm"
+description = "vLLM test"
+test_template_name = "vllm"
+
+[[git_repos]]
+url = "https://github.com/vllm-project/vllm.git"
+commit = "main"
+mount_as = "/vllm_repo"
+
+[cmd_args]
+docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1"
+
+[semantic_eval_cmd_args]
+script = "/vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
+args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+
+[extra_env_vars]
+UCX_NET_DEVICES = "all"
+UCX_TLS = "^gdr_copy,cuda_ipc"
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "vllm"
+
+[[Tests]]
+id = "vllm.disagg.sync"
+test_name = "vllm"
+num_nodes = 2
+time_limit = "00:30:00"
+
+  [Tests.cmd_args.prefill]
+  enforce_eager = ""
+  tensor_parallel_size = 2
+  max_num_batched_tokens = 1024
+
+  [Tests.cmd_args.decode]
+  enforce_eager = ""
+  tensor_parallel_size = 2
+  max_num_batched_tokens = 1024
+
+[[Tests]]
+id = "vllm.disagg.async"
+test_name = "vllm"
+num_nodes = 1
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.prefill]
+  gpu_ids = "0,1"
+  enforce_eager = ""
+  tensor_parallel_size = 1
+  max_num_batched_tokens = 1024
+
+  [Tests.cmd_args.decode]
+  gpu_ids = "2,3"
+  enforce_eager = ""
+  tensor_parallel_size = 2
+  max_num_batched_tokens = 1024
@@ -28,6 +28,10 @@ Test + Scenario example
    max_concurrency = 16
    num_prompts = 30
 
+   [semantic_eval_cmd_args]
+   module = "sglang.test.run_eval"
+   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+
 
 .. code-block:: toml
    :caption: scenario.toml (scenario with one test)
@@ -68,6 +72,29 @@ Test-in-Scenario example
    num_prompts = 30
 
 
+Semantic Validation
+-------------------
+To run GSM8K semantic validation after the serving benchmark, add ``semantic_eval_cmd_args``. CloudAI reports
+``accuracy`` from the eval output, but does not enforce an accuracy threshold.
+
+.. code-block:: toml
+   :caption: test.toml (semantic validation)
+
+   [semantic_eval_cmd_args]
+   module = "sglang.test.run_eval"
+   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+
+For images that still use the legacy SGLang GSM8K runner, override the module and raw arguments:
+
+.. code-block:: toml
+
+   [semantic_eval_cmd_args]
+   module = "sglang.test.few_shot_gsm8k"
+   args = "--num-questions 200"
+
+The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+
+
 Control number of GPUs
 ----------------------
 The number of GPUs can be controlled using the options below, listed from lowest to highest priority:
@@ -130,6 +157,13 @@ Benchmark Command Arguments
    :members:
    :show-inheritance:
 
+Semantic Eval Command Arguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autopydantic_model:: cloudai.workloads.sglang.sglang.SglangSemanticEvalCmdArgs
+   :members:
+   :show-inheritance:
+
 Test Definition
 ~~~~~~~~~~~~~~~
 

@@ -28,6 +28,10 @@ Test and Scenario Examples
    max_concurrency = 16
    num_prompts = 30
 
+   [semantic_eval_cmd_args]
+   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+
 
 .. code-block:: toml
    :caption: scenario.toml (scenario with one test)
@@ -68,6 +72,24 @@ Test-in-Scenario example
    num_prompts = 30
 
 
+Semantic Validation
+-------------------
+To run GSM8K semantic validation after the serving benchmark, add ``semantic_eval_cmd_args``. CloudAI reports
+``accuracy`` from the eval output, but does not enforce an accuracy threshold.
+
+.. code-block:: toml
+   :caption: test.toml (semantic validation)
+
+   [semantic_eval_cmd_args]
+   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+
+If the runtime image does not contain the eval script, mount a vLLM repository with existing ``git_repos`` support and
+point ``script`` at the mounted path.
+
+The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+
+
 Controlling the Number of GPUs
 -------------------------------
 The number of GPUs can be controlled using the options below, listed from lowest to highest priority:
@@ -154,6 +176,13 @@ Benchmark Command Arguments
    :members:
    :show-inheritance:
 
+Semantic Eval Command Arguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.vllm.vllm.VllmSemanticEvalCmdArgs
+   :members:
+   :show-inheritance:
+
 Test Definition
 ~~~~~~~~~~~~~~~