NVIDIA · podkidyshev · Jun 2, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
@@ -28,6 +28,7 @@ workloads = "aiperf.sh"
   backend = "sglang"
   model = "Qwen/Qwen3-0.6B"
   endpoint = "v1/chat/completions"
+  ingress-cmd = "python -m dynamo.frontend --router-mode kv --router-reset-states"
 
     [cmd_args.dynamo.prefill_worker]
     num-nodes = 1
@@ -117,7 +118,7 @@ workloads = "aiperf.sh"
 --accuracy-n-shots 5
 --accuracy-tasks abstract_algebra
 --concurrency 10
---extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--extra-inputs '{"temperature":0,"stop":["\n"],"chat_template_kwargs":{"enable_thinking":false}}'
 --num-requests 100
 '''
 

@@ -27,6 +27,7 @@ workloads = "aiperf.sh"
   [cmd_args.dynamo]
   backend = "vllm"
   model = "Qwen/Qwen3-0.6B"
+  ingress-cmd = "python -m dynamo.frontend --router-mode kv --router-reset-states"
 
     [cmd_args.dynamo.prefill_worker]
     num-nodes = 1
@@ -115,7 +116,7 @@ workloads = "aiperf.sh"
 --accuracy-n-shots 5
 --accuracy-tasks abstract_algebra
 --concurrency 10
---extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--extra-inputs '{"temperature":0,"stop":["\n"],"chat_template_kwargs":{"enable_thinking":false}}'
 --num-requests 100
 '''
 

@@ -121,7 +121,7 @@ dse_excluded_args = [
 --accuracy-n-shots 5
 --accuracy-tasks abstract_algebra
 --concurrency 10
---extra-inputs '{"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}'
+--extra-inputs '{"temperature":0,"stop":["\n"],"chat_template_kwargs":{"enable_thinking":false}}'
 --num-requests 100
 '''
 

@@ -210,11 +210,13 @@ DSE parameter exclusions
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 CloudAI builds the DSE parameter space implicitly from list-valued fields under ``cmd_args``, list-valued
-``extra_env_vars``, and list-valued ``num_nodes``. If a list-valued ``cmd_args`` field is configuration data rather than
-a sweep dimension, exclude it with ``dse_excluded_args`` in the test or scenario definition.
+``extra_env_vars``, and list-valued ``num_nodes``. Most lists mean "try each value", but some workload settings are
+real list-valued configuration, such as worker port lists or ordered benchmark phases.
 
-Entries in ``dse_excluded_args`` must be dot-separated paths that start with ``cmd_args.``. Each entry excludes that
-field and any nested fields below it from DSE parameter discovery:
+Use ``dse_excluded_args`` when a list under ``cmd_args`` should stay intact instead of becoming a sweep dimension.
+Entries must be dot-separated paths that start with ``cmd_args.`` and may point to either a single field or a parent
+object. Matching is prefix-based, so excluding ``cmd_args.foo`` also excludes nested list-valued fields such as
+``cmd_args.foo.bar`` from DSE parameter discovery.
 
 .. code-block:: toml
 
@@ -228,8 +230,13 @@ field and any nested fields below it from DSE parameter discovery:
      lmcache_worker_ports = [8788, 8789, 8790, 8791]
 
 In this example, ``cmd_args.lmcache.chunk_size`` is still swept, while
-``cmd_args.lmcache.lmcache_worker_ports`` is treated as a single configuration value. The exclusion mechanism currently
-applies only to ``cmd_args`` paths; it does not exclude ``extra_env_vars`` or ``num_nodes`` from DSE.
+``cmd_args.lmcache.lmcache_worker_ports`` is passed through as one list value. The exclusion does not remove or mutate
+the field; it only prevents CloudAI from adding that path to the DSE parameter space.
+
+``dse_excluded_args`` currently applies only to ``cmd_args`` paths. It does not exclude list-valued ``extra_env_vars``
+or ``num_nodes``; those lists are still interpreted as sweep dimensions. To exclude many nested list fields at once,
+exclude their common parent path. Common examples are ``cmd_args.aiperf_phases`` and
+``cmd_args.lmcache.lmcache_worker_ports``.
 
 Metric errors and report strategies
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -110,6 +110,47 @@ To use genai-perf, set:
      output-tokens-mean = 500
      request-count = 50
 
+AIPerf Multi-Phase Runs
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``cmd_args.aiperf`` is the base AIPerf config. ``cmd_args.aiperf_phases`` can run several AIPerf rounds against the
+same live Dynamo stack. By default, CloudAI does not restart prefill, decode, or router processes between phases:
+
+.. code-block:: toml
+
+   dse_excluded_args = ["cmd_args.aiperf_phases"]
+
+   [cmd_args.aiperf]
+   health-check-between-phases = true
+   between-phase-cmd = "true"  # default no-op
+
+     [cmd_args.aiperf.args]
+     request-count = 50
+     server-metrics = "auto"
+
+   [[cmd_args.aiperf_phases]]
+   name = "round_1"
+     [cmd_args.aiperf_phases.args]
+     concurrency = 2
+
+   [[cmd_args.aiperf_phases]]
+   name = "round_2"
+     [cmd_args.aiperf_phases.args]
+     concurrency = 4
+
+Single-phase runs keep the old artifact layout: ``aiperf_artifacts/``, ``aiperf.log``, and ``aiperf_report.csv``.
+Multi-phase runs write per-phase artifacts/logs/reports and copy the last phase report to ``aiperf_report.csv`` for
+existing report generation.
+
+``between-phase-cmd`` is a bash command run after each non-final phase. The default is a no-op. Set it explicitly for
+backend-specific cache cleanup, for example ``/cloudai_run_results/routerctl.sh restart`` if a test needs to restart the
+Dynamo router between phases. ``health-check-between-phases`` probes the frontend after the command.
+
+AIPerf args are rendered as normal CLI flags. Multi-value AIPerf options should be passed with AIPerf CLI syntax, such
+as ``server-metrics-formats = "csv,json,jsonl"`` or ``gpu-telemetry = "node1:9401,node2:9401"``. ``server-metrics =
+"auto"`` expands to the frontend metrics endpoint, Dynamo worker metrics endpoints, and any CloudAI-started DCGM
+exporters.
+
 Propagating LMCache Configuration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

@@ -294,6 +294,11 @@ class AIPerf(Workload):
         serialization_alias="continue-on-phase-failure",
         validation_alias=AliasChoices("continue-on-phase-failure", "continue_on_phase_failure"),
     )
+    between_phase_cmd: str | None = Field(
+        default="true",
+        serialization_alias="between-phase-cmd",
+        validation_alias=AliasChoices("between-phase-cmd", "between_phase_cmd"),
+    )
 
     @property
     def installables(self) -> list[Installable]:
@@ -334,6 +339,11 @@ class AIPerfPhase(BaseModel):
         serialization_alias="extra-args",
         validation_alias=AliasChoices("extra-args", "extra_args"),
     )
+    between_phase_cmd: str | None = Field(
+        default=None,
+        serialization_alias="between-phase-cmd",
+        validation_alias=AliasChoices("between-phase-cmd", "between_phase_cmd"),
+    )
 
 
 class AIPerfAccuracy(BaseModel):

@@ -427,6 +427,9 @@ function perform_exit()
     log "Sleeping for ${sleep_before_exit} seconds before exit"
     sleep "${sleep_before_exit}"
   fi
+  if _is_frontend_node && [[ -x "${RESULTS_DIR}/routerctl.sh" ]]; then
+    "${RESULTS_DIR}/routerctl.sh" stop || true
+  fi
   exit "${exit_code}"
 }
 
@@ -733,8 +736,137 @@ function launch_nats()
 
 function launch_ingress()
 {
-  log "Launching ingress with cmd: ${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]}"
-  ${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]} > ${RESULTS_DIR}/dynamo_ingress.log 2>&1
+  write_routerctl
+  start_router
+}
+
+function write_routerctl()
+{
+  export ROUTER_CMD="${dynamo_args["ingress-cmd"]} --http-port ${dynamo_args["port"]}"
+  export ROUTER_URL="${dynamo_args["url"]}"
+  export ROUTER_HEALTH_ENDPOINT="${dynamo_args["endpoint"]}"
+  export ROUTER_HEALTH_MODEL="${dynamo_args["model"]}"
+  export ROUTER_PID_FILE="${RESULTS_DIR}/router.pid"
+  export ROUTER_LOG_FILE="${RESULTS_DIR}/dynamo_ingress.log"
+  export ROUTER_START_TIMEOUT="${ROUTER_START_TIMEOUT:-120}"
+  export ROUTER_STOP_TIMEOUT="${ROUTER_STOP_TIMEOUT:-30}"
+
+  cat > "${RESULTS_DIR}/routerctl.sh" <<'EOF'
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+log() { echo "[$(date +%F\ %T) $(hostname)]: $*"; }
+
+: "${ROUTER_CMD:?ROUTER_CMD is not set}"
+: "${ROUTER_URL:?ROUTER_URL is not set}"
+: "${ROUTER_HEALTH_ENDPOINT:?ROUTER_HEALTH_ENDPOINT is not set}"
+: "${ROUTER_HEALTH_MODEL:?ROUTER_HEALTH_MODEL is not set}"
+: "${ROUTER_PID_FILE:?ROUTER_PID_FILE is not set}"
+: "${ROUTER_LOG_FILE:?ROUTER_LOG_FILE is not set}"
+: "${ROUTER_START_TIMEOUT:=120}"
+: "${ROUTER_STOP_TIMEOUT:=30}"
+
+router_pid() {
+  if [[ -s "${ROUTER_PID_FILE}" ]]; then
+    cat "${ROUTER_PID_FILE}"
+  fi
+}
+
+router_is_running() {
+  local pid
+  pid="$(router_pid)"
+  [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null
+}
+
+wait_for_router() {
+  local deadline=$((SECONDS + ROUTER_START_TIMEOUT))
+  until curl -fsS -X POST "${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT}" \
+    -H 'Content-Type: application/json' \
+    -d "{\"model\":\"${ROUTER_HEALTH_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"stream\":false,\"max_tokens\":1}" \
+    >/dev/null; do
+    if ! router_is_running; then
+      log "ERROR: Router process exited before ${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT} became ready"
+      return 1
+    fi
+    if (( SECONDS >= deadline )); then
+      log "ERROR: Router did not become ready at ${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT} within ${ROUTER_START_TIMEOUT}s"
+      return 1
+    fi
+    sleep 1
+  done
+  log "Router is ready at ${ROUTER_URL}/${ROUTER_HEALTH_ENDPOINT}"
+}
+
+start_router() {
+  local cmd="${ROUTER_CMD}"
+  if [[ "${1:-}" == "--reset-states" && "${cmd}" != *"--router-reset-states"* ]]; then
+    cmd="${cmd} --router-reset-states"
+  fi
+
+  if router_is_running; then
+    log "Router is already running with PID $(router_pid)"
+    return 0
+  fi
+
+  mkdir -p "$(dirname "${ROUTER_LOG_FILE}")"
+  log "Starting router with cmd: ${cmd}"
+  nohup bash -lc "${cmd}" >> "${ROUTER_LOG_FILE}" 2>&1 &
+  local pid=$!
+  echo "${pid}" > "${ROUTER_PID_FILE}"
+  log "Router PID: ${pid}"
+  wait_for_router
+}
+
+stop_router() {
+  if ! router_is_running; then
+    rm -f "${ROUTER_PID_FILE}"
+    log "Router is not running"
+    return 0
+  fi
+
+  local pid
+  pid="$(router_pid)"
+  log "Stopping router PID ${pid}"
+  kill -TERM "${pid}" 2>/dev/null || true
+
+  local deadline=$((SECONDS + ROUTER_STOP_TIMEOUT))
+  while kill -0 "${pid}" 2>/dev/null; do
+    if (( SECONDS >= deadline )); then
+      log "ERROR: Router PID ${pid} did not stop within ${ROUTER_STOP_TIMEOUT}s"
+      return 1
+    fi
+    sleep 1
+  done
+
+  rm -f "${ROUTER_PID_FILE}"
+  log "Router stopped"
+}
+
+case "${1:-}" in
+  start)
+    shift
+    start_router "$@"
+    ;;
+  stop)
+    stop_router
+    ;;
+  restart)
+    shift
+    stop_router
+    start_router "$@"
+    ;;
+  *)
+    echo "Usage: $0 {start|stop|restart} [--reset-states]" >&2
+    exit 2
+    ;;
+esac
+EOF
+  chmod +x "${RESULTS_DIR}/routerctl.sh"
+}
+
+function start_router()
+{
+  "${RESULTS_DIR}/routerctl.sh" start
 }
 
 launch_sgl_http_server() {
@@ -1184,7 +1316,7 @@ function main()
     launch_etcd &
     launch_nats &
     wait_for_etcd
-    launch_ingress &
+    launch_ingress
     if _is_sglang_dsr1; then
       launch_sgl_http_server
     fi

@@ -215,6 +215,26 @@ def _render_aiperf_setup_blocks(self, log_message: str, setup_cmd: str | None) -
             ).rstrip()
         ]
 
+    def _render_between_aiperf_phases_block(
+        self,
+        phase_name: str,
+        cmd: str | None,
+    ) -> list[str]:
+        if not cmd:
+            return []
+
+        cleanup_argv = ["bash", "-lc", cmd]
+        return (
+            textwrap.dedent(
+                f"""\
+            log {shlex.quote(f"Running AIPerf between-phase command after {phase_name}: {shlex.join(cleanup_argv)}")}
+            {shlex.join(cleanup_argv)}
+            """
+            )
+            .rstrip()
+            .splitlines()
+        )
+
     def _render_aiperf_script(self) -> str:
         phases = self.td.cmd_args.aiperf_phases or [AIPerfPhase.model_validate({"name": "aiperf"})]
         single_phase = len(phases) == 1
@@ -298,6 +318,15 @@ def _render_aiperf_script(self) -> str:
                     phase_lines.append(f"  cp {shlex.quote(report_file)} {shlex.quote(final_report_file)}")
                 phase_lines.append(f"  log {shlex.quote(f'Final AIPerf report saved to {final_report_file}')}")
 
+            if not single_phase and idx < len(phases) - 1:
+                phase_lines.extend(
+                    "  " + line
+                    for line in self._render_between_aiperf_phases_block(
+                        phase_name=phase.name,
+                        cmd=resolved_phase.between_phase_cmd,
+                    )
+                )
+
             if not single_phase and idx < len(phases) - 1 and resolved_phase.health_check_between_phases:
                 health_probe_cmd = (
                     '  if ! curl -fsS -X POST "${FRONTEND_URL}/${AIPERF_ENDPOINT}" '

@@ -24,6 +24,8 @@ if [[ "$phase_status" -eq 0 ]]; then
   mkdir -p /cloudai_run_results
   cp /cloudai_run_results/aiperf_artifacts/round_1/profile_export_aiperf.csv /cloudai_run_results/aiperf_round_1_report.csv
   log 'AIPerf report saved to /cloudai_run_results/aiperf_round_1_report.csv'
+  log 'Running AIPerf between-phase command after round_1: bash -lc true'
+  bash -lc true
   if [[ -f "$AIPERF_FAILURE_MARKER" ]]; then
     log 'FATAL: failure marker found between AIPerf phases'
     exit 1

@@ -226,6 +226,7 @@ def test_gen_script_args_writes_resolved_aiperf_script(strategy: AIDynamoSlurmCo
     td.cmd_args.aiperf = AIPerf.model_validate(
         {
             "setup-cmd": "python -m pip install --upgrade aiperf",
+            "between-phase-cmd": "curl -fsS -X POST ${FRONTEND_URL}/reset_prefix_cache || true",
             "args": {
                 "concurrency": 2,
                 "request-count": 50,
@@ -254,6 +255,9 @@ def test_gen_script_args_writes_resolved_aiperf_script(strategy: AIDynamoSlurmCo
     assert "Running AIPerf phase setup for round_1" not in script
     assert "Running AIPerf phase setup for round_2" in script
     assert "bash -lc 'python -m pip install --upgrade another-aiperf-plugin'" in script
+    assert script.count("Running AIPerf between-phase command after") == 1
+    assert "Running AIPerf between-phase command after round_1" in script
+    assert "bash -lc 'curl -fsS -X POST ${FRONTEND_URL}/reset_prefix_cache || true'" in script
     assert ': "${FRONTEND_URL:?FRONTEND_URL is not set}"' in script
     assert '--url "$FRONTEND_URL"' in script
     assert f"--artifact-dir {strategy.CONTAINER_MOUNT_OUTPUT}/aiperf_artifacts/round_1" in script