aws · Yadan-Wei · May 19, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/.github/scripts/efa/ec2_helpers.py b/.github/scripts/efa/ec2_helpers.py
@@ -275,16 +275,34 @@ def launch_efa_instances(aws_session, ami_id, instance_type, key_name, sg_id, co
 
 
 def setup_container(conn, image_uri, container_name):
-    """Pull image and start container with EFA devices and host networking."""
+    """Pull image and start container with EFA devices and host networking.
+
+    Image-specific entrypoint handling:
+    - vLLM (entrypoint = dockerd_entrypoint.sh which execs `vllm serve "$@"`):
+      the default `bash` arg gets parsed as a model_tag and the server crashes.
+      Override entrypoint to keep the container alive for docker exec.
+    - PyTorch (entrypoint = entrypoint.sh which sets LD_LIBRARY_PATH for CUDA
+      forward-compat then `exec "$@"`): keep entrypoint, pass `bash` so the
+      compat env var lands in the environment that NCCL+EFA later inherit.
+      Overriding it breaks NCCL on hosts with older CUDA drivers.
+    """
     devices = get_efa_devices(conn)
     device_args = " ".join(f"--device {d}" for d in devices)
 
+    if "vllm" in image_uri.lower():
+        entrypoint_arg = "--entrypoint /bin/bash"
+        cmd = "-c 'sleep infinity'"
+    else:
+        entrypoint_arg = ""
+        cmd = "bash"
+
     conn.run(f"docker rm -f {container_name}", warn=True)
     conn.run(
         f"docker run --runtime=nvidia --gpus all -id "
         f"--name {container_name} --network host --ulimit memlock=-1:-1 "
+        f"{entrypoint_arg} "
         f"{device_args} -v $HOME/test:/test -v /dev/shm:/dev/shm "
-        f"{image_uri} bash"
+        f"{image_uri} {cmd}"
     )
     LOGGER.info(f"Started container {container_name}")
 

diff --git a/.github/workflows/pr-pytorch-ec2-cuda.yml b/.github/workflows/pr-pytorch-ec2-cuda.yml
@@ -105,6 +105,7 @@ jobs:
       build-change: ${{ steps.changes.outputs.build-change }}
       sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }}
       telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }}
+      efa-test-change: ${{ steps.changes.outputs.efa-test-change }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v5
@@ -135,6 +136,10 @@ jobs:
               - "test/sanity/**"
             telemetry-test-change:
               - "test/telemetry/**"
+            efa-test-change:
+              - "test/efa/**"
+              - ".github/scripts/efa/**"
+              - ".github/workflows/reusable-efa-tests.yml"
 
   # ============================================================
   # Build runtime image
@@ -355,12 +360,23 @@ jobs:
   # EFA integration test (2x p4d.24xlarge, NCCL over EFA)
   # ============================================================
   efa-test:
-    needs: [build-images, sanity-test, security-test, unit-test]
-    if: success()
+    needs: [check-changes, build-images, sanity-test, security-test, unit-test, load-config]
+    # Run when the docker context, EFA test code, or this workflow changed.
+    # Falls back to the prod image when build-images is skipped, mirroring
+    # sanity-test/single-gpu-test.
+    if: |
+      always() && !failure() && !cancelled() &&
+      (needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.efa-test-change == 'true') &&
+      (needs.build-images.result == 'success' || needs.build-images.result == 'skipped')
+    # Per-caller queue: new pushes to this PR cancel its own pending efa-test;
+    # different PRs / workflows don't displace each other on the global lock.
+    concurrency:
+      group: ${{ github.workflow }}-efa-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
     uses: ./.github/workflows/reusable-efa-tests.yml
     with:
-      image-uri: ${{ needs.build-images.outputs.runtime-image-uri }}
-      aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
+      image-uri: ${{ needs.build-images.result == 'success' && needs.build-images.outputs.runtime-image-uri || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
+      aws-account-id: ${{ needs.build-images.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
       aws-region: ${{ vars.AWS_REGION }}
 
   # ============================================================

diff --git a/.github/workflows/pr-vllm-ec2.yml b/.github/workflows/pr-vllm-ec2.yml
@@ -8,11 +8,13 @@ on:
     paths:
       - ".github/config/image/vllm-ec2.yml"
       - ".github/workflows/pr-vllm-ec2.yml"
+      - ".github/workflows/reusable-efa-tests.yml"
       - ".github/workflows/reusable-vllm-upstream-tests.yml"
       - "docker/vllm/Dockerfile"
       - "scripts/common/**"
       - "scripts/telemetry/**"
       - "scripts/vllm/**"
+      - "test/efa/**"
       - "test/sanity/**"
       - "test/telemetry/**"
       - "test/vllm/**"
@@ -102,6 +104,7 @@ jobs:
       build-change: ${{ steps.changes.outputs.build-change }}
       sanity-test-change: ${{ steps.changes.outputs.sanity-test-change }}
       telemetry-test-change: ${{ steps.changes.outputs.telemetry-test-change }}
+      efa-test-change: ${{ steps.changes.outputs.efa-test-change }}
     steps:
       - name: Checkout DLC source
         uses: actions/checkout@v5
@@ -133,6 +136,9 @@ jobs:
               - "test/sanity/**"
             telemetry-test-change:
               - "test/telemetry/**"
+            efa-test-change:
+              - ".github/workflows/reusable-efa-tests.yml"
+              - "test/efa/**"
 
   build-image:
     needs: [check-changes, load-config]
@@ -241,3 +247,28 @@ jobs:
       setup-script: test/vllm/scripts/vllm_test_setup.sh
       example-test-script: test/vllm/scripts/vllm_ec2_examples_test.sh
     secrets: inherit
+
+  # ============================================================
+  # EFA integration test (2x p4d.24xlarge, NCCL over EFA).
+  # Reuses the shared EFA test (test/efa/test_efa.py); the test's
+  # setup_nccl_tests.sh builds nccl-tests in-container against the
+  # nvidia-nccl-cu12 wheel that ships in vllm/vllm-openai.
+  # Runs when the image was rebuilt OR when EFA test files changed
+  # (in which case it tests against the prod image).
+  # ============================================================
+  efa-test:
+    needs: [check-changes, build-image, sanity-test, security-test, load-config]
+    if: |
+      always() && !failure() && !cancelled() &&
+      (needs.build-image.result == 'success' || needs.check-changes.outputs.efa-test-change == 'true')
+    # Per-caller queue: new pushes to this PR cancel its own pending efa-test;
+    # different PRs / workflows don't displace each other on the global lock.
+    concurrency:
+      group: ${{ github.workflow }}-efa-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    uses: ./.github/workflows/reusable-efa-tests.yml
+    with:
+      image-uri: ${{ needs.build-image.result == 'success' && needs.build-image.outputs.ci-image || format('{0}.dkr.ecr.{1}.amazonaws.com/{2}', vars.PROD_AWS_ACCOUNT_ID, vars.AWS_REGION, needs.load-config.outputs.prod-image) }}
+      aws-account-id: ${{ needs.build-image.result == 'success' && vars.CI_AWS_ACCOUNT_ID || vars.PROD_AWS_ACCOUNT_ID }}
+      aws-region: ${{ vars.AWS_REGION }}
+      run-nixl-tests: true
diff --git a/.github/workflows/reusable-efa-tests.yml b/.github/workflows/reusable-efa-tests.yml
@@ -23,6 +23,11 @@ on:
         required: false
         type: string
         default: 'p4d.24xlarge'
+      run-nixl-tests:
+        description: 'Run NIXL libfabric smoke + disaggregated PD tests (vLLM only)'
+        required: false
+        type: boolean
+        default: false
 
 # Serialize EFA tests across all PRs to avoid p4d capacity contention.
 # Only one EFA test runs at a time globally; subsequent runs queue (no cancel).
@@ -53,4 +58,5 @@ jobs:
           PYTHONPATH=$(pwd)/test:$(pwd)/.github/scripts:$PYTHONPATH \
           TEST_IMAGE_URI=${{ inputs.image-uri }} \
           EFA_INSTANCE_TYPE=${{ inputs.efa-instance-type }} \
+          RUN_NIXL_TESTS=${{ inputs.run-nixl-tests && '1' || '0' }} \
           python -m pytest test/efa/test_efa.py -vs -rA --tb=long
diff --git a/.gitignore b/.gitignore
@@ -23,3 +23,4 @@ site/
 tutorials/
 .sisyphus/
 docker/xgboost/prebuilt.whl
+.claude/
diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh
@@ -6,10 +6,10 @@ set -ex
 NUM_HOSTS_FILE=$1
 NUM_HOSTS=$2
 
-if [[ -z "${CUDA_HOME}" ]]; then
-    echo "CUDA_HOME variable is empty, please define it in dockerfile"
-    exit 1
-fi
+# Default CUDA_HOME for images that don't export it (vLLM Ubuntu).
+# PyTorch DLCs already set this in the Dockerfile so this is a no-op there.
+: "${CUDA_HOME:=/usr/local/cuda}"
+export CUDA_HOME
 
 TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
 INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type)
@@ -52,6 +52,28 @@ check_efa_nccl_all_reduce_performance(){
     fi
 }
 
+# Capture diagnostics to a file we cat at the very end. invoke/Fabric truncate
+# the .stdout of a failing remote command to the last few KB, so anything
+# printed before mpirun gets dropped. Stage it through a file and dump after
+# the validators run.
+DIAG_LOG="/test/efa/logs/diagnostics.log"
+{
+    echo "==================== EFA / NCCL diagnostics ===================="
+    echo "--- nvidia-smi ---"
+    nvidia-smi -L || true
+    echo "--- libnccl resolution ---"
+    ldconfig -p | grep libnccl || echo "(no libnccl in ldconfig)"
+    echo "--- ldd all_reduce_perf ---"
+    ldd /usr/local/bin/all_reduce_perf 2>&1 | grep -E "nccl|cuda|fabric|not found" || true
+    echo "--- libfabric provider list ---"
+    fi_info -p efa 2>&1 | head -20 || true
+    echo "--- aws-ofi-nccl plugin ---"
+    ls -la /opt/amazon/ofi-nccl/lib*/libnccl-net*.so 2>&1 | head -5 || true
+    echo "--- /etc/ld.so.conf.d ---"
+    ls /etc/ld.so.conf.d/ 2>&1
+    echo "==================== end diagnostics ===================="
+} > "${DIAG_LOG}" 2>&1
+
 echo "Running all_reduce_perf test"
 mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \
     -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
@@ -66,5 +88,27 @@ else
     echo "check_efa_nccl_all_reduce failed"
 fi
 
+# Dump training log first (it can be huge on success and we don't need it on
+# failure — mpirun's stdout was already captured), then the most actionable
+# diagnostics LAST so they survive Fabric's stdout truncation.
+echo "==================== BEGIN ${TRAINING_LOG} ===================="
+cat "${TRAINING_LOG}" 2>/dev/null || echo "(log file missing)"
+echo "==================== END ${TRAINING_LOG} ===================="
+
+echo "==================== BEGIN ${DIAG_LOG} ===================="
+cat "${DIAG_LOG}" 2>/dev/null || echo "(diagnostics file missing)"
+echo "==================== END ${DIAG_LOG} ===================="
+
+# These are the smallest, highest-signal probes — placed last so the
+# truncated tail Fabric retains will always show them.
+echo "==================== final probes ===================="
+echo "--- ldd all_reduce_perf (libnccl resolution) ---"
+ldd /usr/local/bin/all_reduce_perf 2>&1 | grep -E "nccl|cuda|fabric|not found" || true
+echo "--- libnccl SONAMES on system ---"
+find / -name 'libnccl.so*' 2>/dev/null | head -10
+echo "--- aws-ofi-nccl plugin paths ---"
+ls -la /opt/amazon/ofi-nccl/lib*/libnccl-net*.so 2>&1
+echo "==================== end ===================="
+
 validate_all_reduce_performance_logs
 check_efa_nccl_all_reduce_performance
diff --git a/test/efa/scripts/nixl_disagg_pd.sh b/test/efa/scripts/nixl_disagg_pd.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+# Two-node disaggregated prefill/decode test using NIXL with the LIBFABRIC
+# backend over EFA. Boots a prefill vLLM server on this (master) host, expects
+# the worker host to already have a decode server running (the orchestrator
+# starts both via SSH), launches a small proxy locally, sends a completion
+# request, and verifies the response is non-empty + libfabric/EFA appeared in
+# the prefill log.
+#
+# Args:
+#   $1 = WORKER_IP — private IP of the decode node
+#   $2 = MODEL     — HF model id (default: facebook/opt-125m)
+#
+# Env vars used by the prefill side:
+#   FI_PROVIDER=efa       — NIXL libfabric plugin selects EFA provider
+#   UCX_NET_DEVICES=all   — match upstream test invocation
+set -ex
+
+WORKER_IP=$1
+MODEL=${2:-facebook/opt-125m}
+
+if [[ -z "${WORKER_IP}" ]]; then
+    echo "usage: $0 <worker_ip> [model]" >&2
+    exit 2
+fi
+
+LOG_DIR=/test/efa/logs
+mkdir -p "${LOG_DIR}"
+PREFILL_LOG="${LOG_DIR}/prefill.log"
+PROXY_LOG="${LOG_DIR}/proxy.log"
+
+PREFILL_PORT=8100
+DECODE_PORT=8200
+PROXY_PORT=8192
+SIDE_CHANNEL_PORT=5559
+
+# NixlConnector ignores kv_role at the engine level: the per-request
+# kv_transfer_params dict from the proxy is what determines remote-fetch
+# behavior. kv_both matches upstream's run_accuracy_test.sh.
+#
+# kv_load_failure_policy=fail makes a missing/incomplete handoff a hard error
+# instead of a silent local re-prefill — so a transport regression surfaces
+# immediately rather than passing as a coherent completion.
+KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_connector_extra_config":{"backends":["LIBFABRIC"],"kv_load_failure_policy":"fail"}}'
+
+# Side-channel host: needs to be the IP that the worker can reach this box on.
+SIDE_CHANNEL_HOST=$(ip -4 -o addr show scope global | awk '{print $4}' | cut -d/ -f1 | head -1)
+
+# Block size must match across P and D for remote_block_ids to map correctly;
+# upstream's NIXL accuracy test pins this to 128 (OPT's default is 16, which
+# breaks the lookup at scheduler.py if D defaults differently).
+BLOCK_SIZE=128
+
+cleanup() {
+    set +e
+    [ -n "${PREFILL_PID}" ] && kill -TERM "${PREFILL_PID}" 2>/dev/null
+    [ -n "${PROXY_PID}" ] && kill -TERM "${PROXY_PID}" 2>/dev/null
+    wait 2>/dev/null
+}
+trap cleanup EXIT
+
+# --- Launch prefill server on master ---
+# VLLM_KV_CACHE_LAYOUT=HND is required by NixlConnector and matches upstream.
+CUDA_VISIBLE_DEVICES=0 \
+FI_PROVIDER=efa \
+VLLM_KV_CACHE_LAYOUT=HND \
+VLLM_NIXL_SIDE_CHANNEL_PORT=${SIDE_CHANNEL_PORT} \
+VLLM_NIXL_SIDE_CHANNEL_HOST=${SIDE_CHANNEL_HOST} \
+vllm serve "${MODEL}" \
+    --port ${PREFILL_PORT} \
+    --enforce-eager \
+    --block-size ${BLOCK_SIZE} \
+    --gpu-memory-utilization 0.5 \
+    --kv-transfer-config "${KV_CONFIG}" \
+    >"${PREFILL_LOG}" 2>&1 &
+PREFILL_PID=$!
+
+# --- Wait for prefill to be ready (max ~3 min) ---
+for i in $(seq 1 90); do
+    if curl -sf "http://127.0.0.1:${PREFILL_PORT}/health" >/dev/null; then
+        echo "prefill ready"
+        break
+    fi
+    if ! kill -0 "${PREFILL_PID}" 2>/dev/null; then
+        echo "prefill exited unexpectedly"
+        tail -50 "${PREFILL_LOG}"
+        exit 1
+    fi
+    sleep 2
+done
+curl -sf "http://127.0.0.1:${PREFILL_PORT}/health" >/dev/null || \
+    { echo "prefill never came up"; tail -100 "${PREFILL_LOG}"; exit 1; }
+
+# Wait for decode server on worker to be ready (started independently via SSH).
+for i in $(seq 1 90); do
+    if curl -sf "http://${WORKER_IP}:${DECODE_PORT}/health" >/dev/null; then
+        echo "decode ready"
+        break
+    fi
+    sleep 2
+done
+curl -sf "http://${WORKER_IP}:${DECODE_PORT}/health" >/dev/null || \
+    { echo "decode never came up"; exit 1; }
+
+# --- Launch proxy ---
+python3 /test/efa/scripts/toy_proxy_server.py \
+    --port ${PROXY_PORT} \
+    --prefill-url "http://127.0.0.1:${PREFILL_PORT}" \
+    --decode-url "http://${WORKER_IP}:${DECODE_PORT}" \
+    >"${PROXY_LOG}" 2>&1 &
+PROXY_PID=$!
+sleep 3
+
+# --- Send a completion request ---
+RESPONSE=$(curl -sf -X POST "http://127.0.0.1:${PROXY_PORT}/v1/completions" \
+    -H 'Content-Type: application/json' \
+    -d "{\"model\":\"${MODEL}\",\"prompt\":\"The capital of France is\",\"max_tokens\":8,\"temperature\":0}")
+
+echo "response: ${RESPONSE}"
+echo "${RESPONSE}" | grep -q '"text"' || { echo "no completion text in response"; exit 1; }
+
+# --- Validate NIXL+EFA actually engaged ---
+grep -E "LIBFABRIC|libfabric" "${PREFILL_LOG}" || \
+    { echo "no libfabric mention in prefill log"; tail -200 "${PREFILL_LOG}"; exit 1; }
+grep -E "FI_EP_RDM|provider.*efa|Selected provider is efa" "${PREFILL_LOG}" || \
+    echo "WARNING: couldn't confirm EFA provider was selected (may be in worker log)"
+
+# --- Strict KV-transfer assertion via NixlConnector Prometheus metrics ---
+# vllm:nixl_xfer_time_seconds_count is a histogram counter that increments per
+# successful NIXL transfer. > 0 means at least one KV cache block crossed the
+# wire from prefill to decode over libfabric/EFA. vllm:nixl_num_failed_transfers
+# is a Counter that must stay at 0.
+#
+# vllm:prompt_tokens_total is NOT a useful proof: it always counts the prompt
+# size regardless of whether the KV came from cache or local recompute.
+_metric_value() {
+    # Sum the value of a Prometheus metric, summing across labels if any.
+    local url="$1" name="$2"
+    curl -sf "${url}" | awk -v n="${name}" '
+        $0 ~ "^"n"[ {]" { gsub(/.*[ ]/, "", $0); s += $0 + 0 }
+        END { printf "%d", s }
+    '
+}
+
+DECODE_XFERS=$(_metric_value "http://${WORKER_IP}:${DECODE_PORT}/metrics" "vllm:nixl_xfer_time_seconds_count")
+DECODE_FAILED=$(_metric_value "http://${WORKER_IP}:${DECODE_PORT}/metrics" "vllm:nixl_num_failed_transfers")
+DECODE_GEN=$(_metric_value "http://${WORKER_IP}:${DECODE_PORT}/metrics" "vllm:generation_tokens_total")
+echo "metrics: decode.nixl_xfers=${DECODE_XFERS} decode.nixl_failed=${DECODE_FAILED} decode.gen_tokens=${DECODE_GEN}"
+
+[ "${DECODE_XFERS}" -ge 1 ] || { echo "no NIXL transfers reached decode (xfer_count=${DECODE_XFERS}) — KV did not flow over libfabric"; exit 1; }
+[ "${DECODE_FAILED}" -eq 0 ] || { echo "NIXL transfers failed (failed=${DECODE_FAILED})"; exit 1; }
+[ "${DECODE_GEN}" -ge 1 ] || { echo "decode produced no tokens (gen_tokens=${DECODE_GEN})"; exit 1; }
+
+echo "nixl_disagg_pd test passed (${DECODE_XFERS} NIXL transfer(s) verified via /metrics)"