Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 110 additions & 13 deletions tests/integration/defs/verl/test_verl_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@

All setup (dependency installation, repo cloning, env vars) is handled by
a session-scoped pytest fixture. Configuration is read from verl_config.yml.

Each wrapper function maps 1-to-1 to a single verl pytest case, enabling
fine-grained waiving without blanket-skipping a whole file.
"""

import os
Expand All @@ -29,6 +32,13 @@
_CONFIG_PATH = os.path.join(_HERE, "verl_config.yml")
VERL_ROOT = os.path.join(_HERE, "verl_repo")

_ROLLOUT = "tests/workers/rollout/rollout_trtllm"
_ASYNC_SERVER = f"{_ROLLOUT}/test_async_server.py"
_ADAPTER = f"{_ROLLOUT}/test_adapter.py"
_ROLLOUT_UTILS = f"{_ROLLOUT}/test_trtllm_rollout_utils.py"
_INTER_NODE = f"{_ROLLOUT}/test_inter_node_rollout.py"
_ABORT = f"{_ROLLOUT}/test_trtllm_abort.py"


def _load_config():
with open(_CONFIG_PATH) as f:
Expand Down Expand Up @@ -126,20 +136,107 @@ def _run_verl_test(test_path, extra_args=None, timeout=600):
assert result.returncode == 0, f"Verl test failed with return code {result.returncode}"


def test_async_server():
_run_verl_test("tests/workers/rollout/rollout_trtllm/test_async_server.py")
def _run_single(verl_file, case_name, timeout=600):
"""Run exactly one verl pytest case by name."""
_run_verl_test(verl_file, extra_args=["-k", case_name], timeout=timeout)

Comment thread
Superjomn marked this conversation as resolved.

def test_adapter():
_run_verl_test("tests/workers/rollout/rollout_trtllm/test_adapter.py")
# ---------------------------------------------------------------------------
# test_async_server.py wrappers
# ---------------------------------------------------------------------------


def test_rollout_utils():
_run_verl_test(
"tests/workers/rollout/rollout_trtllm/test_trtllm_rollout_utils.py",
extra_args=[
"-k",
"not (test_unimodal_generate or test_unimodal_batch_generate)",
],
timeout=900,
)
def test_placement_group_with_sub_ray_resource_pool():
_run_single(_ASYNC_SERVER, "test_placement_group_with_sub_ray_resource_pool")


def test_placement_group_with_ray_resource_pool():
_run_single(_ASYNC_SERVER, "test_placement_group_with_ray_resource_pool")


def test_placement_group_multi_node_ray_resource_pool():
_run_single(_ASYNC_SERVER, "test_placement_group_multi_node_ray_resource_pool")


def test_placement_group_multi_node_multi_replica():
_run_single(_ASYNC_SERVER, "test_placement_group_multi_node_multi_replica")


def test_async_generate():
_run_single(_ASYNC_SERVER, "test_async_generate")


def test_async_memory_management():
_run_single(_ASYNC_SERVER, "test_async_memory_management")


# ---------------------------------------------------------------------------
# test_adapter.py wrappers
# ---------------------------------------------------------------------------


def test_make_async_request_get_method():
_run_single(_ADAPTER, "test_make_async_request_get_method")


def test_make_async_request_post_method():
_run_single(_ADAPTER, "test_make_async_request_post_method")


def test_make_async_request_http_error():
_run_single(_ADAPTER, "test_make_async_request_http_error")


def test_make_async_request_max_attempts_exceeded():
_run_single(_ADAPTER, "test_make_async_request_max_attempts_exceeded")


def test_init_without_device_mesh():
_run_single(_ADAPTER, "test_init_without_device_mesh")


# ---------------------------------------------------------------------------
# test_trtllm_rollout_utils.py wrappers (900 s — multimodal cases are slow)
# ---------------------------------------------------------------------------


def test_unimodal_generate():
_run_single(_ROLLOUT_UTILS, "test_unimodal_generate", timeout=900)


def test_unimodal_batch_generate():
_run_single(_ROLLOUT_UTILS, "test_unimodal_batch_generate", timeout=900)


def test_multimodal_generate_with_image():
_run_single(_ROLLOUT_UTILS, "test_multimodal_generate_with_image", timeout=900)


def test_multimodal_different_image_sizes():
_run_single(_ROLLOUT_UTILS, "test_multimodal_different_image_sizes", timeout=900)


def test_multimodal_text_only_fallback():
_run_single(_ROLLOUT_UTILS, "test_multimodal_text_only_fallback", timeout=900)


def test_wake_sleep_cycle():
_run_single(_ROLLOUT_UTILS, "test_wake_sleep_cycle", timeout=900)


# ---------------------------------------------------------------------------
# test_inter_node_rollout.py wrappers (900 s — multi-node)
# ---------------------------------------------------------------------------


def test_inter_node_trtllm_rollout():
_run_single(_INTER_NODE, "test_inter_node_trtllm_rollout", timeout=900)


# ---------------------------------------------------------------------------
# test_trtllm_abort.py wrappers
# ---------------------------------------------------------------------------


def test_trtllm_abort():
_run_single(_ABORT, "test_trtllm_abort")
15 changes: 8 additions & 7 deletions tests/integration/defs/verl/verl_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

verl_config:
repo_url: "https://github.com/volcengine/verl.git"
repo_tag: "4cda6af"
repo_tag: "d324b01"
test_dir: "tests"

install_commands:
Expand All @@ -19,17 +19,17 @@ verl_config:
ln -s libnvshmem_host.so.3 libnvshmem_host.so)
# Install DeepEP
- >-
git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git &&
git clone -b hybrid-ep https://github.com/deepseek-ai/DeepEP.git &&
(cd DeepEP &&
wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch &&
patch -p1 < deepep.patch &&
export CPATH=/usr/local/cuda/targets/$(uname -m | sed 's/aarch64/sbsa-linux/;s/x86_64/x86_64-linux/')/include/cccl:$CPATH &&
TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install) &&
rm -rf DeepEP
# Install Python dependencies
- "pip3 install --no-cache-dir --no-deps trl"
- "pip3 install --no-cache-dir --no-deps trl==0.27.0"
- "pip3 install --no-cache-dir nvtx matplotlib liger_kernel cachetools"
- "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git"
- "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0"
- "pip3 install --no-cache-dir cupy-cuda12x==14.0.1"
- "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git@641a5a0"
- "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.16.0"
- "pip3 install pytest-asyncio"
- "pip3 install --no-cache-dir 'ray[default]==2.54.1'"

Expand All @@ -40,6 +40,7 @@ verl_config:
- "LD_LIBRARY_PATH=\"${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH\""
- "PATH=\"${NVSHMEM_DIR}/bin:$PATH\""
- "TRTLLM_TEST_MODEL_PATH_ROOT=/tmp/verl-models"
- "TORCH_CUDA_ARCH_LIST="

# Read-only CI model cache (flat layout: /scratch.../ModelName)
ci_model_cache: "/scratch.trt_llm_data/llm-models"
Expand Down
22 changes: 19 additions & 3 deletions tests/integration/test_lists/test-db/l0_verl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,22 @@ l0_verl:
backend: verl
orchestrator: mpi
tests:
- verl/test_verl_cases.py::test_async_server
- verl/test_verl_cases.py::test_adapter
- verl/test_verl_cases.py::test_rollout_utils
- verl/test_verl_cases.py::test_placement_group_with_sub_ray_resource_pool
- verl/test_verl_cases.py::test_placement_group_with_ray_resource_pool
- verl/test_verl_cases.py::test_placement_group_multi_node_ray_resource_pool
- verl/test_verl_cases.py::test_placement_group_multi_node_multi_replica
- verl/test_verl_cases.py::test_async_generate
- verl/test_verl_cases.py::test_async_memory_management
- verl/test_verl_cases.py::test_make_async_request_get_method
- verl/test_verl_cases.py::test_make_async_request_post_method
- verl/test_verl_cases.py::test_make_async_request_http_error
- verl/test_verl_cases.py::test_make_async_request_max_attempts_exceeded
- verl/test_verl_cases.py::test_init_without_device_mesh
- verl/test_verl_cases.py::test_unimodal_generate
- verl/test_verl_cases.py::test_unimodal_batch_generate
- verl/test_verl_cases.py::test_multimodal_generate_with_image
- verl/test_verl_cases.py::test_multimodal_different_image_sizes
- verl/test_verl_cases.py::test_multimodal_text_only_fallback
- verl/test_verl_cases.py::test_wake_sleep_cycle
- verl/test_verl_cases.py::test_inter_node_trtllm_rollout
- verl/test_verl_cases.py::test_trtllm_abort
5 changes: 2 additions & 3 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,5 @@ unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora[cuda_graph_config0]
unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781)
unittest/tools/test_layer_wise_benchmarks.py::test_performance_alignment[1] SKIP (https://nvbugs/6127669)
unittest/tools/test_layer_wise_benchmarks.py::test_qwen3_next_gen_tep[1] SKIP (https://nvbugs/6153575)
verl/test_verl_cases.py::test_adapter SKIP (https://nvbugs/5981833)
verl/test_verl_cases.py::test_async_server SKIP (https://nvbugs/5981833)
verl/test_verl_cases.py::test_rollout_utils SKIP (https://nvbugs/5981833)
visual_gen/test_visual_gen_benchmark.py::test_offline_benchmark SKIP (https://nvbugs/6050483)
visual_gen/test_visual_gen_benchmark.py::test_online_benchmark[openai-videos] SKIP (https://nvbugs/6050483)
Loading