Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ ehthumbs.db
Thumbs.db

*.log
slurm-*
install/
results/
.*
Expand Down
43 changes: 0 additions & 43 deletions conf/experimental/test/deepep_low_latency.toml

This file was deleted.

43 changes: 0 additions & 43 deletions conf/experimental/test/deepep_standard.toml

This file was deleted.

27 changes: 27 additions & 0 deletions conf/experimental/test/deepep_test_ep_v2.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name = "deepep_test_ep_v2"
description = "Official DeepEP V2 elastic test_ep"
test_template_name = "DeepEP"

[cmd_args]
docker_image_url = "/your/path/to/the/container"
subtest_name = "test_ep"
elastic_tests_root = "/path/in/the/container/to/the/tests/folder"
num_processes = 8
num_sms = 0
num_qps = 0
num_allocated_qps = 0
num_tokens = 4096
hidden = 7168
num_topk = 8
num_experts = 256
do_cpu_sync = 1
allow_hybrid_mode = 1
allow_multiple_reduction = 1
prefer_overlap_with_compute = 0
seed = 0
skip_check = false
skip_perf_test = false
19 changes: 19 additions & 0 deletions conf/experimental/test/deepep_test_internode.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name = "deepep_test_internode"
description = "Official DeepEP V1 legacy test_internode"
test_template_name = "DeepEP"

[cmd_args]
docker_image_url = "/your/path/to/the/container"
subtest_name = "test_internode"
legacy_tests_root = "/path/in/the/container/to/the/tests/folder"
num_processes = 8
num_tokens = 4096
hidden = 7168
num_topk = 8
num_experts = 256
pressure_test_mode = 0
test_ll_compatibility = false
18 changes: 18 additions & 0 deletions conf/experimental/test/deepep_test_intranode.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name = "deepep_test_intranode"
description = "Official DeepEP V1 legacy test_intranode"
test_template_name = "DeepEP"

[cmd_args]
docker_image_url = "/your/path/to/the/container"
subtest_name = "test_intranode"
legacy_tests_root = "/path/in/the/container/to/the/tests/folder"
num_processes = 8
num_tokens = 4096
hidden = 7168
num_topk = 8
num_experts = 256
allow_mnnvl = false
22 changes: 22 additions & 0 deletions conf/experimental/test/deepep_test_low_latency.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name = "deepep_test_low_latency"
description = "Official DeepEP V1 legacy test_low_latency"
test_template_name = "DeepEP"

[cmd_args]
docker_image_url = "/your/path/to/the/container"
subtest_name = "test_low_latency"
legacy_tests_root = "/path/in/the/container/to/the/tests/folder"
num_processes = 8
num_tokens = 128
hidden = 7168
num_topk = 8
num_experts = 288
allow_mnnvl = false
disable_nvlink = false
use_logfmt = false
pressure_test = false
shrink_test = false
32 changes: 32 additions & 0 deletions conf/experimental/test/moe_benchmark_low_latency.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name = "moe_benchmark_low_latency"
description = "MoE Benchmark - DeepEP low-latency mode plus matrix export"
test_template_name = "MoEBenchmark"

[cmd_args]
docker_image_url = "/your/path/to/the/container"
benchmark_root = "/path/in/the/container/to/the/tests/folder"
mode = "low_latency"
tokens = 128
num_experts = 288
num_topk = 8
hidden_size = 7168
data_type = "bfloat16"
allow_nvlink_for_low_latency = false
allow_mnnvl = false
round_scale = false
use_ue8m0 = false
num_warmups = 20
num_iterations = 50
shuffle_columns = false
use_kineto_profiler = false
enable_tuning = false
config_file_path = "/tmp/config.yaml"
results_dir = "/workspace/dp-benchmark/results"

[extra_env_vars]
NUM_QPS_PER_RANK = "12"
NUM_SMS = "24"
32 changes: 32 additions & 0 deletions conf/experimental/test/moe_benchmark_standard.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name = "moe_benchmark_standard"
description = "MoE Benchmark - DeepEP standard mode plus matrix export"
test_template_name = "MoEBenchmark"

[cmd_args]
docker_image_url = "/your/path/to/the/container"
benchmark_root = "/workspace/dp-benchmark/benchmark"
mode = "standard"
tokens = 4096
num_experts = 256
num_topk = 8
hidden_size = 7168
data_type = "bfloat16"
allow_nvlink_for_low_latency = false
allow_mnnvl = false
round_scale = false
use_ue8m0 = false
num_warmups = 20
num_iterations = 50
shuffle_columns = false
use_kineto_profiler = false
enable_tuning = false
config_file_path = "/tmp/config.yaml"
results_dir = "/workspace/dp-benchmark/results"

[extra_env_vars]
NUM_QPS_PER_RANK = "12"
NUM_SMS = "24"
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -14,16 +14,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name = "deepep-benchmark"
name = "nccl_test_alltoallv"
description = "NCCL AlltoAllv"
test_template_name = "NcclTest"

[[Tests]]
id = "Tests.1"
test_name = "deepep_standard"
num_nodes = 2
time_limit = "00:30:00"
[cmd_args]
docker_image_url = "/your/path/to/the/container"
subtest_name = "alltoallv_perf_mpi"
nthreads = 1
ngpus = 1
minbytes = "512M"
maxbytes = "512M"
stepfactor = 2
iters = 10
warmup_iters = 1
check = 1
blocking = 0
use_deepep_matrix = true

[[Tests]]
id = "Tests.2"
test_name = "deepep_low_latency"
num_nodes = 2
time_limit = "00:30:00"
[extra_env_vars]
NCCL_P2P_DISABLE = "1"
NCCL_SHM_DISABLE = "1"
40 changes: 40 additions & 0 deletions conf/experimental/test/ucc_alltoallv_deepep.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "ucc_alltoallv_deepep"
description = "UCC AlltoAllv"
test_template_name = "UCCTest"

[cmd_args]
docker_image_url = "/your/path/to/the/container"
collective = "alltoallv"
b = 1
e = "8M"
use_deepep_matrix = true

[extra_env_vars]
UCX_IB_GID_INDEX = "auto"
UCX_TLS = "cuda_copy,rc"
UCX_RNDV_THRESH = "0"
UCX_RNDV_SCHEME = "get_zcopy"
MELLANOX_VISIBLE_DEVICES = "0,3,4,5,6,9,10,11"
CUDA_VISIBLE_DEVICES = "0,1,2,3,4,5,6,7"
UCC_CL_HIER_FULL_SBGP_TLS = "ucp"
UCC_CL_HIER_NODE_SBGP_TLS = "cuda"
UCC_TLS = "ucp,cuda"
UCC_CL_HIER_TUNE = "alltoallv:0-inf:@node_split"
UCC_TL_UCP_ALLTOALLV_PAIRWISE_NUM_POSTS = "8"
UCC_CLS = "basic,hier"
11 changes: 11 additions & 0 deletions conf/experimental/test_scenario/deepep_official.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name = "deepep-official-tests"

[[Tests]]
id = "Tests.deepep_test_internode"
test_name = "deepep_test_internode"
num_nodes = 2
time_limit = "00:30:00"
29 changes: 29 additions & 0 deletions conf/experimental/test_scenario/moe_benchmark.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name = "moe-benchmark"

[[Tests]]
id = "Tests.moe_benchmark"
test_name = "moe_benchmark_standard"
num_nodes = 2
time_limit = "00:30:00"

[[Tests]]
id = "Tests.ucc_alltoallv"
test_name = "ucc_alltoallv_deepep"
num_nodes = 2
time_limit = "00:30:00"
[[Tests.dependencies]]
type = "start_post_comp"
id = "Tests.moe_benchmark"

[[Tests]]
id = "Tests.nccl_alltoallv"
test_name = "nccl_test_alltoallv"
num_nodes = 2
time_limit = "00:30:00"
[[Tests.dependencies]]
type = "start_post_comp"
id = "Tests.ucc_alltoallv"
Comment on lines +22 to +29
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial | ⚖️ Poor tradeoff

Verify the dependency chain design: serial vs. parallel execution.

The current configuration chains tests serially: moe_benchmarkucc_alltoallvnccl_alltoallv. This means NCCL test waits for UCC test to complete, even though both tests independently consume matrices from the MoE benchmark output (as indicated by layers 10 and 12 in the stack context).

If both UCC and NCCL tests only need the MoE benchmark output and don't depend on each other, consider having both depend directly on Tests.moe_benchmark:

[[Tests]]
id = "Tests.ucc_alltoallv"
test_name = "ucc_alltoallv_deepep"
num_nodes = 2
time_limit = "00:30:00"
  [[Tests.dependencies]]
  type = "start_post_comp"
  id = "Tests.moe_benchmark"

[[Tests]]
id = "Tests.nccl_alltoallv"
test_name = "nccl_test_alltoallv"
num_nodes = 2
time_limit = "00:30:00"
  [[Tests.dependencies]]
  type = "start_post_comp"
  id = "Tests.moe_benchmark"  # Changed from Tests.ucc_alltoallv

This would allow UCC and NCCL tests to run in parallel after MoE benchmark completes, reducing total scenario runtime. The current serial chain may be intentional for resource management or debugging—please confirm this design choice.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@conf/experimental/test_scenario/moe_benchmark.toml` around lines 22 - 29, The
Tests chain currently makes Tests.nccl_alltoallv depend on Tests.ucc_alltoallv
(serial), causing NCCL to wait for UCC; if both only require the MoE output,
change the dependency of Tests.ucc_alltoallv and Tests.nccl_alltoallv to depend
directly on Tests.moe_benchmark (replace the dependency id in the
[[Tests.dependencies]] block for both Tests.ucc_alltoallv and
Tests.nccl_alltoallv to "Tests.moe_benchmark") so they start in parallel after
the MoE benchmark; if the serial ordering was intentional for resource or
debugging reasons, confirm and leave as-is.

Loading
Loading