PaddlePaddle · Jiang-Jia-Jun · Mar 25, 2026 · Mar 24, 2026 · Copilot · Mar 24, 2026
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -225,10 +225,10 @@ def _validate_split_kv_size(value: int) -> int:
     "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")),
     # File path for file storage backend
     "FILE_BACKEND_STORAGE_DIR": lambda: str(os.getenv("FILE_BACKEND_STORAGE_DIR", "/tmp/fastdeploy")),
-    # Custom all-reduce max buffer size in MB (default 64MB).
+    # Custom all-reduce max buffer size in MB (default 8MB).
     # Increase this to avoid NCCL fallback for large tensors in deterministic mode.
-    # Custom all-reduce max buffer size in MB (default 8MB).
-    # Increase this to avoid NCCL fallback for large tensors in deterministic mode.
+    # Custom deterministic all-reduce max buffer size in MB (default 8MB).
+    # When FD_DETERMINISTIC_MODE=1, tensors larger than this limit will raise an error
+    # instead of falling back to NCCL. Increase this value to avoid max_size errors.
-    # Custom all-reduce max buffer size in MB (default 8MB).
-    # Increase this to avoid NCCL fallback for large tensors in deterministic mode.
+    # Custom deterministic all-reduce max buffer size in MB (default 8MB).
+    # When FD_DETERMINISTIC_MODE=1, tensors larger than this limit will raise an error
+    # instead of falling back to NCCL. Increase this value to avoid max_size errors.
     # E.g. FD_CUSTOM_AR_MAX_SIZE_MB=128 for 128MB.
-    "FD_CUSTOM_AR_MAX_SIZE_MB": lambda: int(os.getenv("FD_CUSTOM_AR_MAX_SIZE_MB", "64")),
+    "FD_CUSTOM_AR_MAX_SIZE_MB": lambda: int(os.getenv("FD_CUSTOM_AR_MAX_SIZE_MB", "8")),
     # Enable deterministic inference mode for chunked prefill alignment
     "FD_DETERMINISTIC_MODE": lambda: bool(int(os.getenv("FD_DETERMINISTIC_MODE", "0"))),
     # Split KV block size for deterministic alignment (must be power of 2 and > 0, default 16)

diff --git a/tests/e2e/4cards_cases/test_determinism_long.py b/tests/e2e/4cards_cases/test_determinism_long.py
@@ -143,7 +143,7 @@ def _module_env():
         {
             "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", "0,1,2,3"),
             "FD_DETERMINISTIC_MODE": "1",
-            "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "57"),
+            "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "64"),
-            "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "64"),
+            "FD_CUSTOM_AR_MAX_SIZE_MB": "64",
-            "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "64"),
+            "FD_CUSTOM_AR_MAX_SIZE_MB": "64",
             "FLAGS_max_partition_size": _CHUNK_SIZE_FOR_TEST,
         }
     ):