-
Notifications
You must be signed in to change notification settings - Fork 729
[FDConfig] Reduce FD_CUSTOM_AR_MAX_SIZE_MB default from 64 to 8 #6997
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -225,10 +225,10 @@ def _validate_split_kv_size(value: int) -> int: | |
| "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")), | ||
| # File path for file storage backend | ||
| "FILE_BACKEND_STORAGE_DIR": lambda: str(os.getenv("FILE_BACKEND_STORAGE_DIR", "/tmp/fastdeploy")), | ||
| # Custom all-reduce max buffer size in MB (default 64MB). | ||
| # Custom all-reduce max buffer size in MB (default 8MB). | ||
| # Increase this to avoid NCCL fallback for large tensors in deterministic mode. | ||
| # E.g. FD_CUSTOM_AR_MAX_SIZE_MB=128 for 128MB. | ||
| "FD_CUSTOM_AR_MAX_SIZE_MB": lambda: int(os.getenv("FD_CUSTOM_AR_MAX_SIZE_MB", "64")), | ||
| "FD_CUSTOM_AR_MAX_SIZE_MB": lambda: int(os.getenv("FD_CUSTOM_AR_MAX_SIZE_MB", "8")), | ||
|
||
| # Enable deterministic inference mode for chunked prefill alignment | ||
| "FD_DETERMINISTIC_MODE": lambda: bool(int(os.getenv("FD_DETERMINISTIC_MODE", "0"))), | ||
| # Split KV block size for deterministic alignment (must be power of 2 and > 0, default 16) | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -143,7 +143,7 @@ def _module_env(): | |||||
| { | ||||||
| "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", "0,1,2,3"), | ||||||
| "FD_DETERMINISTIC_MODE": "1", | ||||||
| "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "57"), | ||||||
| "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "64"), | ||||||
|
||||||
| "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "64"), | |
| "FD_CUSTOM_AR_MAX_SIZE_MB": "64", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里的注释与实际行为不一致:在 FD_DETERMINISTIC_MODE=1 时,如果输入 tensor 超过 max_size,会直接抛 RuntimeError(communication._ensure_deterministic_ready),不会“fallback 到 NCCL”。建议把注释改成“超大 tensor 会报错/需要调大该值以满足 deterministic all-reduce 的 max_size 限制”,避免误导。