Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ def _validate_split_kv_size(value: int) -> int:
"PREFILL_CONTINUOUS_REQUEST_DECODE_RESOURCES": lambda: int(
os.getenv("PREFILL_CONTINUOUS_REQUEST_DECODE_RESOURCES", "1")
),
"FD_ENABLE_E2W_TENSOR_CONVERT": lambda: int(os.getenv("FD_ENABLE_E2W_TENSOR_CONVERT", "0")),
"FD_ENGINE_TASK_QUEUE_WITH_SHM": lambda: int(os.getenv("FD_ENGINE_TASK_QUEUE_WITH_SHM", "0")),
"FD_ENABLE_E2W_TENSOR_CONVERT": lambda: int(os.getenv("FD_ENABLE_E2W_TENSOR_CONVERT", "1")),
"FD_ENGINE_TASK_QUEUE_WITH_SHM": lambda: int(os.getenv("FD_ENGINE_TASK_QUEUE_WITH_SHM", "1")),
"FD_FILL_BITMASK_BATCH": lambda: int(os.getenv("FD_FILL_BITMASK_BATCH", "4")),
"FD_ENABLE_PDL": lambda: int(os.getenv("FD_ENABLE_PDL", "1")),
"FD_ENABLE_ASYNC_LLM": lambda: int(os.getenv("FD_ENABLE_ASYNC_LLM", "0")),
Expand Down
7 changes: 7 additions & 0 deletions tests/ci_use/EB_Lite_with_adapter/test_eblite_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,13 @@ def setup_and_run_server():
- Waits for server port to open (up to 30 seconds)
- Tears down server after all tests finish
"""
# 清理/dev/shm中的临时文件
try:
subprocess.run("rm -rf /dev/shm/*", shell=True)
print("Successfully cleaned up /dev/shm.")
except Exception as e:
print(f"Failed to cleanup /dev/shm: {e}")

print("Pre-test port cleanup...")
clean_ports()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,14 @@ def llm(model_path):
)

# Wait for the port to be open
wait_start = time.time()
while not is_port_open("127.0.0.1", FD_ENGINE_QUEUE_PORT):
if time.time() - wait_start > MAX_WAIT_SECONDS:
pytest.fail(
f"Model engine did not start within {MAX_WAIT_SECONDS} seconds on port {FD_ENGINE_QUEUE_PORT}"
)
time.sleep(1)
time.sleep(2)

This comment was marked as outdated.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议time.sleep(2) 替代端口健康检查,引擎就绪检测不可靠。

FD_ENGINE_TASK_QUEUE_WITH_SHM=1 后,引擎不再监听 TCP 端口,原端口检查失效——但固定睡眠 2 秒无法保证引擎已完全就绪,在负载较重的 CI 环境中容易引发偶发性测试失败(flaky test)。

建议补充适合 SHM 模式的就绪检测机制(如轮询 SHM 文件/信号量是否创建),或至少延长等待时间并加 MAX_WAIT_SECONDS 超时保护,并在注释中说明为何旧的端口检查不再适用:

# SHM 模式下引擎不监听 TCP 端口,改为等待 SHM 就绪
wait_start = time.time()
while not shm_is_ready():  # 补充 SHM 就绪检测
    if time.time() - wait_start > MAX_WAIT_SECONDS:
        pytest.fail(f"Engine did not start within {MAX_WAIT_SECONDS}s")
    time.sleep(1)

如果暂时没有可用的 SHM 就绪探针,建议将 time.sleep(2) 替换为更保险的固定值并留下 TODO 注释。

# wait_start = time.time()
# while not is_port_open("127.0.0.1", FD_ENGINE_QUEUE_PORT):
# if time.time() - wait_start > MAX_WAIT_SECONDS:
# pytest.fail(
# f"Model engine did not start within {MAX_WAIT_SECONDS} seconds on port {FD_ENGINE_QUEUE_PORT}"
# )
# time.sleep(1)

print(f"Model loaded successfully from {model_path} in {time.time() - start:.2f}s.")
yield llm
Expand Down
7 changes: 7 additions & 0 deletions tests/ci_validation/deploy/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,13 @@ def stop_server(signum=None, frame=None):
if os.path.exists("gemm_profiles.json"):
os.remove("gemm_profiles.json")

# 清理/dev/shm中的临时文件
try:
subprocess.run("rm -rf /dev/shm/*", shell=True)
print("Successfully cleaned up /dev/shm.")
except Exception as e:
print(f"Failed to cleanup /dev/shm: {e}")

if signum:
sys.exit(0)

Expand Down
7 changes: 7 additions & 0 deletions tests/e2e/utils/serving_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ def clean_ports(ports=None):
kill_process_on_port(port)
time.sleep(1)

# 清理/dev/shm中的临时文件
try:
subprocess.run("rm -rf /dev/shm/*", shell=True)

This comment was marked as outdated.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❓ 疑问 subprocess.run 未检查返回码,rm 失败会静默通过。

subprocess.run("rm -rf /dev/shm/*", shell=True) 即使 rm 命令本身报错(如权限不足),也不会抛出 Python 异常,except Exception 分支永远不会执行,导致清理失败被无声忽略。相同模式在 deploy.pytest_eblite_serving.pymodel_loader/utils.py 中也存在。

建议添加显式检查 returncode:

try:
    result = subprocess.run("rm -rf /dev/shm/*", shell=True)
    if result.returncode != 0:
        print(f"Warning: /dev/shm cleanup returned code {result.returncode}")
    else:
        print("Successfully cleaned up /dev/shm.")
except Exception as e:
    print(f"Failed to cleanup /dev/shm: {e}")

print("Successfully cleaned up /dev/shm.")
except Exception as e:
print(f"Failed to cleanup /dev/shm: {e}")


def clean(ports=None):
"""
Expand Down
6 changes: 6 additions & 0 deletions tests/model_loader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,12 @@ def clean_ports(ports_to_clean: list[int]):
print(f"Port {port} still in use, retrying cleanup...")
kill_process_on_port(port)
time.sleep(1)
# 清理/dev/shm中的临时文件
try:
subprocess.run("rm -rf /dev/shm/*", shell=True)
print("Successfully cleaned up /dev/shm.")
except Exception as e:
print(f"Failed to cleanup /dev/shm: {e}")


def is_port_open(host: str, port: int, timeout=1.0):
Expand Down
Loading