Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ RUN GITHUB_ARTIFACTORY=github.com \
ARG UV_VERSION=0.11.6
ARG PYTHON_VERSION=3.13.13
ENV PATH="/root/.local/bin:$PATH"
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
uv python install ${PYTHON_VERSION}

Expand Down Expand Up @@ -109,8 +110,8 @@ ARG SKIP_SGLANG_BUILD
ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
ENV UV_LINK_MODE=copy

# Ensure DeepEP is built for H100 and B200 (also mcore inference unified memory API now invokes a torch API that requires these to be set)
ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"
# Ensure DeepEP is built for hopper and (grace)blackwell (also mcore inference unified memory API now invokes a torch API that requires these to be set)
ENV TORCH_CUDA_ARCH_LIST="9.0 10.0 10.3"
Comment thread
terrykong marked this conversation as resolved.

# First copy only the dependency files
COPY --from=nemo-rl pyproject.toml uv.lock ./
Expand Down Expand Up @@ -206,6 +207,10 @@ else
UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py
fi
EOF
RUN for d in /opt/ray_venvs /opt/nemo_rl_venv; do \
[ -d "$d" ] || continue; \
find "$d" -name "ptxas-blackwell" -exec ln -sf /usr/local/cuda/bin/ptxas {} +; \
done

# Generate container fingerprint for frozen environment support
# Store outside /opt/nemo-rl to avoid being overwritten by user mounts
Expand Down
4 changes: 2 additions & 2 deletions nemo_rl/models/policy/lm_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def __init__(
if "TORCH_CUDA_ARCH_LIST" not in os.environ:
raise RuntimeError(
"TORCH_CUDA_ARCH_LIST is not set. This is required in Megatron backend. This variable is set in our container, but "
"if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0'"
"if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0 10.3'"
)

else:
Expand All @@ -132,7 +132,7 @@ def __init__(
if "TORCH_CUDA_ARCH_LIST" not in os.environ:
warnings.warn(
"TORCH_CUDA_ARCH_LIST is not set. This is needed if using DeepEP in DTensorPolicyWorker V2. This variable is set in our container, but "
"if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0'"
"if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0 10.3'"
)
else:
assert (
Expand Down
Loading