InternLM · CyCle1024 · Feb 27, 2026 · Mar 18, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/.dev_scripts/xtuner_rl_path.pth b/.dev_scripts/xtuner_rl_path.pth
@@ -0,0 +1 @@
+import xtuner_rl_path
diff --git a/.dev_scripts/xtuner_rl_path/__init__.py b/.dev_scripts/xtuner_rl_path/__init__.py
@@ -0,0 +1,18 @@
+import os
+import sys
+
+dist_packages_index = 0
+for i, path in enumerate(sys.path):
+    if path.endswith("dist-packages"):
+        dist_packages_index = i
+        break
+
+if os.getenv('XTUNER_USE_LMDEPLOY', '').lower() in ['1', 'on', 'true']:
+    lmdeploy_envs_dir = os.getenv('XTUNER_LMDEPLOY_ENVS_DIR', '/envs/lmdeploy')
+    if lmdeploy_envs_dir not in sys.path:
+        sys.path.insert(dist_packages_index, lmdeploy_envs_dir)
+
+elif os.getenv('XTUNER_USE_SGLANG', '').lower() in ['1', 'on', 'true']:
+    sglang_envs_dir = os.getenv('XTUNER_SGLANG_ENVS_DIR', '/envs/sglang')
+    if sglang_envs_dir not in sys.path:
+        sys.path.insert(dist_packages_index, sglang_envs_dir)
diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml
@@ -14,7 +14,7 @@ on:
 env:
   WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-5)
   WORKSPACE_PREFIX_SHORT: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-3)
-  IMAGE: ailab-llmrazor/xtuner:pt28_20251216_d769950
+  IMAGE: ailab-llmrazor/xtuner_tmp:pt29_20260414_c8f6fa1
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}

diff --git a/Dockerfile b/Dockerfile
@@ -2,39 +2,38 @@
 # builder
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.03-py3
 
-## build args
+## build base env
 FROM ${BASE_IMAGE} AS setup_env
 
-ARG TORCH_VERSION
 ARG PPA_SOURCE
-
-RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-    sed -i "s@http://.*.ubuntu.com@${PPA_SOURCE}@g" /etc/apt/sources.list.d/ubuntu.sources && \
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN sed -i "s@http://.*.ubuntu.com@${PPA_SOURCE}@g" /etc/apt/sources.list.d/ubuntu.sources && \
     apt update && \
     apt install --no-install-recommends ca-certificates -y && \
     apt install --no-install-recommends bc wget -y && \
     apt install --no-install-recommends build-essential sudo -y && \
     apt install --no-install-recommends git curl pkg-config tree unzip tmux \
     openssh-server openssh-client dnsutils iproute2 lsof net-tools zsh rclone \
-    iputils-ping telnet netcat-openbsd -y && \
+    iputils-ping telnet netcat-openbsd htop bubblewrap socat -y && \
     apt clean && rm -rf /var/lib/apt/lists/*
 
 RUN if [ -d /etc/pip ] && [ -f /etc/pip/constraint.txt ]; then echo > /etc/pip/constraint.txt; fi
-RUN pip install pystack py-spy --no-cache-dir
+RUN pip uninstall flash_attn opencv -y && rm -rf /usr/local/lib/python3.12/dist-packages/cv2
 RUN git config --system --add safe.directory "*"
 
+# torch
+ARG TORCH_VERSION
+ARG PYTORCH_WHEELS_URL
 RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    --mount=type=secret,id=NO_PROXY,env=no_proxy \
     if [ -n "${TORCH_VERSION}" ]; then \
         pip install torchvision torch==${TORCH_VERSION} \
-        --index-url https://download.pytorch.org/whl/cu128 \
-        --extra-index-url https://download.pytorch.org/whl/cu126 \
+        -i ${PYTORCH_WHEELS_URL}/cu128 \
+        --extra-index-url ${PYTORCH_WHEELS_URL}/cu126 \
         --no-cache-dir; \
     fi
-
 # set reasonable default for CUDA architectures when building ngc image
-ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0 10.0"
-
-RUN pip uninstall flash_attn opencv -y && rm -rf /usr/local/lib/python3.12/dist-packages/cv2
+ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"
 
 ARG FLASH_ATTN_DIR=/tmp/flash-attn
 ARG CODESPACE=/root/codespace
@@ -56,6 +55,9 @@ ARG CODESPACE
 ARG FLASH_ATTN_DIR
 ARG FLASH_ATTN3_DIR
 ARG FLASH_ATTN_URL
+# force hopper for now, you change it throught build args
+ARG FLASH_ATTN_CUDA_ARCHS="90"
+ARG FLASH_ATTENTION_DISABLE_SM80="TRUE"
 
 RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
     git clone $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 1) && \
@@ -119,42 +121,41 @@ WORKDIR ${CODESPACE}/causal-conv1d
 
 RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip wheel -w ${CAUSAL_CONV1D_DIR} -v --no-deps --no-build-isolation .
 
-# pypi install nvshmem and compile deepep
+# compile nvshmem and deepep
 FROM setup_env AS deep_ep
 
 ARG CODESPACE
 ARG DEEP_EP_DIR
 ARG DEEP_EP_URL
-# build sm90 and sm100 for deep_ep for now
-ARG TORCH_CUDA_ARCH_LIST="9.0 10.0"
 
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+#     curl -LO https://github.com/NVIDIA/nvshmem/releases/download/v3.4.5-0/nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
+#     tar -zxvf nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
+#     cd ${CODESPACE}/nvshmem_src && \
+#     NVSHMEM_SHMEM_SUPPORT=0 \
+#     NVSHMEM_UCX_SUPPORT=0 \
+#     NVSHMEM_USE_NCCL=0 \
+#     NVSHMEM_MPI_SUPPORT=0 \
+#     NVSHMEM_IBGDA_SUPPORT=1 \
+#     NVSHMEM_USE_GDRCOPY=0 \
+#     NVSHMEM_PMIX_SUPPORT=0 \
+#     NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+#     NVSHMEM_BUILD_TESTS=0 \
+#     NVSHMEM_BUILD_EXAMPLES=0 \
+#     NVSHMEM_BUILD_HYDRA_LAUNCHER=0 \
+#     NVSHMEM_BUILD_TXZ_PACKAGE=0 \
+#     NVSHMEM_BUILD_PYTHON_LIB=OFF \
+#     cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_PREFIX} -DMLX5_lib=/lib/x86_64-linux-gnu/libmlx5.so.1 && \
+#     cmake --build build --target install --parallel 32 && \
 RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-    curl -LO https://github.com/NVIDIA/nvshmem/releases/download/v3.4.5-0/nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
-    tar -zxvf nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
-    cd ${CODESPACE}/nvshmem_src && \
-    NVSHMEM_SHMEM_SUPPORT=0 \
-    NVSHMEM_UCX_SUPPORT=0 \
-    NVSHMEM_USE_NCCL=0 \
-    NVSHMEM_MPI_SUPPORT=0 \
-    NVSHMEM_IBGDA_SUPPORT=1 \
-    NVSHMEM_USE_GDRCOPY=0 \
-    NVSHMEM_PMIX_SUPPORT=0 \
-    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-    NVSHMEM_BUILD_TESTS=0 \
-    NVSHMEM_BUILD_EXAMPLES=0 \
-    NVSHMEM_BUILD_HYDRA_LAUNCHER=0 \
-    NVSHMEM_BUILD_TXZ_PACKAGE=0 \
-    NVSHMEM_BUILD_PYTHON_LIB=OFF \
-    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_PREFIX} -DMLX5_lib=/lib/x86_64-linux-gnu/libmlx5.so.1 && \
-    cmake --build build --target install --parallel 32 && \
     cd ${CODESPACE} && git clone $(echo ${DEEP_EP_URL} | cut -d '@' -f 1) && \
     cd ${CODESPACE}/DeepEP && \
     git checkout $(echo ${DEEP_EP_URL} | cut -d '@' -f 2) && \
     git submodule update --init --recursive --force
 
 WORKDIR ${CODESPACE}/DeepEP
 
-RUN NVSHMEM_DIR=${NVSHMEM_PREFIX} pip wheel -w ${DEEP_EP_DIR} -v --no-deps .
+RUN pip wheel -w ${DEEP_EP_DIR} -v --no-deps .
 
 # compile deep_gemm
 FROM setup_env AS deep_gemm
@@ -192,7 +193,7 @@ COPY --from=flash_attn ${FLASH_ATTN_DIR} ${FLASH_ATTN_DIR}
 COPY --from=adaptive_gemm ${ADAPTIVE_GEMM_DIR} ${ADAPTIVE_GEMM_DIR}
 COPY --from=grouped_gemm ${GROUPED_GEMM_DIR} ${GROUPED_GEMM_DIR}
 COPY --from=deep_ep ${DEEP_EP_DIR} ${DEEP_EP_DIR}
-COPY --from=deep_ep ${NVSHMEM_PREFIX} ${NVSHMEM_PREFIX}
+# COPY --from=deep_ep ${NVSHMEM_PREFIX} ${NVSHMEM_PREFIX}
 COPY --from=deep_gemm ${DEEP_GEMM_DIR} ${DEEP_GEMM_DIR}
 COPY --from=causal_conv1d ${CAUSAL_CONV1D_DIR} ${CAUSAL_CONV1D_DIR}
 
@@ -204,51 +205,114 @@ RUN unzip ${DEEP_EP_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 RUN unzip ${DEEP_GEMM_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 RUN unzip ${CAUSAL_CONV1D_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 
-# install sglang and its runtime requirements
-ARG SGLANG_VERSION
+ARG DEFAULT_PYPI_URL
 
-RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-   pip install sglang==${SGLANG_VERSION} sgl-kernel==0.3.14.post1 pybase64 orjson uvloop setproctitle msgspec \
-   compressed_tensors python-multipart torch_memory_saver \
-   grpcio-tools==1.75.1 hf_transfer interegular llguidance==0.7.11 \
-   xgrammar==0.1.24 blobfile==3.0.0 flashinfer_python==0.4.0 --no-cache-dir --no-deps
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN pip install pystack py-spy --no-cache-dir -i ${DEFAULT_PYPI_URL}
+
+# install sglang and its runtime requirements
+ENV XTUNER_SGLANG_ENVS_DIR=/envs/sglang
+
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN \
+   pip install --target ${XTUNER_SGLANG_ENVS_DIR} \
+   sglang==0.5.9 sgl-kernel==0.3.21 \
+   apache-tvm-ffi==0.1.9 \
+   anthropic==0.86.0 \
+   build==1.4.0 \
+   cuda-python==12.9.0 \
+   decord2==3.2.0 \
+   flashinfer_python==0.6.3 \
+   flashinfer_cubin==0.6.3 \
+   gguf==0.18.0 \
+   modelscope==1.35.3 \
+   nvidia-cutlass-dsl==4.4.2 \
+   openai-harmony==0.0.4 \
+   openai==2.6.1 \
+   outlines==0.1.11 \
+   quack-kernels==0.2.4 \
+   timm==1.0.16 \
+   torchao==0.9.0 \
+   torchaudio==2.9.1 \
+   torchcodec==0.8.0 \
+   xgrammar==0.1.32 \
+   smg-grpc-proto==0.4.5 \
+   grpcio==1.78.1 \
+   grpcio-reflection==1.78.1 \
+   grpcio-health-checking==1.80.0 \
+   pycryptodomex==3.23.0 \
+   lxml==6.0.2 \
+   cuda-bindings==12.9.6 \
+   cuda-pathfinder==1.5.0 \
+   nvidia-cudnn-frontend==1.21.0 \
+   lark==1.3.1 \
+   pycountry==26.2.16 \
+   airportsdata==20260315 \
+   outlines_core==0.1.26 \
+   torch-c-dlpack-ext==0.1.5 \
+   pyproject_hooks==1.2.0 \
+   huggingface_hub==0.36.2 \
+   torch_memory_saver==0.0.9 \
+   diskcache==5.6.3 distro==1.9.0 jiter==0.13.0 \
+   llguidance==0.7.11 blobfile==3.0.0 \
+   pybase64 orjson uvloop setproctitle msgspec partial_json_parser \
+   compressed_tensors python-multipart \
+   hf_transfer interegular --no-cache-dir --no-deps -i ${DEFAULT_PYPI_URL}
 
 # install lmdeploy and its missing runtime requirements
 ARG LMDEPLOY_VERSION
 ARG LMDEPLOY_URL
+ENV XTUNER_LMDEPLOY_ENVS_DIR=/envs/lmdeploy
 
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+ARG LMDEPLOY_WHEELS=https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu128-cp312-cp312-manylinux2014_x86_64.whl
 RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    --mount=type=secret,id=NO_PROXY,env=no_proxy \
     pip install fastapi fire openai outlines \
-        partial_json_parser ray[default] shortuuid uvicorn \
-        'pydantic>2' openai_harmony dlblas --no-cache-dir  && \
+        pyzmq aiohttp cloudpickle prometheus_client protobuf numpy pillow einops tiktoken sentencepiece \
+        partial_json_parser 'ray[default]<3' shortuuid uvicorn pybase64 \
+        'pydantic>2' openai_harmony dlblas --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-cache-dir -i ${DEFAULT_PYPI_URL} && \
+    pip install xgrammar==0.1.32 timm!=1.0.23 --no-cache-dir -i ${DEFAULT_PYPI_URL} --no-deps && \
     if [ -n "${LMDEPLOY_VERSION}" ]; then \
-        pip install lmdeploy==${LMDEPLOY_VERSION} --no-deps --no-cache-dir; \
+        # pip install lmdeploy==${LMDEPLOY_VERSION} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
+        echo pip install ${LMDEPLOY_WHEELS} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
+        pip install ${LMDEPLOY_WHEELS} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
     else \
         git clone $(echo ${LMDEPLOY_URL} | cut -d '@' -f 1) && \
         cd ${CODESPACE}/lmdeploy && \
         git checkout $(echo ${LMDEPLOY_URL} | cut -d '@' -f 2) && \
-        pip install . -v --no-deps --no-cache-dir; \
+        pip install . -v --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
     fi
 
 ## install xtuner
 ARG XTUNER_URL
 ARG XTUNER_COMMIT
-#RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
 #   git clone $(echo ${XTUNER_URL} | cut -d '@' -f 1) && \
 #   cd ${CODESPACE}/xtuner && \
 #   git checkout $(echo ${XTUNER_URL} | cut -d '@' -f 2) 
 COPY . ${CODESPACE}/xtuner
 
 WORKDIR ${CODESPACE}/xtuner
-RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-    pip install .[all] -v --no-cache-dir
+
+# Install custom .pth file for conditional lmdeploy and sglang path injection
+RUN cp -r .dev_scripts/xtuner_rl_path* ${PYTHON_SITE_PACKAGE_PATH}/
+
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN pip install .[all] -v --no-cache-dir -i ${DEFAULT_PYPI_URL}
 
 WORKDIR ${CODESPACE}
 
 # nccl update for torch 2.6.0
-RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-    if [ "x${TORCH_VERSION}" = "x2.6.0" ]; then \
-        pip install nvidia-nccl-cu12==2.25.1 --no-cache-dir; \
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN if [ "x${TORCH_VERSION}" = "x2.6.0" ]; then \
+        pip install nvidia-nccl-cu12==2.25.1 --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
+    fi
+
+# cudnn update for torch 2.9.1
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN if [ "x${TORCH_VERSION}" = "x2.9.1" ]; then \
+        pip install nvidia-cudnn-cu12==9.15.1.9 --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
     fi
 
 # setup sysctl

diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -8,7 +8,7 @@ default_config:
             gpus_per_task: 8
             cpus_per_task: 120
             memory_per_task: 512
-            image: ailab-llmrazor/xtuner:pt28_latest
+            image: ailab-llmrazor/xtuner_tmp:pt29_20260414_c8f6fa1
             envs: 
                 - HF_HUB_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/models/hf_hub
     eval:

diff --git a/autotest/config/gptoss.py b/autotest/config/gptoss.py
@@ -18,27 +18,27 @@
 
 
 gptoss_cfg = GptOss21BA3P6Config(
+    compile_cfg=False,
     rope_scaling_cfg=RopeScalingConfig(
         type="yarn",
         beta_fast=16.0,
         beta_slow=1.05,
         factor=16.0,
         original_max_position_embeddings=4096,
         truncate=True,
-    )
+    ),
 )
 optim_cfg = AdamWConfig(lr=6e-05)
 lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
 fsdp_cfg = FSDPConfig(
-    torch_compile=False,
     cpu_offload=False,
     ep_size=gptoss_cfg.ep_size,
 )
 
 dataset_config = [
     {
         "dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0),
-        "tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='gpt-oss', max_length=16384),
+        "tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template="gpt-oss", max_length=16384),
     },
 ]
 

diff --git a/autotest/config/npu_qwen3.py b/autotest/config/npu_qwen3.py
@@ -16,11 +16,10 @@
 ALPACA_PATH = os.environ["ALPACA_PATH"]
 
 
-moe_cfg = Qwen3MoE30BA3Config()
+moe_cfg = Qwen3MoE30BA3Config(compile_cfg=False)
 optim_cfg = AdamWConfig(lr=6e-05)
 lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
 fsdp_cfg = FSDPConfig(
-    torch_compile=False,
     cpu_offload=False,
     ep_size=moe_cfg.ep_size,
 )
@@ -34,7 +33,7 @@
 
 dataloader_config = DataloaderConfig(pack_max_length=16384)
 
-loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) # CELossConfig()
+loss_cfg = CELossConfig(mode="chunk", chunk_size=1024)  # CELossConfig()
 
 
 trainer = TrainerConfig(

diff --git a/autotest/config/npu_qwen3_16nums.py b/autotest/config/npu_qwen3_16nums.py
@@ -16,11 +16,10 @@
 ALPACA_PATH = os.environ["ALPACA_PATH"]
 
 
-moe_cfg = Qwen3MoE30BA3Config()
+moe_cfg = Qwen3MoE30BA3Config(compile_cfg=False)
 optim_cfg = AdamWConfig(lr=6e-05)
 lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
 fsdp_cfg = FSDPConfig(
-    torch_compile=False,
     cpu_offload=False,
     ep_size=moe_cfg.ep_size,
 )
@@ -34,7 +33,7 @@
 
 dataloader_config = DataloaderConfig(pack_max_length=16384)
 
-loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) # CELossConfig()
+loss_cfg = CELossConfig(mode="chunk", chunk_size=1024)  # CELossConfig()
 
 
 trainer = TrainerConfig(

diff --git a/autotest/config/npu_qwen3_moe_30BA3_ep8.py b/autotest/config/npu_qwen3_moe_30BA3_ep8.py
@@ -16,11 +16,10 @@
 ALPACA_PATH = os.environ["ALPACA_PATH"]
 
 
-moe_cfg = Qwen3MoE30BA3Config(ep_size=8)
+moe_cfg = Qwen3MoE30BA3Config(ep_size=8, compile_cfg=False)
 optim_cfg = AdamWConfig(lr=6e-05)
 lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
 fsdp_cfg = FSDPConfig(
-    torch_compile=True,
     cpu_offload=False,
     ep_size=moe_cfg.ep_size,
 )