ROCm · ipanfilo · Jan 20, 2026 · Jan 20, 2026 · Jan 21, 2026 · Jan 21, 2026
@@ -7,6 +7,10 @@ name: 'Build'
 on:
   pull_request:
   workflow_dispatch:
+concurrency:
+  # Group by workflow name + PR number (for PRs) or ref (for branch/tag pushes)
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 jobs:
   core:
     name: 'Core'

@@ -7,6 +7,7 @@ name: Deploy nightly docs
 on:
   push:
     branches: [ "main" ]
+  workflow_dispatch:
 jobs:
   build:
     uses: ./.github/workflows/docs.yml
@@ -21,9 +22,8 @@ jobs:
             name: "te_docs"
             path: "html"
       - name: Prepare for pages
-        uses: actions/upload-pages-artifact@v1.0.7
+        uses: actions/upload-pages-artifact@v3
         with:
-          name: github-pages
           path: "html"
   deploy:
     needs: prepare
@@ -36,4 +36,5 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Deploy
-      uses: actions/deploy-pages@v2.0.0
+      id: deployment
+      uses: actions/deploy-pages@v4
@@ -8,6 +8,10 @@ on:
   pull_request:
   workflow_dispatch:
   workflow_call:
+concurrency:
+  # Group by workflow name + PR number (for PRs) or ref (for branch/tag pushes)
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 jobs:
   build_docs:
     name: 'Build'

@@ -7,6 +7,10 @@ name: 'Lint'
 on:
   pull_request:
   workflow_dispatch:
+concurrency:
+  # Group by workflow name + PR number (for PRs) or ref (for branch/tag pushes)
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 jobs:
   pytorch_cpplint:
     name: 'PyTorch C++'

@@ -58,6 +58,8 @@ jobs:
            || github.actor == 'vthumbe1503'
            || github.actor == 'shengfangd'
            || github.actor == 'kainzhong'
+           || github.actor == 'cspades'
+           || github.actor == 'jomitchellnv'
          )
     steps:
       - name: Check if comment is issued by authorized person

@@ -56,4 +56,8 @@ artifacts/
 **/times.csv
 transformer_engine/build_info.txt
 transformer_engine/common/util/hip_nvml.*
-*.DS_Store
+.DS_Store
+.rsync-filter
+.codex/
+.cline_storage/
+.claude/
@@ -43,4 +43,4 @@ repos:
     rev: c75aca72f4e85c6e47252139e8695f1c8b5f9ae3
     hooks:
       - id: vermin
-        args: ['-t=3.10', '--violations']
+        args: ['-t=3.10-', '--violations']
@@ -1 +1,2 @@
 recursive-include transformer_engine/common/include *.*
+recursive-include build_tools *.py *.txt
@@ -458,7 +458,7 @@ Flax
       for _ in range(10):
         loss, (param_grads, other_grads) = fwd_bwd_fn(params, other_variables, inp)
 
-For a more comprehensive tutorial, check out our `Quickstart Notebook <https://github.com/NVIDIA/TransformerEngine/blob/main/docs/examples/quickstart.ipynb>`_.
+For a more comprehensive tutorial, check out our `Getting Started Guide <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/getting_started.html>`_.
 
 .. overview-end-marker-do-not-remove
 
@@ -496,15 +496,22 @@ For example to use the NGC PyTorch container interactively,
 
 .. code-block:: bash
 
-    docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:25.08-py3
+    docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:26.01-py3
 
 For example to use the NGC JAX container interactively,
 
 .. code-block:: bash
 
-    docker run --gpus all -it --rm nvcr.io/nvidia/jax:25.08-py3
+    docker run --gpus all -it --rm nvcr.io/nvidia/jax:26.01-py3
 
-Where 25.08 (corresponding to August 2025 release) is the container version.
+Where 26.01 (corresponding to January 2026 release) is the container version.
+
+We recommend updating to the latest NGC container available here:
+
+* https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch
+* https://catalog.ngc.nvidia.com/orgs/nvidia/containers/jax
+
+If you run any examples, please ensure you are using a matching version of TransformerEngine. TransformerEngine is pre-built and packaged inside the containers with examples available at ``/opt/transformerengine`` or ``/opt/transformer-engine``. If you would like to use examples from TE main branch and are running into import errors, please try the latest pip package or building from source, although NGC containers are recommended for ease-of-use for most users.
 
 **Benefits of using NGC containers:**
 
@@ -628,6 +635,37 @@ Troubleshooting
          cd transformer_engine
          pip install -v -v -v --no-build-isolation .
 
+**Problems using UV or Virtual Environments:**
+
+1. **Import Error:**
+
+   * **Symptoms:** Cannot import ``transformer_engine``
+   * **Solution:** Ensure your UV environment is active and that you have used ``uv pip install --no-build-isolation <te_pypi_package_or_wheel_or_source_dir>`` instead of a regular pip install to your system environment.
+
+2. **cuDNN Sublibrary Loading Failed:**
+
+   * **Symptoms:** Errors at runtime with ``CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED``
+   * **Solution:** This can occur when TE is built against the container's system installation of cuDNN, but pip packages inside the virtual environment pull in pip packages for ``nvidia-cudnn-cu12/cu13``. To resolve this, when building TE from source please specify the following environment variables to point to the cuDNN in your virtual environment.
+
+
+     .. code-block:: bash
+
+        export CUDNN_PATH=$(pwd)/.venv/lib/python3.12/site-packages/nvidia/cudnn
+        export CUDNN_HOME=$CUDNN_PATH
+        export LD_LIBRARY_PATH=$CUDNN_PATH/lib:$LD_LIBRARY_PATH
+
+3. **Building Wheels:**
+
+   * **Symptoms:** Regular TE installs work correctly but UV wheel builds fail at runtime.
+   * **Solution:** Ensure that ``uv build --wheel --no-build-isolation -v`` is used during the wheel build as well as the pip installation of the wheel. Use ``-v`` for verbose output to verify that TE is not pulling in a mismatching version of PyTorch or JAX that differs from the UV environment's version.
+
+**JAX-specific Common Issues and Solutions:**
+
+1. **FFI Issues:**
+
+   * **Symptoms:** ``No registered implementation for custom call to <some_te_ffi> for platform CUDA``
+   * **Solution:** Ensure ``--no-build-isolation`` is used during installation. If pre-building wheels, ensure that the wheel is both built and installed with ``--no-build-isolation``. See "Problems using UV or Virtual Environments" above if using UV.
+
 .. troubleshooting-end-marker-do-not-remove
 
 Breaking Changes

@@ -1 +1 @@
-2.12.0.dev0
+2.14.0.dev0
@@ -12,6 +12,7 @@
         "__nv_fp8_e4m3" : "te_hip_fp8_e4m3",
         "cuda::getCurrentCUDAStream" : "hip::getCurrentHIPStreamMasqueradingAsCUDA",
         "at::cuda::CUDAGuard" : "at::hip::HIPGuardMasqueradingAsCUDA",
+        "c10::cuda::" : "c10::hip::",
         "__nv_fp4_e2m1" : "__hip_fp4_e2m1",
         "__nv_fp4x2_e2m1" : "__hip_fp4x2_e2m1",
         "__nv_fp4x4_e2m1" : "__hip_fp4x4_e2m1",

@@ -59,12 +59,7 @@ def xla_path() -> str:
     Throws FileNotFoundError if XLA source is not found."""
 
     try:
-        import jax
-        from packaging import version
-        if version.parse(jax.__version__) >= version.parse("0.5.0"):
-            from jax import ffi
-        else:
-            from jax.extend import ffi
+        from jax import ffi
     except ImportError:
         if os.getenv("XLA_HOME"):
             xla_home = Path(os.getenv("XLA_HOME"))

@@ -306,9 +306,10 @@ def nvcc_path() -> Tuple[str, str]:
 def get_cuda_include_dirs() -> Tuple[str, str]:
     """Returns the CUDA header directory."""
 
+    force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0")))
     # If cuda is installed via toolkit, all necessary headers
     # are bundled inside the top level cuda directory.
-    if cuda_toolkit_include_path() is not None:
+    if not force_wheels and cuda_toolkit_include_path() is not None:
         return [cuda_toolkit_include_path()]
 
     # Use pip wheels to include all headers.        
@@ -317,7 +318,10 @@ def get_cuda_include_dirs() -> Tuple[str, str]:
     except ModuleNotFoundError as e:
         raise RuntimeError("CUDA not found.")
 
-    cuda_root = Path(nvidia.__file__).parent
+    if nvidia.__file__ is not None:
+        cuda_root = Path(nvidia.__file__).parent
+    else:
+        cuda_root = Path(nvidia.__path__[0])  # namespace
     return [
         subdir / "include"
         for subdir in cuda_root.iterdir()

@@ -12,7 +12,7 @@ TEST_DIR=${TE_PATH}tests/pytorch
 #: ${TEST_WORKERS:=4}
 
 install_prerequisites() {
-    pip install 'numpy>=1.22.4' pandas
+    pip install 'numpy>=1.22.4' pandas safetensors
     rc=$?
     if [ $rc -ne 0 ]; then
         script_error "Failed to install test prerequisites"
@@ -100,8 +100,11 @@ run_test_config_mgpu(){
     run_default_fa 2 distributed/test_numerics.py
     run_default_fa 1 distributed/test_torch_fsdp2.py
     run_default_fa 2 distributed/test_torch_fsdp2_fp8.py
-    run_default_fa_lbl "flash" 3 attention/test_attention_with_cp.py -k "with_flash"
-    run_default_fa_lbl "fused" 2 attention/test_attention_with_cp.py -k "with_fused"
+    if [ $_fus_attn = ck ]; then
+        run 2 attention/test_attention_with_cp.py -k "with_fused"
+    elif [ $_fus_attn = flash ]; then
+        run 3 attention/test_attention_with_cp.py -k "with_flash"
+    fi
 }
 
 run_benchmark() {

@@ -0,0 +1,134 @@
+/* Diagram color definitions for Transformer Engine documentation */
+
+/* High precision (BF16/FP16) elements */
+.hp {
+  fill: #ede7f6;
+  stroke: #673ab7;
+  stroke-width: 2;
+}
+
+/* FP8 precision elements */
+.fp8 {
+  fill: #fff8e1;
+  stroke: #ffa726;
+  stroke-width: 2;
+}
+
+/* GEMM/computation operations */
+.gemm {
+  fill: #ffe0b2;
+  stroke: #fb8c00;
+  stroke-width: 2.5;
+}
+
+/* Quantization operations */
+.quantize {
+  fill: #e8f5e9;
+  stroke: #66bb6a;
+  stroke-width: 2;
+}
+
+/* Amax computation operations */
+.amax {
+  fill: #e1f5fe;
+  stroke: #039be5;
+  stroke-width: 2;
+}
+
+/* Text styles */
+.text {
+  font-family: 'Segoe UI', Arial, sans-serif;
+  font-size: 14px;
+  text-anchor: middle;
+  fill: #212121;
+}
+
+.small-text {
+  font-family: 'Segoe UI', Arial, sans-serif;
+  font-size: 14px;
+  text-anchor: middle;
+  fill: #757575;
+}
+
+.label {
+  font-family: 'Segoe UI', Arial, sans-serif;
+  font-size: 14px;
+  text-anchor: middle;
+  fill: #424242;
+}
+
+.title {
+  font-family: 'Segoe UI', Arial, sans-serif;
+  font-size: 18px;
+  font-weight: 600;
+  text-anchor: middle;
+  fill: #212121;
+}
+
+.section-title {
+  font-family: 'Segoe UI', Arial, sans-serif;
+  font-size: 15px;
+  font-weight: 600;
+  text-anchor: middle;
+}
+
+/* Arrows */
+/* Note: marker-end references #arrowhead marker which must be defined in each SVG's <defs> section */
+.arrow {
+  stroke: #616161;
+  stroke-width: 2;
+  fill: none;
+  marker-end: url(#arrowhead);
+}
+
+/* Additional box and element styles */
+.box-blue {
+  fill: #e3f2fd;
+  stroke: #1976d2;
+  stroke-width: 2;
+}
+
+.box-orange {
+  fill: #fff3e0;
+  stroke: #f57c00;
+  stroke-width: 2;
+}
+
+.box-green {
+  fill: #c8e6c9;
+  stroke: #388e3c;
+  stroke-width: 2;
+}
+
+.box-dashed {
+  stroke-dasharray: 5,5;
+}
+
+/* LayerNorm specific */
+.layernorm {
+  fill: #b3e5fc;
+  stroke: #0277bd;
+  stroke-width: 2.5;
+}
+
+/* Fused layers */
+.fused {
+  fill: #b2dfdb;
+  stroke: #00695c;
+  stroke-width: 3;
+}
+
+/* Generic computation blocks */
+.computation {
+  fill: #f5f5f5;
+  stroke: #757575;
+  stroke-width: 2;
+}
+
+/* FP32 precision (alternative red) */
+.fp32 {
+  fill: #ffcdd2;
+  stroke: #d32f2f;
+  stroke-width: 2.5;
+}
+
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		recursive-include transformer_engine/common/include .
		recursive-include build_tools .py .txt