intel · ai-fw-intg · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml
@@ -27,9 +27,9 @@ jobs:
       build_config: Release
       architecture: x64
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-      docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
-      docker_image_repo: onnxruntimecuda12manylinuxbuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
+      docker_image_repo: onnxruntimecuda13manylinuxbuild
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=13.0 --cuda_home=/usr/local/cuda-13.0 --cudnn_home=/usr/local/cuda-13.0 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
@@ -57,8 +57,8 @@ jobs:
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda12manylinuxbuild
-          build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
+          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda13manylinuxbuild
+          build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
           push: true
           azure-container-registry-name: onnxruntimebuildcache
         env:
@@ -91,6 +91,15 @@ jobs:
             echo "Warning: perms.txt not found in artifact."
           fi
 
+      # Verify the GPU is accessible inside Docker before running the full test suite.
+      # If the NVIDIA Container Toolkit fails to expose /dev/nvidia* devices,
+      # tests will fail with "CUDA failure 100" and waste 10+ minutes.
+      - name: Verify GPU access in Docker
+        run: |
+          docker run --rm --gpus all \
+            "${{ steps.build_docker_image_step.outputs.full-image-name }}" \
+            nvidia-smi
+
       # --- Run Tests using the downloaded build ---
       # The run-build-script-in-docker action mounts ${{ runner.temp }} to /onnxruntime_src/build
       # So build.py --build_dir build/Release inside the container correctly finds the artifacts.
@@ -102,5 +111,5 @@ jobs:
           build_config: Release
           mode: 'test' # Set mode to test
           execution_providers: 'cuda'
-          extra_build_flags: '--use_binskim_compliant_compile_flags --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+          extra_build_flags: '--use_binskim_compliant_compile_flags --cuda_version=13.0 --cuda_home=/usr/local/cuda-13.0 --cudnn_home=/usr/local/cuda-13.0 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
           python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -26,17 +26,17 @@ jobs:
       build_config: Release
       architecture: x64
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-      docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
-      docker_image_repo: onnxruntimecuda12manylinuxbuild
+      docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
+      docker_image_repo: onnxruntimecuda13manylinuxbuild
       extra_build_flags: >-
         --use_binskim_compliant_compile_flags
         --build_wheel
         --parallel
         --nvcc_threads 4
         --flash_nvcc_threads 4
-        --cuda_version=12.8
-        --cuda_home=/usr/local/cuda-12.8
-        --cudnn_home=/usr/local/cuda-12.8
+        --cuda_version=13.0
+        --cuda_home=/usr/local/cuda-13.0
+        --cudnn_home=/usr/local/cuda-13.0
         --enable_cuda_profiling
         --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         --cmake_extra_defines onnxruntime_QUICK_BUILD=ON
@@ -67,8 +67,8 @@ jobs:
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda12manylinuxbuild
-          build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
+          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda13manylinuxbuild
+          build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
           push: true
           azure-container-registry-name: onnxruntimebuildcache
         env:
@@ -100,6 +100,15 @@ jobs:
             echo "Warning: perms.txt not found in artifact."
           fi
 
+      # Verify the GPU is accessible inside Docker before running the full test suite.
+      # If the NVIDIA Container Toolkit fails to expose /dev/nvidia* devices,
+      # tests will fail with "CUDA failure 100" and waste 10+ minutes.
+      - name: Verify GPU access in Docker
+        run: |
+          docker run --rm --gpus all \
+            "${{ steps.build_docker_image_step.outputs.full-image-name }}" \
+            nvidia-smi
+
       # --- Install the ORT wheel and run CUDA plugin EP tests ---
       - name: Run CUDA Plugin EP Python Tests
         run: |
@@ -111,6 +120,11 @@ jobs:
             bash -c "
               set -ex
               export PATH=/opt/python/cp312-cp312/bin:\$PATH
+              # Ensure libcudart.so.13 is findable regardless of host-runner NVIDIA Container Toolkit configuration.
+              # The CUDA runtime library lives in the container image at /usr/local/cuda-13.0/lib64, but the
+              # LD_LIBRARY_PATH may not include this path when the runner's NVIDIA toolkit only mounts driver
+              # libraries at /usr/local/nvidia/lib64.
+              export LD_LIBRARY_PATH=/usr/local/cuda-13.0/lib64:\${LD_LIBRARY_PATH:-}
 
               # Install the ORT wheel
               python -m pip install /build/Release/Release/dist/onnxruntime*.whl

diff --git a/.github/workflows/nightly_webgpu.yml b/.github/workflows/nightly_webgpu.yml
@@ -0,0 +1,77 @@
+name: Nightly ONNX Runtime WebGPU Builds
+
+on:
+  schedule:
+  - cron: '0 9 * * *'  # Daily at 09:00 UTC
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  webgpu_shader_key_validation:
+    runs-on: [
+      "self-hosted",
+      "1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
+      "JobId=webgpu_shader_validation-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
+      ]
+    timeout-minutes: 90
+    env:
+      ALLOW_RELEASED_ONNX_OPSET_ONLY: "0"
+      ONNXRUNTIME_TEST_GPU_DEVICE_ID: "0"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v6
+      with:
+        fetch-depth: 0
+        submodules: none
+
+    - name: Setup Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+        architecture: x64
+
+    - name: Locate vcvarsall and Setup Env
+      uses: ./.github/actions/locate-vcvarsall-and-setup-env
+      with:
+        architecture: x64
+
+    - name: Install python modules
+      run: python -m pip install -r tools\ci_build\github\windows\python\requirements.txt
+      shell: cmd
+      working-directory: ${{ github.workspace }}
+
+    - name: Setup Node.js
+      uses: actions/setup-node@v6
+      with:
+        node-version: "24"
+
+    - name: Build and Test
+      shell: pwsh
+      run: |
+        $env:ORT_WEBGPU_EP_SHADER_DUMP_FILE = "${{ github.workspace }}\RelWithDebInfo\RelWithDebInfo\shader_dump.log"
+
+        python.exe ${{ github.workspace }}\tools\ci_build\build.py `
+          --config RelWithDebInfo `
+          --build_dir ${{ github.workspace }} `
+          --use_binskim_compliant_compile_flags `
+          --cmake_generator "Visual Studio 17 2022" `
+          --build_shared_lib `
+          --use_webgpu `
+          --wgsl_template static `
+          --cmake_extra_defines onnxruntime_BUILD_DAWN_SHARED_LIBRARY=ON `
+          --update `
+          --build --parallel `
+          --test
+
+    - name: Check log file
+      shell: cmd
+      run: |
+        dir ${{ github.workspace }}\RelWithDebInfo\RelWithDebInfo\shader_dump.log
+
+    - name: Validate shader keys
+      uses: ./.github/actions/webgpu-validate-shader-key
+      with:
+        log_file_path: ${{ github.workspace }}\RelWithDebInfo\RelWithDebInfo\shader_dump.log
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
@@ -157,6 +157,7 @@ jobs:
     runs-on: [
       "self-hosted",
       "1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
+      "1ES.ImageOverride=onnxruntime-Win-CPU-VS2022-Latest-NVMe-x64-test",
       "JobId=windows-cuda-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
     ]
     steps:
@@ -222,6 +223,13 @@ jobs:
         with:
           whl-directory: ${{ runner.temp }}\build\RelWithDebInfo\RelWithDebInfo\dist
 
+      # Verify the GPU is accessible before running the full test suite.
+      # If the NVIDIA driver is not available, tests will fail with
+      # "CUDA failure 100" and waste significant time.
+      - name: Verify GPU access
+        shell: pwsh
+        run: nvidia-smi
+
       - name: Run Tests
         working-directory: ${{ runner.temp }}
         run: |

diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
@@ -127,6 +127,7 @@ jobs:
     runs-on: [
       "self-hosted",
       "1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
+      "1ES.ImageOverride=onnxruntime-Win-CPU-VS2022-Latest-NVMe-x64-test",
       "JobId=windows-cuda-plugin-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
     ]
     steps:
@@ -187,6 +188,13 @@ jobs:
         with:
           whl-directory: ${{ runner.temp }}\build\Release\Release\dist
 
+      # Verify the GPU is accessible before running the full test suite.
+      # If the NVIDIA driver is not available, tests will fail with
+      # "CUDA failure 100" and waste significant time.
+      - name: Verify GPU access
+        shell: pwsh
+        run: nvidia-smi
+
       - name: Run CUDA Plugin EP Python Tests
         working-directory: ${{ github.workspace }}\onnxruntime\test\python\transformers
         shell: pwsh

diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -30,7 +30,7 @@ googletest;https://github.com/google/googletest/archive/refs/tags/v1.17.0.zip;f6
 #xnnpack 2025.06.22
 googlexnnpack;https://github.com/google/XNNPACK/archive/3cf85e705098622d59056dcb8f5f963ea7bb0a00.zip;6f6bbba627241f89463ca845febaf063982b34fe
 json;https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.zip;5e88795165cc8590138d1f47ce94ee567b85b4d6
-microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14
+microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.2.1.zip;1094e3bb7a8af763dcb136ccd676e6e75e614eec
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
@@ -405,24 +405,16 @@ if (CPUINFO_SUPPORTED)
   endif()
 endif()
 
-if(onnxruntime_USE_CUDA)
-  onnxruntime_fetchcontent_declare(
-    GSL
-    URL ${DEP_URL_microsoft_gsl}
-    URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
-    PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/gsl/1064.patch
-    EXCLUDE_FROM_ALL
-    FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
-  )
-else()
-  onnxruntime_fetchcontent_declare(
-    GSL
-    URL ${DEP_URL_microsoft_gsl}
-    URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
-    EXCLUDE_FROM_ALL
-    FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
-  )
-endif()
+onnxruntime_fetchcontent_declare(
+  GSL
+  URL ${DEP_URL_microsoft_gsl}
+  URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
+  # Stringify fix for GSL_SUPPRESS on MSVC (C4875). Remove when GSL ships a release
+  # containing microsoft/GSL#1213 (commit 543d0dd).
+  PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/gsl/1213.patch
+  EXCLUDE_FROM_ALL
+  FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
+)
 set(GSL_TARGET "Microsoft.GSL::GSL")
 set(GSL_INCLUDE_DIR "$<TARGET_PROPERTY:${GSL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>")
 onnxruntime_fetchcontent_makeavailable(GSL)
@@ -624,10 +616,17 @@ endif()
 if(onnxruntime_ENABLE_TRAINING OR (onnxruntime_ENABLE_TRAINING_APIS AND onnxruntime_BUILD_UNIT_TESTS))
   # Once code under orttraining/orttraining/models dir is removed "onnxruntime_ENABLE_TRAINING" should be removed from
   # this conditional
+  if(Patch_FOUND)
+    set(ONNXRUNTIME_CXXOPTS_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cxxopts/gcc-15-compat.patch)
+  else()
+    set(ONNXRUNTIME_CXXOPTS_PATCH_COMMAND "")
+  endif()
+
   onnxruntime_fetchcontent_declare(
     cxxopts
     URL ${DEP_URL_cxxopts}
     URL_HASH SHA1=${DEP_SHA1_cxxopts}
+    PATCH_COMMAND ${ONNXRUNTIME_CXXOPTS_PATCH_COMMAND}
     EXCLUDE_FROM_ALL
     FIND_PACKAGE_ARGS NAMES cxxopts
   )

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -55,6 +55,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qlutgemm.cpp
   ${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
   ${MLAS_SRC_DIR}/flashattn.cpp
+  ${MLAS_SRC_DIR}/flashattn_qkv.cpp
   ${MLAS_SRC_DIR}/qkv_quant.cpp
   ${MLAS_SRC_DIR}/cast.cpp
   ${MLAS_SRC_DIR}/layernorm.cpp

diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
@@ -242,6 +242,7 @@ if (onnxruntime_USE_CUDA AND NOT WIN32)
   )
   include(cutlass)
   target_include_directories(onnxruntime_pybind11_state PRIVATE ${cutlass_SOURCE_DIR}/include)
+  target_link_libraries(onnxruntime_pybind11_state PRIVATE CUDA::cudart)
 endif()
 if (onnxruntime_USE_CUDA AND WIN32)
   target_compile_definitions(onnxruntime_pybind11_state PRIVATE ORT_NO_CUDA_IN_PYBIND)

diff --git a/cmake/patches/cxxopts/gcc-15-compat.patch b/cmake/patches/cxxopts/gcc-15-compat.patch
@@ -0,0 +1,13 @@
+diff --git a/include/cxxopts.hpp b/include/cxxopts.hpp
+index 991ba3fc..a2e71faf 100644
+--- a/include/cxxopts.hpp
++++ b/include/cxxopts.hpp
+@@ -25,6 +25,7 @@
+ #ifndef CXXOPTS_HPP_INCLUDED
+ #define CXXOPTS_HPP_INCLUDED
+
++#include <cstdint>
+ #include <cstring>
+ #include <cctype>
+ #include <exception>
+
diff --git a/cmake/patches/gsl/1064.patch b/cmake/patches/gsl/1064.patch
diff --git a/cmake/patches/gsl/1213.patch b/cmake/patches/gsl/1213.patch
@@ -0,0 +1,13 @@
+diff --git a/include/gsl/assert b/include/gsl/assert
+index 58e0426..b3f7c8a 100644
+--- a/include/gsl/assert
++++ b/include/gsl/assert
+@@ -50,7 +50,7 @@
+ #define GSL_SUPPRESS(x) [[gsl::suppress(#x)]]
+ #else
+ #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__NVCC__)
+-#define GSL_SUPPRESS(x) [[gsl::suppress(x)]]
++#define GSL_SUPPRESS(x) [[gsl::suppress(#x)]]
+ #else
+ #define GSL_SUPPRESS(x)
+ #endif // _MSC_VER
diff --git a/cmake/vcpkg.json b/cmake/vcpkg.json
@@ -40,7 +40,10 @@
       "name": "mimalloc",
       "platform": "windows"
     },
-    "ms-gsl",
+    {
+      "name": "ms-gsl",
+      "version>=": "4.2.1"
+    },
     "nlohmann-json",
     "onnx",
     {