halide · shoaibkamil · Dec 15, 2025 · Dec 15, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/.clang-tidy b/.clang-tidy
@@ -161,7 +161,7 @@ Checks: >
     -readability-use-std-min-max,
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
-ExcludeHeaderFilterRegex: '\.fbs\.h|common/cmdline\.h|/mini_\w*\.h'
+ExcludeHeaderFilterRegex: '\.fbs\.h|common/cmdline\.h|mini_[^/]*\.h'
 FormatStyle: 'file'
 CheckOptions:
     -   key: modernize-use-default-member-init.UseAssignment

diff --git a/Makefile b/Makefile
@@ -896,7 +896,8 @@ RUNTIME_CPP_COMPONENTS = \
   trace_helper \
   tracing \
   wasm_cpu_features \
-  webgpu_dawn \
+  webgpu_dawn_arm \
+  webgpu_dawn_x86 \
   webgpu_emscripten \
   windows_aarch64_cpu_features_arm \
   windows_clock \
@@ -1094,6 +1095,8 @@ RUNTIME_TRIPLE_WIN_GENERIC_64 = "x86_64-unknown-windows-unknown"
 
 RUNTIME_TRIPLE_WEBGPU_32 = "wasm32-unknown-unknown-unknown"
 RUNTIME_TRIPLE_WEBGPU_64 = "wasm64-unknown-unknown-unknown"
+RUNTIME_TRIPLE_WEBGPU_ARM_64 = "aarch64-unknown-unknown-unknown"
+RUNTIME_TRIPLE_WEBGPU_X86_64 = "x86_64-unknown-unknown-unknown"
 
 # `-fno-threadsafe-statics` is very important here (note that it allows us to use a 'modern' C++
 # standard but still skip threadsafe guards for static initialization in our runtime code)
@@ -1147,6 +1150,22 @@ $(BUILD_DIR)/initmod.windows_%_64.ll: $(SRC_DIR)/runtime/windows_%.cpp
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WIN_GENERIC_64) -fshort-wchar -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_64.d
 
+$(BUILD_DIR)/initmod.webgpu_dawn_arm_32.ll: $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_arm_32.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_arm_64.ll: $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_ARM_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_arm_64.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_x86_32.ll: $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_x86_32.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_x86_64.ll: $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_X86_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_x86_64.d
+
 $(BUILD_DIR)/initmod.webgpu_%_32.ll: $(SRC_DIR)/runtime/webgpu_%.cpp
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WEBGPU_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_32.d
@@ -1155,6 +1174,22 @@ $(BUILD_DIR)/initmod.webgpu_%_64.ll: $(SRC_DIR)/runtime/webgpu_%.cpp
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_64.d
 
+$(BUILD_DIR)/initmod.webgpu_dawn_arm_32_debug.ll: $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_arm_32_debug.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_arm_64_debug.ll: $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_ARM_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_arm_64_debug.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_x86_32_debug.ll: $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_x86_32_debug.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_x86_64_debug.ll: $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_X86_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_x86_64_debug.d
+
 $(BUILD_DIR)/initmod.webgpu_%_32_debug.ll: $(SRC_DIR)/runtime/webgpu_%.cpp
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WEBGPU_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_32_debug.d

diff --git a/cmake/vcpkg/triplets/arm64-osx-halide.cmake b/cmake/vcpkg/triplets/arm64-osx-halide.cmake
@@ -0,0 +1,10 @@
+set(VCPKG_TARGET_ARCHITECTURE arm64)
+set(VCPKG_CRT_LINKAGE dynamic)
+set(VCPKG_LIBRARY_LINKAGE static)
+set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
+set(VCPKG_OSX_ARCHITECTURES arm64)
+
+# Dawn must be a shared library so it can be loaded via dlopen at runtime.
+if (PORT STREQUAL "dawn")
+    set(VCPKG_LIBRARY_LINKAGE dynamic)
+endif ()
diff --git a/cmake/vcpkg/triplets/x64-osx-halide.cmake b/cmake/vcpkg/triplets/x64-osx-halide.cmake
@@ -0,0 +1,9 @@
+set(VCPKG_TARGET_ARCHITECTURE x64)
+set(VCPKG_CRT_LINKAGE dynamic)
+set(VCPKG_LIBRARY_LINKAGE static)
+set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
+
+# Dawn must be a shared library so it can be loaded via dlopen at runtime.
+if (PORT STREQUAL "dawn")
+    set(VCPKG_LIBRARY_LINKAGE dynamic)
+endif ()
diff --git a/doc/WebGPU.md b/doc/WebGPU.md
@@ -27,37 +27,40 @@ device codegen may be required before it becomes profitable to use.
 
 ## Running with WebAssembly via Emscripten: `HL_TARGET=wasm-32-wasmrt-webgpu`
 
-> _Tested with top-of-tree Emscripten as of 2023-02-23, against Chrome v113._
+> _Tested with Emscripten 5.0.0 (a7c5deabd7c88ba1c38ebe988112256775f944c6)_
+> _Tested with Node.js 25.5.0_
+> _Requires Node.js 25 or later_
 
 Halide can generate WebGPU code that can be integrated with WASM code using
 Emscripten.
 
 When invoking `emcc` to link Halide-generated objects, include these flags:
-`-s USE_WEBGPU=1 -s ASYNCIFY`.
+`--use-port=emdawnwebgpu -s JSPI`.
 
-Tests that use AOT compilation can be run using a native WebGPU implementation
-that has Node.js bindings, such as [Dawn](https://dawn.googlesource.com/dawn/).
-You must set an environment variable named `HL_WEBGPU_NODE_BINDINGS` that
-has an absolute path to the bindings to run these tests, e.g. `HL_WEBGPU_NODE_BINDINGS=/path/to/dawn.node`.
+Tests that use AOT compilation require a WebGPU implementation with Node.js
+bindings. There are two options:
 
-See [below](#setting-up-dawn) for instructions on building the Dawn Node.js
-bindings.
+- **`webgpu` npm package (recommended)**: Run `npm install` in the Halide
+  source root. The test runner will find and use the package automatically,
+  with no additional environment variables needed. See
+  [below](#webgpu-provider-webgpu-npm-package-recommended) for setup
+  instructions.
+- **Custom `dawn.node`**: Set `HL_WEBGPU_NODE_BINDINGS=/path/to/dawn.node`.
+  See [Setting up Dawn](#setting-up-dawn) for build instructions.
 
 JIT compilation is not supported when using WebGPU with WASM.
 
 ## Running natively: `HL_TARGET=host-webgpu`
 
-> _Tested with top-of-tree Dawn as of 2023-11-27 [commit b5d38fc7dc2a20081312c95e379c4a918df8b7d4]._
+> _Tested with Dawn release branch chromium/7698 (536c572aba)_
 
 For testing purposes, Halide can also target native WebGPU libraries, such as
 [Dawn](https://dawn.googlesource.com/dawn/) or
 [wgpu](https://github.com/gfx-rs/wgpu).
 This is currently the only path that can run the JIT correctness tests.
 See [below](#setting-up-dawn) for instructions on building Dawn.
 
-> Note that as of 2023-11-27, wgpu is not supported due to
-> [lacking `override` support for WGSL](https://github.com/gfx-rs/wgpu/issues/1762)
-> which we require > in order to set GPU block sizes.
+> Note that as of 2026-02-17, wgpu is not supported due to lack of WaitAny timeout support.
 
 When targeting WebGPU with a native target, Halide defaults to looking for a
 build of Dawn (with several common names and suffixes); you can override this
@@ -72,6 +75,107 @@ Note that it is explicitly legal to specify both WEBGPU_NATIVE_LIB and
 WEBGPU_NODE_BINDINGS for the same build; the correct executable environment
 will be selected based on the Halide target specified.
 
+## Setting up for WASM+WebGPU testing
+
+### Installing Node.js 25 via nvm
+
+Install [nvm](https://github.com/nvm-sh/nvm) (Node Version Manager), then use
+it to install and activate Node.js 25:
+
+    curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.1/install.sh | bash
+    nvm install 25
+    nvm use 25
+
+To activate Node.js 25 automatically in future shell sessions, add
+`nvm use 25` (or `nvm alias default 25`) to your shell profile.
+
+### Installing Emscripten via emsdk
+
+Install Emscripten using the [emsdk](https://github.com/emscripten-core/emsdk):
+
+    git clone https://github.com/emscripten-core/emsdk.git
+    cd emsdk
+    ./emsdk install latest
+    ./emsdk activate latest
+    source ./emsdk_env.sh
+
+Add `source /path/to/emsdk/emsdk_env.sh` to your shell profile to activate
+Emscripten automatically in future sessions.
+
+### WebGPU provider: `webgpu` npm package (recommended)
+
+The simplest way to provide a WebGPU implementation for Node.js testing is to
+install the [`webgpu`](https://www.npmjs.com/package/webgpu) npm package, which
+bundles pre-built Dawn Node.js bindings. From the Halide source root, run:
+
+    npm install
+
+The `package.json` in the Halide source root lists `webgpu` as a dependency.
+Once installed, the test runner (`tools/launch_wasm_test.js`) will find and use
+it automatically — no `HL_WEBGPU_NODE_BINDINGS` environment variable is needed.
+
+### WebGPU provider: building Dawn Node.js bindings from scratch (alternative)
+
+As an alternative to the `webgpu` npm package, you can build the Dawn Node.js
+bindings from source. See [Setting up Dawn](#setting-up-dawn) for instructions.
+After building, point the test runner at your build by setting:
+
+    HL_WEBGPU_NODE_BINDINGS=/path/to/dawn.node
+
+## Setting up Dawn via vcpkg
+
+> _Tested with Dawn vcpkg port version 20251202.213730 on macOS (arm64)_
+
+The Halide repository includes vcpkg support for automatically downloading and
+building Dawn as part of the CMake configure step. This is the easiest path for
+native WebGPU testing and does not require `depot_tools` or `gclient`.
+
+Note: the vcpkg Dawn port builds only the native `libwebgpu_dawn` shared
+library. It does **not** include Node.js bindings (`dawn.node`), so this path
+only supports `HL_TARGET=host-webgpu` (native JIT). For the WASM+Node.js path,
+see [Setting up Dawn](#setting-up-dawn) below.
+
+### Install vcpkg
+
+Clone and bootstrap vcpkg locally (no system-wide installation needed):
+
+    git clone https://github.com/microsoft/vcpkg <vcpkg_dir>
+    <vcpkg_dir>/bootstrap-vcpkg.sh -disableMetrics
+
+### Configure and build Halide
+
+Use the provided `arm64-osx-halide` (or `x64-osx-halide`) overlay triplet,
+which keeps static linkage for all packages except Dawn (which must be shared
+so it can be loaded via `dlopen` at runtime):
+
+    cmake <halide_root_dir> -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_TOOLCHAIN_FILE=<vcpkg_dir>/scripts/buildsystems/vcpkg.cmake \
+        -DVCPKG_TARGET_TRIPLET=arm64-osx-halide \
+        -DVCPKG_MANIFEST_FEATURES=webgpu-native \
+        -DLLVM_DIR=<llvm_dir>/lib/cmake/llvm \
+        -DClang_DIR=<llvm_dir>/lib/cmake/clang \
+        -DHalide_TARGET=host-webgpu
+
+    cmake --build <build_dir>
+
+vcpkg will download and build Dawn automatically during the configure step,
+installing it into `<build_dir>/vcpkg_installed/`.
+
+You may need to add `-DWITH_SERIALIZATION=OFF`, `-DWITH_PYTHON_BINDINGS=OFF`, and
+`-DHalide_WASM_BACKEND=OFF` flags because vcpkg disables FetchContent, and those 
+features depend on packages (`flatbuffers`, `pybind11`, `wabt`) that are not included 
+in the `webgpu-native` manifest feature.
+
+### Run tests
+
+Set `HL_WEBGPU_NATIVE_LIB` to the vcpkg-installed Dawn library and run via
+CTest:
+
+    HL_JIT_TARGET=host-webgpu \
+    HL_WEBGPU_NATIVE_LIB=<build_dir>/vcpkg_installed/arm64-osx-halide/lib/libwebgpu_dawn.dylib \
+    ctest --test-dir <build_dir> -R correctness_gpu
+
 ## Setting up Dawn
 
 Building Dawn's Node.js bindings currently requires using CMake.
@@ -120,9 +224,44 @@ The recommended method for updating `mini_webgpu.h` is to copy the
 `gen/include/dawn/webgpu.h` file from the Dawn build directory, then:
 - Restore the `// clang-format {off,on}` lines.
 - Comment out the `#include <std*>` lines.
-- Remove the `void` parameter from the `WGPUProc` declaration.
-
-This guarantees a version of the WebGPU header that is compatible with Dawn.
-When the native API eventually stabilizes, it should be possible to obtain a
-header from the `webgpu-native` GitHub organization that will be compatible
-with Dawn, wgpu, and Emscripten.
+- Include the following block to define things that would normally be defined in system headers:
+```
+// BEGIN Halide-specific changes
+//
+// For the Halide runtime, we can't include these headers,
+// so we define NULL, SIZE_MAX, and integer limit macros here.
+// #include <stdint.h>
+// #include <stddef.h>
+// #include <math.h>
+
+#ifndef NULL
+#ifdef __cplusplus
+#define NULL nullptr
+#else
+#define NULL ((void*)0)
+#endif
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX (~(size_t)0)
+#endif
+
+#ifndef UINT32_MAX
+#define UINT32_MAX (~(uint32_t)0)
+#endif
+
+#ifndef UINT64_MAX
+#define UINT64_MAX (~(uint64_t)0)
+#endif
+
+// This _should_ be correct on all platforms we support, but needs checking.
+#ifndef UINT32_C
+#define UINT32_C(x) ((uint32_t)(x))
+#endif
+
+// END Halide-specific changes
+
+```
+
+This guarantees a version of the WebGPU header that is compatible with how
+Halide builds the runtime.
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
@@ -175,10 +175,6 @@ DECLARE_CPP_INITMOD(timer_profiler)
 DECLARE_CPP_INITMOD(to_string)
 DECLARE_CPP_INITMOD(trace_helper)
 DECLARE_CPP_INITMOD(tracing)
-// TODO(https://github.com/halide/Halide/issues/7248)
-// DECLARE_CPP_INITMOD(webgpu)
-DECLARE_CPP_INITMOD(webgpu_dawn)
-DECLARE_CPP_INITMOD(webgpu_emscripten)
 DECLARE_CPP_INITMOD(windows_clock)
 DECLARE_CPP_INITMOD(windows_cuda)
 DECLARE_CPP_INITMOD(windows_get_symbol)
@@ -284,6 +280,38 @@ DECLARE_NO_INITMOD(vulkan)
 DECLARE_NO_INITMOD(windows_vulkan)
 #endif  // WITH_VULKAN
 
+#ifdef WITH_WEBGPU
+// TODO(https://github.com/halide/Halide/issues/7248)
+#ifdef WITH_X86
+DECLARE_CPP_INITMOD(webgpu_dawn_x86)
+DECLARE_CPP_INITMOD(webgpu_dawn_x86_debug)
+#else
+DECLARE_NO_INITMOD(webgpu_dawn_x86)
+DECLARE_NO_INITMOD(webgpu_dawn_x86_debug)
+#endif
+#ifdef WITH_ARM
+DECLARE_CPP_INITMOD(webgpu_dawn_arm)
+DECLARE_CPP_INITMOD(webgpu_dawn_arm_debug)
+#else
+DECLARE_NO_INITMOD(webgpu_dawn_arm)
+DECLARE_NO_INITMOD(webgpu_dawn_arm_debug)
+#endif
+#ifdef WITH_WEBASSEMBLY
+DECLARE_CPP_INITMOD(webgpu_emscripten)
+DECLARE_CPP_INITMOD(webgpu_emscripten_debug)
+#else
+DECLARE_NO_INITMOD(webgpu_emscripten)
+DECLARE_NO_INITMOD(webgpu_emscripten_debug)
+#endif
+#else
+DECLARE_NO_INITMOD(webgpu_dawn_x86)
+DECLARE_NO_INITMOD(webgpu_dawn_arm)
+DECLARE_NO_INITMOD(webgpu_emscripten)
+DECLARE_NO_INITMOD(webgpu_dawn_x86_debug)
+DECLARE_NO_INITMOD(webgpu_dawn_arm_debug)
+DECLARE_NO_INITMOD(webgpu_emscripten_debug)
+#endif  // WITH_WEBGPU
+
 #ifdef WITH_X86
 DECLARE_LL_INITMOD(x86_amx)
 DECLARE_LL_INITMOD(x86_avx512)
@@ -1340,7 +1368,14 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
                 if (t.os == Target::WebAssemblyRuntime) {
                     modules.push_back(get_initmod_webgpu_emscripten(c, bits_64, debug));
                 } else {
-                    modules.push_back(get_initmod_webgpu_dawn(c, bits_64, debug));
+                    user_assert(bits_64) << "Native WebGPU target only available on 64-bit targets for now.\n";
+                    if (t.arch == Target::X86) {
+                        modules.push_back(get_initmod_webgpu_dawn_x86(c, bits_64, debug));
+                    } else if (t.arch == Target::ARM) {
+                        modules.push_back(get_initmod_webgpu_dawn_arm(c, bits_64, debug));
+                    } else {
+                        user_error << "WebGPU can only be used on X86 or ARM architectures.\n";
+                    }
                 }
             }
         }