LostBeard · LostBeard · May 30, 2026 · May 30, 2026 · May 30, 2026 · May 30, 2026
diff --git a/ILGPU.Algorithms/ReductionExtensions.cs b/ILGPU.Algorithms/ReductionExtensions.cs
@@ -317,6 +317,7 @@ public static T Reduce<T, TReduction>(
             where T : unmanaged
             where TReduction : struct, IScanReduceOperation<T>
         {
+            EnsureSyncReadbackSupported(accelerator);
             using var output = accelerator.Allocate1D<T>(1);
             accelerator.Reduce<T, TReduction>(stream, input, output.View);
             T result = default;
@@ -336,13 +337,23 @@ public static T Reduce<T, TReduction>(
         /// Uses the internal cache to realize a temporary output buffer.
         /// </remarks>
         /// <returns>The reduced value.</returns>
-        public static Task<T> ReduceAsync<T, TReduction>(
+        public static async Task<T> ReduceAsync<T, TReduction>(
             this Accelerator accelerator,
             AcceleratorStream stream,
             ArrayView<T> input)
             where T : unmanaged
-            where TReduction : struct, IScanReduceOperation<T> =>
-            Task.Run(() => accelerator.Reduce<T, TReduction>(stream, input));
+            where TReduction : struct, IScanReduceOperation<T>
+        {
+            // Real async reduction. The previous Task.Run(() => sync Reduce) was fake:
+            // the inner sync Reduce ends in a synchronous CopyToCPU which throws on
+            // WebGPU (no sync readback) and reads stale/zero on Wasm/WebGL (the
+            // reduction kernel is still in flight). Dispatch, then await the backend's
+            // REAL async drain + readback via ArrayView.CopyToCPUAsync.
+            using var output = accelerator.Allocate1D<T>(1);
+            accelerator.Reduce<T, TReduction>(stream, input, output.View);
+            var result = await output.View.CopyToCPUAsync(stream).ConfigureAwait(false);
+            return result[0];
+        }
 
         /// <summary>
         /// Performs a reduction using a reduction logic.
@@ -388,13 +399,36 @@ public static T Reduce<T, TStride, TReduction>(
             where TStride : struct, IStride1D
             where TReduction : struct, IScanReduceOperation<T>
         {
+            EnsureSyncReadbackSupported(accelerator);
             using var output = accelerator.Allocate1D<T>(1);
             accelerator.Reduce<T, TStride, TReduction>(stream, input, output.View);
             T result = default;
             output.View.CopyToCPU(stream, ref result, 1);
             return result;
         }
 
+        /// <summary>
+        /// Guards the synchronous <c>Reduce</c>-to-scalar overloads against browser
+        /// backends, which have no usable synchronous GPU-&gt;CPU readback: WebGPU
+        /// throws on sync readback, and Wasm/WebGL would read stale data before the
+        /// in-flight reduction kernel finishes (a silent wrong result). Callers on
+        /// those backends must use <c>ReduceAsync</c>.
+        /// </summary>
+        private static void EnsureSyncReadbackSupported(Accelerator accelerator)
+        {
+            switch (accelerator.AcceleratorType)
+            {
+                case AcceleratorType.Wasm:
+                case AcceleratorType.WebGL:
+                case AcceleratorType.WebGPU:
+                    throw new NotSupportedException(
+                        $"Synchronous Reduce-to-scalar is not supported on the " +
+                        $"{accelerator.AcceleratorType} backend: browser backends have no " +
+                        "synchronous GPU->CPU readback (WebGPU throws; Wasm/WebGL would read " +
+                        "stale data before in-flight kernels finish). Use ReduceAsync instead.");
+            }
+        }
+
         /// <summary>
         /// Performs a reduction using a reduction logic.
         /// </summary>
@@ -408,13 +442,20 @@ public static T Reduce<T, TStride, TReduction>(
         /// Uses the internal cache to realize a temporary output buffer.
         /// </remarks>
         /// <returns>The reduced value.</returns>
-        public static Task<T> ReduceAsync<T, TStride, TReduction>(
+        public static async Task<T> ReduceAsync<T, TStride, TReduction>(
             this Accelerator accelerator,
             AcceleratorStream stream,
             ArrayView1D<T, TStride> input)
             where T : unmanaged
             where TStride : struct, IStride1D
-            where TReduction : struct, IScanReduceOperation<T> =>
-            Task.Run(() => accelerator.Reduce<T, TStride, TReduction>(stream, input));
+            where TReduction : struct, IScanReduceOperation<T>
+        {
+            // Real async reduction — see ReduceAsync<T, TReduction> above for why the
+            // former Task.Run(sync Reduce) was incorrect on browser backends.
+            using var output = accelerator.Allocate1D<T>(1);
+            accelerator.Reduce<T, TStride, TReduction>(stream, input, output.View);
+            var result = await output.View.CopyToCPUAsync(stream).ConfigureAwait(false);
+            return result[0];
+        }
     }
 }
diff --git a/ILGPU/Runtime/Accelerator.cs b/ILGPU/Runtime/Accelerator.cs
@@ -17,6 +17,7 @@
 using System.Diagnostics;
 using System.IO;
 using System.Threading;
+using System.Threading.Tasks;
 
 namespace ILGPU.Runtime
 {
@@ -271,6 +272,29 @@ public void Synchronize()
         /// </summary>
         protected abstract void SynchronizeInternal();
 
+        /// <summary>
+        /// Asynchronously synchronizes all pending operations on this accelerator.
+        /// </summary>
+        /// <returns>A task that completes once all submitted work has finished.</returns>
+        /// <remarks>
+        /// The default implementation simply runs the synchronous
+        /// <see cref="Synchronize"/> and returns a completed task, which is correct
+        /// for backends whose <see cref="Synchronize"/> blocks until the queue drains
+        /// (CUDA, OpenCL, CPU). Single-threaded browser backends (Wasm, WebGPU,
+        /// WebGL) cannot block-wait — their synchronous <see cref="Synchronize"/> is a
+        /// non-blocking flush / no-op — so they MUST override this to await their real
+        /// drain (worker dispatch completion, queue.OnSubmittedWorkDone, GL fence).
+        /// Algorithm and consumer code that needs a host-visible result after an
+        /// unawaited dispatch must <c>await</c> this rather than calling the
+        /// synchronous <see cref="Synchronize"/>, which silently does nothing on those
+        /// backends.
+        /// </remarks>
+        public virtual Task SynchronizeAsync()
+        {
+            Synchronize();
+            return Task.CompletedTask;
+        }
+
         /// <summary>
         /// Clears all internal caches.
         /// </summary>

diff --git a/ILGPU/Runtime/AcceleratorStream.cs b/ILGPU/Runtime/AcceleratorStream.cs
@@ -53,7 +53,19 @@ protected AcceleratorStream(Accelerator accelerator)
         /// Synchronizes all queued operations asynchronously.
         /// </summary>
         /// <returns>A task object to wait for.</returns>
-        public Task SynchronizeAsync() => Task.Run(synchronizeAction);
+        /// <remarks>
+        /// The default implementation offloads the blocking
+        /// <see cref="Synchronize"/> call to a thread-pool thread, which is correct
+        /// for backends whose <see cref="Synchronize"/> genuinely blocks until the
+        /// queue drains (CUDA, OpenCL, CPU). Single-threaded browser backends
+        /// (Wasm, WebGPU, WebGL) cannot block-wait and their synchronous
+        /// <see cref="Synchronize"/> is a non-blocking flush / no-op; those streams
+        /// MUST override this to await their real async drain. Algorithm code that
+        /// needs a host-visible result after an unawaited dispatch must await this
+        /// (or <see cref="Accelerator.SynchronizeAsync"/>) rather than calling the
+        /// synchronous <see cref="Synchronize"/>.
+        /// </remarks>
+        public virtual Task SynchronizeAsync() => Task.Run(synchronizeAction);
 
         /// <summary>
         /// Makes the associated accelerator the current one for this thread and

diff --git a/ILGPU/Runtime/ArrayViewExtensions.cs b/ILGPU/Runtime/ArrayViewExtensions.cs
@@ -17,6 +17,8 @@
 using System;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading.Tasks;
 
 namespace ILGPU.Runtime
 {
@@ -926,6 +928,56 @@ public static void CopyToCPU<T, TView>(
             stream.Synchronize();
         }
 
+        /// <summary>
+        /// Asynchronously copies the contents of the given view back to the host as a
+        /// managed array. This is the backend-agnostic, browser-safe readback: it routes
+        /// through <see cref="MemoryBuffer.CopyToRawAsync"/>, which awaits the accelerator's
+        /// real async drain before reading. Unlike the synchronous <c>CopyToCPU</c>, this
+        /// returns correct data on Wasm (drains in-flight worker kernels first) and does not
+        /// throw on WebGPU / WebGL (which have no synchronous GPU-&gt;CPU readback).
+        /// </summary>
+        /// <typeparam name="T">The element type.</typeparam>
+        /// <param name="source">The source view to read back.</param>
+        /// <param name="stream">The used accelerator stream.</param>
+        /// <returns>A task producing the view's <c>Length</c> elements.</returns>
+        [NotInsideKernel]
+        public static async Task<T[]> CopyToCPUAsync<T>(
+            this ArrayView<T> source,
+            AcceleratorStream stream)
+            where T : unmanaged
+        {
+            var contig = (IContiguousArrayView)source;
+            var buffer = contig.Buffer
+                ?? throw new InvalidOperationException(
+                    "ArrayView has no backing buffer.");
+            long countElems = source.Length;
+            if (countElems == 0)
+                return Array.Empty<T>();
+            int elementSize = ((IArrayView)source).ElementSize;
+            long byteOffset = contig.IndexInBytes;
+            long byteCount = countElems * elementSize;
+
+            var bytes = await buffer
+                .CopyToRawAsync(stream, byteOffset, byteCount)
+                .ConfigureAwait(false);
+
+            var result = new T[countElems];
+            MemoryMarshal.Cast<byte, T>(bytes).CopyTo(new Span<T>(result));
+            return result;
+        }
+
+        /// <summary>
+        /// <see cref="ArrayView1D{T, TStride}"/> overload of
+        /// <see cref="CopyToCPUAsync{T}(ArrayView{T}, AcceleratorStream)"/>.
+        /// </summary>
+        [NotInsideKernel]
+        public static Task<T[]> CopyToCPUAsync<T, TStride>(
+            this ArrayView1D<T, TStride> source,
+            AcceleratorStream stream)
+            where T : unmanaged
+            where TStride : struct, IStride1D =>
+            source.BaseView.CopyToCPUAsync(stream);
+
         /// <summary>
         /// Copies from the CPU source address into the given target view while
         /// synchronizing the current accelerator stream.

diff --git a/ILGPU/Runtime/MemoryBuffer.cs b/ILGPU/Runtime/MemoryBuffer.cs
@@ -10,10 +10,12 @@
 // ---------------------------------------------------------------------------------------
 
 using ILGPU.Resources;
+using ILGPU.Runtime.CPU;
 using System;
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
+using System.Threading.Tasks;
 
 namespace ILGPU.Runtime
 {
@@ -147,6 +149,50 @@ protected internal abstract void CopyTo(
             in ArrayView<byte> sourceView,
             in ArrayView<byte> targetView);
 
+        /// <summary>
+        /// Asynchronously copies a raw byte range of this buffer back to the host.
+        /// </summary>
+        /// <param name="stream">The used accelerator stream.</param>
+        /// <param name="sourceOffsetInBytes">The source offset in bytes.</param>
+        /// <param name="lengthInBytes">The number of bytes to read back.</param>
+        /// <returns>A task producing the copied bytes.</returns>
+        /// <remarks>
+        /// This is the overridable async GPU-&gt;CPU readback hook. The default
+        /// implementation awaits <see cref="AcceleratorStream.SynchronizeAsync"/>
+        /// (the real drain on backends that override it) and then performs the
+        /// synchronous <see cref="CopyTo(AcceleratorStream, long, in ArrayView{byte})"/>,
+        /// which is correct for CUDA / OpenCL / CPU. Browser backends MUST override
+        /// this because their synchronous <see cref="CopyTo"/> either reads stale data
+        /// before worker kernels finish (Wasm) or throws because synchronous GPU-&gt;CPU
+        /// readback is impossible (WebGPU / WebGL); each provides a true async readback
+        /// (SharedArrayBuffer read after drain, <c>mapAsync</c>, GL worker readback).
+        /// Prefer the typed <c>ArrayView&lt;T&gt;.CopyToCPUAsync</c> extension over
+        /// calling this directly.
+        /// </remarks>
+        protected internal virtual async Task<byte[]> CopyToRawAsync(
+            AcceleratorStream stream,
+            long sourceOffsetInBytes,
+            long lengthInBytes)
+        {
+            if (lengthInBytes < 0)
+                throw new ArgumentOutOfRangeException(nameof(lengthInBytes));
+
+            // Real drain first. Overridden to a true async wait on browser backends;
+            // offloads the blocking Synchronize on CUDA / OpenCL / CPU.
+            await stream.SynchronizeAsync().ConfigureAwait(false);
+
+            var result = new byte[lengthInBytes];
+            if (lengthInBytes == 0)
+                return result;
+
+            using var cpuBuffer = CPUMemoryBuffer.Create(
+                Accelerator,
+                ref result[0],
+                lengthInBytes);
+            CopyTo(stream, sourceOffsetInBytes, cpuBuffer.AsRawArrayView());
+            return result;
+        }
+
         /// <summary>
         /// Copies elements from the source view to the current buffer.
         /// </summary>

diff --git a/SpawnDev.ILGPU.Demo.Shared/SpawnDev.ILGPU.Demo.Shared.csproj b/SpawnDev.ILGPU.Demo.Shared/SpawnDev.ILGPU.Demo.Shared.csproj
@@ -10,6 +10,7 @@
 	<ItemGroup>
 		<ProjectReference Include="..\SpawnDev.ILGPU\SpawnDev.ILGPU.csproj" />
 		<ProjectReference Include="..\SpawnDev.ILGPU.P2P\SpawnDev.ILGPU.P2P.csproj" />
+		<ProjectReference Include="..\..\..\SpawnDev.ILGPU.ML\SpawnDev.ILGPU.ML\SpawnDev.ILGPU.ML\SpawnDev.ILGPU.ML.csproj" />
 		<ProjectReference Include="..\ILGPU\ILGPU.csproj" />
 		<ProjectReference Include="..\ILGPU.Algorithms\ILGPU.Algorithms.csproj" />
 		<PackageReference Include="SpawnDev.UnitTesting.Blazor" Version="2.5.3" />

diff --git a/SpawnDev.ILGPU.Demo.Shared/UnitTests/BackendTestBase.Tests2.cs b/SpawnDev.ILGPU.Demo.Shared/UnitTests/BackendTestBase.Tests2.cs
@@ -439,6 +439,65 @@ public async Task ILGPUReduceTest() => await RunTest(async accelerator =>
                 throw new Exception($"Reduce<AddInt32> expected {expectedSum2}, got {sumResult[0]}");
         });
 
+        // Tests the REAL async ReduceAsync API. Before the 2026-05-29 async/sync fix this
+        // was Task.Run(() => sync Reduce) — fake async whose inner sync CopyToCPU THREW on
+        // WebGPU (no sync GPU->CPU readback) and returned STALE/zero on Wasm (the reduction
+        // kernel was still in flight). It now routes through the core virtual
+        // AcceleratorStream/Accelerator.SynchronizeAsync drain + ArrayView.CopyToCPUAsync
+        // readback, so it returns correct scalars on every backend. Exercises the full
+        // dispatch -> real async drain -> async readback path end to end.
+        [TestMethod]
+        public async Task ILGPUReduceAsyncTest() => await RunTest(async accelerator =>
+        {
+            const int count = 256;
+            var data = new int[count];
+            for (int i = 0; i < count; i++) data[i] = i + 1; // 1..256
+            using var inputBuf = accelerator.Allocate1D(data);
+
+            int maxResult = await accelerator.ReduceAsync<
+                int, global::ILGPU.Algorithms.ScanReduceOperations.MaxInt32>(
+                accelerator.DefaultStream, inputBuf.View.BaseView);
+            if (maxResult != 256)
+                throw new Exception($"ReduceAsync<MaxInt32> expected 256, got {maxResult}");
+
+            int minResult = await accelerator.ReduceAsync<
+                int, global::ILGPU.Algorithms.ScanReduceOperations.MinInt32>(
+                accelerator.DefaultStream, inputBuf.View.BaseView);
+            if (minResult != 1)
+                throw new Exception($"ReduceAsync<MinInt32> expected 1, got {minResult}");
+
+            int sumResult = await accelerator.ReduceAsync<
+                int, global::ILGPU.Algorithms.ScanReduceOperations.AddInt32>(
+                accelerator.DefaultStream, inputBuf.View.BaseView);
+            int expectedSum = count * (count + 1) / 2; // 32896
+            if (sumResult != expectedSum)
+                throw new Exception($"ReduceAsync<AddInt32> expected {expectedSum}, got {sumResult}");
+        });
+
+        // Verifies MemSetToZeroAsync: a kernel fills the buffer with nonzero values
+        // (unawaited dispatch), then MemSetToZeroAsync must order AFTER that kernel and
+        // zero the buffer. On Wasm the sync MemSetToZero would race the in-flight worker
+        // kernel (immediate SAB write bypassing the dispatch queue); the async variant
+        // drains first. WebGPU records the clear into the same encoder; desktop is
+        // stream-ordered. Skipped on WebGL (MemSet is deferred CPU-side upload — readback
+        // reads the GPU/worker side, so this CPU-fill pattern isn't meaningful there).
+        [TestMethod]
+        public async Task MemSetToZeroAsyncTest() => await RunTest(async accelerator =>
+        {
+            const int count = 64;
+            using var buf = accelerator.Allocate1D<int>(count);
+            var fill = accelerator.LoadAutoGroupedStreamKernel<Index1D, ArrayView<int>>(
+                (i, v) => v[i] = i + 1);
+            fill((Index1D)count, buf.View); // unawaited dispatch
+            await buf.View.MemSetToZeroAsync(accelerator.DefaultStream);
+            var result = await buf.CopyToHostAsync<int>();
+            for (int i = 0; i < count; i++)
+                if (result[i] != 0)
+                    throw new Exception(
+                        $"MemSetToZeroAsync: index {i} = {result[i]}, expected 0 (zero-fill " +
+                        "did not order after the kernel)");
+        });
+
         // ILGPUReduceSmallTest removed — the main ILGPUReduceTest covers the same functionality.
         // The small test was a diagnostic that exposed a pre-existing WebGL GLSL codegen bug
         // (_idx undeclared identifier in Reduce kernel shader). Not a Wasm issue.