Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 47 additions & 6 deletions ILGPU.Algorithms/ReductionExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ public static T Reduce<T, TReduction>(
where T : unmanaged
where TReduction : struct, IScanReduceOperation<T>
{
EnsureSyncReadbackSupported(accelerator);
using var output = accelerator.Allocate1D<T>(1);
accelerator.Reduce<T, TReduction>(stream, input, output.View);
T result = default;
Expand All @@ -336,13 +337,23 @@ public static T Reduce<T, TReduction>(
/// Uses the internal cache to realize a temporary output buffer.
/// </remarks>
/// <returns>The reduced value.</returns>
public static Task<T> ReduceAsync<T, TReduction>(
public static async Task<T> ReduceAsync<T, TReduction>(
this Accelerator accelerator,
AcceleratorStream stream,
ArrayView<T> input)
where T : unmanaged
where TReduction : struct, IScanReduceOperation<T> =>
Task.Run(() => accelerator.Reduce<T, TReduction>(stream, input));
where TReduction : struct, IScanReduceOperation<T>
{
// Real async reduction. The previous Task.Run(() => sync Reduce) was fake:
// the inner sync Reduce ends in a synchronous CopyToCPU which throws on
// WebGPU (no sync readback) and reads stale/zero on Wasm/WebGL (the
// reduction kernel is still in flight). Dispatch, then await the backend's
// REAL async drain + readback via ArrayView.CopyToCPUAsync.
using var output = accelerator.Allocate1D<T>(1);
accelerator.Reduce<T, TReduction>(stream, input, output.View);
var result = await output.View.CopyToCPUAsync(stream).ConfigureAwait(false);
return result[0];
}

/// <summary>
/// Performs a reduction using a reduction logic.
Expand Down Expand Up @@ -388,13 +399,36 @@ public static T Reduce<T, TStride, TReduction>(
where TStride : struct, IStride1D
where TReduction : struct, IScanReduceOperation<T>
{
EnsureSyncReadbackSupported(accelerator);
using var output = accelerator.Allocate1D<T>(1);
accelerator.Reduce<T, TStride, TReduction>(stream, input, output.View);
T result = default;
output.View.CopyToCPU(stream, ref result, 1);
return result;
}

/// <summary>
/// Guards the synchronous <c>Reduce</c>-to-scalar overloads against browser
/// backends, which have no usable synchronous GPU-&gt;CPU readback: WebGPU
/// throws on sync readback, and Wasm/WebGL would read stale data before the
/// in-flight reduction kernel finishes (a silent wrong result). Callers on
/// those backends must use <c>ReduceAsync</c>.
/// </summary>
private static void EnsureSyncReadbackSupported(Accelerator accelerator)
{
switch (accelerator.AcceleratorType)
{
case AcceleratorType.Wasm:
case AcceleratorType.WebGL:
case AcceleratorType.WebGPU:
throw new NotSupportedException(
$"Synchronous Reduce-to-scalar is not supported on the " +
$"{accelerator.AcceleratorType} backend: browser backends have no " +
"synchronous GPU->CPU readback (WebGPU throws; Wasm/WebGL would read " +
"stale data before in-flight kernels finish). Use ReduceAsync instead.");
}
}

/// <summary>
/// Performs a reduction using a reduction logic.
/// </summary>
Expand All @@ -408,13 +442,20 @@ public static T Reduce<T, TStride, TReduction>(
/// Uses the internal cache to realize a temporary output buffer.
/// </remarks>
/// <returns>The reduced value.</returns>
public static Task<T> ReduceAsync<T, TStride, TReduction>(
public static async Task<T> ReduceAsync<T, TStride, TReduction>(
this Accelerator accelerator,
AcceleratorStream stream,
ArrayView1D<T, TStride> input)
where T : unmanaged
where TStride : struct, IStride1D
where TReduction : struct, IScanReduceOperation<T> =>
Task.Run(() => accelerator.Reduce<T, TStride, TReduction>(stream, input));
where TReduction : struct, IScanReduceOperation<T>
{
// Real async reduction — see ReduceAsync<T, TReduction> above for why the
// former Task.Run(sync Reduce) was incorrect on browser backends.
using var output = accelerator.Allocate1D<T>(1);
accelerator.Reduce<T, TStride, TReduction>(stream, input, output.View);
var result = await output.View.CopyToCPUAsync(stream).ConfigureAwait(false);
return result[0];
}
}
}
24 changes: 24 additions & 0 deletions ILGPU/Runtime/Accelerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
using System.Diagnostics;
using System.IO;
using System.Threading;
using System.Threading.Tasks;

namespace ILGPU.Runtime
{
Expand Down Expand Up @@ -271,6 +272,29 @@ public void Synchronize()
/// </summary>
protected abstract void SynchronizeInternal();

/// <summary>
/// Asynchronously synchronizes all pending operations on this accelerator.
/// </summary>
/// <returns>A task that completes once all submitted work has finished.</returns>
/// <remarks>
/// The default implementation simply runs the synchronous
/// <see cref="Synchronize"/> and returns a completed task, which is correct
/// for backends whose <see cref="Synchronize"/> blocks until the queue drains
/// (CUDA, OpenCL, CPU). Single-threaded browser backends (Wasm, WebGPU,
/// WebGL) cannot block-wait — their synchronous <see cref="Synchronize"/> is a
/// non-blocking flush / no-op — so they MUST override this to await their real
/// drain (worker dispatch completion, queue.OnSubmittedWorkDone, GL fence).
/// Algorithm and consumer code that needs a host-visible result after an
/// unawaited dispatch must <c>await</c> this rather than calling the
/// synchronous <see cref="Synchronize"/>, which silently does nothing on those
/// backends.
/// </remarks>
public virtual Task SynchronizeAsync()
{
Synchronize();
return Task.CompletedTask;
}

/// <summary>
/// Clears all internal caches.
/// </summary>
Expand Down
14 changes: 13 additions & 1 deletion ILGPU/Runtime/AcceleratorStream.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,19 @@ protected AcceleratorStream(Accelerator accelerator)
/// Synchronizes all queued operations asynchronously.
/// </summary>
/// <returns>A task object to wait for.</returns>
public Task SynchronizeAsync() => Task.Run(synchronizeAction);
/// <remarks>
/// The default implementation offloads the blocking
/// <see cref="Synchronize"/> call to a thread-pool thread, which is correct
/// for backends whose <see cref="Synchronize"/> genuinely blocks until the
/// queue drains (CUDA, OpenCL, CPU). Single-threaded browser backends
/// (Wasm, WebGPU, WebGL) cannot block-wait and their synchronous
/// <see cref="Synchronize"/> is a non-blocking flush / no-op; those streams
/// MUST override this to await their real async drain. Algorithm code that
/// needs a host-visible result after an unawaited dispatch must await this
/// (or <see cref="Accelerator.SynchronizeAsync"/>) rather than calling the
/// synchronous <see cref="Synchronize"/>.
/// </remarks>
public virtual Task SynchronizeAsync() => Task.Run(synchronizeAction);

/// <summary>
/// Makes the associated accelerator the current one for this thread and
Expand Down
52 changes: 52 additions & 0 deletions ILGPU/Runtime/ArrayViewExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading.Tasks;

namespace ILGPU.Runtime
{
Expand Down Expand Up @@ -926,6 +928,56 @@ public static void CopyToCPU<T, TView>(
stream.Synchronize();
}

/// <summary>
/// Asynchronously copies the contents of the given view back to the host as a
/// managed array. This is the backend-agnostic, browser-safe readback: it routes
/// through <see cref="MemoryBuffer.CopyToRawAsync"/>, which awaits the accelerator's
/// real async drain before reading. Unlike the synchronous <c>CopyToCPU</c>, this
/// returns correct data on Wasm (drains in-flight worker kernels first) and does not
/// throw on WebGPU / WebGL (which have no synchronous GPU-&gt;CPU readback).
/// </summary>
/// <typeparam name="T">The element type.</typeparam>
/// <param name="source">The source view to read back.</param>
/// <param name="stream">The used accelerator stream.</param>
/// <returns>A task producing the view's <c>Length</c> elements.</returns>
[NotInsideKernel]
public static async Task<T[]> CopyToCPUAsync<T>(
this ArrayView<T> source,
AcceleratorStream stream)
where T : unmanaged
{
var contig = (IContiguousArrayView)source;
var buffer = contig.Buffer
?? throw new InvalidOperationException(
"ArrayView has no backing buffer.");
long countElems = source.Length;
if (countElems == 0)
return Array.Empty<T>();
int elementSize = ((IArrayView)source).ElementSize;
long byteOffset = contig.IndexInBytes;
long byteCount = countElems * elementSize;

var bytes = await buffer
.CopyToRawAsync(stream, byteOffset, byteCount)
.ConfigureAwait(false);

var result = new T[countElems];
MemoryMarshal.Cast<byte, T>(bytes).CopyTo(new Span<T>(result));
return result;
}

/// <summary>
/// <see cref="ArrayView1D{T, TStride}"/> overload of
/// <see cref="CopyToCPUAsync{T}(ArrayView{T}, AcceleratorStream)"/>.
/// </summary>
[NotInsideKernel]
public static Task<T[]> CopyToCPUAsync<T, TStride>(
this ArrayView1D<T, TStride> source,
AcceleratorStream stream)
where T : unmanaged
where TStride : struct, IStride1D =>
source.BaseView.CopyToCPUAsync(stream);

/// <summary>
/// Copies from the CPU source address into the given target view while
/// synchronizing the current accelerator stream.
Expand Down
46 changes: 46 additions & 0 deletions ILGPU/Runtime/MemoryBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
// ---------------------------------------------------------------------------------------

using ILGPU.Resources;
using ILGPU.Runtime.CPU;
using System;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Threading.Tasks;

namespace ILGPU.Runtime
{
Expand Down Expand Up @@ -147,6 +149,50 @@ protected internal abstract void CopyTo(
in ArrayView<byte> sourceView,
in ArrayView<byte> targetView);

/// <summary>
/// Asynchronously copies a raw byte range of this buffer back to the host.
/// </summary>
/// <param name="stream">The used accelerator stream.</param>
/// <param name="sourceOffsetInBytes">The source offset in bytes.</param>
/// <param name="lengthInBytes">The number of bytes to read back.</param>
/// <returns>A task producing the copied bytes.</returns>
/// <remarks>
/// This is the overridable async GPU-&gt;CPU readback hook. The default
/// implementation awaits <see cref="AcceleratorStream.SynchronizeAsync"/>
/// (the real drain on backends that override it) and then performs the
/// synchronous <see cref="CopyTo(AcceleratorStream, long, in ArrayView{byte})"/>,
/// which is correct for CUDA / OpenCL / CPU. Browser backends MUST override
/// this because their synchronous <see cref="CopyTo"/> either reads stale data
/// before worker kernels finish (Wasm) or throws because synchronous GPU-&gt;CPU
/// readback is impossible (WebGPU / WebGL); each provides a true async readback
/// (SharedArrayBuffer read after drain, <c>mapAsync</c>, GL worker readback).
/// Prefer the typed <c>ArrayView&lt;T&gt;.CopyToCPUAsync</c> extension over
/// calling this directly.
/// </remarks>
protected internal virtual async Task<byte[]> CopyToRawAsync(
AcceleratorStream stream,
long sourceOffsetInBytes,
long lengthInBytes)
{
if (lengthInBytes < 0)
throw new ArgumentOutOfRangeException(nameof(lengthInBytes));

// Real drain first. Overridden to a true async wait on browser backends;
// offloads the blocking Synchronize on CUDA / OpenCL / CPU.
await stream.SynchronizeAsync().ConfigureAwait(false);

var result = new byte[lengthInBytes];
if (lengthInBytes == 0)
return result;

using var cpuBuffer = CPUMemoryBuffer.Create(
Accelerator,
ref result[0],
lengthInBytes);
CopyTo(stream, sourceOffsetInBytes, cpuBuffer.AsRawArrayView());
return result;
}

/// <summary>
/// Copies elements from the source view to the current buffer.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
<ItemGroup>
<ProjectReference Include="..\SpawnDev.ILGPU\SpawnDev.ILGPU.csproj" />
<ProjectReference Include="..\SpawnDev.ILGPU.P2P\SpawnDev.ILGPU.P2P.csproj" />
<ProjectReference Include="..\..\..\SpawnDev.ILGPU.ML\SpawnDev.ILGPU.ML\SpawnDev.ILGPU.ML\SpawnDev.ILGPU.ML.csproj" />
<ProjectReference Include="..\ILGPU\ILGPU.csproj" />
<ProjectReference Include="..\ILGPU.Algorithms\ILGPU.Algorithms.csproj" />
<PackageReference Include="SpawnDev.UnitTesting.Blazor" Version="2.5.3" />
Expand Down
59 changes: 59 additions & 0 deletions SpawnDev.ILGPU.Demo.Shared/UnitTests/BackendTestBase.Tests2.cs
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,65 @@ public async Task ILGPUReduceTest() => await RunTest(async accelerator =>
throw new Exception($"Reduce<AddInt32> expected {expectedSum2}, got {sumResult[0]}");
});

// Tests the REAL async ReduceAsync API. Before the 2026-05-29 async/sync fix this
// was Task.Run(() => sync Reduce) — fake async whose inner sync CopyToCPU THREW on
// WebGPU (no sync GPU->CPU readback) and returned STALE/zero on Wasm (the reduction
// kernel was still in flight). It now routes through the core virtual
// AcceleratorStream/Accelerator.SynchronizeAsync drain + ArrayView.CopyToCPUAsync
// readback, so it returns correct scalars on every backend. Exercises the full
// dispatch -> real async drain -> async readback path end to end.
[TestMethod]
public async Task ILGPUReduceAsyncTest() => await RunTest(async accelerator =>
{
const int count = 256;
var data = new int[count];
for (int i = 0; i < count; i++) data[i] = i + 1; // 1..256
using var inputBuf = accelerator.Allocate1D(data);

int maxResult = await accelerator.ReduceAsync<
int, global::ILGPU.Algorithms.ScanReduceOperations.MaxInt32>(
accelerator.DefaultStream, inputBuf.View.BaseView);
if (maxResult != 256)
throw new Exception($"ReduceAsync<MaxInt32> expected 256, got {maxResult}");

int minResult = await accelerator.ReduceAsync<
int, global::ILGPU.Algorithms.ScanReduceOperations.MinInt32>(
accelerator.DefaultStream, inputBuf.View.BaseView);
if (minResult != 1)
throw new Exception($"ReduceAsync<MinInt32> expected 1, got {minResult}");

int sumResult = await accelerator.ReduceAsync<
int, global::ILGPU.Algorithms.ScanReduceOperations.AddInt32>(
accelerator.DefaultStream, inputBuf.View.BaseView);
int expectedSum = count * (count + 1) / 2; // 32896
if (sumResult != expectedSum)
throw new Exception($"ReduceAsync<AddInt32> expected {expectedSum}, got {sumResult}");
});

// Verifies MemSetToZeroAsync: a kernel fills the buffer with nonzero values
// (unawaited dispatch), then MemSetToZeroAsync must order AFTER that kernel and
// zero the buffer. On Wasm the sync MemSetToZero would race the in-flight worker
// kernel (immediate SAB write bypassing the dispatch queue); the async variant
// drains first. WebGPU records the clear into the same encoder; desktop is
// stream-ordered. Skipped on WebGL (MemSet is deferred CPU-side upload — readback
// reads the GPU/worker side, so this CPU-fill pattern isn't meaningful there).
[TestMethod]
public async Task MemSetToZeroAsyncTest() => await RunTest(async accelerator =>
{
const int count = 64;
using var buf = accelerator.Allocate1D<int>(count);
var fill = accelerator.LoadAutoGroupedStreamKernel<Index1D, ArrayView<int>>(
(i, v) => v[i] = i + 1);
fill((Index1D)count, buf.View); // unawaited dispatch
await buf.View.MemSetToZeroAsync(accelerator.DefaultStream);
var result = await buf.CopyToHostAsync<int>();
for (int i = 0; i < count; i++)
if (result[i] != 0)
throw new Exception(
$"MemSetToZeroAsync: index {i} = {result[i]}, expected 0 (zero-fill " +
"did not order after the kernel)");
});

// ILGPUReduceSmallTest removed — the main ILGPUReduceTest covers the same functionality.
// The small test was a diagnostic that exposed a pre-existing WebGL GLSL codegen bug
// (_idx undeclared identifier in Reduce kernel shader). Not a Wasm issue.
Expand Down
Loading
Loading