Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions backends/webgpu/runtime/WebGPUBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ using executorch::runtime::register_backend;
using executorch::runtime::Result;
using executorch::runtime::Span;

// Test-only global; overwritten on each init() call.
static WebGPUMemoryStats s_last_memory_stats_for_testing;

WebGPUMemoryStats get_last_memory_stats() {
return s_last_memory_stats_for_testing;
}

bool WebGPUBackend::is_available() const {
return true;
}
Expand Down Expand Up @@ -83,6 +90,8 @@ Result<DelegateHandle*> WebGPUBackend::init(
return Error::DelegateInvalidCompatibility;
}

s_last_memory_stats_for_testing = graph->memory_stats();

processed->Free();

return graph;
Expand Down
5 changes: 5 additions & 0 deletions backends/webgpu/runtime/WebGPUBackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,17 @@

#pragma once

#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
#include <executorch/runtime/backend/interface.h>

namespace executorch {
namespace backends {
namespace webgpu {

// Test-only: returns memory stats from the most recently initialized graph.
// Not thread-safe; only valid when a single graph is loaded at a time.
WebGPUMemoryStats get_last_memory_stats();

class WebGPUBackend final : public ::executorch::runtime::BackendInterface {
public:
~WebGPUBackend() override = default;
Expand Down
99 changes: 76 additions & 23 deletions backends/webgpu/runtime/WebGPUGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
#include <webgpu/wgpu.h>

#include <executorch/runtime/platform/assert.h>

#include <cstring>
#include <stdexcept>

Expand Down Expand Up @@ -50,9 +52,15 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) {
WebGPUGraph::WebGPUGraph() = default;

WebGPUGraph::~WebGPUGraph() {
for (auto& t : tensors_) {
if (t.buffer) {
wgpuBufferRelease(t.buffer);
for (size_t i = 0; i < tensors_.size(); i++) {
if (tensors_[i].buffer &&
(i >= tensor_mem_obj_ids_.size() || tensor_mem_obj_ids_[i] < 0)) {
wgpuBufferRelease(tensors_[i].buffer);
}
}
for (auto& buf : shared_buffers_) {
if (buf) {
wgpuBufferRelease(buf);
}
}
for (auto& buf : output_staging_buffers_) {
Expand Down Expand Up @@ -94,6 +102,7 @@ void WebGPUGraph::build(
const int num_vals = values ? values->size() : 0;
value_types_.resize(num_vals, ValueType::Null);
tensors_.resize(num_vals);
tensor_mem_obj_ids_.resize(num_vals, -1);
ints_.resize(num_vals, 0);
doubles_.resize(num_vals, 0.0);
bools_.resize(num_vals, false);
Expand Down Expand Up @@ -121,27 +130,41 @@ void WebGPUGraph::build(
}
tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype());

// Create GPU buffer
WGPUBufferDescriptor buf_desc = {};
buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4;
buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
WGPUBufferUsage_CopySrc;
buf_desc.mappedAtCreation = false;
tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);

// Upload constant data if this tensor has a constant_id
int constant_id = vk_tensor->constant_id();
if (constant_id >= 0 && constant_data) {
const auto* constants = graph->constants();
if (constants && constant_id < static_cast<int>(constants->size())) {
const auto* vk_bytes = constants->Get(constant_id);
// Only upload from embedded bytes (not named data map)
if (vk_bytes->offset() != UINT64_MAX) {
const uint8_t* src = constant_data + vk_bytes->offset();
wgpuQueueWriteBuffer(
queue_, tensor.buffer, 0, src, tensor.nbytes);
int mem_obj_id = vk_tensor->mem_obj_id();

// Constants always get dedicated buffers regardless of mem_obj_id
if (constant_id >= 0 || mem_obj_id < 0) {
tensor_mem_obj_ids_[i] = -1;
WGPUBufferDescriptor buf_desc = {};
ET_CHECK_MSG(tensor.nbytes > 0, "Tensor has zero bytes");
buf_desc.size = tensor.nbytes;
buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
WGPUBufferUsage_CopySrc;
buf_desc.mappedAtCreation = false;
tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);

if (constant_id >= 0 && constant_data) {
const auto* constants = graph->constants();
if (constants &&
constant_id < static_cast<int>(constants->size())) {
const auto* vk_bytes = constants->Get(constant_id);
if (vk_bytes->offset() != UINT64_MAX) {
const uint8_t* src = constant_data + vk_bytes->offset();
wgpuQueueWriteBuffer(
queue_, tensor.buffer, 0, src, tensor.nbytes);
}
}
}
} else {
// Shared buffer: track required size, defer allocation to pass 2
tensor_mem_obj_ids_[i] = mem_obj_id;
size_t id = static_cast<size_t>(mem_obj_id);
if (id >= shared_buffer_sizes_.size()) {
shared_buffer_sizes_.resize(id + 1, 0);
}
shared_buffer_sizes_[id] =
std::max(shared_buffer_sizes_[id], tensor.nbytes);
}
break;
}
Expand All @@ -166,6 +189,24 @@ void WebGPUGraph::build(
}
}

// Allocate shared buffers and assign to tensors
shared_buffers_.resize(shared_buffer_sizes_.size(), nullptr);
for (size_t id = 0; id < shared_buffer_sizes_.size(); id++) {
WGPUBufferDescriptor buf_desc = {};
ET_CHECK_MSG(shared_buffer_sizes_[id] > 0, "Shared buffer has zero bytes");
buf_desc.size = shared_buffer_sizes_[id];
buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
WGPUBufferUsage_CopySrc;
buf_desc.mappedAtCreation = false;
shared_buffers_[id] = wgpuDeviceCreateBuffer(device_, &buf_desc);
}
for (int i = 0; i < num_vals; i++) {
int mid = tensor_mem_obj_ids_[i];
if (mid >= 0) {
tensors_[i].buffer = shared_buffers_[mid];
}
}

// Phase 2: Record input and output IDs
const auto* fb_input_ids = graph->input_ids();
if (fb_input_ids) {
Expand All @@ -181,7 +222,8 @@ void WebGPUGraph::build(

// Create staging buffer for output readback
WGPUBufferDescriptor staging_desc = {};
staging_desc.size = tensors_[oid].nbytes > 0 ? tensors_[oid].nbytes : 4;
ET_CHECK_MSG(tensors_[oid].nbytes > 0, "Output tensor has zero bytes");
staging_desc.size = tensors_[oid].nbytes;
staging_desc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst;
staging_desc.mappedAtCreation = false;
output_staging_buffers_.push_back(
Expand Down Expand Up @@ -315,10 +357,21 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const {
WebGPUMemoryStats stats;
for (size_t i = 0; i < value_types_.size(); i++) {
if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
stats.tensor_buffer_bytes += tensors_[i].nbytes;
stats.num_tensors++;
// Shared tensors are tracked via shared_buffer_sizes_
bool is_shared =
i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0;
if (!is_shared) {
stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes;
}
}
}
for (size_t s : shared_buffer_sizes_) {
stats.shared_buffer_bytes += s;
}
stats.num_shared_objects = static_cast<int>(shared_buffers_.size());
stats.tensor_buffer_bytes =
stats.shared_buffer_bytes + stats.unshared_tensor_buffer_bytes;
for (size_t i = 0; i < output_ids_.size(); i++) {
stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes;
}
Expand Down
8 changes: 8 additions & 0 deletions backends/webgpu/runtime/WebGPUGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ struct WebGPUDispatch {

struct WebGPUMemoryStats {
size_t tensor_buffer_bytes = 0;
size_t shared_buffer_bytes = 0;
int num_shared_objects = 0;
size_t unshared_tensor_buffer_bytes = 0;
size_t staging_buffer_bytes = 0;
size_t uniform_buffer_bytes = 0;
int num_tensors = 0;
Expand Down Expand Up @@ -134,6 +137,11 @@ class WebGPUGraph {
std::vector<int> input_ids_;
std::vector<int> output_ids_;

// Memory aliasing: tensors with the same mem_obj_id share a WGPUBuffer.
std::vector<int> tensor_mem_obj_ids_;
std::vector<WGPUBuffer> shared_buffers_;
std::vector<size_t> shared_buffer_sizes_;

// Staging buffers for reading back outputs (MapRead | CopyDst).
std::vector<WGPUBuffer> output_staging_buffers_;

Expand Down
15 changes: 15 additions & 0 deletions backends/webgpu/test/ops/add/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
z = x + y
z = z + x
z = z + y
z = z + x
z = z + y
return z


Expand Down Expand Up @@ -97,5 +99,18 @@ def export_add_model(output_path: str) -> None:
print(f"Exported {output_path}")


def export_chained_add_model(output_path: str) -> None:
"""Export a chained add model (z=x+y; z=z+x; z=z+y; z=z+x; z=z+y) to .pte for memory aliasing testing."""
model = AddChainedModule()
example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024))
ep = torch.export.export(model, example_inputs)
et_program = to_edge_transform_and_lower(
ep, partitioner=[VulkanPartitioner()]
).to_executorch()
with open(output_path, "wb") as f:
f.write(et_program.buffer)
print(f"Exported {output_path}")


if __name__ == "__main__":
unittest.main()
7 changes: 5 additions & 2 deletions backends/webgpu/test/test_build_webgpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v

# ── Step 2: Export .pte model ─────────────────────────────────────────────────

echo "=== Step 2: Export test model ==="
echo "=== Step 2: Export test models ==="
PTE_MODEL="/tmp/webgpu_add_test.pte"
PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
cd "${EXECUTORCH_ROOT}"
$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.add.test_add import export_add_model
from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
export_add_model('${PTE_MODEL}')
export_chained_add_model('${PTE_CHAINED_MODEL}')
"

# ── Step 3: Native build + test (wgpu-native) ────────────────────────────────
Expand Down Expand Up @@ -60,6 +62,7 @@ cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC}

echo "=== Step 4: Run native test ==="
WEBGPU_TEST_MODEL="${PTE_MODEL}" \
WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
"${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test"

echo "=== Done ==="
96 changes: 96 additions & 0 deletions backends/webgpu/test/test_webgpu_native.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/webgpu/runtime/WebGPUBackend.h>
#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>
Expand Down Expand Up @@ -75,6 +76,92 @@ static bool test_single_add(const std::string& model_path) {
return true;
}

static bool test_chained_add_memory(const std::string& model_path) {
printf("\n--- Test: chained add memory aliasing (1024x1024) ---\n");

Module module(model_path);
auto err = module.load_forward();
if (err != Error::Ok) {
printf("FAIL: could not load forward method (error %d)\n", (int)err);
return false;
}
printf("Model loaded: %s\n", model_path.c_str());

auto stats = get_last_memory_stats();
printf("Memory stats after build:\n");
printf(" num_tensors: %d\n", stats.num_tensors);
printf(" num_shared_objects: %d\n", stats.num_shared_objects);
printf(" shared_buffer_bytes: %zu\n", stats.shared_buffer_bytes);
printf(
" unshared_tensor_buffer_bytes: %zu\n",
stats.unshared_tensor_buffer_bytes);
printf(" tensor_buffer_bytes: %zu\n", stats.tensor_buffer_bytes);
printf(" total_bytes: %zu\n", stats.total_bytes());

constexpr int dim = 1024;
constexpr int size = dim * dim;

std::vector<float> x_data(size);
std::vector<float> y_data(size);
for (int i = 0; i < size; i++) {
x_data[i] = static_cast<float>(i % 100) * 0.01f;
y_data[i] = static_cast<float>(i % 50) * 0.02f;
}

auto x = make_tensor_ptr({dim, dim}, std::vector<float>(x_data));
auto y = make_tensor_ptr({dim, dim}, std::vector<float>(y_data));

auto result = module.forward({EValue(x), EValue(y)});
if (!result.ok()) {
printf("FAIL: forward failed (error %d)\n", (int)result.error());
return false;
}

const auto& outputs = result.get();
if (outputs.empty() || !outputs[0].isTensor()) {
printf("FAIL: no tensor output\n");
return false;
}

// z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y
const auto& out_tensor = outputs[0].toTensor();
const float* out_data = out_tensor.const_data_ptr<float>();

float max_error = 0.0f;
for (int i = 0; i < size; i++) {
float expected = 3.0f * x_data[i] + 3.0f * y_data[i];
float error = std::abs(out_data[i] - expected);
max_error = std::max(max_error, error);
}

printf("Max error: %e\n", max_error);
if (max_error > 1e-3f) {
printf("FAIL: max error exceeds tolerance 1e-3\n");
return false;
}

if (stats.num_shared_objects <= 0) {
printf("FAIL: expected shared objects but got none\n");
return false;
}
printf(
"PASS: memory aliasing is active (%d shared objects)\n",
stats.num_shared_objects);

size_t naive_bytes =
static_cast<size_t>(stats.num_tensors) * dim * dim * sizeof(float);
printf("Naive tensor bytes: %zu\n", naive_bytes);
printf("Actual tensor bytes: %zu\n", stats.tensor_buffer_bytes);
if (stats.tensor_buffer_bytes >= naive_bytes) {
printf("FAIL: expected memory savings but actual >= naive\n");
return false;
}
printf("PASS: memory savings from aliasing confirmed\n");

printf("PASS: chained add memory test\n");
return true;
}

int main(int argc, char** argv) {
std::string model_path = "webgpu_add_test.pte";
if (argc > 1) {
Expand All @@ -84,6 +171,11 @@ int main(int argc, char** argv) {
model_path = env;
}

std::string chained_model_path;
if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) {
chained_model_path = env;
}

WebGPUContext ctx;
try {
ctx = create_webgpu_context();
Expand All @@ -97,6 +189,10 @@ int main(int argc, char** argv) {

bool ok = test_single_add(model_path);

if (!chained_model_path.empty()) {
ok = test_chained_add_memory(chained_model_path) && ok;
}

set_default_webgpu_context(nullptr);
destroy_webgpu_context(ctx);

Expand Down
Loading