Skip to content

Commit b7ca1a4

Browse files
authored
Prefetch mmap'd weight blobs to eliminate page fault bottleneck (pytorch#18236)
Weight loading via update_constants_from_blob was achieving only 0.3-0.4 GB/s (vs 8 GB/s hardware capability) because memcpy from mmap'd pages triggers synchronous page faults — each 16K page traps into the kernel for NVMe I/O. Call madvise(MADV_WILLNEED) on the weights blob early in Metal backend init, before writing/dlopen'ing the .so file. The kernel prefaults pages asynchronously during the ~200ms of other init work. By the time memcpy runs, pages are already resident and throughput reaches 5-8 GB/s. Metal init time: ~25s -> ~9s (2.7x faster) on int4 Voxtral model.
1 parent e4eeb98 commit b7ca1a4

1 file changed

Lines changed: 39 additions & 4 deletions

File tree

backends/apple/metal/runtime/metal_backend.cpp

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
#include <executorch/runtime/core/error.h>
1212
#include <executorch/runtime/core/evalue.h>
1313
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
14+
#include <sys/mman.h>
1415
#include <unistd.h>
16+
#include <cerrno>
1517
#include <chrono>
1618
#include <cstdio>
1719
#include <cstdlib>
@@ -247,7 +249,41 @@ class ET_EXPERIMENTAL MetalBackend final
247249
ET_LOG(Info, "MetalBackend::init - so_blob_key: %s", so_blob_key.c_str());
248250

249251
const NamedDataMap* named_data_map = context.get_named_data_map();
250-
ET_LOG(Info, "MetalBackend::init - Got named data map: %p", named_data_map);
252+
ET_CHECK_OR_RETURN_ERROR(
253+
named_data_map != nullptr,
254+
Internal,
255+
"MetalBackend requires a NamedDataMap for weight loading");
256+
257+
// Prefetch the weights blob — trigger async readahead so pages are
258+
// resident by the time update_constants_from_blob memcpy's them.
259+
// This overlaps disk I/O with the .so write + dlopen (~200ms).
260+
std::string weights_blob_key =
261+
method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
262+
{
263+
auto prefetch_buf = named_data_map->get_data(weights_blob_key.c_str());
264+
if (prefetch_buf.ok() && prefetch_buf->data() != nullptr) {
265+
// Align address down to page boundary (madvise requires it).
266+
uintptr_t addr = reinterpret_cast<uintptr_t>(prefetch_buf->data());
267+
size_t page_size = getpagesize();
268+
uintptr_t aligned_addr = addr & ~(page_size - 1);
269+
size_t aligned_size = prefetch_buf->size() + (addr - aligned_addr);
270+
int ret = madvise(
271+
reinterpret_cast<void*>(aligned_addr), aligned_size, MADV_WILLNEED);
272+
if (ret != 0) {
273+
ET_LOG(
274+
Info,
275+
"MetalBackend::init - madvise(MADV_WILLNEED) failed for %s: %s",
276+
weights_blob_key.c_str(),
277+
strerror(errno));
278+
} else {
279+
ET_LOG(
280+
Info,
281+
"MetalBackend::init - Prefetching %s (%.1f MB)",
282+
weights_blob_key.c_str(),
283+
prefetch_buf->size() / (1024.0 * 1024.0));
284+
}
285+
}
286+
}
251287

252288
ET_LOG(
253289
Info,
@@ -344,9 +380,8 @@ class ET_EXPERIMENTAL MetalBackend final
344380

345381
handle->container_handle = container_handle;
346382

347-
// Look into named data map for constant data
348-
std::string weights_blob_key =
349-
method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
383+
// Look into named data map for constant data (key computed above for
384+
// prefetch)
350385
auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
351386
if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
352387
ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());

0 commit comments

Comments
 (0)