blevesearch · CascadingRadium · Nov 6, 2025 · Nov 13, 2025 · Jan 8, 2026
diff --git a/c_api/gpu/DeviceUtils_c.cpp b/c_api/gpu/DeviceUtils_c.cpp
@@ -44,3 +44,12 @@ int faiss_gpu_sync_all_devices() {
     }
     CATCH_AND_HANDLE
 }
+
+/// Returns the free memory (in bytes) on the specified device
+int faiss_get_free_memory(int device, size_t* p_free_bytes) {
+    try {
+        size_t freeBytes = faiss::gpu::getFreeMemory(device);
+        *p_free_bytes = freeBytes;
+    }
+    CATCH_AND_HANDLE
+}
diff --git a/c_api/gpu/DeviceUtils_c.h b/c_api/gpu/DeviceUtils_c.h
@@ -31,6 +31,9 @@ int faiss_gpu_profiler_stop();
 /// cudaDeviceSynchronize for each device)
 int faiss_gpu_sync_all_devices();
 
+/// Returns the free memory (in bytes) on the specified device
+int faiss_get_free_memory(int device, size_t* p_free_bytes);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
@@ -61,7 +61,8 @@ GpuIndex::GpuIndex(
         : Index(dims, metric),
           resources_(resources),
           config_(config),
-          minPagedSize_(kMinPageSize) {
+          minPagedSize_(kMinPageSize),
+          pageSize_(kNonPinnedPageSize) {
     FAISS_THROW_IF_NOT_FMT(
             config_.device < getNumDevices(),
             "Invalid GPU device %d",
@@ -483,7 +484,7 @@ void GpuIndex::searchFromCpuPaged_ex_(
     if (!pinnedAlloc.first || pageSizeInVecs < 1) {
         // Just page without overlapping copy with compute
         idx_t batchSize = utils::nextHighestPowerOf2(
-                (kNonPinnedPageSize /
+                (pageSize_ /
                  (get_numeric_type_size(numeric_type) * this->d)));
 
         for (idx_t cur = 0; cur < n; cur += batchSize) {

diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
@@ -271,6 +271,9 @@ class GpuIndex : public faiss::Index {
 
     /// Size above which we page copies from the CPU to GPU
     size_t minPagedSize_;
+
+    /// Size of the pages we use to page copies from the CPU to GPU
+    size_t pageSize_;
 };
 
 /// If the given index is a GPU index, this returns the index instance

diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
@@ -312,39 +312,136 @@ void GpuIndexFlat::compute_residual(const float* x, float* residual, idx_t key)
     compute_residual_n(1, x, residual, &key);
 }
 
-void GpuIndexFlat::compute_residual_n(
-        idx_t n,
+void GpuIndexFlat::compute_residual_n_batch(
+        idx_t batchSize,
         const float* xs,
         float* residuals,
-        const idx_t* keys) const {
-    DeviceScope scope(config_.device);
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    if (n == 0) {
-        // nothing to do
-        return;
-    }
+        const idx_t* keys,
+        bool residualOnHost,
+        cudaStream_t stream) const {
 
     auto vecsDevice = toDeviceTemporary<float, 2>(
             resources_.get(),
             config_.device,
             const_cast<float*>(xs),
             stream,
-            {n, this->d});
+            {batchSize, d});
+
     auto idsDevice = toDeviceTemporary<idx_t, 1>(
             resources_.get(),
             config_.device,
             const_cast<idx_t*>(keys),
             stream,
-            {n});
+            {batchSize});
+
     auto residualDevice = toDeviceTemporary<float, 2>(
-            resources_.get(), config_.device, residuals, stream, {n, this->d});
+            resources_.get(),
+            config_.device,
+            residuals,
+            stream,
+            {batchSize, d});
 
-    FAISS_ASSERT(data_);
     data_->computeResidual(vecsDevice, idsDevice, residualDevice);
 
-    // If the output is on the host, copy back if needed
-    fromDevice<float, 2>(residualDevice, residuals, stream);
+    if (residualOnHost) {
+        fromDevice<float, 2>(residualDevice, residuals, stream);
+    }
+}
+
+void GpuIndexFlat::compute_residual_n_paged(
+        idx_t n,
+        const float* xs,
+        float* residuals,
+        const idx_t* keys,
+        bool xsOnHost,
+        bool resOnHost,
+        cudaStream_t stream) const {
+
+    idx_t batchSize = utils::nextHighestPowerOf2(
+            pageSize_ / (d * sizeof(float)));
+
+    // If residuals already on device, create one wrapper and slice it
+    if (!resOnHost) {
+        auto residualDevice = toDeviceTemporary<float, 2>(
+                resources_.get(),
+                config_.device,
+                residuals,
+                stream,
+                {n, d});
+
+        for (idx_t cur = 0; cur < n; cur += batchSize) {
+            idx_t thisBatch = std::min(batchSize, n - cur);
+
+            auto residualBatch =
+                residualDevice.narrowOutermost(cur, thisBatch);
+
+            compute_residual_n_batch(
+                    thisBatch,
+                    xs + cur * d,
+                    residualBatch.data(),
+                    keys + cur,
+                    false,
+                    stream);
+        }
+    } else {
+        // residuals on host → per-batch copy back
+        for (idx_t cur = 0; cur < n; cur += batchSize) {
+            idx_t thisBatch = std::min(batchSize, n - cur);
+
+            compute_residual_n_batch(
+                    thisBatch,
+                    xs + cur * d,
+                    residuals + cur * d,
+                    keys + cur,
+                    true,
+                    stream);
+        }
+    }
+}
+
+
+void GpuIndexFlat::compute_residual_n(
+        idx_t n,
+        const float* xs,
+        float* residuals,
+        const idx_t* keys) const {
+    if (n == 0) {
+        // nothing to do
+        return;
+    }
+    FAISS_ASSERT(data_);
+    DeviceScope scope(config_.device);
+    auto stream = resources_->getDefaultStream(config_.device);
+    // first check if we need to page the device transfers 
+    // since n*d may exceed device memory, so we call compute_residual in pages 
+    // use paged mode if
+    // - data_size >= minPagedSize_ AND
+    // - atleast one of xs or residuals is on host
+    size_t dataSize = (size_t)n * d * sizeof(float);
+    bool xsOnHost  = getDeviceForAddress(xs) == -1;
+    bool resOnHost = getDeviceForAddress(residuals) == -1;
+    bool usePaged =
+        dataSize >= minPagedSize_ && (xsOnHost || resOnHost);
+    if (!usePaged) {
+        compute_residual_n_batch(
+                n,
+                xs,
+                residuals,
+                keys,
+                resOnHost,
+                stream);
+        return;
+    }
+
+    // Paged path
+    compute_residual_n_paged(
+            n,
+            xs,
+            residuals,
+            keys,
+            xsOnHost,
+            resOnHost,
+            stream);
 }
 
 //

diff --git a/faiss/gpu/GpuIndexFlat.h b/faiss/gpu/GpuIndexFlat.h
@@ -107,6 +107,26 @@ class GpuIndexFlat : public GpuIndex {
             float* residuals,
             const idx_t* keys) const override;
 
+    /// Compute residual (batch mode) with paging support
+    void compute_residual_n_paged(
+        idx_t n,
+        const float* xs,
+        float* residuals,
+        const idx_t* keys,
+        bool xsOnHost,
+        bool resOnHost,
+        cudaStream_t stream) const;
+
+    /// Compute residual (batch mode)
+        void compute_residual_n_batch(
+                idx_t batchSize,
+                const float* xs,
+                float* residuals,
+                const idx_t* keys,
+                bool residualOnHost,
+                cudaStream_t stream) const;
+
+    ///             
     /// For internal access
     inline FlatIndex* getGpuData() {
         return data_.get();