Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions c_api/gpu/DeviceUtils_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,12 @@ int faiss_gpu_sync_all_devices() {
}
CATCH_AND_HANDLE
}

/// Returns the free memory (in bytes) on the specified device
int faiss_get_free_memory(int device, size_t* p_free_bytes) {
try {
size_t freeBytes = faiss::gpu::getFreeMemory(device);
*p_free_bytes = freeBytes;
}
CATCH_AND_HANDLE
}
3 changes: 3 additions & 0 deletions c_api/gpu/DeviceUtils_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ int faiss_gpu_profiler_stop();
/// cudaDeviceSynchronize for each device)
int faiss_gpu_sync_all_devices();

/// Returns the free memory (in bytes) on the specified device
int faiss_get_free_memory(int device, size_t* p_free_bytes);

#ifdef __cplusplus
}
#endif
Expand Down
5 changes: 3 additions & 2 deletions faiss/gpu/GpuIndex.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ GpuIndex::GpuIndex(
: Index(dims, metric),
resources_(resources),
config_(config),
minPagedSize_(kMinPageSize) {
minPagedSize_(kMinPageSize),
pageSize_(kNonPinnedPageSize) {
FAISS_THROW_IF_NOT_FMT(
config_.device < getNumDevices(),
"Invalid GPU device %d",
Expand Down Expand Up @@ -483,7 +484,7 @@ void GpuIndex::searchFromCpuPaged_ex_(
if (!pinnedAlloc.first || pageSizeInVecs < 1) {
// Just page without overlapping copy with compute
idx_t batchSize = utils::nextHighestPowerOf2(
(kNonPinnedPageSize /
(pageSize_ /
(get_numeric_type_size(numeric_type) * this->d)));

for (idx_t cur = 0; cur < n; cur += batchSize) {
Expand Down
3 changes: 3 additions & 0 deletions faiss/gpu/GpuIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,9 @@ class GpuIndex : public faiss::Index {

/// Size above which we page copies from the CPU to GPU
size_t minPagedSize_;

/// Size of the pages we use to page copies from the CPU to GPU
size_t pageSize_;
};

/// If the given index is a GPU index, this returns the index instance
Expand Down
129 changes: 113 additions & 16 deletions faiss/gpu/GpuIndexFlat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -312,39 +312,136 @@ void GpuIndexFlat::compute_residual(const float* x, float* residual, idx_t key)
compute_residual_n(1, x, residual, &key);
}

void GpuIndexFlat::compute_residual_n(
idx_t n,
void GpuIndexFlat::compute_residual_n_batch(
idx_t batchSize,
const float* xs,
float* residuals,
const idx_t* keys) const {
DeviceScope scope(config_.device);
auto stream = resources_->getDefaultStream(config_.device);

if (n == 0) {
// nothing to do
return;
}
const idx_t* keys,
bool residualOnHost,
cudaStream_t stream) const {

auto vecsDevice = toDeviceTemporary<float, 2>(
resources_.get(),
config_.device,
const_cast<float*>(xs),
stream,
{n, this->d});
{batchSize, d});

auto idsDevice = toDeviceTemporary<idx_t, 1>(
resources_.get(),
config_.device,
const_cast<idx_t*>(keys),
stream,
{n});
{batchSize});

auto residualDevice = toDeviceTemporary<float, 2>(
resources_.get(), config_.device, residuals, stream, {n, this->d});
resources_.get(),
config_.device,
residuals,
stream,
{batchSize, d});

FAISS_ASSERT(data_);
data_->computeResidual(vecsDevice, idsDevice, residualDevice);

// If the output is on the host, copy back if needed
fromDevice<float, 2>(residualDevice, residuals, stream);
if (residualOnHost) {
fromDevice<float, 2>(residualDevice, residuals, stream);
}
}

void GpuIndexFlat::compute_residual_n_paged(
idx_t n,
const float* xs,
float* residuals,
const idx_t* keys,
bool xsOnHost,
bool resOnHost,
cudaStream_t stream) const {

idx_t batchSize = utils::nextHighestPowerOf2(
pageSize_ / (d * sizeof(float)));

// If residuals already on device, create one wrapper and slice it
if (!resOnHost) {
auto residualDevice = toDeviceTemporary<float, 2>(
resources_.get(),
config_.device,
residuals,
stream,
{n, d});

for (idx_t cur = 0; cur < n; cur += batchSize) {
idx_t thisBatch = std::min(batchSize, n - cur);

auto residualBatch =
residualDevice.narrowOutermost(cur, thisBatch);

compute_residual_n_batch(
thisBatch,
xs + cur * d,
residualBatch.data(),
keys + cur,
false,
stream);
}
} else {
// residuals on host → per-batch copy back
for (idx_t cur = 0; cur < n; cur += batchSize) {
idx_t thisBatch = std::min(batchSize, n - cur);

compute_residual_n_batch(
thisBatch,
xs + cur * d,
residuals + cur * d,
keys + cur,
true,
stream);
}
}
}


void GpuIndexFlat::compute_residual_n(
idx_t n,
const float* xs,
float* residuals,
const idx_t* keys) const {
if (n == 0) {
// nothing to do
return;
}
FAISS_ASSERT(data_);
DeviceScope scope(config_.device);
auto stream = resources_->getDefaultStream(config_.device);
// first check if we need to page the device transfers
// since n*d may exceed device memory, so we call compute_residual in pages
// use paged mode if
// - data_size >= minPagedSize_ AND
// - atleast one of xs or residuals is on host
size_t dataSize = (size_t)n * d * sizeof(float);
bool xsOnHost = getDeviceForAddress(xs) == -1;
bool resOnHost = getDeviceForAddress(residuals) == -1;
bool usePaged =
dataSize >= minPagedSize_ && (xsOnHost || resOnHost);
if (!usePaged) {
compute_residual_n_batch(
n,
xs,
residuals,
keys,
resOnHost,
stream);
return;
}

// Paged path
compute_residual_n_paged(
n,
xs,
residuals,
keys,
xsOnHost,
resOnHost,
stream);
}

//
Expand Down
20 changes: 20 additions & 0 deletions faiss/gpu/GpuIndexFlat.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,26 @@ class GpuIndexFlat : public GpuIndex {
float* residuals,
const idx_t* keys) const override;

/// Compute residual (batch mode) with paging support
void compute_residual_n_paged(
idx_t n,
const float* xs,
float* residuals,
const idx_t* keys,
bool xsOnHost,
bool resOnHost,
cudaStream_t stream) const;

/// Compute residual (batch mode)
void compute_residual_n_batch(
idx_t batchSize,
const float* xs,
float* residuals,
const idx_t* keys,
bool residualOnHost,
cudaStream_t stream) const;

///
/// For internal access
inline FlatIndex* getGpuData() {
return data_.get();
Expand Down