Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ee94971
Fixed vector normalization for Inner Product -- still failing 15 test…
HowardHuang1 Mar 7, 2026
78e47f1
Merge branch 'main' into HH-Vector-Normalization
aamijar Mar 9, 2026
e822460
add logging to compare recall and search speed for raw v.s. normalize…
HowardHuang1 Mar 10, 2026
da32073
revert change comparing normalize and raw in single run -- that was p…
HowardHuang1 Mar 11, 2026
5375bb5
previously normalization was applied to entire IVF-PQ pipeline --> ch…
HowardHuang1 Mar 16, 2026
c320640
revert to raw vectors. No need to normalize here because normalized v…
HowardHuang1 Mar 16, 2026
0486bf5
clean up code
HowardHuang1 Mar 16, 2026
107e2b3
clean up code
HowardHuang1 Mar 16, 2026
5442b89
upload code that resolves linker issue + live csv updates
HowardHuang1 Mar 18, 2026
dc9b6df
remove live_csv
HowardHuang1 Mar 18, 2026
cf3666c
Merge branch 'main' into HH-Vector-Normalization
HowardHuang1 Mar 18, 2026
bdda881
hardcode file path instead of searching multiple directories + fix in…
HowardHuang1 Mar 19, 2026
c237db3
clean up unnecessary checks in data_export.py
HowardHuang1 Mar 19, 2026
3c20377
bring back comma parsing instead of underscore parsing
HowardHuang1 Mar 20, 2026
728c964
bring back parts of plot/__main__.py for clarity
HowardHuang1 Mar 20, 2026
6c9bc36
get rid of incremental JSON->CSV write for clarity
HowardHuang1 Mar 20, 2026
eb6bb88
bring back original plot/__main__.py for clarity
HowardHuang1 Mar 20, 2026
4546a85
fix cuvs-bench generate groundtruth which was sorted incorrectly for …
HowardHuang1 Mar 23, 2026
ea8a6b4
revert normalization outside kernel that requires copy of dataset res…
HowardHuang1 Mar 30, 2026
5800e53
didn't modify kernels themselves but routed IP predict through existi…
HowardHuang1 Mar 31, 2026
945d3ee
add inner_product_cosine_assignment flag which is set when ivf_pq_bui…
HowardHuang1 Apr 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cpp/include/cuvs/cluster/kmeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ struct balanced_params : base_params {
* Number of training iterations
*/
uint32_t n_iters = 20;

/** If true (only valid with InnerProduct), E-step uses cosine-style fused NN; M-step uses raw
* means. */
bool inner_product_cosine_assignment = false;
};

/**
Expand Down
121 changes: 76 additions & 45 deletions cpp/src/cluster/detail/kmeans_balanced.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ constexpr static inline float kAdjustCentersWeight = 7.0f;
* @param[in] n_clusters Number of clusters/centers
* @param[in] dim Dimensionality of the data
* @param[in] dataset Pointer to the data [n_rows, dim]
* @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows]
* @param[in] dataset_norm Pointer to precomputed row norms (L2: ||x||^2; cosine / IP: ||x||_2)
* [n_rows]
* @param[in] n_rows Number samples in the `dataset`
* @param[out] labels Output predictions [n_rows]
* @param[inout] mr (optional) Memory resource to use for temporary allocations
Expand Down Expand Up @@ -129,7 +130,44 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
raft::compose_op<raft::cast_op<LabelT>, raft::key_op>());
break;
}
case cuvs::distance::DistanceType::InnerProduct:
if (!params.inner_product_cosine_assignment) {
// Raw inner product: assign each row to the cluster with largest dot product.
// TODO: pass buffer
rmm::device_uvector<MathT> distances(n_rows * n_clusters, stream, mr);

MathT alpha = -1.0;
MathT beta = 0.0;

raft::linalg::gemm(handle,
true,
false,
n_clusters,
n_rows,
dim,
&alpha,
centers,
dim,
dataset,
dim,
&beta,
distances.data(),
n_clusters,
stream);

auto distances_const_view =
raft::make_device_matrix_view<const MathT, IdxT, raft::row_major>(
distances.data(), n_rows, n_clusters);
auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(labels, n_rows);
raft::matrix::argmin(handle, distances_const_view, labels_view);
break;
}
RAFT_EXPECTS(dataset_norm != nullptr,
"inner_product_cosine_assignment requires precomputed row L2 norms (||x||_2)");
[[fallthrough]];
case cuvs::distance::DistanceType::CosineExpanded: {
// Fused cosine NN (also used for InnerProduct + inner_product_cosine_assignment via
// fallthrough).
auto workspace = raft::make_device_mdarray<char, IdxT>(
handle, mr, raft::make_extents<IdxT>((sizeof(int)) * n_rows));

Expand Down Expand Up @@ -159,7 +197,7 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
false,
false,
true,
params.metric,
cuvs::distance::DistanceType::CosineExpanded,
0.0f,
stream);
// Copy keys to output labels
Expand All @@ -169,35 +207,6 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
raft::compose_op<raft::cast_op<LabelT>, raft::key_op>());
break;
}
case cuvs::distance::DistanceType::InnerProduct: {
// TODO: pass buffer
rmm::device_uvector<MathT> distances(n_rows * n_clusters, stream, mr);

MathT alpha = -1.0;
MathT beta = 0.0;

raft::linalg::gemm(handle,
true,
false,
n_clusters,
n_rows,
dim,
&alpha,
centers,
dim,
dataset,
dim,
&beta,
distances.data(),
n_clusters,
stream);

auto distances_const_view = raft::make_device_matrix_view<const MathT, IdxT, raft::row_major>(
distances.data(), n_rows, n_clusters);
auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(labels, n_rows);
raft::matrix::argmin(handle, distances_const_view, labels_view);
break;
}
default: {
RAFT_FAIL("The chosen distance metric is not supported (%d)", int(params.metric));
}
Expand Down Expand Up @@ -232,13 +241,12 @@ constexpr auto calc_minibatch_size(IdxT n_clusters,
// Estimate memory needs per row (i.e element of the batch).
size_t mem_per_row = 0;
switch (metric) {
// fusedL2NN needs a mutex and a key-value pair for each row.
// fused L2 NN needs a mutex and a key-value pair for each row.
case distance::DistanceType::L2Expanded:
case distance::DistanceType::L2SqrtExpanded: {
mem_per_row += sizeof(int);
mem_per_row += sizeof(raft::KeyValuePair<IdxT, MathT>);
} break;
// Other metrics require storing a distance matrix.
default: {
mem_per_row += sizeof(MathT) * n_clusters;
}
Expand Down Expand Up @@ -426,15 +434,19 @@ void predict(const raft::resources& handle,
auto stream = raft::resource::get_cuda_stream(handle);
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
"predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle));
auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle));
const auto minibatch_metric = params.inner_product_cosine_assignment
? cuvs::distance::DistanceType::CosineExpanded
: params.metric;
auto [max_minibatch_size, _mem_per_row] =
calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
calc_minibatch_size<MathT>(n_clusters, n_rows, dim, minibatch_metric, std::is_same_v<T, MathT>);
rmm::device_uvector<MathT> cur_dataset(
std::is_same_v<T, MathT> ? 0 : max_minibatch_size * dim, stream, mem_res);
bool need_compute_norm =
dataset_norm == nullptr && (params.metric == cuvs::distance::DistanceType::L2Expanded ||
params.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
params.metric == cuvs::distance::DistanceType::CosineExpanded);
params.metric == cuvs::distance::DistanceType::CosineExpanded ||
params.inner_product_cosine_assignment);
rmm::device_uvector<MathT> cur_dataset_norm(
need_compute_norm ? max_minibatch_size : 0, stream, mem_res);
const MathT* dataset_norm_ptr = nullptr;
Expand All @@ -454,7 +466,8 @@ void predict(const raft::resources& handle,

// Compute the norm now if it hasn't been pre-computed.
if (need_compute_norm) {
if (params.metric == cuvs::distance::DistanceType::CosineExpanded)
if (params.metric == cuvs::distance::DistanceType::CosineExpanded ||
params.inner_product_cosine_assignment)
compute_norm(handle,
cur_dataset_norm.data(),
cur_dataset_ptr,
Expand Down Expand Up @@ -654,7 +667,8 @@ auto adjust_centers(MathT* centers,
* @param[in] n_iters Requested number of iterations (can differ from params.n_iter!)
* @param[in] dim Dimensionality of the dataset
* @param[in] dataset Pointer to a managed row-major array [n_rows, dim]
* @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows]
* @param[in] dataset_norm Pointer to precomputed row norms (L2: ||x||^2; cosine / IP: ||x||_2)
* [n_rows]
* @param[in] n_rows Number of rows in the dataset
* @param[in] n_cluster Requested number of clusters
* @param[inout] cluster_centers Pointer to a managed row-major array [n_clusters, dim]
Expand Down Expand Up @@ -717,8 +731,10 @@ void balancing_em_iters(const raft::resources& handle,
}
switch (params.metric) {
// For some metrics, cluster calculation and adjustment tends to favor zero center vectors.
// To avoid converging to zero, we normalize the center vectors on every iteration.
// To avoid converging to zero, we normalize the center vectors before each E-step.
case cuvs::distance::DistanceType::InnerProduct:
if (!params.inner_product_cosine_assignment) { break; }
[[fallthrough]];
case cuvs::distance::DistanceType::CosineExpanded:
case cuvs::distance::DistanceType::CorrelationExpanded: {
auto clusters_in_view = raft::make_device_matrix_view<const MathT, IdxT, raft::row_major>(
Expand Down Expand Up @@ -962,14 +978,17 @@ auto build_fine_clusters(const raft::resources& handle,

thrust::transform_iterator<MappingOpT, const T*> mapping_itr(dataset_mptr, mapping_op);
raft::matrix::gather(mapping_itr, dim, n_rows, mc_trainset_ids, k, mc_trainset, stream);
const MathT* fine_subset_norm = nullptr;
if (params.metric == cuvs::distance::DistanceType::L2Expanded ||
params.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
params.metric == cuvs::distance::DistanceType::CosineExpanded) {
params.metric == cuvs::distance::DistanceType::CosineExpanded ||
params.inner_product_cosine_assignment) {
thrust::gather(raft::resource::get_thrust_policy(handle),
mc_trainset_ids,
mc_trainset_ids + k,
dataset_norm_mptr,
mc_trainset_norm);
fine_subset_norm = mc_trainset_norm;
}

build_clusters(handle,
Expand All @@ -983,7 +1002,7 @@ auto build_fine_clusters(const raft::resources& handle,
mc_trainset_csizes_tmp.data(),
mapping_op,
device_memory,
mc_trainset_norm);
fine_subset_norm);

raft::copy(handle,
raft::make_device_vector_view(cluster_centers + (dim * fine_clusters_csum[i]),
Expand Down Expand Up @@ -1035,23 +1054,35 @@ void build_hierarchical(const raft::resources& handle,

IdxT n_mesoclusters = std::min(n_clusters, static_cast<IdxT>(std::sqrt(n_clusters) + 0.5));
RAFT_LOG_DEBUG("build_hierarchical: n_mesoclusters: %u", n_mesoclusters);
if (params.metric == cuvs::distance::DistanceType::InnerProduct &&
params.inner_product_cosine_assignment) {
RAFT_LOG_INFO(
"kmeans_balanced build_hierarchical: InnerProduct + inner_product_cosine_assignment "
"(cosine-style "
"assignment / norm precompute path)");
}

// TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf.
rmm::mr::managed_memory_resource managed_memory;
rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle);
auto [max_minibatch_size, mem_per_row] =
calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
const auto hierarchical_minibatch_metric = params.inner_product_cosine_assignment
? cuvs::distance::DistanceType::CosineExpanded
: params.metric;
auto [max_minibatch_size, mem_per_row] = calc_minibatch_size<MathT>(
n_clusters, n_rows, dim, hierarchical_minibatch_metric, std::is_same_v<T, MathT>);

// Precompute the L2 norm of the dataset if relevant and not yet computed.
rmm::device_uvector<MathT> dataset_norm_buf(0, stream, device_memory);
const MathT* dataset_norm = nullptr;
if ((params.metric == cuvs::distance::DistanceType::L2Expanded ||
params.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
params.metric == cuvs::distance::DistanceType::CosineExpanded)) {
params.metric == cuvs::distance::DistanceType::CosineExpanded ||
params.inner_product_cosine_assignment)) {
dataset_norm_buf.resize(n_rows, stream);
for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
if (params.metric == cuvs::distance::DistanceType::CosineExpanded)
if (params.metric == cuvs::distance::DistanceType::CosineExpanded ||
params.inner_product_cosine_assignment)
compute_norm(handle,
dataset_norm_buf.data() + offset,
dataset + dim * offset,
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/cluster/kmeans_balanced.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include "../neighbors/detail/ann_utils.cuh"
#include "detail/kmeans_balanced.cuh"
#include <raft/core/logger.hpp>
#include <raft/core/mdarray.hpp>
#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/util/cuda_utils.cuh>
Expand Down Expand Up @@ -145,6 +146,14 @@ void predict(const raft::resources& handle,
static_cast<uint64_t>(std::numeric_limits<LabelT>::max()),
"The chosen label type cannot represent all cluster labels");

if (params.metric == cuvs::distance::DistanceType::InnerProduct &&
params.inner_product_cosine_assignment) {
RAFT_LOG_INFO(
"kmeans_balanced predict: InnerProduct + inner_product_cosine_assignment (minibatched "
"predict -> "
"fused cosine NN in predict_core)");
}

cuvs::cluster::kmeans::detail::predict(handle,
params,
centroids.data_handle(),
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,14 @@ auto build(raft::resources const& handle,
cuvs::cluster::kmeans::balanced_params kmeans_params;
kmeans_params.n_iters = params.kmeans_n_iters;
kmeans_params.metric = static_cast<cuvs::distance::DistanceType>((int)impl->metric());
kmeans_params.inner_product_cosine_assignment =
(impl->metric() == distance::DistanceType::InnerProduct);
if (kmeans_params.inner_product_cosine_assignment) {
RAFT_LOG_INFO(
"IVF-PQ build: balanced k-means inner_product_cosine_assignment=true (cosine-style E-step "
"for "
"InnerProduct)");
}

if (impl->metric() == distance::DistanceType::CosineExpanded) {
raft::linalg::row_normalize<raft::linalg::L2Norm>(
Expand All @@ -1341,6 +1349,7 @@ auto build(raft::resources const& handle,
rmm::device_uvector<uint32_t> labels(n_rows_train, stream, big_memory_resource);
auto centers_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
cluster_centers, impl->n_lists(), impl->dim());

if (impl->metric() == distance::DistanceType::CosineExpanded) {
raft::linalg::row_normalize<raft::linalg::L2Norm>(handle, centers_const_view, centers_view);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,13 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
else:
distances = xp.concatenate([distances, D], axis=1)
indices = xp.concatenate([indices, Ind], axis=1)
idx = xp.argsort(distances, axis=1)[:, :k]
# Euclidean: smaller distance is better → sort ascending.
# Inner product: D holds similarities, larger is better → sort
# descending (equivalent to ascending on -D if library used -sim).
if metric == "inner_product":
idx = xp.argsort(-distances, axis=1)[:, :k]
else:
idx = xp.argsort(distances, axis=1)[:, :k]
distances = xp.take_along_axis(distances, idx, axis=1)
indices = xp.take_along_axis(indices, idx, axis=1)

Expand Down
2 changes: 1 addition & 1 deletion python/cuvs_bench/cuvs_bench/run/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def get_build_path(executable: str) -> Optional[str]:
build_path = os.getenv("CUVS_HOME")
if build_path:
build_path = os.path.join(
build_path, "cpp", "build", "release", executable
build_path, "cpp", "build", "bench", "ann", executable
)
if os.path.exists(build_path):
print(f"-- Using cuVS bench from repository in {build_path}.")
Expand Down
22 changes: 20 additions & 2 deletions python/cuvs_bench/cuvs_bench/run/runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,24 @@
from typing import Dict, List, Optional, Tuple


def _subprocess_env(ann_executable_path: str) -> Dict[str, str]:
"""Build env for C++ benchmark subprocess. When CUVS_HOME is set, force the repo's libcuvs.so to be used (LD_PRELOAD + LD_LIBRARY_PATH) so the correct local build runs."""
env = os.environ.copy()
repo = os.getenv("CUVS_HOME")
if repo:
build_dir = os.path.join(repo, "cpp", "build")
if os.path.isdir(build_dir):
lib = os.path.join(build_dir, "libcuvs.so")
if os.path.isfile(lib):
env["LD_PRELOAD"] = lib + (os.pathsep + env["LD_PRELOAD"] if env.get("LD_PRELOAD") else "")
env["LD_LIBRARY_PATH"] = build_dir + os.pathsep + env.get("LD_LIBRARY_PATH", "")
# So IVF-PQ normalization logging goes to a known path (C++ uses this when set)
log_path = os.path.join(build_dir, "cuvs_ivf_pq_normalization.log")
env["CUVS_IVF_PQ_NORMALIZATION_LOG"] = log_path
print(f"[cuvs_bench] IVF-PQ normalization log (if any) -> {log_path}", flush=True)
return env


def cuvs_bench_cpp(
conf_file: Dict,
conf_filename: str,
Expand Down Expand Up @@ -123,7 +141,7 @@ def cuvs_bench_cpp(
)
else:
try:
subprocess.run(cmd, check=True)
subprocess.run(cmd, check=True, env=_subprocess_env(ann_executable_path))
merge_build_files(
build_folder, build_file, temp_build_file
)
Expand Down Expand Up @@ -163,7 +181,7 @@ def cuvs_bench_cpp(
)
else:
try:
subprocess.run(cmd, check=True)
subprocess.run(cmd, check=True, env=_subprocess_env(ann_executable_path))
except Exception as e:
print(f"Error occurred running benchmark: {e}")
finally:
Expand Down
Loading
Loading