rapidsai · HowardHuang1 · Mar 7, 2026 · Mar 9, 2026 · Mar 10, 2026 · Mar 11, 2026
@@ -125,6 +125,10 @@ struct balanced_params : base_params {
    * Number of training iterations
    */
   uint32_t n_iters = 20;
+
+  /** If true (only valid with InnerProduct), E-step uses cosine-style fused NN; M-step uses raw
+   * means. */
+  bool inner_product_cosine_assignment = false;
 };
 
 /**

@@ -67,7 +67,8 @@ constexpr static inline float kAdjustCentersWeight = 7.0f;
  * @param[in] n_clusters Number of clusters/centers
  * @param[in] dim Dimensionality of the data
  * @param[in] dataset Pointer to the data [n_rows, dim]
- * @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows]
+ * @param[in] dataset_norm Pointer to precomputed row norms (L2: ||x||^2; cosine / IP: ||x||_2)
+ * [n_rows]
  * @param[in] n_rows Number samples in the `dataset`
  * @param[out] labels Output predictions [n_rows]
  * @param[inout] mr (optional) Memory resource to use for temporary allocations
@@ -129,7 +130,44 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
                         raft::compose_op<raft::cast_op<LabelT>, raft::key_op>());
       break;
     }
+    case cuvs::distance::DistanceType::InnerProduct:
+      if (!params.inner_product_cosine_assignment) {
+        // Raw inner product: assign each row to the cluster with largest dot product.
+        // TODO: pass buffer
+        rmm::device_uvector<MathT> distances(n_rows * n_clusters, stream, mr);
+
+        MathT alpha = -1.0;
+        MathT beta  = 0.0;
+
+        raft::linalg::gemm(handle,
+                           true,
+                           false,
+                           n_clusters,
+                           n_rows,
+                           dim,
+                           &alpha,
+                           centers,
+                           dim,
+                           dataset,
+                           dim,
+                           &beta,
+                           distances.data(),
+                           n_clusters,
+                           stream);
+
+        auto distances_const_view =
+          raft::make_device_matrix_view<const MathT, IdxT, raft::row_major>(
+            distances.data(), n_rows, n_clusters);
+        auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(labels, n_rows);
+        raft::matrix::argmin(handle, distances_const_view, labels_view);
+        break;
+      }
+      RAFT_EXPECTS(dataset_norm != nullptr,
+                   "inner_product_cosine_assignment requires precomputed row L2 norms (||x||_2)");
+      [[fallthrough]];
     case cuvs::distance::DistanceType::CosineExpanded: {
+      // Fused cosine NN (also used for InnerProduct + inner_product_cosine_assignment via
+      // fallthrough).
       auto workspace = raft::make_device_mdarray<char, IdxT>(
         handle, mr, raft::make_extents<IdxT>((sizeof(int)) * n_rows));
 
@@ -159,7 +197,7 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
         false,
         false,
         true,
-        params.metric,
+        cuvs::distance::DistanceType::CosineExpanded,
         0.0f,
         stream);
       // Copy keys to output labels
@@ -169,35 +207,6 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
                         raft::compose_op<raft::cast_op<LabelT>, raft::key_op>());
       break;
     }
-    case cuvs::distance::DistanceType::InnerProduct: {
-      // TODO: pass buffer
-      rmm::device_uvector<MathT> distances(n_rows * n_clusters, stream, mr);
-
-      MathT alpha = -1.0;
-      MathT beta  = 0.0;
-
-      raft::linalg::gemm(handle,
-                         true,
-                         false,
-                         n_clusters,
-                         n_rows,
-                         dim,
-                         &alpha,
-                         centers,
-                         dim,
-                         dataset,
-                         dim,
-                         &beta,
-                         distances.data(),
-                         n_clusters,
-                         stream);
-
-      auto distances_const_view = raft::make_device_matrix_view<const MathT, IdxT, raft::row_major>(
-        distances.data(), n_rows, n_clusters);
-      auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(labels, n_rows);
-      raft::matrix::argmin(handle, distances_const_view, labels_view);
-      break;
-    }
     default: {
       RAFT_FAIL("The chosen distance metric is not supported (%d)", int(params.metric));
     }
@@ -232,13 +241,12 @@ constexpr auto calc_minibatch_size(IdxT n_clusters,
   // Estimate memory needs per row (i.e element of the batch).
   size_t mem_per_row = 0;
   switch (metric) {
-    // fusedL2NN needs a mutex and a key-value pair for each row.
+    // fused L2 NN needs a mutex and a key-value pair for each row.
     case distance::DistanceType::L2Expanded:
     case distance::DistanceType::L2SqrtExpanded: {
       mem_per_row += sizeof(int);
       mem_per_row += sizeof(raft::KeyValuePair<IdxT, MathT>);
     } break;
-    // Other metrics require storing a distance matrix.
     default: {
       mem_per_row += sizeof(MathT) * n_clusters;
     }
@@ -426,15 +434,19 @@ void predict(const raft::resources& handle,
   auto stream = raft::resource::get_cuda_stream(handle);
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
-  auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle));
+  auto mem_res                = mr.value_or(raft::resource::get_workspace_resource(handle));
+  const auto minibatch_metric = params.inner_product_cosine_assignment
+                                  ? cuvs::distance::DistanceType::CosineExpanded
+                                  : params.metric;
   auto [max_minibatch_size, _mem_per_row] =
-    calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
+    calc_minibatch_size<MathT>(n_clusters, n_rows, dim, minibatch_metric, std::is_same_v<T, MathT>);
   rmm::device_uvector<MathT> cur_dataset(
     std::is_same_v<T, MathT> ? 0 : max_minibatch_size * dim, stream, mem_res);
   bool need_compute_norm =
     dataset_norm == nullptr && (params.metric == cuvs::distance::DistanceType::L2Expanded ||
                                 params.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
-                                params.metric == cuvs::distance::DistanceType::CosineExpanded);
+                                params.metric == cuvs::distance::DistanceType::CosineExpanded ||
+                                params.inner_product_cosine_assignment);
   rmm::device_uvector<MathT> cur_dataset_norm(
     need_compute_norm ? max_minibatch_size : 0, stream, mem_res);
   const MathT* dataset_norm_ptr = nullptr;
@@ -454,7 +466,8 @@ void predict(const raft::resources& handle,
 
     // Compute the norm now if it hasn't been pre-computed.
     if (need_compute_norm) {
-      if (params.metric == cuvs::distance::DistanceType::CosineExpanded)
+      if (params.metric == cuvs::distance::DistanceType::CosineExpanded ||
+          params.inner_product_cosine_assignment)
         compute_norm(handle,
                      cur_dataset_norm.data(),
                      cur_dataset_ptr,
@@ -654,7 +667,8 @@ auto adjust_centers(MathT* centers,
  * @param[in] n_iters Requested number of iterations (can differ from params.n_iter!)
  * @param[in] dim Dimensionality of the dataset
  * @param[in] dataset Pointer to a managed row-major array [n_rows, dim]
- * @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows]
+ * @param[in] dataset_norm Pointer to precomputed row norms (L2: ||x||^2; cosine / IP: ||x||_2)
+ * [n_rows]
  * @param[in] n_rows Number of rows in the dataset
  * @param[in] n_cluster Requested number of clusters
  * @param[inout] cluster_centers Pointer to a managed row-major array [n_clusters, dim]
@@ -717,8 +731,10 @@ void balancing_em_iters(const raft::resources& handle,
     }
     switch (params.metric) {
       // For some metrics, cluster calculation and adjustment tends to favor zero center vectors.
-      // To avoid converging to zero, we normalize the center vectors on every iteration.
+      // To avoid converging to zero, we normalize the center vectors before each E-step.
       case cuvs::distance::DistanceType::InnerProduct:
+        if (!params.inner_product_cosine_assignment) { break; }
+        [[fallthrough]];
       case cuvs::distance::DistanceType::CosineExpanded:
       case cuvs::distance::DistanceType::CorrelationExpanded: {
         auto clusters_in_view = raft::make_device_matrix_view<const MathT, IdxT, raft::row_major>(
@@ -962,14 +978,17 @@ auto build_fine_clusters(const raft::resources& handle,
 
     thrust::transform_iterator<MappingOpT, const T*> mapping_itr(dataset_mptr, mapping_op);
     raft::matrix::gather(mapping_itr, dim, n_rows, mc_trainset_ids, k, mc_trainset, stream);
+    const MathT* fine_subset_norm = nullptr;
     if (params.metric == cuvs::distance::DistanceType::L2Expanded ||
         params.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
-        params.metric == cuvs::distance::DistanceType::CosineExpanded) {
+        params.metric == cuvs::distance::DistanceType::CosineExpanded ||
+        params.inner_product_cosine_assignment) {
       thrust::gather(raft::resource::get_thrust_policy(handle),
                      mc_trainset_ids,
                      mc_trainset_ids + k,
                      dataset_norm_mptr,
                      mc_trainset_norm);
+      fine_subset_norm = mc_trainset_norm;
     }
 
     build_clusters(handle,
@@ -983,7 +1002,7 @@ auto build_fine_clusters(const raft::resources& handle,
                    mc_trainset_csizes_tmp.data(),
                    mapping_op,
                    device_memory,
-                   mc_trainset_norm);
+                   fine_subset_norm);
 
     raft::copy(handle,
                raft::make_device_vector_view(cluster_centers + (dim * fine_clusters_csum[i]),
@@ -1035,23 +1054,35 @@ void build_hierarchical(const raft::resources& handle,
 
   IdxT n_mesoclusters = std::min(n_clusters, static_cast<IdxT>(std::sqrt(n_clusters) + 0.5));
   RAFT_LOG_DEBUG("build_hierarchical: n_mesoclusters: %u", n_mesoclusters);
+  if (params.metric == cuvs::distance::DistanceType::InnerProduct &&
+      params.inner_product_cosine_assignment) {
+    RAFT_LOG_INFO(
+      "kmeans_balanced build_hierarchical: InnerProduct + inner_product_cosine_assignment "
+      "(cosine-style "
+      "assignment / norm precompute path)");
+  }
 
   // TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf.
   rmm::mr::managed_memory_resource managed_memory;
   rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle);
-  auto [max_minibatch_size, mem_per_row] =
-    calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
+  const auto hierarchical_minibatch_metric     = params.inner_product_cosine_assignment
+                                                   ? cuvs::distance::DistanceType::CosineExpanded
+                                                   : params.metric;
+  auto [max_minibatch_size, mem_per_row]       = calc_minibatch_size<MathT>(
+    n_clusters, n_rows, dim, hierarchical_minibatch_metric, std::is_same_v<T, MathT>);
 
   // Precompute the L2 norm of the dataset if relevant and not yet computed.
   rmm::device_uvector<MathT> dataset_norm_buf(0, stream, device_memory);
   const MathT* dataset_norm = nullptr;
   if ((params.metric == cuvs::distance::DistanceType::L2Expanded ||
        params.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
-       params.metric == cuvs::distance::DistanceType::CosineExpanded)) {
+       params.metric == cuvs::distance::DistanceType::CosineExpanded ||
+       params.inner_product_cosine_assignment)) {
     dataset_norm_buf.resize(n_rows, stream);
     for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
       IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
-      if (params.metric == cuvs::distance::DistanceType::CosineExpanded)
+      if (params.metric == cuvs::distance::DistanceType::CosineExpanded ||
+          params.inner_product_cosine_assignment)
         compute_norm(handle,
                      dataset_norm_buf.data() + offset,
                      dataset + dim * offset,

@@ -7,6 +7,7 @@
 
 #include "../neighbors/detail/ann_utils.cuh"
 #include "detail/kmeans_balanced.cuh"
+#include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/util/cuda_utils.cuh>
@@ -145,6 +146,14 @@ void predict(const raft::resources& handle,
                  static_cast<uint64_t>(std::numeric_limits<LabelT>::max()),
                "The chosen label type cannot represent all cluster labels");
 
+  if (params.metric == cuvs::distance::DistanceType::InnerProduct &&
+      params.inner_product_cosine_assignment) {
+    RAFT_LOG_INFO(
+      "kmeans_balanced predict: InnerProduct + inner_product_cosine_assignment (minibatched "
+      "predict -> "
+      "fused cosine NN in predict_core)");
+  }
+
   cuvs::cluster::kmeans::detail::predict(handle,
                                          params,
                                          centroids.data_handle(),

@@ -1330,6 +1330,14 @@ auto build(raft::resources const& handle,
     cuvs::cluster::kmeans::balanced_params kmeans_params;
     kmeans_params.n_iters = params.kmeans_n_iters;
     kmeans_params.metric  = static_cast<cuvs::distance::DistanceType>((int)impl->metric());
+    kmeans_params.inner_product_cosine_assignment =
+      (impl->metric() == distance::DistanceType::InnerProduct);
+    if (kmeans_params.inner_product_cosine_assignment) {
+      RAFT_LOG_INFO(
+        "IVF-PQ build: balanced k-means inner_product_cosine_assignment=true (cosine-style E-step "
+        "for "
+        "InnerProduct)");
+    }
 
     if (impl->metric() == distance::DistanceType::CosineExpanded) {
       raft::linalg::row_normalize<raft::linalg::L2Norm>(
@@ -1341,6 +1349,7 @@ auto build(raft::resources const& handle,
     rmm::device_uvector<uint32_t> labels(n_rows_train, stream, big_memory_resource);
     auto centers_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
       cluster_centers, impl->n_lists(), impl->dim());
+
     if (impl->metric() == distance::DistanceType::CosineExpanded) {
       raft::linalg::row_normalize<raft::linalg::L2Norm>(handle, centers_const_view, centers_view);
     }

@@ -201,7 +201,13 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
         else:
             distances = xp.concatenate([distances, D], axis=1)
             indices = xp.concatenate([indices, Ind], axis=1)
-            idx = xp.argsort(distances, axis=1)[:, :k]
+            # Euclidean: smaller distance is better → sort ascending.
+            # Inner product: D holds similarities, larger is better → sort
+            # descending (equivalent to ascending on -D if library used -sim).
+            if metric == "inner_product":
+                idx = xp.argsort(-distances, axis=1)[:, :k]
+            else:
+                idx = xp.argsort(distances, axis=1)[:, :k]
             distances = xp.take_along_axis(distances, idx, axis=1)
             indices = xp.take_along_axis(indices, idx, axis=1)
 

@@ -345,7 +345,7 @@ def get_build_path(executable: str) -> Optional[str]:
     build_path = os.getenv("CUVS_HOME")
     if build_path:
         build_path = os.path.join(
-            build_path, "cpp", "build", "release", executable
+            build_path, "cpp", "build", "bench", "ann", executable
         )
         if os.path.exists(build_path):
             print(f"-- Using cuVS bench from repository in {build_path}.")

@@ -11,6 +11,24 @@
 from typing import Dict, List, Optional, Tuple
 
 
+def _subprocess_env(ann_executable_path: str) -> Dict[str, str]:
+    """Build env for C++ benchmark subprocess. When CUVS_HOME is set, force the repo's libcuvs.so to be used (LD_PRELOAD + LD_LIBRARY_PATH) so the correct local build runs."""
+    env = os.environ.copy()
+    repo = os.getenv("CUVS_HOME")
+    if repo:
+        build_dir = os.path.join(repo, "cpp", "build")
+        if os.path.isdir(build_dir):
+            lib = os.path.join(build_dir, "libcuvs.so")
+            if os.path.isfile(lib):
+                env["LD_PRELOAD"] = lib + (os.pathsep + env["LD_PRELOAD"] if env.get("LD_PRELOAD") else "")
+            env["LD_LIBRARY_PATH"] = build_dir + os.pathsep + env.get("LD_LIBRARY_PATH", "")
+            # So IVF-PQ normalization logging goes to a known path (C++ uses this when set)
+            log_path = os.path.join(build_dir, "cuvs_ivf_pq_normalization.log")
+            env["CUVS_IVF_PQ_NORMALIZATION_LOG"] = log_path
+            print(f"[cuvs_bench] IVF-PQ normalization log (if any) -> {log_path}", flush=True)
+    return env
+
+
 def cuvs_bench_cpp(
     conf_file: Dict,
     conf_filename: str,
@@ -123,7 +141,7 @@ def cuvs_bench_cpp(
                 )
             else:
                 try:
-                    subprocess.run(cmd, check=True)
+                    subprocess.run(cmd, check=True, env=_subprocess_env(ann_executable_path))
                     merge_build_files(
                         build_folder, build_file, temp_build_file
                     )
@@ -163,7 +181,7 @@ def cuvs_bench_cpp(
                 )
             else:
                 try:
-                    subprocess.run(cmd, check=True)
+                    subprocess.run(cmd, check=True, env=_subprocess_env(ann_executable_path))
                 except Exception as e:
                     print(f"Error occurred running benchmark: {e}")
                 finally: