rapidsai · viclafargue · Apr 10, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
@@ -179,6 +179,13 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 };
  * on the host. Data is processed in GPU-sized batches, streaming from host to device.
  * The batch size is controlled by params.streaming_batch_size.
  *
+ * Multi-GPU dispatch is selected automatically based on the handle state:
+ *   - If `raft::resource::is_multi_gpu(handle)` (cuVS SNMG): the full dataset X
+ *     is split across GPUs internally with an OpenMP parallel region and NCCL.
+ *   - If `raft::resource::comms_initialized(handle)` (Dask/Ray/MPI): X is treated as
+ *     this worker's partition, and RAFT communicators are used for collectives.
+ *   - Otherwise: single-GPU batched k-means.
+ *
  * @code{.cpp}
  *   #include <raft/core/resources.hpp>
  *   #include <cuvs/cluster/kmeans.hpp>
@@ -208,7 +215,8 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 };
  *               raft::make_host_scalar_view(&n_iter));
  * @endcode
  *
- * @param[in]     handle        The raft handle.
+ * @param[in]     handle        The raft handle. When a multi-GPU resource is
+ *                              attached, multi-GPU dispatch is used automatically.
  * @param[in]     params        Parameters for KMeans model. Batch size is read from
  *                              params.streaming_batch_size.
  * @param[in]     X             Training instances on HOST memory. The data must

@@ -596,26 +596,27 @@ void compute_centroid_shift(raft::resources const& handle,
  * @brief Evaluate convergence criteria entirely on device.
  *
  * Checks the cost-ratio and centroid-shift stopping conditions and writes
- * a boolean result (0 or 1) into @p done_flag.  Also advances
- * @p prior_clustering_cost to the current cost for the next iteration.
+ * 0 or 1 into @p done_flag, and advances @p prior_clustering_cost.
+ * @p FlagT is deduced from @p done_flag (default `int`); MG callers pass
+ * `int64_t` for NCCL allreduce compatibility.
  */
-template <typename DataT>
+template <typename DataT, typename FlagT = int>
 __device__ void check_convergence(raft::device_scalar_view<const DataT> clustering_cost,
                                   raft::device_scalar_view<DataT> prior_clustering_cost,
                                   raft::device_scalar_view<const DataT> sqrd_norm_error,
                                   DataT tol,
                                   int n_iter,
-                                  raft::device_scalar_view<int> done_flag)
+                                  raft::device_scalar_view<FlagT> done_flag)
 {
   DataT cur_cost = *clustering_cost.data_handle();
   DataT norm_err = *sqrd_norm_error.data_handle();
-  int done       = 0;
+  FlagT done     = FlagT{0};
 
   if (cur_cost != DataT{0} && n_iter > 1) {
     DataT delta = cur_cost / *prior_clustering_cost.data_handle();
-    if (delta > DataT{1} - tol) done = 1;
+    if (delta > DataT{1} - tol) done = FlagT{1};
   }
-  if (norm_err < tol) done = 1;
+  if (norm_err < tol) done = FlagT{1};
 
   *prior_clustering_cost.data_handle() = cur_cost;
   *done_flag.data_handle()             = done;

@@ -463,35 +463,31 @@ void checkWeights(const raft::resources& handle,
                   raft::device_vector_view<DataT, IndexT> weight)
 {
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
-  rmm::device_scalar<DataT> wt_aggr(stream);
+  auto d_wt_sum       = raft::make_device_scalar<DataT>(handle, DataT{0});
 
   const auto& comm = raft::resource::get_comms(handle);
 
   auto n_samples = weight.extent(0);
   raft::linalg::mapThenSumReduce(
-    wt_aggr.data(), n_samples, raft::identity_op{}, stream, weight.data_handle());
+    d_wt_sum.data_handle(), n_samples, raft::identity_op{}, stream, weight.data_handle());
 
-  comm.allreduce<DataT>(wt_aggr.data(),  // sendbuff
-                        wt_aggr.data(),  // recvbuff
-                        1,               // count
+  comm.allreduce<DataT>(d_wt_sum.data_handle(),  // sendbuff
+                        d_wt_sum.data_handle(),  // recvbuff
+                        1,                       // count
                         raft::comms::op_t::SUM,
                         stream);
-  DataT wt_sum = wt_aggr.value(stream);
-  raft::resource::sync_stream(handle, stream);
-  RAFT_EXPECTS(wt_sum > DataT{0}, "invalid parameter (sum of sample weights must be positive)");
 
-  if (wt_sum != n_samples) {
-    CUVS_LOG_KMEANS(handle,
-                    "[Warning!] KMeans: normalizing the user provided sample weights to "
-                    "sum up to %d samples",
-                    n_samples);
-
-    raft::linalg::map(handle,
-                      weight,
-                      raft::compose_op(raft::mul_const_op<DataT>{static_cast<DataT>(n_samples)},
-                                       raft::div_const_op<DataT>{wt_sum}),
-                      raft::make_const_mdspan(weight));
-  }
+  // Normalize weights so they sum to n_samples (per rank). Reading the sum from
+  // a device pointer avoids a host copy / stream sync. When the sum already
+  // equals n_samples this is a numerical no-op (matches single-GPU behavior).
+  const DataT* d_wt_sum_ptr = d_wt_sum.data_handle();
+  raft::linalg::map(
+    handle,
+    weight,
+    [n_samples, d_wt_sum_ptr] __device__(DataT w) {
+      return w * static_cast<DataT>(n_samples) / *d_wt_sum_ptr;
+    },
+    raft::make_const_mdspan(weight));
 }
 
 template <typename DataT, typename IndexT>