ml-explore · sofinvalery · May 28, 2026
diff --git a/mlx/backend/cuda/reduce/all_reduce.cu b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -68,12 +68,12 @@ void all_reduce(
 
   out.set_data(cu::malloc_async(out.nbytes(), encoder));
 
-  auto get_args = [](int size, int N) {
-    int threads = std::min(512, (size + N - 1) / N);
+  auto get_args = [](size_t size, size_t N) {
+    int threads =
+        static_cast<int>(std::min(size_t{512}, cuda::ceil_div(size, N)));
     threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
-    int reductions_per_step = threads * N;
-    size_t steps_needed =
-        (size + reductions_per_step - 1) / reductions_per_step;
+    size_t reductions_per_step = threads * N;
+    size_t steps_needed = cuda::ceil_div(size, reductions_per_step);
 
     int blocks;
     if (steps_needed < 32) {
@@ -88,7 +88,7 @@ void all_reduce(
       blocks = 1024;
     }
 
-    size_t steps_per_block = (steps_needed + blocks - 1) / blocks;
+    size_t steps_per_block = cuda::ceil_div(steps_needed, blocks);
     size_t block_step = steps_per_block * reductions_per_step;
 
     return std::make_tuple(blocks, threads, block_step);