[slimtensor] Enable CUDA tensor copy (pytorch#16800)

pytorchbot · Gasoonjia · web-flow · commit 37aa87ea1a07 · 2026-01-22T14:32:04.000-08:00
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: pytorch#16771 by @Gasoonjia ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/gasoonjia/111/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/gasoonjia/111/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/gasoonjia/110/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/gasoonjia/111/orig Differential Revision: [D91202900](https://our.internmc.facebook.com/intern/diff/D91202900/) @diff-train-skip-merge --------- Co-authored-by: gasoonjia <gasoonjia@icloud.com> Co-authored-by: Gasoonjia <gasoonjia@meta.com>
diff --git a/backends/aoti/slim/core/SlimTensor.h b/backends/aoti/slim/core/SlimTensor.h
@@ -11,6 +11,7 @@
 #include <cstdint>
 #include <cstring>
 #include <utility>
+#include <vector>
 
 #include <executorch/backends/aoti/slim/c10/core/Contiguity.h>
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
@@ -277,69 +278,67 @@ class SlimTensor {
    * Copy data from another tensor to this tensor.
    *
    * Both tensors must have the same numel and dtype.
-   * Currently only supports CPU-to-CPU copy (contiguous tensors only).
+   * Supports CPU-to-CPU and cross-device copies (CPU↔CUDA, CUDA↔CUDA).
    *
    * @param other The source tensor to copy from
    * @return Reference to this tensor
    */
   SlimTensor& copy_(const SlimTensor& other) {
     ET_CHECK_MSG(
-        this->numel() == other.numel(),
-        "copy_: numel mismatch (dst=%zu, src=%zu)",
-        this->numel(),
-        other.numel());
-    ET_CHECK_MSG(this->dtype() == other.dtype(), "copy_: dtype mismatch");
+        this->numel() == other.numel(), "copy_: numel of tensors must match");
+    ET_CHECK_MSG(this->dtype() == other.dtype(), "copy_: dtype must match");
 
     if (this->numel() == 0) {
       return *this;
     }
 
-    // Current we only support CPU-only tensors
-    // TODO(gasoonjia): support other device types.
-    ET_CHECK_MSG(
-        this->is_cpu() && other.is_cpu(), "copy_: only CPU tensors supported");
-
+    // Case 1: Both tensors are contiguous. We can do a fast bulk copy.
     if (this->is_contiguous() && other.is_contiguous()) {
-      // Fast path: both tensors are contiguous, use memcpy
-      std::memcpy(this->data_ptr(), other.data_ptr(), other.nbytes());
-    } else {
-      // Slow path: element-wise copy for non-contiguous tensors
-      copy_strided_(other);
+      storage_->copy_(
+          this->data_ptr(), other.data_ptr(), other.nbytes(), other.device());
+      return *this;
     }
 
-    return *this;
-  }
-
- private:
-  /**
-   * Element-wise copy for non-contiguous tensors.
-   */
-  void copy_strided_(const SlimTensor& other) {
+    // Case 2: At least one tensor is non-contiguous, perform element-wise copy
+    // that respects both source and destination strides.
     const size_t elem_size = c10::elementSize(dtype_);
     char* dst_data = static_cast<char*>(this->data_ptr());
     const char* src_data = static_cast<const char*>(other.data_ptr());
 
     std::vector<int64_t> counter(this->dim(), 0);
     for (size_t i = 0; i < this->numel(); i++) {
-      // Compute source offset
+      // Compute src offset in elements
       int64_t src_offset = 0;
       for (size_t d = 0; d < other.dim(); d++) {
-        src_offset += counter[d] * other.stride(static_cast<int64_t>(d));
+        src_offset += counter[d] * other.stride(d);
       }
 
-      // Compute destination offset
+      // Compute dst offset in elements
       int64_t dst_offset = 0;
       for (size_t d = 0; d < this->dim(); d++) {
-        dst_offset += counter[d] * this->stride(static_cast<int64_t>(d));
+        dst_offset += counter[d] * this->stride(d);
       }
 
-      // Copy single element
-      std::memcpy(
-          dst_data + dst_offset * static_cast<int64_t>(elem_size),
-          src_data + src_offset * static_cast<int64_t>(elem_size),
-          elem_size);
-
-      // Increment multi-dimensional counter
+      // Copy elem_size bytes from src to dst
+      if (this->device().is_cpu() && other.device().is_cpu()) {
+        std::memcpy(
+            dst_data + dst_offset * elem_size,
+            src_data + src_offset * elem_size,
+            elem_size);
+      } else if (this->device().is_cuda() || other.device().is_cuda()) {
+#if defined(CUDA_AVAILABLE)
+        DeviceTraits<c10::DeviceType::CUDA>::memcpy(
+            dst_data + dst_offset * elem_size,
+            src_data + src_offset * elem_size,
+            elem_size,
+            device(), // dst device
+            other.device() // src device
+        );
+#else
+        ET_CHECK_MSG(false, "Failed on copy_ cuda tensors: no CUDA support");
+#endif
+      }
+      // Increment the multi-dimensional counter
       for (int64_t d = static_cast<int64_t>(this->dim()) - 1; d >= 0; --d) {
         counter[d]++;
         if (counter[d] < this->size(d)) {
@@ -348,8 +347,10 @@ class SlimTensor {
         counter[d] = 0;
       }
     }
+    return *this;
   }
 
+ private:
   void refresh_numel() {
     numel_ = compute_numel(sizes_and_strides_.sizes_arrayref());
   }
diff --git a/backends/aoti/slim/core/Storage.h b/backends/aoti/slim/core/Storage.h
@@ -296,12 +296,15 @@ class MaybeOwningStorage {
       return;
     }
 
-    ET_CHECK_MSG(
-        device_.is_cpu() && src_device.is_cpu(),
-        "Only CPU-to-CPU copy is currently supported");
-
-    DeviceTraits<c10::DeviceType::CPU>::memcpy(
-        dst_data_ptr, src_data_ptr, nbytes, device_, src_device);
+    if (device_.is_cpu() && src_device.is_cpu()) {
+      // CPU to CPU copy
+      DeviceTraits<c10::DeviceType::CPU>::memcpy(
+          dst_data_ptr, src_data_ptr, nbytes, device_, src_device);
+    } else {
+      // At least one of the devices is CUDA
+      DeviceTraits<c10::DeviceType::CUDA>::memcpy(
+          dst_data_ptr, src_data_ptr, nbytes, device_, src_device);
+    }
   }
 
   /// Creates a clone of this storage on the specified device.
diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl
@@ -22,7 +22,6 @@ def define_common_targets():
         ],
     )
 
-    # Header-only library for SlimTensor (CPU-only for now)
     runtime.cxx_library(
         name = "slimtensor",
         headers = [
diff --git a/backends/aoti/slim/core/test/targets.bzl b/backends/aoti/slim/core/test/targets.bzl
@@ -32,49 +32,31 @@ def define_common_targets():
             **backend_kwargs
         )
 
-        backend_kwargs = {
-            "external_deps": [("cuda", None, "cuda-lazy")],
-            "preprocessor_flags": ["-DCUDA_AVAILABLE=1"],
-            "keep_gpu_sections": True,
-            "remote_execution": re_test_utils.remote_execution(
-                platform = "gpu-remote-execution",
-            ),
-        } if backend_mode == "cuda" else {}
-
         runtime.cxx_test(
-            name = "test_storage" + backend_suffix,
+            name = "test_slimtensor_basic"  + backend_suffix,
             srcs = [
-                "test_storage.cpp",
+                "test_slimtensor_basic.cpp",
             ],
             deps = [
+                "//executorch/backends/aoti/slim/core:slimtensor",
                 "//executorch/backends/aoti/slim/core:storage",
             ],
             **backend_kwargs
         )
 
         runtime.cxx_test(
-            name = "test_slimtensor_basic"  + backend_suffix,
+            name = "test_slimtensor_copy" + backend_suffix,
             srcs = [
-                "test_slimtensor_basic.cpp",
+                "test_slimtensor_copy.cpp",
             ],
             deps = [
                 "//executorch/backends/aoti/slim/core:slimtensor",
                 "//executorch/backends/aoti/slim/core:storage",
+                "//executorch/backends/aoti/slim/factory:empty",
             ],
             **backend_kwargs
         )
 
-    runtime.cxx_test(
-        name = "test_slimtensor_copy",
-        srcs = [
-            "test_slimtensor_copy.cpp",
-        ],
-        deps = [
-            "//executorch/backends/aoti/slim/core:slimtensor",
-            "//executorch/backends/aoti/slim/core:storage",
-        ],
-    )
-
     runtime.cxx_test(
         name = "test_slimtensor_dtypes",
         srcs = [
diff --git a/backends/aoti/slim/core/test/test_slimtensor_copy.cpp b/backends/aoti/slim/core/test/test_slimtensor_copy.cpp

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,6 @@ def define_common_targets():`
`22`	`22`	`],`
`23`	`23`	`)`
`24`	`24`
`25`		`- # Header-only library for SlimTensor (CPU-only for now)`
`26`	`25`	`runtime.cxx_library(`
`27`	`26`	`name = "slimtensor",`
`28`	`27`	`headers = [`