Issue/846 - Ensure embedding tensors are on the same device.

gongchensu · gongchensu · commit 5dd2a261d45d · 2025-12-26T08:05:05.000Z
diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc
@@ -43,9 +43,16 @@ Embedding::Embedding(size_t num_embeddings,
 }
 
 Tensor Embedding::forward(const Tensor &indices) const {
+    // Ensure indices are on the same device as weight
+    // This avoids synchronous memcpy in ops layer which would hurt performance
+    Tensor indices_on_device = indices;
+    if (indices->device() != device_) {
+        indices_on_device = indices->to(device_);
+    }
+    
     // Ensure indices are contiguous for efficient access
     // op::embedding now supports device-side input for graph recording
-    Tensor indices_contiguous = indices->is_contiguous() ? indices : indices->contiguous();
+    Tensor indices_contiguous = indices_on_device->is_contiguous() ? indices_on_device : indices_on_device->contiguous();
     
     // Use op::embedding which now supports device-side input and batch dimension
     // This enables full graph recording support without synchronization
diff --git a/src/infinicore/ops/embedding/embedding.cc b/src/infinicore/ops/embedding/embedding.cc
@@ -12,8 +12,10 @@ common::OpDispatcher<Embedding::schema> &Embedding::dispatcher() {
 }
 
 void Embedding::execute(Tensor out, Tensor input, Tensor weight) {
-    // Check that output and weight are on the same device
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, weight);
+    // Check that all tensors are on the same device
+    // This is critical: if input is on CPU while out/weight are on GPU,
+    // passing CPU pointer to CUDA kernel will cause memory access errors
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, weight);
     
     // Set device context
     infinicore::context::setDevice(out->device());
diff --git a/src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu b/src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
@@ -23,7 +23,6 @@ infiniStatus_t Descriptor::create(
     infiniopTensorDescriptor_t input_desc,
     infiniopTensorDescriptor_t weight_desc) {
 
-    auto handle_nvidia = reinterpret_cast<device::nvidia::Handle *>(handle);
     auto input_shape = input_desc->shape();
     auto weight_shape = weight_desc->shape();
     
@@ -63,7 +62,7 @@ infiniStatus_t Descriptor::create(
         vocab_size,
         input_dtype,
         weight_dtype,
-        new Opaque{handle_nvidia->internal()},
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
         handle->device,
         handle->device_id);