Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/infinicore/context/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ void memcpyH2D(void *dst, const void *src, size_t size, bool async = true);
void memcpyD2H(void *dst, const void *src, size_t size);
void memcpyD2D(void *dst, const void *src, size_t size, bool async = true);
void memcpyH2H(void *dst, const void *src, size_t size);
void memcpyD2DPeer(void *dst, int dst_device, const void *src, int src_device, size_t size, bool async = true);

// Timing APIs for performance measurement
infinirtEvent_t createEvent();
Expand Down
3 changes: 3 additions & 0 deletions include/infinirt.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ __C __export infiniStatus_t infinirtFreeHost(void *ptr);
__C __export infiniStatus_t infinirtMemcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind);
__C __export infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream);

__C __export infiniStatus_t infinirtMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t size);
__C __export infiniStatus_t infinirtMemcpyPeerAsync(void *dst, int dst_device, const void *src, int src_device, size_t size, infinirtStream_t stream);

// Stream-ordered memory
__C __export infiniStatus_t infinirtMallocAsync(void **p_ptr, size_t size, infinirtStream_t stream);
__C __export infiniStatus_t infinirtFreeAsync(void *ptr, infinirtStream_t stream);
Expand Down
4 changes: 4 additions & 0 deletions src/infinicore/context/context_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ void memcpyH2H(void *dst, const void *src, size_t size) {
return ContextImpl::singleton().getCpuRuntime()->memcpyD2D(dst, src, size);
}

void memcpyD2DPeer(void *dst, int dst_device, const void *src, int src_device, size_t size, bool async) {
return ContextImpl::singleton().getCurrentRuntime()->memcpyD2DPeer(dst, dst_device, src, src_device, size, async);
}

// Timing API implementations
infinirtEvent_t createEvent() {
return ContextImpl::singleton().getCurrentRuntime()->createEvent();
Expand Down
8 changes: 8 additions & 0 deletions src/infinicore/context/runtime/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ void Runtime::memcpyD2D(void *dst, const void *src, size_t size, bool async) {
}
}

void Runtime::memcpyD2DPeer(void *dst, int dst_device, const void *src, int src_device, size_t size, bool async) {
if (async) {
INFINICORE_CHECK_ERROR(infinirtMemcpyPeerAsync(dst, dst_device, src, src_device, size, stream_));
} else {
INFINICORE_CHECK_ERROR(infinirtMemcpyPeer(dst, dst_device, src, src_device, size));
}
}

// Timing method implementations
infinirtEvent_t Runtime::createEvent() {
infinirtEvent_t event;
Expand Down
1 change: 1 addition & 0 deletions src/infinicore/context/runtime/runtime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class Runtime {
void memcpyH2D(void *dst, const void *src, size_t size, bool async = true);
void memcpyD2H(void *dst, const void *src, size_t size);
void memcpyD2D(void *dst, const void *src, size_t size, bool async = true);
void memcpyD2DPeer(void *dst, int dst_device, const void *src, int src_device, size_t size, bool async = true);

// Timing methods
infinirtEvent_t createEvent();
Expand Down
44 changes: 31 additions & 13 deletions src/infinicore/tensor/copy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ void TensorImpl::copy_from(Tensor src) {
if (src->shape() != this->shape()) {
throw std::runtime_error("Cannot copy from tensor with different shape");
}
if (this->device() == src->device()) {
auto this_device = this->device();
auto src_device = src->device();
if (this_device == src_device) {
op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), src);
} else {
if (!src->is_contiguous()) {
Expand All @@ -30,23 +32,39 @@ void TensorImpl::copy_from(Tensor src) {

// Use nbytes() to get the actual tensor size, not the full memory size
size_t copy_size = std::min(this->nbytes(), src->nbytes());
if (this->device().getType() == Device::Type::CPU) {
context::setDevice(src->device());
if (this_device.getType() == src_device.getType()) {
context::setDevice(this_device);
// Same device type, e.g., NVIDIA to NVIDIA, different indices; won't be CPU
if (this->is_contiguous()) {
context::memcpyD2H(this->data(), src->data(), copy_size);
context::memcpyD2DPeer(this->data(), this_device.getIndex(),
src->data(), src_device.getIndex(),
copy_size);
} else {
auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
context::memcpyD2H(local_src->data(), src->data(), this->data_.memory->size());
context::memcpyD2DPeer(local_src->data(), this_device.getIndex(),
src->data(), src_device.getIndex(),
copy_size);
op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
}
} else if (src->device().getType() == Device::Type::CPU) {
context::setDevice(this->device());
if (this->is_contiguous()) {
context::memcpyH2D(this->data(), src->data(), copy_size);
} else {
auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
context::memcpyH2D(local_src->data(), src->data(), copy_size);
op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
} else {
if (this_device.getType() == Device::Type::CPU) {
context::setDevice(src_device);
if (this->is_contiguous()) {
context::memcpyD2H(this->data(), src->data(), copy_size);
} else {
auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
context::memcpyD2H(local_src->data(), src->data(), this->data_.memory->size());
op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
}
} else if (src_device.getType() == Device::Type::CPU) {
context::setDevice(this_device);
if (this->is_contiguous()) {
context::memcpyH2D(this->data(), src->data(), copy_size);
} else {
auto local_src = Tensor::empty(this->shape(), this->dtype(), this->device());
context::memcpyH2D(local_src->data(), src->data(), copy_size);
op::rearrange_(Tensor(const_cast<TensorImpl *>(this)->shared_from_this()), local_src);
}
}
}
}
Expand Down
27 changes: 27 additions & 0 deletions src/infinirt/ascend/infinirt_ascend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ infiniStatus_t eventDestroy(infinirtEvent_t event) {

infiniStatus_t eventElapsedTime(float *ms_ptr, infinirtEvent_t start, infinirtEvent_t end) {
return INFINI_STATUS_NOT_IMPLEMENTED;
// Commented before validation
// CHECK_ACLRT(aclrtEventElapsedTime(ms_ptr, (aclrtEvent)start, (aclrtEvent)end));
// return INFINI_STATUS_SUCCESS;
}

infiniStatus_t mallocDevice(void **p_ptr, size_t size) {
Expand Down Expand Up @@ -143,6 +146,30 @@ infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemc
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t memcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t size) {
return INFINI_STATUS_NOT_IMPLEMENTED;
// Commented before validation
// int32_t can_access_peer = 0;
// CHECK_ACLRT(aclrtDeviceCanAccessPeer(&can_access_peer, dst_device, src_device));
// if (!can_access_peer) {
// CHECK_ACLRT(aclrtDeviceEnablePeerAccess(src_device, 0));
// }
// CHECK_ACLRT(aclrtMemcpy(dst, size, src, size, ACL_MEMCPY_DEVICE_TO_DEVICE));
// return INFINI_STATUS_SUCCESS;
}

infiniStatus_t memcpyPeerAsync(void *dst, int dst_device, const void *src, int src_device, size_t size, infinirtStream_t stream) {
return INFINI_STATUS_NOT_IMPLEMENTED;
// Commented before validation
// int32_t can_access_peer = 0;
// CHECK_ACLRT(aclrtDeviceCanAccessPeer(&can_access_peer, dst_device, src_device));
// if (!can_access_peer) {
// CHECK_ACLRT(aclrtDeviceEnablePeerAccess(src_device, 0));
// }
// CHECK_ACLRT(aclrtMemcpyAsync(dst, size, src, size, ACL_MEMCPY_DEVICE_TO_DEVICE, (aclrtStream)stream));
// return INFINI_STATUS_SUCCESS;
}

infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
return mallocDevice(p_ptr, size);
}
Expand Down
27 changes: 25 additions & 2 deletions src/infinirt/bang/infinirt_bang.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,19 @@ infiniStatus_t eventCreate(infinirtEvent_t *event_ptr) {
}

infiniStatus_t eventCreateWithFlags(infinirtEvent_t *event_ptr, uint32_t flags) {
return INFINI_STATUS_NOT_IMPLEMENTED;
cnrtNotifier_t notifier;
unsigned int cnrt_flags = CNRT_NOTIFIER_DEFAULT;

if (flags & INFINIRT_EVENT_DISABLE_TIMING) {
cnrt_flags |= CNRT_NOTIFIER_DISABLE_TIMING_ALL;
}
if (flags & INFINIRT_EVENT_BLOCKING_SYNC) {
cnrt_flags |= CNRT_NOTIFIER_DISABLE_TIMING_SW;
}

CHECK_BANGRT(cnrtNotifierCreateWithFlags(&notifier, cnrt_flags));
*event_ptr = notifier;
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
Expand Down Expand Up @@ -83,7 +95,8 @@ infiniStatus_t eventDestroy(infinirtEvent_t event) {
}

infiniStatus_t eventElapsedTime(float *ms_ptr, infinirtEvent_t start, infinirtEvent_t end) {
return INFINI_STATUS_NOT_IMPLEMENTED;
CHECK_BANGRT(cnrtNotifierElapsedTime((cnrtNotifier_t)start, (cnrtNotifier_t)end, ms_ptr));
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t mallocDevice(void **p_ptr, size_t size) {
Expand Down Expand Up @@ -131,6 +144,16 @@ infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemc
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t memcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t size) {
CHECK_BANGRT(cnrtMemcpyPeer(dst, dst_device, (void *)src, src_device, size));
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t memcpyPeerAsync(void *dst, int dst_device, const void *src, int src_device, size_t size, infinirtStream_t stream) {
CHECK_BANGRT(cnrtMemcpyPeerAsync(dst, dst_device, (void *)src, src_device, size, (cnrtQueue_t)stream));
return INFINI_STATUS_SUCCESS;
}

// Does not support async malloc. Use blocking-style malloc instead
infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
CHECK_BANGRT(cnrtMalloc(p_ptr, size));
Expand Down
8 changes: 8 additions & 0 deletions src/infinirt/cpu/infinirt_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,14 @@ infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemc
return memcpy(dst, src, size, kind);
}

infiniStatus_t memcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t size) {
return INFINI_STATUS_INTERNAL_ERROR;
}

infiniStatus_t memcpyPeerAsync(void *dst, int dst_device, const void *src, int src_device, size_t size, infinirtStream_t stream) {
return INFINI_STATUS_INTERNAL_ERROR;
}

infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
return mallocDevice(p_ptr, size);
}
Expand Down
10 changes: 10 additions & 0 deletions src/infinirt/cuda/infinirt_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,16 @@ infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemc
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t memcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t size) {
CHECK_CUDART(cudaMemcpyPeer(dst, dst_device, src, src_device, size));
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t memcpyPeerAsync(void *dst, int dst_device, const void *src, int src_device, size_t size, infinirtStream_t stream) {
CHECK_CUDART(cudaMemcpyPeerAsync(dst, dst_device, src, src_device, size, (cudaStream_t)stream));
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
CHECK_CUDART(cudaMallocAsync(p_ptr, size, (cudaStream_t)stream));
return INFINI_STATUS_SUCCESS;
Expand Down
8 changes: 8 additions & 0 deletions src/infinirt/infinirt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,14 @@ __C infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size_t size,
INFINIRT_CALL_DEVICE_API(memcpyAsync, (dst, src, size, kind, stream));
}

__C infiniStatus_t infinirtMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t size) {
INFINIRT_CALL_DEVICE_API(memcpyPeer, (dst, dst_device, src, src_device, size));
}

__C infiniStatus_t infinirtMemcpyPeerAsync(void *dst, int dst_device, const void *src, int src_device, size_t size, infinirtStream_t stream) {
INFINIRT_CALL_DEVICE_API(memcpyPeerAsync, (dst, dst_device, src, src_device, size, stream));
}

__C infiniStatus_t infinirtMallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
INFINIRT_CALL_DEVICE_API(mallocAsync, (p_ptr, size, stream));
}
Expand Down
Loading