Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions unified-runtime/source/adapters/cuda/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include <umf/memory_provider.h>
#include <umf/providers/provider_cuda.h>

#include "cuda_call_logger.hpp"

ur_result_t mapErrorUR(CUresult Result);

/// Converts CUDA error into UR error codes, and outputs error information
Expand All @@ -36,8 +38,22 @@ void checkErrorUR(nvmlReturn_t Result, const char *Function, int Line,
void checkErrorUR(ur_result_t Result, const char *Function, int Line,
const char *File);

#define UR_CHECK_ERROR(Result) \
checkErrorUR(Result, __func__, __LINE__, __FILE__)
// Enhanced UR_CHECK_ERROR that automatically logs CUDA calls when tracing is
// enabled
#define UR_CHECK_ERROR(expr) \
do { \
if (::ur::cuda::call_logger::isEnabled()) { \
constexpr const char *__call_str = #expr; \
if (__call_str[0] == 'c' && __call_str[1] == 'u') { \
CUDA_CALL_TRACE_GENERIC(__call_str); \
} \
} \
auto __result = (expr); \
if (::ur::cuda::call_logger::isEnabled()) { \
CUDA_CALL_TRACE_RESULT(__result); \
} \
checkErrorUR(__result, __func__, __LINE__, __FILE__); \
} while (0)

std::string getCudaVersionString();

Expand Down
25 changes: 20 additions & 5 deletions unified-runtime/source/adapters/cuda/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "adapter.hpp"
#include "common.hpp"
#include "common/ur_ref_count.hpp"
#include "cuda_call_logger.hpp"
#include "device.hpp"
#include "umf_helpers.hpp"

Expand Down Expand Up @@ -157,27 +158,41 @@ namespace {
class ScopedContext {
public:
ScopedContext(ur_device_handle_t Device) {
if (!Device) {
throw UR_RESULT_ERROR_INVALID_DEVICE;
if (Device) {
setContext(Device->getNativeContext());
}
setContext(Device->getNativeContext());
}

ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }

~ScopedContext() {}
~ScopedContext() {
if (NeedToRecover) {
CUDA_CALL_TRACE_CTX_SET(Original);
(void)cuCtxSetCurrent(Original);
}
}

private:
void setContext(CUcontext Desired) {
CUcontext Original = nullptr;
Original = nullptr;

CUDA_CALL_TRACE_CTX_GET(&Original);
UR_CHECK_ERROR(cuCtxGetCurrent(&Original));

// Make sure the desired context is active on the current thread, setting
// it if necessary
if (Original != Desired) {
CUDA_CALL_TRACE_CTX_SET(Desired);
UR_CHECK_ERROR(cuCtxSetCurrent(Desired));
// Only restore context if we had a valid one before
// (don't restore to nullptr)
if (Original != nullptr) {
NeedToRecover = true;
}
}
}

CUcontext Original = nullptr;
bool NeedToRecover = false;
};
} // namespace
165 changes: 165 additions & 0 deletions unified-runtime/source/adapters/cuda/cuda_call_logger.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
//===--------- cuda_call_logger.hpp - CUDA API Call Logger ---------------===//
//
// Copyright (C) 2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#pragma once

#include "logger/ur_logger.hpp"
#include <cstdlib>
#include <cuda.h>
#include <iostream>
#include <sstream>

namespace ur::cuda::call_logger {

inline bool isEnabled() {
static bool enabled = [] {
const char *env = std::getenv("UR_CUDA_CALL_TRACE");
return env != nullptr && env[0] == '1';
}();
return enabled;
}

// Helper to format pointer addresses
template <typename T> std::string formatPtr(T *ptr) {
std::stringstream ss;
ss << std::hex << "0x" << reinterpret_cast<uintptr_t>(ptr);
return ss.str();
}

// Helper to format CUdeviceptr
inline std::string formatDevPtr(CUdeviceptr ptr) {
std::stringstream ss;
ss << std::hex << "0x" << ptr;
return ss.str();
}

// Helper to format size_t
inline std::string formatSize(size_t size) {
std::stringstream ss;
ss << size << " bytes";
return ss.str();
}

// Context logging
inline void logContextSwitch(const char *operation, CUcontext ctx,
const char *func, int line) {
if (!isEnabled())
return;
std::cerr << "[CUDA] " << func << ":" << line << " " << operation
<< " ctx=" << formatPtr(ctx) << "\n";
}

// Memory operation logging
inline void logMemcpy(const char *operation, CUdeviceptr dst, CUdeviceptr src,
size_t size, CUstream stream, const char *func,
int line) {
if (!isEnabled())
return;
std::cerr << "[CUDA] " << func << ":" << line << " " << operation
<< " dst=" << formatDevPtr(dst) << " src=" << formatDevPtr(src)
<< " size=" << formatSize(size) << " stream=" << formatPtr(stream)
<< "\n";
}

// Kernel launch logging
inline void logKernelLaunch(CUfunction kernel, unsigned gridX, unsigned gridY,
unsigned gridZ, unsigned blockX, unsigned blockY,
unsigned blockZ, unsigned sharedMem,
CUstream stream, const char *func, int line) {
if (!isEnabled())
return;
std::cerr << "[CUDA] " << func << ":" << line << " cuLaunchKernel"
<< " kernel=" << formatPtr(kernel) << " grid=(" << gridX << ","
<< gridY << "," << gridZ << ") block=(" << blockX << "," << blockY
<< "," << blockZ << ") sharedMem=" << sharedMem
<< " stream=" << formatPtr(stream) << "\n";
}

// Generic CUDA call logging
inline void logCudaCall(const char *callString, const char *func, int line) {
if (!isEnabled())
return;
std::cerr << "[CUDA] " << func << ":" << line << " " << callString << "\n";
}

inline void logCudaResult(CUresult result, const char *func, int line) {
if (!isEnabled())
return;
if (result == CUDA_SUCCESS) {
std::cerr << "[CUDA] " << func << ":" << line << " -> CUDA_SUCCESS\n";
} else {
const char *errStr = nullptr;
cuGetErrorString(result, &errStr);
std::cerr << "[CUDA] " << func << ":" << line << " -> ERROR: " << result
<< " (" << (errStr ? errStr : "unknown") << ")\n";
}
}

// Overloads for other return types (no-op, we only log CUresult)
inline void logCudaResult(nvmlReturn_t, const char *, int) {}
inline void logCudaResult(ur_result_t, const char *, int) {}

} // namespace ur::cuda::call_logger

// Macros for specific CUDA API calls with detailed logging
#define CUDA_CALL_TRACE_CTX_SET(ctx) \
do { \
::ur::cuda::call_logger::logContextSwitch("cuCtxSetCurrent", ctx, \
__func__, __LINE__); \
} while (0)

#define CUDA_CALL_TRACE_CTX_GET(ctx) \
do { \
::ur::cuda::call_logger::logContextSwitch("cuCtxGetCurrent", *ctx, \
__func__, __LINE__); \
} while (0)

#define CUDA_CALL_TRACE_MEMCPY_ASYNC(dst, src, size, stream) \
do { \
::ur::cuda::call_logger::logMemcpy("cuMemcpyDtoDAsync", dst, src, size, \
stream, __func__, __LINE__); \
} while (0)

#define CUDA_CALL_TRACE_MEMCPY_GENERIC(dst, src, size, stream) \
do { \
::ur::cuda::call_logger::logMemcpy("cuMemcpyAsync", dst, src, size, \
stream, __func__, __LINE__); \
} while (0)

#define CUDA_CALL_TRACE_KERNEL_LAUNCH(kernel, gx, gy, gz, bx, by, bz, sm, \
stream) \
do { \
::ur::cuda::call_logger::logKernelLaunch(kernel, gx, gy, gz, bx, by, bz, \
sm, stream, __func__, __LINE__); \
} while (0)

#define CUDA_CALL_TRACE_GENERIC(call_str) \
do { \
::ur::cuda::call_logger::logCudaCall(call_str, __func__, __LINE__); \
} while (0)

#define CUDA_CALL_TRACE_RESULT(result) \
do { \
::ur::cuda::call_logger::logCudaResult(result, __func__, __LINE__); \
} while (0)

// Wrapper for UR_CHECK_ERROR that adds logging
#define UR_CHECK_ERROR_TRACED(result) \
do { \
CUresult __result = (result); \
CUDA_CALL_TRACE_RESULT(__result); \
UR_CHECK_ERROR(__result); \
} while (0)

// Universal wrapper that logs the CUDA call before executing it
#define UR_CUDA_CALL_LOGGED(call) \
do { \
CUDA_CALL_TRACE_GENERIC(#call); \
UR_CHECK_ERROR(call); \
} while (0)
31 changes: 28 additions & 3 deletions unified-runtime/source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "enqueue.hpp"
#include "common.hpp"
#include "context.hpp"
#include "cuda_call_logger.hpp"
#include "event.hpp"
#include "kernel.hpp"
#include "memory.hpp"
Expand Down Expand Up @@ -38,13 +39,30 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,

auto Result = forLatestEvents(
EventWaitList, NumEventsInWaitList,
[Stream](ur_event_handle_t Event) -> ur_result_t {
[Stream, CommandQueue](ur_event_handle_t Event) -> ur_result_t {
if (Event->getStream() == Stream) {
return UR_RESULT_SUCCESS;
} else {
UR_CHECK_ERROR(cuStreamWaitEvent(Stream, Event->get(), 0));
}

// CUDA limitation: cuStreamWaitEvent fails when event and stream are
// from different native CUDA contexts (different physical devices).
// Compare the native CUcontext of the devices to detect this case.
if (Event->getQueue() &&
Event->getQueue()->getDevice()->getNativeContext() !=
CommandQueue->getDevice()->getNativeContext()) {
// Cross-device synchronization requires host involvement:
// 1. Wait for cross-device event to complete (blocks CPU)
UR_CHECK_ERROR(cuEventSynchronize(Event->get()));
// 2. Synchronize target stream to create ordering barrier
// This ensures work enqueued after this call happens after
// the cross-device event has completed
UR_CHECK_ERROR(cuStreamSynchronize(Stream));
return UR_RESULT_SUCCESS;
}

// Same native context: use asynchronous stream wait
UR_CHECK_ERROR(cuStreamWaitEvent(Stream, Event->get(), 0));
return UR_RESULT_SUCCESS;
});
return Result;
} catch (ur_result_t Err) {
Expand Down Expand Up @@ -434,6 +452,10 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
}

auto &ArgPointers = hKernel->getArgPointers();
CUDA_CALL_TRACE_KERNEL_LAUNCH(CuFunc, BlocksPerGrid[0], BlocksPerGrid[1],
BlocksPerGrid[2], ThreadsPerBlock[0],
ThreadsPerBlock[1], ThreadsPerBlock[2],
LocalSize, CuStream);
UR_CHECK_ERROR(cuLaunchKernel(
CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
Expand Down Expand Up @@ -918,6 +940,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
auto Dst = std::get<BufferMem>(hBufferDst->Mem)
.getPtrWithOffset(hQueue->getDevice(), dstOffset);

CUDA_CALL_TRACE_MEMCPY_ASYNC(Dst, Src, size, Stream);
UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream));

if (phEvent) {
Expand Down Expand Up @@ -1578,6 +1601,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
hQueue, CuStream);
UR_CHECK_ERROR(EventPtr->start());
}
CUDA_CALL_TRACE_MEMCPY_GENERIC((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size,
CuStream);
UR_CHECK_ERROR(
cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));
if (phEvent) {
Expand Down
2 changes: 2 additions & 0 deletions unified-runtime/source/adapters/cuda/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "adapter.hpp"
#include "common.hpp"
#include "context.hpp"
#include "cuda_call_logger.hpp"
#include "device.hpp"
#include "event.hpp"
#include "platform.hpp"
Expand Down Expand Up @@ -578,6 +579,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t,
void *pDst,
const void *pSrc,
size_t Size) {
CUDA_CALL_TRACE_GENERIC("cuMemcpy (synchronous)");
UR_CHECK_ERROR(cuMemcpy((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size));
return UR_RESULT_SUCCESS;
}
Loading
Loading