intel · kekaczma · Feb 24, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
@@ -18,6 +18,8 @@
 #include <umf/memory_provider.h>
 #include <umf/providers/provider_cuda.h>
 
+#include "cuda_call_logger.hpp"
+
 ur_result_t mapErrorUR(CUresult Result);
 
 /// Converts CUDA error into UR error codes, and outputs error information
@@ -36,8 +38,22 @@ void checkErrorUR(nvmlReturn_t Result, const char *Function, int Line,
 void checkErrorUR(ur_result_t Result, const char *Function, int Line,
                   const char *File);
 
-#define UR_CHECK_ERROR(Result)                                                 \
-  checkErrorUR(Result, __func__, __LINE__, __FILE__)
+// Enhanced UR_CHECK_ERROR that automatically logs CUDA calls when tracing is
+// enabled
+#define UR_CHECK_ERROR(expr)                                                   \
+  do {                                                                         \
+    if (::ur::cuda::call_logger::isEnabled()) {                                \
+      constexpr const char *__call_str = #expr;                                \
+      if (__call_str[0] == 'c' && __call_str[1] == 'u') {                      \
+        CUDA_CALL_TRACE_GENERIC(__call_str);                                   \
+      }                                                                        \
+    }                                                                          \
+    auto __result = (expr);                                                    \
+    if (::ur::cuda::call_logger::isEnabled()) {                                \
+      CUDA_CALL_TRACE_RESULT(__result);                                        \
+    }                                                                          \
+    checkErrorUR(__result, __func__, __LINE__, __FILE__);                      \
+  } while (0)
 
 std::string getCudaVersionString();
 

@@ -21,6 +21,7 @@
 #include "adapter.hpp"
 #include "common.hpp"
 #include "common/ur_ref_count.hpp"
+#include "cuda_call_logger.hpp"
 #include "device.hpp"
 #include "umf_helpers.hpp"
 
@@ -157,27 +158,41 @@ namespace {
 class ScopedContext {
 public:
   ScopedContext(ur_device_handle_t Device) {
-    if (!Device) {
-      throw UR_RESULT_ERROR_INVALID_DEVICE;
+    if (Device) {
+      setContext(Device->getNativeContext());
     }
-    setContext(Device->getNativeContext());
   }
 
   ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }
 
-  ~ScopedContext() {}
+  ~ScopedContext() {
+    if (NeedToRecover) {
+      CUDA_CALL_TRACE_CTX_SET(Original);
+      (void)cuCtxSetCurrent(Original);
+    }
+  }
 
 private:
   void setContext(CUcontext Desired) {
-    CUcontext Original = nullptr;
+    Original = nullptr;
 
+    CUDA_CALL_TRACE_CTX_GET(&Original);
     UR_CHECK_ERROR(cuCtxGetCurrent(&Original));
 
     // Make sure the desired context is active on the current thread, setting
     // it if necessary
     if (Original != Desired) {
+      CUDA_CALL_TRACE_CTX_SET(Desired);
       UR_CHECK_ERROR(cuCtxSetCurrent(Desired));
+      // Only restore context if we had a valid one before
+      // (don't restore to nullptr)
+      if (Original != nullptr) {
+        NeedToRecover = true;
+      }
     }
   }
+
+  CUcontext Original = nullptr;
+  bool NeedToRecover = false;
 };
 } // namespace
@@ -0,0 +1,165 @@
+//===--------- cuda_call_logger.hpp - CUDA API Call Logger ---------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "logger/ur_logger.hpp"
+#include <cstdlib>
+#include <cuda.h>
+#include <iostream>
+#include <sstream>
+
+namespace ur::cuda::call_logger {
+
+inline bool isEnabled() {
+  static bool enabled = [] {
+    const char *env = std::getenv("UR_CUDA_CALL_TRACE");
+    return env != nullptr && env[0] == '1';
+  }();
+  return enabled;
+}
+
+// Helper to format pointer addresses
+template <typename T> std::string formatPtr(T *ptr) {
+  std::stringstream ss;
+  ss << std::hex << "0x" << reinterpret_cast<uintptr_t>(ptr);
+  return ss.str();
+}
+
+// Helper to format CUdeviceptr
+inline std::string formatDevPtr(CUdeviceptr ptr) {
+  std::stringstream ss;
+  ss << std::hex << "0x" << ptr;
+  return ss.str();
+}
+
+// Helper to format size_t
+inline std::string formatSize(size_t size) {
+  std::stringstream ss;
+  ss << size << " bytes";
+  return ss.str();
+}
+
+// Context logging
+inline void logContextSwitch(const char *operation, CUcontext ctx,
+                             const char *func, int line) {
+  if (!isEnabled())
+    return;
+  std::cerr << "[CUDA] " << func << ":" << line << " " << operation
+            << " ctx=" << formatPtr(ctx) << "\n";
+}
+
+// Memory operation logging
+inline void logMemcpy(const char *operation, CUdeviceptr dst, CUdeviceptr src,
+                      size_t size, CUstream stream, const char *func,
+                      int line) {
+  if (!isEnabled())
+    return;
+  std::cerr << "[CUDA] " << func << ":" << line << " " << operation
+            << " dst=" << formatDevPtr(dst) << " src=" << formatDevPtr(src)
+            << " size=" << formatSize(size) << " stream=" << formatPtr(stream)
+            << "\n";
+}
+
+// Kernel launch logging
+inline void logKernelLaunch(CUfunction kernel, unsigned gridX, unsigned gridY,
+                            unsigned gridZ, unsigned blockX, unsigned blockY,
+                            unsigned blockZ, unsigned sharedMem,
+                            CUstream stream, const char *func, int line) {
+  if (!isEnabled())
+    return;
+  std::cerr << "[CUDA] " << func << ":" << line << " cuLaunchKernel"
+            << " kernel=" << formatPtr(kernel) << " grid=(" << gridX << ","
+            << gridY << "," << gridZ << ") block=(" << blockX << "," << blockY
+            << "," << blockZ << ") sharedMem=" << sharedMem
+            << " stream=" << formatPtr(stream) << "\n";
+}
+
+// Generic CUDA call logging
+inline void logCudaCall(const char *callString, const char *func, int line) {
+  if (!isEnabled())
+    return;
+  std::cerr << "[CUDA] " << func << ":" << line << " " << callString << "\n";
+}
+
+inline void logCudaResult(CUresult result, const char *func, int line) {
+  if (!isEnabled())
+    return;
+  if (result == CUDA_SUCCESS) {
+    std::cerr << "[CUDA] " << func << ":" << line << " -> CUDA_SUCCESS\n";
+  } else {
+    const char *errStr = nullptr;
+    cuGetErrorString(result, &errStr);
+    std::cerr << "[CUDA] " << func << ":" << line << " -> ERROR: " << result
+              << " (" << (errStr ? errStr : "unknown") << ")\n";
+  }
+}
+
+// Overloads for other return types (no-op, we only log CUresult)
+inline void logCudaResult(nvmlReturn_t, const char *, int) {}
+inline void logCudaResult(ur_result_t, const char *, int) {}
+
+} // namespace ur::cuda::call_logger
+
+// Macros for specific CUDA API calls with detailed logging
+#define CUDA_CALL_TRACE_CTX_SET(ctx)                                           \
+  do {                                                                         \
+    ::ur::cuda::call_logger::logContextSwitch("cuCtxSetCurrent", ctx,          \
+                                              __func__, __LINE__);             \
+  } while (0)
+
+#define CUDA_CALL_TRACE_CTX_GET(ctx)                                           \
+  do {                                                                         \
+    ::ur::cuda::call_logger::logContextSwitch("cuCtxGetCurrent", *ctx,         \
+                                              __func__, __LINE__);             \
+  } while (0)
+
+#define CUDA_CALL_TRACE_MEMCPY_ASYNC(dst, src, size, stream)                   \
+  do {                                                                         \
+    ::ur::cuda::call_logger::logMemcpy("cuMemcpyDtoDAsync", dst, src, size,    \
+                                       stream, __func__, __LINE__);            \
+  } while (0)
+
+#define CUDA_CALL_TRACE_MEMCPY_GENERIC(dst, src, size, stream)                 \
+  do {                                                                         \
+    ::ur::cuda::call_logger::logMemcpy("cuMemcpyAsync", dst, src, size,        \
+                                       stream, __func__, __LINE__);            \
+  } while (0)
+
+#define CUDA_CALL_TRACE_KERNEL_LAUNCH(kernel, gx, gy, gz, bx, by, bz, sm,      \
+                                      stream)                                  \
+  do {                                                                         \
+    ::ur::cuda::call_logger::logKernelLaunch(kernel, gx, gy, gz, bx, by, bz,   \
+                                             sm, stream, __func__, __LINE__);  \
+  } while (0)
+
+#define CUDA_CALL_TRACE_GENERIC(call_str)                                      \
+  do {                                                                         \
+    ::ur::cuda::call_logger::logCudaCall(call_str, __func__, __LINE__);        \
+  } while (0)
+
+#define CUDA_CALL_TRACE_RESULT(result)                                         \
+  do {                                                                         \
+    ::ur::cuda::call_logger::logCudaResult(result, __func__, __LINE__);        \
+  } while (0)
+
+// Wrapper for UR_CHECK_ERROR that adds logging
+#define UR_CHECK_ERROR_TRACED(result)                                          \
+  do {                                                                         \
+    CUresult __result = (result);                                              \
+    CUDA_CALL_TRACE_RESULT(__result);                                          \
+    UR_CHECK_ERROR(__result);                                                  \
+  } while (0)
+
+// Universal wrapper that logs the CUDA call before executing it
+#define UR_CUDA_CALL_LOGGED(call)                                              \
+  do {                                                                         \
+    CUDA_CALL_TRACE_GENERIC(#call);                                            \
+    UR_CHECK_ERROR(call);                                                      \
+  } while (0)
@@ -11,6 +11,7 @@
 #include "enqueue.hpp"
 #include "common.hpp"
 #include "context.hpp"
+#include "cuda_call_logger.hpp"
 #include "event.hpp"
 #include "kernel.hpp"
 #include "memory.hpp"
@@ -38,13 +39,30 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
 
     auto Result = forLatestEvents(
         EventWaitList, NumEventsInWaitList,
-        [Stream](ur_event_handle_t Event) -> ur_result_t {
+        [Stream, CommandQueue](ur_event_handle_t Event) -> ur_result_t {
           if (Event->getStream() == Stream) {
             return UR_RESULT_SUCCESS;
-          } else {
-            UR_CHECK_ERROR(cuStreamWaitEvent(Stream, Event->get(), 0));
+          }
+
+          // CUDA limitation: cuStreamWaitEvent fails when event and stream are
+          // from different native CUDA contexts (different physical devices).
+          // Compare the native CUcontext of the devices to detect this case.
+          if (Event->getQueue() && 
+              Event->getQueue()->getDevice()->getNativeContext() != 
+              CommandQueue->getDevice()->getNativeContext()) {
+            // Cross-device synchronization requires host involvement:
+            // 1. Wait for cross-device event to complete (blocks CPU)
+            UR_CHECK_ERROR(cuEventSynchronize(Event->get()));
+            // 2. Synchronize target stream to create ordering barrier
+            //    This ensures work enqueued after this call happens after
+            //    the cross-device event has completed
+            UR_CHECK_ERROR(cuStreamSynchronize(Stream));
             return UR_RESULT_SUCCESS;
           }
+
+          // Same native context: use asynchronous stream wait
+          UR_CHECK_ERROR(cuStreamWaitEvent(Stream, Event->get(), 0));
+          return UR_RESULT_SUCCESS;
         });
     return Result;
   } catch (ur_result_t Err) {
@@ -434,6 +452,10 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
     }
 
     auto &ArgPointers = hKernel->getArgPointers();
+    CUDA_CALL_TRACE_KERNEL_LAUNCH(CuFunc, BlocksPerGrid[0], BlocksPerGrid[1],
+                                  BlocksPerGrid[2], ThreadsPerBlock[0],
+                                  ThreadsPerBlock[1], ThreadsPerBlock[2],
+                                  LocalSize, CuStream);
     UR_CHECK_ERROR(cuLaunchKernel(
         CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
@@ -918,6 +940,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
     auto Dst = std::get<BufferMem>(hBufferDst->Mem)
                    .getPtrWithOffset(hQueue->getDevice(), dstOffset);
 
+    CUDA_CALL_TRACE_MEMCPY_ASYNC(Dst, Src, size, Stream);
     UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream));
 
     if (phEvent) {
@@ -1578,6 +1601,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
                                                       hQueue, CuStream);
       UR_CHECK_ERROR(EventPtr->start());
     }
+    CUDA_CALL_TRACE_MEMCPY_GENERIC((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size,
+                                   CuStream);
     UR_CHECK_ERROR(
         cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));
     if (phEvent) {

@@ -13,6 +13,7 @@
 #include "adapter.hpp"
 #include "common.hpp"
 #include "context.hpp"
+#include "cuda_call_logger.hpp"
 #include "device.hpp"
 #include "event.hpp"
 #include "platform.hpp"
@@ -578,6 +579,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t,
                                                           void *pDst,
                                                           const void *pSrc,
                                                           size_t Size) {
+  CUDA_CALL_TRACE_GENERIC("cuMemcpy (synchronous)");
   UR_CHECK_ERROR(cuMemcpy((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size));
   return UR_RESULT_SUCCESS;
 }