KernelTuner · stijnh · Mar 16, 2026 · Apr 7, 2026
diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py
@@ -2,11 +2,12 @@
 from warnings import warn
 
 import numpy as np
+import os
 
 from kernel_tuner.backends.backend import GPUBackend
 from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
 from kernel_tuner.util import SkippableFailure
-from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc
+from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home
 
 # embedded in try block to be able to generate documentation
 # and run tests without cuda-python installed
@@ -74,9 +75,6 @@
         self.current_module = None
         self.func = None
         self.compiler_options = compiler_options or []
-        self.compiler_options_bytes = []
-        for option in self.compiler_options:
-            self.compiler_options_bytes.append(str(option).encode("UTF-8"))
 
         # create a stream and events
         err, self.stream = driver.cuStreamCreate(0)
@@ -154,37 +152,60 @@
         """
         kernel_string = kernel_instance.kernel_string
         kernel_name = kernel_instance.name
+        expression_name = str.encode(kernel_name)
+        compiler_options = list(self.compiler_options)
 
-        # mimic pycuda behavior to wrap kernel_string in extern "C" if not in kernel_string already
-        if 'extern "C"' not in kernel_string:
-            kernel_string = 'extern "C" {\n' + kernel_string + "\n}"
+        # Add -std=c++11
+        if not any(opt.startswith(("-std=", "--std=")) for opt in self.compiler_options):
+            compiler_options.append("--std=c++11")
 
-        compiler_options = self.compiler_options_bytes
-        if not any([b"--std=" in opt for opt in compiler_options]):
-            compiler_options.append(b"--std=c++11")
-        if not any(["--std=" in opt for opt in self.compiler_options]):
-            self.compiler_options.append("--std=c++11")
-        if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
-            compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8"))
-        if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
-            self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
+        # Add -arch
+        if not any(opt.startswith(("-arch", "--arch", "--gpu-architecture=")) for opt in self.compiler_options):
+            arch_val = to_valid_nvrtc_gpu_arch_cc(self.cc)
+            compiler_options.append(f"--gpu-architecture=compute_{arch_val}")
+
+        # Add CUDA home to include path
+        cuda_home = find_cuda_home()
+        if cuda_home:
+            cuda_include = os.path.join(cuda_home, "include")
+            compiler_options.append(f"-I{cuda_include}")
+
+        # nvrtcCompileProgram requires bytes instead of str
+        compiler_options = [str(opt).encode("UTF-8") for opt in compiler_options]
 
         err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
         try:
+            # Add the kernel as an expression. This is necessary for templated kernels to ensure that the
+            # compiler actually instantiates the kernel that we want to compile.
+            cuda_error_check(err)
+            err = nvrtc.nvrtcAddNameExpression(program, expression_name)
+
+            # Compile the program
             cuda_error_check(err)
             err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
+
+            # Get the PTX
             cuda_error_check(err)
             err, size = nvrtc.nvrtcGetPTXSize(program)
             cuda_error_check(err)
             buff = b" " * size
             err = nvrtc.nvrtcGetPTX(program, buff)
             cuda_error_check(err)
+
+            # Load the module
             err, self.current_module = driver.cuModuleLoadData(np.char.array(buff))
             if err == driver.CUresult.CUDA_ERROR_INVALID_PTX:
                 raise SkippableFailure("uses too much shared data")
             else:
                 cuda_error_check(err)
-            err, self.func = driver.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
+
+            # First, get the "lowered" name of the kernel (i.e., the name inside the PTX).
+            # After, we can use the lowered name to lookup the kernel in the module.
+            err, lowered_name = nvrtc.nvrtcGetLoweredName(program, expression_name)
+            cuda_error_check(err)
+            err, self.func = driver.cuModuleGetFunction(
+                self.current_module, lowered_name
+            )
             cuda_error_check(err)
 
             # get the number of registers per thread used in this kernel

diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -707,7 +707,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         )
 
         # check for templated kernel
-        if kernel_source.lang in ["CUDA", "NVCUDA", "HIP"] and "<" in name and ">" in name:
+        if kernel_source.lang in ["CUDA", "HIP"] and "<" in name and ">" in name:
             kernel_string, name = wrap_templated_kernel(kernel_string, name)
 
         # Preprocess GPU arguments. Require for handling `Tunable` arguments

diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py
@@ -1,6 +1,10 @@
 """Module for kernel tuner cuda-python utility functions."""
 
 import numpy as np
+import os
+import subprocess
+import shutil
+from typing import Optional
 
 try:
     from cuda.bindings import driver, runtime, nvrtc
@@ -56,12 +60,20 @@ def cuda_error_check(error):
         if error != nvrtc.nvrtcResult.NVRTC_SUCCESS:
             _, desc = nvrtc.nvrtcGetErrorString(error)
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
-    elif isinstance(error, tuple) and len(error) > 0:
-        cuda_error_check(error[0])
-    else:
-        raise RuntimeError(f"unknown error type returned by CUDA: {error!r} (type: {type(error).__name__})")
 
 
 def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str:
     """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options."""
     return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="75")
+
+
+def find_cuda_home() -> Optional[str]:
+    """
+    Finds the CUDA home directory by checking environment variables.
+    """
+    for var in ["CUDA_HOME", "CUDA_PATH", "CUDA_ROOT"]:
+        path = os.environ.get(var)
+        if path and os.path.exists(path):
+            return path
+
+    return None
diff --git a/test/test_cuda_functions.py b/test/test_cuda_functions.py
@@ -32,6 +32,12 @@ def test_ready_argument_list():
     assert isinstance(gpu_args[2], driver.CUdeviceptr)
 
 
+def create_kernel_instance(kernel_name, kernel_string):
+    kernel_sources = KernelSource(kernel_name, kernel_string, "cuda")
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
+    return kernel_instance
+
+
 @skip_if_no_cuda
 def test_compile():
 
@@ -44,15 +50,47 @@ def test_compile():
     }
     """
 
-    kernel_name = "vector_add"
-    kernel_sources = KernelSource(kernel_name, kernel_string, "cuda")
-    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
+    kernel_instance = create_kernel_instance("vector_add", kernel_string)
     dev = nvcuda.CudaFunctions(0)
-    try:
-        dev.compile(kernel_instance)
-    except Exception as e:
-        pytest.fail("Did not expect any exception:" + str(e))
+    dev.compile(kernel_instance)
+
+@skip_if_no_cuda
+def test_compile_template():
+
+    kernel_string = """
+    namespace nested::namespaces {
+    template <typename T, int N>
+    __global__ void vector_add(T *c, T *a, T *b) {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i<N) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    }
+    """
+
+    kernel_name = "nested::namespaces::vector_add<float,10>"
+    kernel_instance = create_kernel_instance(kernel_name, kernel_string)
+    dev = nvcuda.CudaFunctions(0, compiler_options=["-std=c++17"])
+    dev.compile(kernel_instance)
+
+@skip_if_no_cuda
+def test_compile_include():
+
+    kernel_string = """
+    #include <cuda_fp16.h>
+
+    __global__ void vector_add(__nv_half *c, __nv_half *a, __nv_half *b, int n) {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i<n) {
+            c[i] = __hadd(a[i], b[i]);
+        }
+    }
+    """
 
+    kernel_instance = create_kernel_instance("vector_add", kernel_string)
+    dev = nvcuda.CudaFunctions(0, compiler_options=["-std=c++17"])
+    dev.compile(kernel_instance)
 
 @skip_if_no_cuda
 def test_tune_kernel(env):