ROCm · johannes-graner · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
@@ -1631,6 +1631,13 @@ struct ConvInvscale
         e = type_convert<f8_t>(c / scale_in_ / scale_wei_ / scale_out_);
     };
 
+    template <>
+    __host__ __device__ void operator()<f8_t, f8_t>(f8_t& e, const f8_t& c) const
+    {
+        const float c_float = type_convert<float>(c);
+        e                   = type_convert<f8_t>(c_float / scale_in_ / scale_wei_ / scale_out_);
+    };
+
     float scale_in_;
     float scale_wei_;
     float scale_out_;
@@ -1656,6 +1663,13 @@ struct ConvScale
         e = type_convert<f8_t>(c * scale_in_ * scale_wei_ * scale_out_);
     };
 
+    template <>
+    __host__ __device__ void operator()<f8_t, f8_t>(f8_t& e, const f8_t& c) const
+    {
+        const float c_float = type_convert<float>(c);
+        e                   = type_convert<f8_t>(c_float * scale_in_ * scale_wei_ * scale_out_);
+    };
+
     float scale_in_;
     float scale_wei_;
     float scale_out_;
@@ -1683,6 +1697,15 @@ struct ConvScaleRelu
         e = type_convert<f8_t>(x * scale_out_);
     };
 
+    template <>
+    __host__ __device__ void operator()<f8_t, f8_t>(f8_t& e, const f8_t& c) const
+    {
+        const float c_float = type_convert<float>(c);
+        float x;
+        Relu{}.template operator()<float>(x, c_float * scale_in_ * scale_wei_);
+        e = type_convert<f8_t>(x * scale_out_);
+    };
+
     float scale_in_;
     float scale_wei_;
     float scale_out_;

@@ -22,9 +22,39 @@ struct SimpleDeviceMem
         HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&p_mem_), mem_size));
     }
 
+    // Delete copy operations (resource should not be copied)
+    SimpleDeviceMem(const SimpleDeviceMem&)            = delete;
+    SimpleDeviceMem& operator=(const SimpleDeviceMem&) = delete;
+
+    // Define move operations
+    SimpleDeviceMem(SimpleDeviceMem&& other) noexcept : p_mem_(other.p_mem_)
+    {
+        other.p_mem_ = nullptr;
+    }
+
+    SimpleDeviceMem& operator=(SimpleDeviceMem&& other) noexcept
+    {
+        if(this != &other)
+        {
+            if(p_mem_)
+            {
+                (void)hipFree(p_mem_);
+            }
+            p_mem_       = other.p_mem_;
+            other.p_mem_ = nullptr;
+        }
+        return *this;
+    }
+
     void* GetDeviceBuffer() { return p_mem_; }
 
-    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    ~SimpleDeviceMem()
+    {
+        if(p_mem_)
+        {
+            (void)hipFree(p_mem_);
+        }
+    }
 
     void* p_mem_;
 };

@@ -17,6 +17,7 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
 
 namespace ck {
 namespace profiler {
@@ -129,7 +130,10 @@ bool profile_conv_bwd_data_impl(int do_verification,
     out_device_buf.ToDevice(output.mData.data());
     wei_device_buf.ToDevice(weight.mData.data());
 
-    if(do_verification)
+    // profile device Conv instances
+    bool pass = true;
+
+    if(do_verification == 1)
     {
         auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
                                                                          InDataType,
@@ -154,6 +158,27 @@ bool profile_conv_bwd_data_impl(int do_verification,
         ref_invoker.Run(ref_argument);
     }
 
+    // GPU reference (compute once, compare in kernel loop)
+    Tensor<InDataType> gpu_ref_input(in_g_n_c_wis_desc);
+    if(do_verification == 2)
+    {
+        DeviceMem gpu_ref_in_dev(sizeof(InDataType) *
+                                 input_device_result.mDesc.GetElementSpaceSize());
+        gpu_ref_in_dev.SetZero(); // bwd data needs zero initialization
+
+        ck::ref::naive_conv_bwd_data<InLayout, WeiLayout, OutLayout>(
+            static_cast<InDataType*>(gpu_ref_in_dev.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            conv_param,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+
+        hip_check_error(hipDeviceSynchronize());
+        gpu_ref_in_dev.FromDevice(gpu_ref_input.mData.data());
+    }
+
     using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData<NDimSpatial,
                                                                      InLayout,
                                                                      WeiLayout,
@@ -176,8 +201,6 @@ bool profile_conv_bwd_data_impl(int do_verification,
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
     int num_kernel        = 0;
-    // profile device Conv instances
-    bool pass = true;
 
     for(auto& op_ptr : op_ptrs)
     {
@@ -235,7 +258,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
                 best_gb_per_sec = gb_per_sec;
             }
 
-            if(do_verification)
+            if(do_verification == 1)
             {
                 in_device_buf.FromDevice(input_device_result.mData.data());
 
@@ -255,6 +278,31 @@ bool profile_conv_bwd_data_impl(int do_verification,
                     show_data_nhwc_layout(input_host_result);
                     std::cout << std::endl;
 
+                    std::cout << "out_device: ";
+                    show_data_nhwc_layout(input_device_result);
+                    std::cout << std::endl;
+                }
+            }
+            else if(do_verification == 2)
+            {
+                in_device_buf.FromDevice(input_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(input_device_result, gpu_ref_input);
+
+                if(do_log)
+                {
+                    std::cout << "in : ";
+                    show_data_nhwc_layout(output);
+                    std::cout << std::endl;
+
+                    std::cout << "wei: ";
+                    show_data_nhwc_layout(weight);
+                    std::cout << std::endl;
+
+                    std::cout << "out_gpu_ref  : ";
+                    show_data_nhwc_layout(gpu_ref_input);
+                    std::cout << std::endl;
+
                     std::cout << "out_device: ";
                     show_data_nhwc_layout(input_device_result);
                     std::cout << std::endl;

@@ -21,6 +21,7 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
 
 namespace ck {
 namespace profiler {
@@ -107,8 +108,11 @@ bool profile_conv_fwd_impl(int do_verification,
     in_device_buf.ToDevice(input.mData.data());
     wei_device_buf.ToDevice(weight.mData.data());
 
+    // profile device op instances
+    bool pass = true;
+
     // run reference op
-    if(do_verification)
+    if(do_verification == 1)
     {
         auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                                      InDataType,
@@ -135,6 +139,24 @@ bool profile_conv_fwd_impl(int do_verification,
 
         ref_invoker.Run(ref_argument);
     }
+    // GPU reference (compute once, compare in kernel loop)
+    Tensor<OutDataType> gpu_ref_output(out_g_n_k_wos_desc);
+    if(do_verification == 2)
+    {
+        DeviceMem gpu_ref_out_dev(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+
+        ck::ref::naive_conv_fwd<InLayout, WeiLayout, OutLayout>(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(gpu_ref_out_dev.GetDeviceBuffer()),
+            conv_param,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+
+        hip_check_error(hipDeviceSynchronize());
+        gpu_ref_out_dev.FromDevice(gpu_ref_output.mData.data());
+    }
 
     using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NDimSpatial,
                                                                  InLayout,
@@ -158,8 +180,6 @@ bool profile_conv_fwd_impl(int do_verification,
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
     int num_kernel        = 0;
-    // profile device op instances
-    bool pass = true;
 
     for(auto& op_ptr : op_ptrs)
     {
@@ -217,7 +237,7 @@ bool profile_conv_fwd_impl(int do_verification,
                 best_gb_per_sec = gb_per_sec;
             }
 
-            if(do_verification)
+            if(do_verification == 1)
             {
                 out_device_buf.FromDevice(device_output.mData.data());
 
@@ -233,6 +253,23 @@ bool profile_conv_fwd_impl(int do_verification,
                         << std::endl;
                 }
             }
+            else if(do_verification == 2)
+            {
+                out_device_buf.FromDevice(device_output.mData.data());
+
+                pass = pass & ck::utils::check_err(device_output, gpu_ref_output);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "gpu_ref_output  : ", gpu_ref_output.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                        << std::endl;
+                }
+            }
         }
         else
         {

@@ -22,6 +22,8 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
+#include "example/48_pool3d_fwd/pool3d_fwd_common.hpp"
 
 namespace ck {
 namespace profiler {
@@ -255,9 +257,9 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     }
 
     // run reference op
-    if(do_verification)
+    if(do_verification == 1)
     {
-        // Run Conv and Bnorm seperatly
+        // CPU reference: Run Conv and Bnorm separately
         auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                                      InDataType,
                                                                      WeiDataType,
@@ -291,6 +293,86 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
         ref_bnorm_clamp_infer<NDimSpatial>(
             host_output, host_output, mean, variance, scale, shift, floor, ceil, epsilon);
     }
+    else if(do_verification == 2)
+    {
+        // GPU reference: Two-step approach (Conv+Bias, then BatchNorm+Clamp)
+
+        // Prepare bias tensor info for Conv step
+        std::vector<ck::index_t> bias_lengths_vec(NDimSpatial + 3);
+        std::vector<ck::index_t> bias_strides_vec(NDimSpatial + 3);
+
+        for(size_t i = 0; i < bias_lengths_vec.size(); ++i)
+        {
+            bias_lengths_vec[i] = e_g_n_k_wos_lengths[i];
+            bias_strides_vec[i] = d_g_n_k_wos_strides[i];
+        }
+
+        std::array<const OutDataType*, 1> bias_ptrs = {
+            reinterpret_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer())};
+        std::array<std::vector<ck::index_t>, 1> bias_lengths = {bias_lengths_vec};
+        std::array<std::vector<ck::index_t>, 1> bias_strides = {bias_strides_vec};
+
+        std::array<const InDataType*, 1> in_ptrs = {
+            reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer())};
+        std::array<const WeiDataType*, 1> wei_ptrs = {
+            reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer())};
+
+        // Step 1: Conv + Bias
+        ck::ref::naive_conv_fwd_multi_abd<0,
+                                          0,
+                                          1,
+                                          InLayout,
+                                          WeiLayout,
+                                          OutLayout,
+                                          InDataType,
+                                          WeiDataType,
+                                          OutDataType,
+                                          InElementOp,
+                                          WeiElementOp,
+                                          Add,
+                                          OutDataType>(
+            in_ptrs,
+            wei_ptrs,
+            bias_ptrs,
+            reinterpret_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            conv_param,
+            bias_lengths,
+            bias_strides,
+            in_element_op,
+            wei_element_op,
+            Add{});
+
+        HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+        // Step 2: Batch Norm + Clamp (in-place on output buffer)
+        long_index_t total_out = device_output.mDesc.GetElementSpaceSize();
+
+        // Prepare tensor strides (actual output layout)
+        std::vector<ck::index_t> tensor_strides_vec(NDimSpatial + 3);
+        for(size_t i = 0; i < tensor_strides_vec.size(); ++i)
+        {
+            tensor_strides_vec[i] = e_g_n_k_wos_strides[i];
+        }
+
+        ck::ref::naive_batchnorm_clamp_infer_gpu(
+            reinterpret_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            reinterpret_cast<const OutDataType*>(mean_device_buf.GetDeviceBuffer()),
+            reinterpret_cast<const OutDataType*>(variance_device_buf.GetDeviceBuffer()),
+            reinterpret_cast<const OutDataType*>(scale_device_buf.GetDeviceBuffer()),
+            reinterpret_cast<const OutDataType*>(shift_device_buf.GetDeviceBuffer()),
+            bias_lengths_vec,
+            bias_strides_vec,
+            tensor_strides_vec,
+            total_out,
+            epsilon,
+            floor,
+            ceil);
+
+        HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+        out_device_buf.FromDevice(host_output.mData.data());
+    }
 
     std::string best_op_name;
     float best_avg_time   = 0;
@@ -348,7 +430,17 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
             {
                 out_device_buf.FromDevice(device_output.mData.data());
 
-                pass = pass & ck::utils::check_err(device_output, host_output);
+                if constexpr(std::is_same_v<OutDataType, ck::half_t>)
+                {
+                    // The batchnorm involves operations which can cause small numerical
+                    // differences between host and device implementations
+                    static double rtol = 1e-2;
+                    pass = pass & ck::utils::check_err(device_output, host_output, rtol);
+                }
+                else
+                {
+                    pass = pass & ck::utils::check_err(device_output, host_output);
+                }
 
                 if(do_log)
                 {