Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
c7da77d
test_convnd_fwd
johannes-graner Jan 7, 2026
e00ef08
test_convnd_bwd_data
johannes-graner Jan 7, 2026
2f83bac
test_conv_bwd_data_scale
johannes-graner Jan 7, 2026
2f9b366
test_grouped_convnd_fwd_clamp
johannes-graner Jan 7, 2026
0c106d2
test_grouped_convnd_fwd_scale
johannes-graner Jan 8, 2026
9e95a2a
multiple A/B tensors and D tensor for fwd GPU ref
johannes-graner Jan 8, 2026
7004943
test_grouped_convnd_fwd_scaleadd_ab
johannes-graner Jan 8, 2026
3298801
test_grouped_convnd_fwd_bias_clamp
johannes-graner Jan 8, 2026
2e36ef8
test_grouped_convnd_fwd_bilinear
johannes-graner Jan 8, 2026
2992269
test_grouped_convnd_fwd_gk_bias_clamp
johannes-graner Jan 8, 2026
e2f75fa
Extend GPU reference to enable batchnorm epilogue
johannes-graner Jan 9, 2026
6da4576
test_grouped_convnd_fwd{,_gk}_bias_bnorm_clamp
johannes-graner Jan 9, 2026
64cf835
test_grouped_conv_bwd_data_bilinear
johannes-graner Jan 9, 2026
1556359
test_grouped_convnd_bwd_weight_bilinear
johannes-graner Jan 9, 2026
9c2a899
Merge branch 'develop' into jograner/extend-gpu-reference
johannes-graner Jan 9, 2026
0d5b27d
Add missing template instantiation
johannes-graner Jan 9, 2026
85f2d93
Perform operations in float in reference
johannes-graner Jan 12, 2026
684c9ed
Merge branch 'develop' into jograner/extend-gpu-reference
johannes-graner Jan 12, 2026
c9f0a5c
Merge branch 'develop' into jograner/extend-gpu-reference
johannes-graner Jan 13, 2026
a3b2475
Slightly increase tolerance for batchnorm profiler
johannes-graner Jan 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1631,6 +1631,13 @@ struct ConvInvscale
e = type_convert<f8_t>(c / scale_in_ / scale_wei_ / scale_out_);
};

template <>
__host__ __device__ void operator()<f8_t, f8_t>(f8_t& e, const f8_t& c) const
{
const float c_float = type_convert<float>(c);
e = type_convert<f8_t>(c_float / scale_in_ / scale_wei_ / scale_out_);
};

float scale_in_;
float scale_wei_;
float scale_out_;
Expand All @@ -1656,6 +1663,13 @@ struct ConvScale
e = type_convert<f8_t>(c * scale_in_ * scale_wei_ * scale_out_);
};

template <>
__host__ __device__ void operator()<f8_t, f8_t>(f8_t& e, const f8_t& c) const
{
const float c_float = type_convert<float>(c);
e = type_convert<f8_t>(c_float * scale_in_ * scale_wei_ * scale_out_);
};

float scale_in_;
float scale_wei_;
float scale_out_;
Expand Down Expand Up @@ -1683,6 +1697,15 @@ struct ConvScaleRelu
e = type_convert<f8_t>(x * scale_out_);
};

template <>
__host__ __device__ void operator()<f8_t, f8_t>(f8_t& e, const f8_t& c) const
{
const float c_float = type_convert<float>(c);
float x;
Relu{}.template operator()<float>(x, c_float * scale_in_ * scale_wei_);
e = type_convert<f8_t>(x * scale_out_);
};

float scale_in_;
float scale_wei_;
float scale_out_;
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,39 @@ struct SimpleDeviceMem
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&p_mem_), mem_size));
}

// Delete copy operations (resource should not be copied)
SimpleDeviceMem(const SimpleDeviceMem&) = delete;
SimpleDeviceMem& operator=(const SimpleDeviceMem&) = delete;

// Define move operations
SimpleDeviceMem(SimpleDeviceMem&& other) noexcept : p_mem_(other.p_mem_)
{
other.p_mem_ = nullptr;
}

SimpleDeviceMem& operator=(SimpleDeviceMem&& other) noexcept
{
if(this != &other)
{
if(p_mem_)
{
(void)hipFree(p_mem_);
}
p_mem_ = other.p_mem_;
other.p_mem_ = nullptr;
}
return *this;
}

void* GetDeviceBuffer() { return p_mem_; }

~SimpleDeviceMem() { (void)hipFree(p_mem_); }
~SimpleDeviceMem()
{
if(p_mem_)
{
(void)hipFree(p_mem_);
}
}

void* p_mem_;
};
Expand Down
56 changes: 52 additions & 4 deletions profiler/include/profiler/profile_conv_bwd_data_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"

namespace ck {
namespace profiler {
Expand Down Expand Up @@ -129,7 +130,10 @@ bool profile_conv_bwd_data_impl(int do_verification,
out_device_buf.ToDevice(output.mData.data());
wei_device_buf.ToDevice(weight.mData.data());

if(do_verification)
// profile device Conv instances
bool pass = true;

if(do_verification == 1)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
InDataType,
Expand All @@ -154,6 +158,27 @@ bool profile_conv_bwd_data_impl(int do_verification,
ref_invoker.Run(ref_argument);
}

// GPU reference (compute once, compare in kernel loop)
Tensor<InDataType> gpu_ref_input(in_g_n_c_wis_desc);
if(do_verification == 2)
{
DeviceMem gpu_ref_in_dev(sizeof(InDataType) *
input_device_result.mDesc.GetElementSpaceSize());
gpu_ref_in_dev.SetZero(); // bwd data needs zero initialization

ck::ref::naive_conv_bwd_data<InLayout, WeiLayout, OutLayout>(
static_cast<InDataType*>(gpu_ref_in_dev.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);

hip_check_error(hipDeviceSynchronize());
gpu_ref_in_dev.FromDevice(gpu_ref_input.mData.data());
}

using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData<NDimSpatial,
InLayout,
WeiLayout,
Expand All @@ -176,8 +201,6 @@ bool profile_conv_bwd_data_impl(int do_verification,
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device Conv instances
bool pass = true;

for(auto& op_ptr : op_ptrs)
{
Expand Down Expand Up @@ -235,7 +258,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
best_gb_per_sec = gb_per_sec;
}

if(do_verification)
if(do_verification == 1)
{
in_device_buf.FromDevice(input_device_result.mData.data());

Expand All @@ -255,6 +278,31 @@ bool profile_conv_bwd_data_impl(int do_verification,
show_data_nhwc_layout(input_host_result);
std::cout << std::endl;

std::cout << "out_device: ";
show_data_nhwc_layout(input_device_result);
std::cout << std::endl;
}
}
else if(do_verification == 2)
{
in_device_buf.FromDevice(input_device_result.mData.data());

pass = pass & ck::utils::check_err(input_device_result, gpu_ref_input);

if(do_log)
{
std::cout << "in : ";
show_data_nhwc_layout(output);
std::cout << std::endl;

std::cout << "wei: ";
show_data_nhwc_layout(weight);
std::cout << std::endl;

std::cout << "out_gpu_ref : ";
show_data_nhwc_layout(gpu_ref_input);
std::cout << std::endl;

std::cout << "out_device: ";
show_data_nhwc_layout(input_device_result);
std::cout << std::endl;
Expand Down
45 changes: 41 additions & 4 deletions profiler/include/profiler/profile_conv_fwd_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"

namespace ck {
namespace profiler {
Expand Down Expand Up @@ -107,8 +108,11 @@ bool profile_conv_fwd_impl(int do_verification,
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weight.mData.data());

// profile device op instances
bool pass = true;

// run reference op
if(do_verification)
if(do_verification == 1)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
Expand All @@ -135,6 +139,24 @@ bool profile_conv_fwd_impl(int do_verification,

ref_invoker.Run(ref_argument);
}
// GPU reference (compute once, compare in kernel loop)
Tensor<OutDataType> gpu_ref_output(out_g_n_k_wos_desc);
if(do_verification == 2)
{
DeviceMem gpu_ref_out_dev(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());

ck::ref::naive_conv_fwd<InLayout, WeiLayout, OutLayout>(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(gpu_ref_out_dev.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);

hip_check_error(hipDeviceSynchronize());
gpu_ref_out_dev.FromDevice(gpu_ref_output.mData.data());
}

using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NDimSpatial,
InLayout,
Expand All @@ -158,8 +180,6 @@ bool profile_conv_fwd_impl(int do_verification,
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances
bool pass = true;

for(auto& op_ptr : op_ptrs)
{
Expand Down Expand Up @@ -217,7 +237,7 @@ bool profile_conv_fwd_impl(int do_verification,
best_gb_per_sec = gb_per_sec;
}

if(do_verification)
if(do_verification == 1)
{
out_device_buf.FromDevice(device_output.mData.data());

Expand All @@ -233,6 +253,23 @@ bool profile_conv_fwd_impl(int do_verification,
<< std::endl;
}
}
else if(do_verification == 2)
{
out_device_buf.FromDevice(device_output.mData.data());

pass = pass & ck::utils::check_err(device_output, gpu_ref_output);

if(do_log)
{
LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
LogRangeAsType<float>(
std::cout << "gpu_ref_output : ", gpu_ref_output.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
<< std::endl;
}
}
}
else
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
#include "example/48_pool3d_fwd/pool3d_fwd_common.hpp"

namespace ck {
namespace profiler {
Expand Down Expand Up @@ -255,9 +257,9 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
}

// run reference op
if(do_verification)
if(do_verification == 1)
{
// Run Conv and Bnorm seperatly
// CPU reference: Run Conv and Bnorm separately
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
WeiDataType,
Expand Down Expand Up @@ -291,6 +293,86 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
ref_bnorm_clamp_infer<NDimSpatial>(
host_output, host_output, mean, variance, scale, shift, floor, ceil, epsilon);
}
else if(do_verification == 2)
{
// GPU reference: Two-step approach (Conv+Bias, then BatchNorm+Clamp)

// Prepare bias tensor info for Conv step
std::vector<ck::index_t> bias_lengths_vec(NDimSpatial + 3);
std::vector<ck::index_t> bias_strides_vec(NDimSpatial + 3);

for(size_t i = 0; i < bias_lengths_vec.size(); ++i)
{
bias_lengths_vec[i] = e_g_n_k_wos_lengths[i];
bias_strides_vec[i] = d_g_n_k_wos_strides[i];
}

std::array<const OutDataType*, 1> bias_ptrs = {
reinterpret_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer())};
std::array<std::vector<ck::index_t>, 1> bias_lengths = {bias_lengths_vec};
std::array<std::vector<ck::index_t>, 1> bias_strides = {bias_strides_vec};

std::array<const InDataType*, 1> in_ptrs = {
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer())};
std::array<const WeiDataType*, 1> wei_ptrs = {
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer())};

// Step 1: Conv + Bias
ck::ref::naive_conv_fwd_multi_abd<0,
0,
1,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
Add,
OutDataType>(
in_ptrs,
wei_ptrs,
bias_ptrs,
reinterpret_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param,
bias_lengths,
bias_strides,
in_element_op,
wei_element_op,
Add{});

HIP_CHECK_ERROR(hipDeviceSynchronize());

// Step 2: Batch Norm + Clamp (in-place on output buffer)
long_index_t total_out = device_output.mDesc.GetElementSpaceSize();

// Prepare tensor strides (actual output layout)
std::vector<ck::index_t> tensor_strides_vec(NDimSpatial + 3);
for(size_t i = 0; i < tensor_strides_vec.size(); ++i)
{
tensor_strides_vec[i] = e_g_n_k_wos_strides[i];
}

ck::ref::naive_batchnorm_clamp_infer_gpu(
reinterpret_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(mean_device_buf.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(variance_device_buf.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(scale_device_buf.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(shift_device_buf.GetDeviceBuffer()),
bias_lengths_vec,
bias_strides_vec,
tensor_strides_vec,
total_out,
epsilon,
floor,
ceil);

HIP_CHECK_ERROR(hipDeviceSynchronize());

out_device_buf.FromDevice(host_output.mData.data());
}

std::string best_op_name;
float best_avg_time = 0;
Expand Down Expand Up @@ -348,7 +430,17 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
{
out_device_buf.FromDevice(device_output.mData.data());

pass = pass & ck::utils::check_err(device_output, host_output);
if constexpr(std::is_same_v<OutDataType, ck::half_t>)
{
// The batchnorm involves operations which can cause small numerical
// differences between host and device implementations
static double rtol = 1e-2;
pass = pass & ck::utils::check_err(device_output, host_output, rtol);
}
else
{
pass = pass & ck::utils::check_err(device_output, host_output);
}

if(do_log)
{
Expand Down
Loading
Loading