Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion example/62_convnd_activ/convinvscale/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,11 @@ if (NOT GPU_TARGETS MATCHES "gfx11")
add_custom_target(example_convnd_activ_xdl_convinvscale)
add_example_executable(example_convnd_fwd_xdl_convinvscale_fp8 convnd_fwd_xdl_convinvscale_fp8.cpp)
add_example_dependencies(example_convnd_activ_xdl_convinvscale example_convnd_fwd_xdl_convinvscale_fp8)
endif()
endif()

# WMMA
if (GPU_TARGETS MATCHES "gfx12")
add_custom_target(example_convnd_activ_wmma_convinvscale)
add_example_executable(example_convnd_fwd_wmma_convinvscale_fp8 convnd_fwd_wmma_convinvscale_fp8.cpp)
add_example_dependencies(example_convnd_activ_wmma_convinvscale example_convnd_fwd_wmma_convinvscale_fp8)
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT

#include "convnd_fwd_convinvscale_common.hpp"

#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"

using InDataType = ck::f8_t;
using WeiDataType = ck::f8_t;
using AccDataType = float;
using CShuffleDataType = float;
using DsDataType = ck::Tuple<>;
using OutDataType = ck::f8_t;
using AComputeDataType = ck::f8_t;
using BComputeDataType = ck::f8_t;

template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ConvInvscale;

static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename DsLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
NDimSpatial, // NDimSpatial
InLayout, // ALayout
WeiLayout, // BLayout
DsLayout, // DsLayout (empty tuple for ConvInvScale)
OutLayout, // ELayout
InDataType, // ADataType
WeiDataType, // BDataType
AccDataType, // AccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType (empty tuple)
OutDataType, // EDataType
InElementOp, // AElementwiseOperation
WeiElementOp, // BElementwiseOperation
OutElementOp, // CDEElementwiseOperation
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
64, // BlockSize
64, // MPerBlock
64, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
16, // MPerWmma
16, // NPerWmma
4, // MRepeat
2, // NRepeat
S<4, 16, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
1, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 16, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
1, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1, // CShuffleMRepeatPerShuffle
1, // CShuffleNRepeatPerShuffle
S<1, 16, 1, 4>, // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
1, // CDEBlockTransferScalarPerVector_NPerBlock
ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
ck::BlockGemmPipelineVersion::v1, // BlkGemmPipelineVer
AComputeDataType, // AComputeDataType
BComputeDataType, // BComputeDataType
1>; // NumGroupsToMerge

#include "run_convnd_fwd_convinvscale_example.inc"

int main(int argc, char* argv[])
{
if(!ck::is_gfx12_supported())
{
std::cout << "This kernel support gfx12 only" << std::endl;

return 0;
}
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
}
16 changes: 16 additions & 0 deletions example/62_convnd_activ/convscale/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,19 @@ if (NOT GPU_TARGETS MATCHES "gfx11")
add_example_executable(example_convnd_fwd_xdl_convscale_bf8_fp8 convnd_fwd_xdl_convscale_bf8_fp8.cpp)
add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_bf8_fp8)
endif()

# WMMA
if (GPU_TARGETS MATCHES "gfx12")
add_custom_target(example_convnd_activ_wmma_convscale)
add_example_executable(example_convnd_fwd_wmma_convscale_fp8 convnd_fwd_wmma_convscale_fp8.cpp)
add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_fp8)

add_example_executable(example_convnd_fwd_wmma_convscale_bf8 convnd_fwd_wmma_convscale_bf8.cpp)
add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_bf8)

add_example_executable(example_convnd_fwd_wmma_convscale_fp8_bf8 convnd_fwd_wmma_convscale_fp8_bf8.cpp)
add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_fp8_bf8)

add_example_executable(example_convnd_fwd_wmma_convscale_bf8_fp8 convnd_fwd_wmma_convscale_bf8_fp8.cpp)
add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_bf8_fp8)
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT

#include "convnd_fwd_convscale_common.hpp"

#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"

using InDataType = ck::bf8_t;
using WeiDataType = ck::bf8_t;
using AccDataType = float;
using CShuffleDataType = float;
using DsDataType = ck::Tuple<>;
using OutDataType = ck::f8_t;
using AComputeDataType = InDataType;
using BComputeDataType = AComputeDataType;

template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ConvScale;

static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename DsLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
NDimSpatial, // NDimSpatial
InLayout, // ALayout
WeiLayout, // BLayout
DsLayout, // DsLayout (empty tuple for ConvScale)
OutLayout, // ELayout
InDataType, // ADataType
WeiDataType, // BDataType
AccDataType, // AccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType (empty tuple)
OutDataType, // EDataType
InElementOp, // AElementwiseOperation
WeiElementOp, // BElementwiseOperation
OutElementOp, // CDEElementwiseOperation
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
64, // BlockSize
64, // MPerBlock
64, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
16, // MPerWmma
16, // NPerWmma
4, // MRepeat
2, // NRepeat
S<4, 16, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
1, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 16, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
1, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1, // CShuffleMRepeatPerShuffle
1, // CShuffleNRepeatPerShuffle
S<1, 16, 1, 4>, // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
1, // CDEBlockTransferScalarPerVector_NPerBlock
ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
ck::BlockGemmPipelineVersion::v1, // BlkGemmPipelineVer
AComputeDataType, // AComputeDataType
BComputeDataType, // BComputeDataType
1>; // NumGroupsToMerge

#include "run_convnd_fwd_convscale_example.inc"

int main(int argc, char* argv[])
{
if(!ck::is_gfx12_supported())
{
std::cout << "This kernel support gfx12 only" << std::endl;

return 0;
}
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT

#include "convnd_fwd_convscale_common.hpp"

#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"

using InDataType = ck::bf8_t;
using WeiDataType = ck::f8_t;
using AccDataType = float;
using CShuffleDataType = float;
using DsDataType = ck::Tuple<>;
using OutDataType = ck::f8_t;
using AComputeDataType = ck::bf8_t;
using BComputeDataType = ck::f8_t;

template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ConvScale;

static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename DsLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
NDimSpatial, // NDimSpatial
InLayout, // ALayout
WeiLayout, // BLayout
DsLayout, // DsLayout (empty tuple for ConvScale)
OutLayout, // ELayout
InDataType, // ADataType
WeiDataType, // BDataType
AccDataType, // AccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType (empty tuple)
OutDataType, // EDataType
InElementOp, // AElementwiseOperation
WeiElementOp, // BElementwiseOperation
OutElementOp, // CDEElementwiseOperation
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
64, // BlockSize
64, // MPerBlock
64, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
16, // MPerWmma
16, // NPerWmma
4, // MRepeat
2, // NRepeat
S<4, 16, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
1, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 16, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
1, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1, // CShuffleMRepeatPerShuffle
1, // CShuffleNRepeatPerShuffle
S<1, 16, 1, 4>, // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
1, // CDEBlockTransferScalarPerVector_NPerBlock
ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
ck::BlockGemmPipelineVersion::v1, // BlkGemmPipelineVer
AComputeDataType, // AComputeDataType
BComputeDataType, // BComputeDataType
1>; // NumGroupsToMerge

#include "run_convnd_fwd_convscale_example.inc"

int main(int argc, char* argv[])
{
if(!ck::is_gfx12_supported())
{
std::cout << "This kernel support gfx12 only" << std::endl;

return 0;
}
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
}
Loading