-
Notifications
You must be signed in to change notification settings - Fork 266
Cleanup and refactoring related to tile loading #3505
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
4d77856
9691569
bda5a7a
825d17c
ca71cd7
994b8f4
74533b4
3a094e2
cfa11f2
9559a93
9633d3f
9af4498
514035e
3d55a1e
63a4559
8fc4030
3216110
ca17ac3
2edd077
b91efe5
0a4388d
e62c96f
ea4e543
c020a42
35c620e
2ab79eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,48 +3,43 @@ | |
|
|
||
| #pragma once | ||
|
|
||
| #include "ck_tile/core/config.hpp" | ||
| #include "ck_tile/core.hpp" | ||
| #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" | ||
|
|
||
| namespace ck_tile { | ||
|
|
||
| template <typename DstDataType, index_t UnaryOpSize> | ||
| struct InterleavedPKTypeLoader | ||
| struct ConverterLoader | ||
| { | ||
| template <typename WarpWindow, typename WarpTile> | ||
| CK_TILE_DEVICE static void load_interleaved_pk_type(WarpTile& warp_tile, | ||
| const WarpWindow& warp_window) | ||
| CK_TILE_DEVICE static void load_interleaved_pk_type(WarpTile& dst, const WarpWindow& src) | ||
| { | ||
| const element_wise::PassThroughPack8 elementwise_op{}; | ||
|
|
||
| static_assert(WarpTile::get_thread_buffer_size() % UnaryOpSize == 0); | ||
| constexpr index_t thread_buffer_size = WarpTile::get_thread_buffer_size() / UnaryOpSize; | ||
| const auto in_dstr_tensors = load_tile(warp_window); | ||
| const auto tmp = load_tile(src); | ||
|
|
||
| using DstVectorType = DstDataType __attribute__((ext_vector_type(UnaryOpSize))); | ||
| static_for<0, thread_buffer_size, 1>{}([&](auto i) { | ||
| elementwise_op(warp_tile.get_thread_buffer().template get_as<DstVectorType>()(i), | ||
| in_dstr_tensors.get_thread_buffer().template get_as<pk_int4x4_t>()[i]); | ||
| const element_wise::PassThroughPack8 elementwise_op{}; | ||
|
|
||
| elementwise_op(dst.get_thread_buffer().template get_as<DstVectorType>()(i), | ||
| tmp.get_thread_buffer().template get_as<pk_int4x4_t>()[i]); | ||
| }); | ||
| } | ||
| }; | ||
|
|
||
| template <typename SrcDataType, | ||
| typename DstDataType, | ||
| index_t UnaryOpSize, | ||
| bool LoadTranspose = false, | ||
| typename WarpTile, | ||
| typename WarpWindow> | ||
| CK_TILE_DEVICE void load_int4_tile(WarpTile& dst, const WarpWindow& src) | ||
| template <index_t UnaryOpSize, bool LoadTranspose = false, typename WarpTile, typename WarpWindow> | ||
| CK_TILE_DEVICE void load_and_convert_tile(WarpTile& dst, const WarpWindow& src) | ||
joyeamd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| { | ||
| if constexpr(std::is_same_v<SrcDataType, pk_int4_t>) | ||
| if constexpr(std::is_same_v<typename WarpWindow::Base::DataType, pk_int4_t>) | ||
| { | ||
| static_assert(!LoadTranspose, "LoadTranspose not supported with pk_int4_t"); | ||
| InterleavedPKTypeLoader<DstDataType, UnaryOpSize>::load_interleaved_pk_type(dst, src); | ||
| ConverterLoader<typename WarpTile::DataType, UnaryOpSize>::load_interleaved_pk_type(dst, | ||
| src); | ||
| } | ||
| else if constexpr(LoadTranspose) | ||
| { | ||
| dst = load_tile_transpose(src); | ||
| load_tile_transpose(dst, src); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Copilot seems to agree with me that assignment cannot be optimized by the compiler in such a way that the creation of temporaries is avoided, when assigning to a complex type. It then ought to make sense to avoid assignment when possible, and pass the object to be assigned to as a reference instead.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @SamiAario-AMD Have you checked the assembly for this change? Does the register usage is different? |
||
| } | ||
| else | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tmp is not the best name. What about naming it
srcand the windowsrc_window?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will do this in a separate PR as discussed.