Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2325,6 +2325,12 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) {
{VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) {
store = dyn_cast<Instruction>(value);
} else {
if (!slice_val->getType()->isVectorTy()) {
slice_val = create_broadcast(slice_val, 1);
}
if (!slice_mask->getType()->isVectorTy()) {
slice_mask = create_broadcast(slice_mask, 1);
}
store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask);
}
add_tbaa_metadata(store, op->name, slice_index);
Expand Down Expand Up @@ -2444,6 +2450,9 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
load_inst = dyn_cast<Instruction>(value);
} else {
if (slice_mask != nullptr) {
if (!slice_mask->getType()->isVectorTy()) {
slice_mask = create_broadcast(slice_mask, 1);
}
load_inst = builder->CreateMaskedLoad(slice_type, vec_ptr, llvm::Align(align_bytes), slice_mask);
} else {
load_inst = builder->CreateAlignedLoad(slice_type, vec_ptr, llvm::Align(align_bytes));
Expand Down
1 change: 1 addition & 0 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ tests(GROUPS correctness
plain_c_includes.c
popc_clz_ctz_bounds.cpp
predicated_store_load.cpp
predicated_store_load_single_lane.cpp
prefetch.cpp
print.cpp
print_loop_nest.cpp
Expand Down
35 changes: 35 additions & 0 deletions test/correctness/predicated_store_load_single_lane.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "Halide.h"

using namespace Halide;

int main(int argc, char **argv) {
// This test exercises predicated vector loads and stores with a single
// lane. These require special handling because Halide's IR does not
// distinguish between scalars and single-element vectors, while LLVM
// does.

int w = get_jit_target_from_environment().natural_vector_size<float>();

Func f1{"f1"}, f2{"f2"};
Var x{"x"}, xo{"xo"}, xi{"xi"};

ImageParam input(Float(32), 1);

f1(x) = input(x) * 2;
f2(x) = select(x < w, 0, f1(x) + f1(x + 1));

// This schedule creates a situation where f1 is computed with a
// vectorized loop that requires predicated loads/stores for the
// final single element.
f2.split(x, xo, xi, w);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't you explicitly select a Predicated tail strategy? Auto might select guard with if?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GuardWithIf currently generates a predicate, but yes it might be more future proof to use TailStrategy::Predicate here

f1.compute_at(f2, xo).vectorize(x); // effective vector width = w + 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or in vectorize set the tail strategy?


// Compile to check that codegen succeeds. This would crash before the fix
// with "Call parameter type does not match function signature" because
// the masked load/store intrinsics received scalar masks instead of
// vector masks.
f2.compile_jit();

printf("Success!\n");
return 0;
}
Loading