Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/iir_blur/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule.
$(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS)

$(BIN)/%/out.png: $(BIN)/%/filter
$< ../images/rgba.png $(BIN)/$*/out.png
$< ../images/rgb.png $(BIN)/$*/out.png

clean:
rm -rf $(BIN)
20 changes: 14 additions & 6 deletions apps/iir_blur/iir_blur_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,27 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
if (!skip_schedule) {
if (!target.has_gpu_feature()) {
// CPU schedule.
// 8.2ms on an Intel i9-9960X using 16 threads
// 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads
// Split the transpose into tiles of rows. Parallelize over channels
// and strips (Halide supports nested parallelism).
Var xo, yo, t;
// and strips.
Var xo, yo, t, yi;
transpose.compute_root()
.tile(x, y, xo, yo, x, y, vec, vec * 4)
.split(y, y, yi, vec)
.unroll(yi)
.vectorize(x)
.parallel(yo)
.parallel(c);
.fuse(yo, c, t)
.parallel(t);

blur.in(transpose)
.reorder_storage(y, x)
.compute_at(transpose, y)
.vectorize(x)
.unroll(y);

// Run the filter on each row of tiles (which corresponds to a strip of
// columns in the input).
blur.compute_at(transpose, yo);
blur.compute_at(transpose, t);

// Vectorize computations within the strips.
blur.update(0)
Expand Down
4 changes: 0 additions & 4 deletions src/CodeGen_Hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1404,10 +1404,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
return slice_vector(concat_vectors(result), 0, idx_elements);
}

bool is_power_of_two(int x) {
return (x & (x - 1)) == 0;
}

// vdelta and vrdelta are instructions that take an input vector and
// pass it through a network made up of levels. Each element x at each
// level i can either take the element from the previous level at the
Expand Down
20 changes: 15 additions & 5 deletions src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1363,10 +1363,6 @@ void CodeGen_LLVM::codegen(const Stmt &s) {
s.accept(this);
}

bool CodeGen_LLVM::is_power_of_two(int x) const {
return (x & (x - 1)) == 0;
}

Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const {
if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) {
return Float(32, t.lanes());
Expand Down Expand Up @@ -2194,6 +2190,20 @@ void CodeGen_LLVM::visit(const Broadcast *op) {
value = create_broadcast(v, op->lanes);
}

Value *CodeGen_LLVM::optimization_fence(Value *v) {
llvm::Type *t = v->getType();
internal_assert(!t->isScalableTy())
<< "optimization_fence does not support scalable vectors yet";
const int bits = t->getPrimitiveSizeInBits();
if (bits % 16) {
return v;
}
Comment on lines +2198 to +2200
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels like it should be an internal_assert -- I wouldn't want optimization_fence to be a no-op unexpectedly, if I were to use it.

llvm::Type *float_type = llvm_type_of(Float(16, bits / 16));
v = builder->CreateBitCast(v, float_type);
v = builder->CreateArithmeticFence(v, float_type);
return builder->CreateBitCast(v, t);
}

Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
internal_assert(!vecs.empty());
for (size_t i = 1; i < vecs.size(); i++) {
Expand All @@ -2210,7 +2220,7 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
for (int i = 0; i < vec_elements * 2; i++) {
indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
}
return shuffle_vectors(a, b, indices);
return optimization_fence(shuffle_vectors(a, b, indices));
} else {
// Grab the even and odd elements of vecs.
vector<Value *> even_vecs;
Expand Down
6 changes: 4 additions & 2 deletions src/CodeGen_LLVM.h
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,10 @@ class CodeGen_LLVM : public IRVisitor {
* an arbitrary number of vectors.*/
virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);

/** A fence to prevent fusion of ops by llvm. Designed for floats, but we
* abuse it to prevent shufflevector fusion too. */
llvm::Value *optimization_fence(llvm::Value *);

/** Description of an intrinsic function overload. Overloads are resolved
* using both argument and return types. The scalar types of the arguments
* and return type must match exactly for an overload resolution to succeed. */
Expand Down Expand Up @@ -523,8 +527,6 @@ class CodeGen_LLVM : public IRVisitor {
/** Shorthand for shuffling a single vector. */
llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector<int> &indices);

bool is_power_of_two(int x) const;

bool is_scalable_vector(llvm::Value *v) const;

/** Go looking for a vector version of a runtime function. Will
Expand Down
Loading
Loading