halide · abadams · Jan 26, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/apps/iir_blur/Makefile b/apps/iir_blur/Makefile
@@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule.
 	$(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS)
 
 $(BIN)/%/out.png: $(BIN)/%/filter
-	$< ../images/rgba.png $(BIN)/$*/out.png
+	$< ../images/rgb.png $(BIN)/$*/out.png
 
 clean:
 	rm -rf $(BIN)
diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
@@ -36,19 +36,27 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
     if (!skip_schedule) {
         if (!target.has_gpu_feature()) {
             // CPU schedule.
-            // 8.2ms on an Intel i9-9960X using 16 threads
+            // 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads
             // Split the transpose into tiles of rows. Parallelize over channels
-            // and strips (Halide supports nested parallelism).
-            Var xo, yo, t;
+            // and strips.
+            Var xo, yo, t, yi;
             transpose.compute_root()
                 .tile(x, y, xo, yo, x, y, vec, vec * 4)
+                .split(y, y, yi, vec)
+                .unroll(yi)
                 .vectorize(x)
-                .parallel(yo)
-                .parallel(c);
+                .fuse(yo, c, t)
+                .parallel(t);
+
+            blur.in(transpose)
+                .reorder_storage(y, x)
+                .compute_at(transpose, y)
+                .vectorize(x)
+                .unroll(y);
 
             // Run the filter on each row of tiles (which corresponds to a strip of
             // columns in the input).
-            blur.compute_at(transpose, yo);
+            blur.compute_at(transpose, t);
 
             // Vectorize computations within the strips.
             blur.update(0)

diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
@@ -1404,10 +1404,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
     return slice_vector(concat_vectors(result), 0, idx_elements);
 }
 
-bool is_power_of_two(int x) {
-    return (x & (x - 1)) == 0;
-}
-
 // vdelta and vrdelta are instructions that take an input vector and
 // pass it through a network made up of levels. Each element x at each
 // level i can either take the element from the previous level at the

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -1363,10 +1363,6 @@ void CodeGen_LLVM::codegen(const Stmt &s) {
     s.accept(this);
 }
 
-bool CodeGen_LLVM::is_power_of_two(int x) const {
-    return (x & (x - 1)) == 0;
-}
-
 Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const {
     if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) {
         return Float(32, t.lanes());
@@ -2194,6 +2190,20 @@ void CodeGen_LLVM::visit(const Broadcast *op) {
     value = create_broadcast(v, op->lanes);
 }
 
+Value *CodeGen_LLVM::optimization_fence(Value *v) {
+    llvm::Type *t = v->getType();
+    internal_assert(!t->isScalableTy())
+        << "optimization_fence does not support scalable vectors yet";
+    const int bits = t->getPrimitiveSizeInBits();
+    if (bits % 16) {
+        return v;
+    }
+    llvm::Type *float_type = llvm_type_of(Float(16, bits / 16));
+    v = builder->CreateBitCast(v, float_type);
+    v = builder->CreateArithmeticFence(v, float_type);
+    return builder->CreateBitCast(v, t);
+}
+
 Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     internal_assert(!vecs.empty());
     for (size_t i = 1; i < vecs.size(); i++) {
@@ -2210,7 +2220,7 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         for (int i = 0; i < vec_elements * 2; i++) {
             indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
         }
-        return shuffle_vectors(a, b, indices);
+        return optimization_fence(shuffle_vectors(a, b, indices));
     } else {
         // Grab the even and odd elements of vecs.
         vector<Value *> even_vecs;

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
@@ -460,6 +460,10 @@ class CodeGen_LLVM : public IRVisitor {
      * an arbitrary number of vectors.*/
     virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);
 
+    /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
+     * abuse it to prevent shufflevector fusion too. */
+    llvm::Value *optimization_fence(llvm::Value *);
+
     /** Description of an intrinsic function overload. Overloads are resolved
      * using both argument and return types. The scalar types of the arguments
      * and return type must match exactly for an overload resolution to succeed. */
@@ -523,8 +527,6 @@ class CodeGen_LLVM : public IRVisitor {
     /** Shorthand for shuffling a single vector. */
     llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector<int> &indices);
 
-    bool is_power_of_two(int x) const;
-
     bool is_scalable_vector(llvm::Value *v) const;
 
     /** Go looking for a vector version of a runtime function. Will