[ez][ET-VK][q8ta_conv2d_pw] Halve accumulator to lift Adreno occupancy

ssjia · SS-JIA · commit 1ee58ed3b7da · 2026-05-09T02:08:46.000-04:00
Pull Request resolved: pytorch#19396 The pointwise quantized conv shader allocated ivec4 out_accum[4][2] = 32 int32 accumulators per thread, which on Adreno 740 pinned 28 full-precision registers per thread and capped ALU fiber occupancy at 37%. AOC reported 26.7% exposed long-latency stalls, evidence that occupancy was too low to hide texture and SSBO latency. Halve the accumulator to 16 ints by reducing TILE_N4 from 2 to 1 (each thread now covers 4 widths × 4 output channels = a single 4×4 output block). The compensating dispatch change is in pick_q8ta_conv2d_pw_global_wg_size: global_wg.x doubles since each thread covers half as many output channel blocks as before. Each thread still loads 1 input ivec4 (4 widths) per K-iter, preserving the natural int8x4 packing alignment, so arithmetic intensity drops only 25% (2.67 → 2.0 MAC/B, in contrast to the variant where TILE_M is halved which drops AI by 50%). ghstack-source-id: 379519735 @exported-using-ghexport Differential Revision: [D103770023](https://our.internmc.facebook.com/intern/diff/D103770023/)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_pw.glsl
@@ -22,16 +22,18 @@ $if USE_INT8_DOT_PRODUCT_EXT == 1:
 
 ${define_active_storage_type("buffer")}
 
+// Each thread computes a TILE_M (width) x TILE_N (output channel) output block,
+// using an int32 accumulator tile.
 // corresponds to input/output width dim
 #define TILE_M4 1
 // corresponds to input channels dim
 #define TILE_K4 1
 // corresponds to output channels dim
-#define TILE_N4 2
+#define TILE_N4 1
 
 #define TILE_M 4
 #define TILE_K 4
-#define TILE_N 8
+#define TILE_N 4
 
 layout(std430) buffer;
 
@@ -86,9 +88,9 @@ int compute_outp_buffer_idx(
 }
 
 void main() {
-  // Thread mapping: each thread handles TILE_M (4) widths × TILE_N (8) output channels
-  // gl_GlobalInvocationID.x → output channel blocks (TILE_N4 = 2 blocks of 4 channels)
-  // gl_GlobalInvocationID.y → width blocks (TILE_M4 = 1 block of 4 widths)
+  // Thread mapping: each thread handles TILE_M widths x TILE_N output channels.
+  // gl_GlobalInvocationID.x -> output channel blocks.
+  // gl_GlobalInvocationID.y -> width blocks.
   // gl_GlobalInvocationID.z → batch (or height * batch combined)
   const int oc_block_idx = int(gl_GlobalInvocationID.x) * TILE_N4;
   const int ow_block_idx = int(gl_GlobalInvocationID.y) * TILE_M4;
@@ -137,11 +139,11 @@ void main() {
 
   // Main accumulation loop over K dimension
   for (int k4 = 0; k4 < K4_per_group; k4++) {
-    // Load packed int8 input tile (TILE_M4=1, TILE_K4=1)
+    // Load the packed int8 input tile for the current width and K sub-block.
     // Each int contains 4 packed int8s (one per width position in the tile)
     ivec4 int8_input_tile = t_packed_int8_input[input_idx];
 
-    // Load int8 weight tile (TILE_K4=1, TILE_N4=2)
+    // Load the int8 weight tile for the current K and output-channel sub-block.
     ivec4 int8_weight_tile[TILE_N4];
     [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
       int8_weight_tile[n4] = texelFetch(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp
@@ -33,19 +33,17 @@ utils::uvec3 pick_q8ta_conv2d_pw_global_wg_size(
   const uint32_t H = graph->size_at<uint32_t>(-2, output);
   const uint32_t C = graph->size_at<uint32_t>(-3, output);
 
-  // The 4W4C shader processes tiles of:
-  // - TILE_N4=2 groups of 4 output channels (8 channels per thread)
-  // - TILE_M4=1 groups of 4 widths (4 widths per thread)
-  // - 1 height per thread
-  constexpr uint32_t TILE_N4 = 2;
+  // Each thread covers a 4-width x 4-channel output block.
+  // Tile constants must match TILE_M4 / TILE_N4 in q8ta_conv2d_pw.glsl.
+  constexpr uint32_t TILE_N4 = 1;
   constexpr uint32_t TILE_M4 = 1;
 
   const uint32_t C4 = utils::div_up_4(C);
   const uint32_t W4 = utils::div_up_4(W);
 
   // Global workgroup size:
-  // x = output channels / (TILE_N4 * 4) = C4 / TILE_N4
-  // y = width / (TILE_M4 * 4) = W4 / TILE_M4
+  // x = output channels / (TILE_N4 * 4) = C4 / TILE_N4 = C4
+  // y = width / (TILE_M4 * 4) = W4 / TILE_M4 = W4
   // z = height
   return {utils::div_up(C4, TILE_N4), utils::div_up(W4, TILE_M4), H};
 }