Add dedicated HiFi kernel for max pool 2d (#18240)

mcremon-meta · meta-codesync[bot] · commit 0cc005a40271 · 2026-03-18T08:55:47.000-07:00
Summary: Pull Request resolved: #18240 As titled. Calls into nnlib directly. Differential Revision: D96874522 Reviewed By: hsharma35
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -2292,27 +2292,27 @@ def quantized_max_pool2d_nchw_meta(
     dilation: list[int],
     ceil_mode: bool,
 ) -> torch.Tensor:
-    assert len(kernel_size) == 2, f"kernel_size must have 2 elements, got {len(kernel_size)}"
+    assert (
+        len(kernel_size) == 2
+    ), f"kernel_size must have 2 elements, got {len(kernel_size)}"
     assert len(stride) == 2, f"stride must have 2 elements, got {len(stride)}"
     assert len(padding) == 2, f"padding must have 2 elements, got {len(padding)}"
     assert len(dilation) == 2, f"dilation must have 2 elements, got {len(dilation)}"
-    assert len(input.size()) == 4, f"input must be 4D (N, C, H, W), got {len(input.size())}D"
+    assert (
+        len(input.size()) == 4
+    ), f"input must be 4D (N, C, H, W), got {len(input.size())}D"
 
     batch = input.size(0)
     channels = input.size(1)
     height_in = input.size(2)
     width_in = input.size(3)
 
     height_out_raw = (
-        (height_in + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1)
-        / stride[0]
-        + 1
-    )
+        height_in + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1
+    ) / stride[0] + 1
     width_out_raw = (
-        (width_in + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1)
-        / stride[1]
-        + 1
-    )
+        width_in + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1
+    ) / stride[1] + 1
 
     if ceil_mode:
         height_out = ceil(height_out_raw)
@@ -2333,27 +2333,27 @@ def quantized_max_pool2d_nhwc_meta(
     dilation: list[int],
     ceil_mode: bool,
 ) -> torch.Tensor:
-    assert len(kernel_size) == 2, f"kernel_size must have 2 elements, got {len(kernel_size)}"
+    assert (
+        len(kernel_size) == 2
+    ), f"kernel_size must have 2 elements, got {len(kernel_size)}"
     assert len(stride) == 2, f"stride must have 2 elements, got {len(stride)}"
     assert len(padding) == 2, f"padding must have 2 elements, got {len(padding)}"
     assert len(dilation) == 2, f"dilation must have 2 elements, got {len(dilation)}"
-    assert len(input.size()) == 4, f"input must be 4D (N, H, W, C), got {len(input.size())}D"
+    assert (
+        len(input.size()) == 4
+    ), f"input must be 4D (N, H, W, C), got {len(input.size())}D"
 
     batch = input.size(0)
     height_in = input.size(1)
     width_in = input.size(2)
     channels = input.size(3)
 
     height_out_raw = (
-        (height_in + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1)
-        / stride[0]
-        + 1
-    )
+        height_in + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1
+    ) / stride[0] + 1
     width_out_raw = (
-        (width_in + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1)
-        / stride[1]
-        + 1
-    )
+        width_in + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1
+    ) / stride[1] + 1
 
     if ceil_mode:
         height_out = ceil(height_out_raw)
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -739,7 +739,9 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             dequants_biases,
                             op_node,
                         )
-                    elif isinstance(pattern, (MaxPool2dPattern, MaxPool2dWithoutIndicesPattern)):
+                    elif isinstance(
+                        pattern, (MaxPool2dPattern, MaxPool2dWithoutIndicesPattern)
+                    ):
                         args, kwargs = get_args_and_kwargs_max_pool2d(
                             inputs_inputs,
                             op_node,
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
@@ -503,6 +503,7 @@ def replacement_op(self) -> OpOverload:
 
 # This is a base class for ReLU
 
+
 # This is a base class for ReLU, since it can be used with two different aten ops
 class ReluBasePattern(QuantizationPattern):
     @abstractmethod
diff --git a/backends/cadence/aot/tests/test_quantizer_ops.py b/backends/cadence/aot/tests/test_quantizer_ops.py
@@ -505,7 +505,9 @@ def _build_max_pool2d_graph(self) -> tuple[torch.fx.GraphModule, torch.fx.Node]:
             target=torch.ops.aten.max_pool2d_with_indices.default,
         )
         self.assertEqual(
-            len(max_pool_nodes), 1, "Should find exactly one max_pool2d_with_indices node"
+            len(max_pool_nodes),
+            1,
+            "Should find exactly one max_pool2d_with_indices node",
         )
         return gm, max_pool_nodes[0]
 
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -2599,9 +2599,7 @@ def test_replace_max_pool2d_nchw_with_nhwc(self) -> None:
         self.assertEqual(
             count_node(gm, exir_ops.edge.cadence.quantized_max_pool2d_nchw.default), 1
         )
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.aten.permute_copy.default), 0
-        )
+        self.assertEqual(count_node(gm, exir_ops.edge.aten.permute_copy.default), 0)
 
         # Deepcopy before the pass
         original = copy.deepcopy(gm)
diff --git a/backends/cadence/generic/operators/op_quantized_max_pool2d_nhwc.cpp b/backends/cadence/generic/operators/op_quantized_max_pool2d_nhwc.cpp
@@ -10,6 +10,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <cstring>
 #include <limits>
 
 #include <executorch/backends/cadence/generic/operators/cadence_type_util.h>
diff --git a/backends/cadence/hifi/operators/op_quantized_max_pool2d_nhwc.cpp b/backends/cadence/hifi/operators/op_quantized_max_pool2d_nhwc.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+Tensor& quantized_max_pool2d_nhwc_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode,
+    Tensor& output) {
+  // NHWC layout: [N, H, W, C]
+  const int32_t batch_size = input.size(0);
+  const int32_t in_height = input.size(1);
+  const int32_t in_width = input.size(2);
+  const int32_t channels = input.size(3);
+
+  const int32_t out_height = output.size(1);
+  const int32_t out_width = output.size(2);
+
+  const int32_t kernel_h = kernel_size[0];
+  const int32_t kernel_w = kernel_size[1];
+  const int32_t stride_h = stride[0];
+  const int32_t stride_w = stride[1];
+  const int32_t pad_h = padding[0];
+  const int32_t pad_w = padding[1];
+
+  // Determine NNLIB precision constants based on dtype
+  ScalarType dtype = input.scalar_type();
+  int32_t nnlib_precision;
+  switch (dtype) {
+    case ScalarType::Char: // int8
+      nnlib_precision = PREC_SYM8S;
+      break;
+    case ScalarType::Byte: // uint8
+      nnlib_precision = PREC_ASYM8U;
+      break;
+    default:
+      ET_DCHECK_MSG(
+          false,
+          "Unsupported dtype %s for HiFi quantized_max_pool2d_nhwc",
+          torch::executor::toString(dtype));
+      return output;
+  }
+
+  // Compute scratch buffer size for NNLIB maxpool
+  int32_t scratch_size = xa_nn_maxpool_getsize(
+      channels,
+      nnlib_precision,
+      nnlib_precision,
+      in_height,
+      in_width,
+      kernel_h,
+      kernel_w,
+      stride_w, // x_stride
+      stride_h, // y_stride
+      pad_w, // x_padding
+      pad_h, // y_padding
+      out_height,
+      out_width,
+      0, // inp_data_format: 0 = NHWC
+      0); // out_data_format: 0 = NHWC
+  ET_DCHECK_MSG(scratch_size >= 0, "xa_nn_maxpool_getsize failed");
+
+  // Allocate aligned scratch memory
+  void* p_scratch = kernels::allocate_temp_memory(ctx, scratch_size);
+
+  // Process each batch using NNLIB optimized maxpool kernel
+  for (int32_t n = 0; n < batch_size; ++n) {
+    const int32_t spatial_size = in_height * in_width * channels;
+    const int32_t out_spatial_size = out_height * out_width * channels;
+
+    int32_t ret;
+    if (dtype == ScalarType::Char) {
+      const int8_t* in_batch =
+          input.const_data_ptr<int8_t>() + n * spatial_size;
+      int8_t* out_batch =
+          output.mutable_data_ptr<int8_t>() + n * out_spatial_size;
+
+      ret = xa_nn_maxpool_8(
+          out_batch,
+          in_batch,
+          in_height,
+          in_width,
+          channels,
+          kernel_h,
+          kernel_w,
+          stride_w, // x_stride
+          stride_h, // y_stride
+          pad_w, // x_padding
+          pad_h, // y_padding
+          out_height,
+          out_width,
+          0, // inp_data_format: NHWC
+          0, // out_data_format: NHWC
+          p_scratch);
+    } else {
+      const uint8_t* in_batch =
+          input.const_data_ptr<uint8_t>() + n * spatial_size;
+      uint8_t* out_batch =
+          output.mutable_data_ptr<uint8_t>() + n * out_spatial_size;
+
+      ret = xa_nn_maxpool_asym8(
+          out_batch,
+          in_batch,
+          in_height,
+          in_width,
+          channels,
+          kernel_h,
+          kernel_w,
+          stride_w, // x_stride
+          stride_h, // y_stride
+          pad_w, // x_padding
+          pad_h, // y_padding
+          out_height,
+          out_width,
+          0, // inp_data_format: NHWC
+          0, // out_data_format: NHWC
+          p_scratch);
+    }
+    ET_DCHECK_MSG(ret == 0, "HiFi xa_nn_maxpool failed");
+  }
+
+  return output;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
@@ -632,6 +632,16 @@ def define_common_targets():
         compatible_with = ["ovr_config//cpu:xtensa"],
     )
 
+    runtime.cxx_library(
+        name = "op_quantized_max_pool2d_nhwc",
+        srcs = ["op_quantized_max_pool2d_nhwc.cpp"],
+        exported_headers = ["operators.h"],
+        platforms = CXX,
+        deps = COMMON_DEPS,
+        visibility = ["PUBLIC"],
+        compatible_with = ["ovr_config//cpu:xtensa"],
+    )
+
     runtime.cxx_library(
         name = "op_quantized_relu_asym8s_asym8s_per_tensor_out",
         srcs = ["op_quantized_relu_asym8s_asym8s_per_tensor_out.cpp"],

Original file line number	Diff line number	Diff line change
`@@ -505,7 +505,9 @@ def _build_max_pool2d_graph(self) -> tuple[torch.fx.GraphModule, torch.fx.Node]:`
`505`	`505`	`target=torch.ops.aten.max_pool2d_with_indices.default,`
`506`	`506`	`)`
`507`	`507`	`self.assertEqual(`
`508`		`- len(max_pool_nodes), 1, "Should find exactly one max_pool2d_with_indices node"`
	`508`	`+ len(max_pool_nodes),`
	`509`	`+ 1,`
	`510`	`+ "Should find exactly one max_pool2d_with_indices node",`
`509`	`511`	`)`
`510`	`512`	`return gm, max_pool_nodes[0]`
`511`	`513`