Arm backend: Improve permute fusion over elementwise ops (pytorch#19451)

AdrianLundell · web-flow · commit 5363438aa13a · 2026-05-11T15:56:56.000+02:00
Adds handling of all ops handled by the insert_table_ops
- For FP, add all ops to remove_permutes_arount_elementwise_tosa_ops
- For INT, ensure that the tosa.TABLE op is treated properly


Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
@@ -278,11 +278,12 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     out_quantargs=output_qparams[0],
                 )
                 # Register buffer in self.exported_program.state_dict
+                # b_ prefix is important to be recognized as a constant in RemovePermutesAroundElementwiseOps
                 const_table_node = create_constant_placeholder(
                     exp_program=self.exported_program,
                     graph=node.graph,
                     kind=InputKind.BUFFER,
-                    name=node.name + "_table_constant",
+                    name="b_" + node.name + "_table_constant",
                     data=buffer,
                     persistent_buffer=True,
                 )
diff --git a/backends/arm/_passes/remove_permutes_around_elementwise_tosa_ops.py b/backends/arm/_passes/remove_permutes_around_elementwise_tosa_ops.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from executorch.backends.arm._passes.insert_table_ops import TableOps
 from executorch.backends.transforms.remove_permutes_around_elementwise_ops import (
     RemovePermutesAroundElementwiseOps,
 )
@@ -12,6 +13,21 @@
 class RemovePermutesAroundElementwiseTosaOps(RemovePermutesAroundElementwiseOps):
     permutable_ops = {
         *RemovePermutesAroundElementwiseOps.permutable_ops,
+        *TableOps.unary_table_ops.keys(),
+        *TableOps.special_table_ops,
         exir_ops.backend.tosa.RESCALE.default,
         exir_ops.backend.tosa.TABLE.default,
     }
+
+    def permute_subgraph(self, subgraph):
+        # Original function will always permute constant nodes which is wrong for table ops
+        # Remove constant tosa.TABLE edges before running full function
+        new_constant_edges_in = set()
+        for const_node, user_node in subgraph.constant_edges_in:
+            if user_node.target == exir_ops.backend.tosa.TABLE.default:
+                continue
+            else:
+                new_constant_edges_in.add((const_node, user_node))
+
+        subgraph.constant_edges_in = new_constant_edges_in
+        super().permute_subgraph(subgraph)
diff --git a/backends/arm/test/misc/test_transpose_counts.py b/backends/arm/test/misc/test_transpose_counts.py
@@ -9,7 +9,10 @@
 import torch
 
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.test_pipeline import TosaPipelineFP
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+)
 
 
 InputT = Tuple[Any, ...]
@@ -330,6 +333,17 @@ def forward(self, x):
         return torch.cat((a, b), dim=-1)
 
 
+class PermuteSiluPermute(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.silu = torch.nn.SiLU()
+
+    def forward(self, x: torch.Tensor):
+        x = torch.permute(x, [0, 2, 3, 1])
+        x = self.silu(x)
+        return torch.permute(x, [0, 3, 1, 2])
+
+
 cases = {
     "conv1d_rank2": TransposeCountCase(Conv1dModule(), (torch.randn(2, 8),), 2),
     "conv1d_rank3": TransposeCountCase(Conv1dModule(), (torch.randn(1, 2, 8),), 2),
@@ -458,6 +472,14 @@ def forward(self, x):
     ),
 }
 
+cases_int = {
+    "permute_silu_permute": TransposeCountCase(
+        PermuteSiluPermute(),
+        (torch.randn(1, 2, 3, 4),),
+        0,
+    ),
+}
+
 
 cases_channels_last = {
     "conv2d_rank4_channels_last": TransposeCountCase(
@@ -531,13 +553,20 @@ def forward(self, x):
 }
 
 
-@common.parametrize("case", cases)
+@common.parametrize("case", cases | cases_int)
 def test_transpose_counts_tosa_FP(case: TransposeCountCase) -> None:
     pipeline = TosaPipelineFP[InputT](case.module, case.inputs, aten_op=[])
     pipeline.count_tosa_ops({"TRANSPOSE": case.expected_transposes})
     pipeline.run()
 
 
+@common.parametrize("case", cases_int)
+def test_transpose_counts_tosa_INT(case: TransposeCountCase) -> None:
+    pipeline = TosaPipelineINT[InputT](case.module, case.inputs, aten_op=[])
+    pipeline.count_tosa_ops({"TRANSPOSE": case.expected_transposes})
+    pipeline.run()
+
+
 xfails = {
     "conv3d_rank5_channels_last": "Numerical error",
     "views_channels_last": "Torch.export: View not supported by torch.export in channels last format",

Original file line number	Diff line number	Diff line change
`@@ -278,11 +278,12 @@ def call(self, graph_module: GraphModule) -> PassResult:`
`278`	`278`	`out_quantargs=output_qparams[0],`
`279`	`279`	`)`
`280`	`280`	`# Register buffer in self.exported_program.state_dict`
	`281`	`+ # b_ prefix is important to be recognized as a constant in RemovePermutesAroundElementwiseOps`
`281`	`282`	`const_table_node = create_constant_placeholder(`
`282`	`283`	`exp_program=self.exported_program,`
`283`	`284`	`graph=node.graph,`
`284`	`285`	`kind=InputKind.BUFFER,`
`285`		`- name=node.name + "_table_constant",`
	`286`	`+ name="b_" + node.name + "_table_constant",`
`286`	`287`	`data=buffer,`
`287`	`288`	`persistent_buffer=True,`
`288`	`289`	`)`