Arm backend: fix the quantized scalar remainder issue (pytorch#18401)

xingguo01 · web-flow · commit 36a1952c5017 · 2026-03-23T16:19:03.000Z
- Keep scalar remainder opaque through quantization and lower it through the LUT/table path, to avoid the inaccurate div/floor/mul/sub decomposition in INT mode. - Keep tensor-tensor INT remainder xfail cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell Signed-off-by: Xingguo Li <xingguo.li@arm.com>
diff --git a/backends/arm/_passes/decompose_remainder_pass.py b/backends/arm/_passes/decompose_remainder_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -59,6 +59,15 @@ def call_operator(self, op, args, kwargs, meta, updated=False):
         )
         if op not in supported_ops:
             return super().call_operator(op, args, kwargs, meta, updated)
+        # Keep scalar remainder opaque during transform-for-annotation so the
+        # quantizer can wrap the original op directly. In the backend pipeline,
+        # also preserve quantized scalar remainder so InsertTableOpsPass can
+        # lower it as a lookup table instead of expanding to div/floor/mul/sub.
+        if op in (
+            exir_ops.edge.aten.remainder.Scalar,
+            torch.ops.aten.remainder.Scalar,
+        ) and (self.is_tfa_pass or self._is_quantized_meta(meta)):
+            return super().call_operator(op, args, kwargs, meta, updated)
 
         div_op, mul_op, sub_op = _decomposition_ops[op]
         x, y = args[0], args[1]
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
@@ -64,6 +64,7 @@ class TableOps:
         exir_ops.edge.aten.pow.Tensor_Scalar,
         exir_ops.edge.aten.gelu.default,
         exir_ops.edge.aten.elu.default,
+        exir_ops.edge.aten.remainder.Scalar,
     }
 
     def __init__(self, exported_program: ExportedProgram):
@@ -102,6 +103,9 @@ def __getitem__(self, node: Node):
                     return lambda x: torch.nn.functional.elu(
                         x, alpha=input_alpha
                     ).flatten()
+                case exir_ops.edge.aten.remainder.Scalar:
+                    divisor = cast(float | int, node.args[1])
+                    return lambda x: torch.remainder(x, divisor).flatten()
                 case _:
                     # Op must be handled if it's inside self.special_ops
                     raise AssertionError("Unhandled table operation")
diff --git a/backends/arm/_passes/replace_scalar_with_tensor_pass.py b/backends/arm/_passes/replace_scalar_with_tensor_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -79,6 +79,11 @@
     _fp_profile_ops | _int_profile_ops
 )
 
+_preserve_in_tfa = {
+    torch.ops.aten.remainder.Scalar,
+    exir_ops.edge.aten.remainder.Scalar,
+}
+
 
 class ReplaceScalarWithTensorByProfilePass(ArmPass, ReplaceScalarWithTensorArgPass):
     """Profile-aware scalar-to-tensor replacement pass for binary ops."""
@@ -94,6 +99,9 @@ def __init__(self, tfa_pass=False, *args, **kwargs):
         super().__init__(tfa_pass, _all_ops, *args, **kwargs)
 
     def call_operator(self, op, args, kwargs, meta):
+        if self.is_tfa_pass and op in _preserve_in_tfa:
+            return ExportPass.call_operator(self, op, args, kwargs, meta)
+
         tosa_spec = get_context_spec()
 
         included_ops = {}
@@ -108,7 +116,7 @@ def call_operator(self, op, args, kwargs, meta):
         if op in TableOps.included_ops():
             # Do not handle quantized table ops; forward unchanged.
             input_qparams = meta.data.get("input_qparams", {})
-            output_qparams = meta.data.get("input_qparams", {})
+            output_qparams = meta.data.get("output_qparams", {})
             if len(input_qparams) > 0 and len(output_qparams) > 0:
                 # Do not handle; forward unchanged.
                 return ExportPass.call_operator(self, op, args, kwargs, meta)
diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
@@ -80,6 +80,7 @@
     exir_ops.edge.aten.repeat.default,
     exir_ops.edge.aten.reciprocal.default,
     exir_ops.edge.aten.relu.default,
+    exir_ops.edge.aten.remainder.Scalar,
     exir_ops.edge.aten.remainder.Tensor,
     exir_ops.edge.aten.rsqrt.default,
     exir_ops.edge.aten.select_copy.int,
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -507,6 +507,7 @@ def _match_pattern(
     torch.ops.aten.asinh.default,
     torch.ops.aten.cosh.default,
     torch.ops.aten.cumsum.default,
+    torch.ops.aten.remainder.Scalar,
     torch.ops.aten.tan.default,
 }
 
diff --git a/backends/arm/test/ops/test_remainder.py b/backends/arm/test/ops/test_remainder.py
@@ -29,6 +29,7 @@ class Remainder(torch.nn.Module):
     exir_op_tensor = "executorch_exir_dialects_edge__ops_aten_remainder_Tensor"
     aten_op_scalar = "torch.ops.aten.remainder.Scalar"
     exir_op_scalar = "executorch_exir_dialects_edge__ops_aten_remainder_Scalar"
+    lowered_exir_ops = [exir_op_scalar, exir_op_tensor]
 
     test_cases_tensor = {
         "rank2_tensors": lambda: (
@@ -97,18 +98,14 @@ def test_remainder_tensor_tosa_INT(test_data):
     pipeline.run()
 
 
-@common.parametrize(
-    "test_data",
-    Remainder.test_cases_scalar,
-    xfails={
-        "scalar_pos": "MLETORCH-1832 - Quantized remainder with scalar divisor produces incorrect results for certain inputs"
-    },
-)
+@common.parametrize("test_data", Remainder.test_cases_scalar)
 def test_remainder_scalar_tosa_INT(test_data):
     pipeline = TosaPipelineINT[Remainder.input_t](
         Remainder(),
         test_data(),
         [],
+        Remainder.lowered_exir_ops,
+        frobenius_threshold=0.4,
     )
     pipeline.run()
 
@@ -131,6 +128,7 @@ def test_remainder_scalar_u55_INT(test_data):
         Remainder(),
         test_data(),
         [],
+        Remainder.lowered_exir_ops,
     )
     pipeline.run()
 
@@ -153,6 +151,7 @@ def test_remainder_scalar_u85_INT(test_data):
         Remainder(),
         test_data(),
         [],
+        Remainder.lowered_exir_ops,
     )
     pipeline.run()
 

Original file line number	Diff line number	Diff line change
`@@ -507,6 +507,7 @@ def _match_pattern(`
`507`	`507`	`torch.ops.aten.asinh.default,`
`508`	`508`	`torch.ops.aten.cosh.default,`
`509`	`509`	`torch.ops.aten.cumsum.default,`
	`510`	`+ torch.ops.aten.remainder.Scalar,`
`510`	`511`	`torch.ops.aten.tan.default,`
`511`	`512`	`}`
`512`	`513`