NVIDIA · IgorBaratta · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · coderabbitai
@@ -22,6 +22,8 @@
 import torch
 from onnx_graphsurgeon.ir.tensor import LazyValues
 
+from modelopt.onnx.utils import is_fp8_constant
+
 from .base_exporter import ONNXQuantExporter
 
 
@@ -61,37 +63,46 @@ def compress_weights(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
         graph.cleanup().toposort().fold_constants().cleanup()
 
         for node in graph.nodes:
-            if node.op == "TRT_FP8QuantizeLinear":
-                # Should not remove input QDQ
-                if not isinstance(node.inputs[0], gs.Constant):
-                    continue
-
-                weights = node.inputs[0]
-                scale = node.inputs[1]
-                torch_weights = torch.from_numpy(weights.values)
-                torch_scale = torch.from_numpy(scale.values)
-                quantizer_name = scale.name.rsplit("/", 1)[0]
-                dq_op = node.outputs[0].outputs[0]
-                assert dq_op.op == "TRT_FP8DequantizeLinear", (
-                    f"QDQ does not occur in pairs. You reached {dq_op.op}"
-                )
-
-                # Replace it with Dequantize with FP8 weights. This is a WAR because numpy does not support fp8.
-                numpy_weights = (
-                    (torch_weights / torch_scale).to(torch.float8_e4m3fn).view(torch.uint8).numpy()
-                )
-                tensor = onnx.TensorProto()
-                tensor.data_type = onnx.TensorProto.FLOAT8E4M3FN
-                tensor.dims.extend(numpy_weights.shape)
-                tensor.raw_data = numpy_weights.tobytes()
-                values = LazyValues(tensor)
-                onnx_weights_fp8 = gs.Constant(quantizer_name + "/fp8_weights", values)
-
-                node.outputs.clear()
-                # DQ Op is separated out
-                dq_op.inputs[0] = onnx_weights_fp8
-                dq_op.op = "DequantizeLinear"
-                dq_op.outputs[0].dtype = dq_op.inputs[1].dtype
+            is_trt_fp8_q = node.op == "TRT_FP8QuantizeLinear"
+            is_std_fp8_q = (
+                node.op == "QuantizeLinear"
+                and len(node.inputs) >= 3
+                and isinstance(node.inputs[2], gs.Constant)
+                and is_fp8_constant(node.inputs[2])
+            )
+            if not (is_trt_fp8_q or is_std_fp8_q):
+                continue
+
+            # Should not remove input QDQ
+            if not isinstance(node.inputs[0], gs.Constant):
+                continue
+
+            weights = node.inputs[0]
+            scale = node.inputs[1]
+            torch_weights = torch.from_numpy(weights.values)
+            torch_scale = torch.from_numpy(scale.values)
+            quantizer_name = scale.name.rsplit("/", 1)[0]
+            dq_op = node.outputs[0].outputs[0]
+            assert dq_op.op in ("TRT_FP8DequantizeLinear", "DequantizeLinear"), (
+                f"QDQ does not occur in pairs. You reached {dq_op.op}"
+            )
+
+            # Replace it with Dequantize with FP8 weights. This is a WAR because numpy does not support fp8.
+            numpy_weights = (
+                (torch_weights / torch_scale).to(torch.float8_e4m3fn).view(torch.uint8).numpy()
+            )
+            tensor = onnx.TensorProto()
+            tensor.data_type = onnx.TensorProto.FLOAT8E4M3FN
+            tensor.dims.extend(numpy_weights.shape)
+            tensor.raw_data = numpy_weights.tobytes()
+            values = LazyValues(tensor)
+            onnx_weights_fp8 = gs.Constant(quantizer_name + "/fp8_weights", values)
+
+            node.outputs.clear()
+            # DQ Op is separated out
+            dq_op.inputs[0] = onnx_weights_fp8
+            dq_op.op = "DequantizeLinear"
+            dq_op.outputs[0].dtype = dq_op.inputs[1].dtype
 
         graph.cleanup().toposort()
         end_time = time.time()

@@ -23,6 +23,8 @@
 import torch
 from onnx_graphsurgeon.ir.tensor import LazyValues
 
+from modelopt.onnx.utils import is_fp8_constant
+
 
 def clear_inputs(node: gs.Node | gs.Tensor):
     """Clear all inputs for a node or tensor in ONNX."""
@@ -81,37 +83,46 @@ def fold_fp8_qdq_to_dq(graph: gs.Graph):
     graph.cleanup().toposort().fold_constants().cleanup()
 
     for node in graph.nodes:
-        if node.op == "TRT_FP8QuantizeLinear":
-            # Should not remove input QDQ
-            if not isinstance(node.inputs[0], gs.Constant):
-                continue
-
-            weights = node.inputs[0]
-            scale = node.inputs[1]
-            torch_weights = torch.from_numpy(weights.values)
-            torch_scale = torch.from_numpy(scale.values)
-            quantizer_name = scale.name.rsplit("/", 1)[0]
-            dq_op = node.outputs[0].outputs[0]
-            assert dq_op.op == "TRT_FP8DequantizeLinear", (
-                f"QDQ does not occur in pairs. You reached {dq_op.op}"
-            )
-
-            # Replace it with Dequantize with FP8 weights. This is a WAR because numpy does not support fp8.
-            numpy_weights = (
-                (torch_weights / torch_scale).to(torch.float8_e4m3fn).view(torch.uint8).numpy()
-            )
-            tensor = onnx.TensorProto()
-            tensor.data_type = onnx.TensorProto.FLOAT8E4M3FN
-            tensor.dims.extend(numpy_weights.shape)
-            tensor.raw_data = numpy_weights.tobytes()
-            values = LazyValues(tensor)
-            onnx_weights_fp8 = gs.Constant(quantizer_name + "/fp8_weights", values)
-
-            node.outputs.clear()
-            # DQ Op is separated out
-            dq_op.inputs[0] = onnx_weights_fp8
-            dq_op.op = "DequantizeLinear"
-            dq_op.outputs[0].dtype = dq_op.inputs[1].dtype
+        is_trt_fp8_q = node.op == "TRT_FP8QuantizeLinear"
+        is_std_fp8_q = (
+            node.op == "QuantizeLinear"
+            and len(node.inputs) >= 3
+            and isinstance(node.inputs[2], gs.Constant)
+            and is_fp8_constant(node.inputs[2])
+        )
+        if not (is_trt_fp8_q or is_std_fp8_q):
+            continue
+
+        # Should not remove input QDQ
+        if not isinstance(node.inputs[0], gs.Constant):
+            continue
+
+        weights = node.inputs[0]
+        scale = node.inputs[1]
+        torch_weights = torch.from_numpy(weights.values)
+        torch_scale = torch.from_numpy(scale.values)
+        quantizer_name = scale.name.rsplit("/", 1)[0]
+        dq_op = node.outputs[0].outputs[0]
+        assert dq_op.op in ("TRT_FP8DequantizeLinear", "DequantizeLinear"), (
+            f"QDQ does not occur in pairs. You reached {dq_op.op}"
+        )
+
+        # Replace it with Dequantize with FP8 weights. This is a WAR because numpy does not support fp8.
+        numpy_weights = (
+            (torch_weights / torch_scale).to(torch.float8_e4m3fn).view(torch.uint8).numpy()
+        )
+        tensor = onnx.TensorProto()
+        tensor.data_type = onnx.TensorProto.FLOAT8E4M3FN
+        tensor.dims.extend(numpy_weights.shape)
+        tensor.raw_data = numpy_weights.tobytes()
+        values = LazyValues(tensor)
+        onnx_weights_fp8 = gs.Constant(quantizer_name + "/fp8_weights", values)
+
+        node.outputs.clear()
+        # DQ Op is separated out
+        dq_op.inputs[0] = onnx_weights_fp8
+        dq_op.op = "DequantizeLinear"
+        dq_op.outputs[0].dtype = dq_op.inputs[1].dtype
 
     graph.cleanup().toposort()
     end_time = time.time()

@@ -28,13 +28,27 @@
 import onnx_graphsurgeon as gs
 from onnx.helper import get_attribute_value
 from onnx_graphsurgeon import Constant, Node, Variable
+from onnx_graphsurgeon.ir.tensor import LazyValues
 
 from modelopt.onnx.logging_config import logger
 
 # Base minimum opset for quantization (opset 19 is the first to support fp16 scales)
 BASE_MIN_OPSET = 19
 
 
+def is_fp8_constant(const: Constant) -> bool:
+    """Return True if a gs.Constant holds a FLOAT8E4M3FN tensor.
+
+    Uses getattr to guard against future changes to the LazyValues internal API.
+    """
+    if not isinstance(const.values, LazyValues):
+        return False
+    tensor_proto = getattr(const.values, "_tensor", None)
+    if tensor_proto is None:
+        return False
+    return tensor_proto.data_type == onnx.TensorProto.FLOAT8E4M3FN
+
+
 def get_input_names_from_bytes(model_bytes: bytes, external_inputs_only: bool = True) -> list[str]:
     """This function returns the inputs names of the given onnx model in bytes.
 

@@ -126,7 +126,12 @@
 }
 mha_valid_precisions = {"Half", "BFloat16"}
 
-torch_dtype_map = {"Float": torch.float32, "Half": torch.float16, "BFloat16": torch.bfloat16}
+torch_dtype_map = {
+    "Float": torch.float32,
+    "Half": torch.float16,
+    "BFloat16": torch.bfloat16,
+    "Float8": torch.float8_e4m3fn,
+}
 
 
 def export_int8(
@@ -221,8 +226,7 @@ def _fp8_quantize(
     """Helper Function for Quantization."""
     output_shape = sym_help._get_tensor_sizes(inputs)
 
-    # TRT StronglyType only supports FP16 QDQs
-    # custom ops, so cast the input if needed.
+    # Cast the input to the high-precision dtype if needed.
     input_type = inputs.type().scalarType()
     assert trt_high_precision_dtype in (input_type, "Float"), (
         "TRT StronglyType requires both weights and amax to be in the BF16/FP16, or the QDQ in Float."
@@ -234,9 +238,12 @@ def _fp8_quantize(
         "Constant",
         value_t=torch.tensor(scale_inv).to(torch_dtype_map[trt_high_precision_dtype]),
     )
-    q_op = g.op("trt::TRT_FP8QuantizeLinear", inputs, scale).setType(
-        inputs.type().with_dtype(torch.uint8).with_sizes(output_shape)
-    )
+    # Use standard ONNX QuantizeLinear with FLOAT8E4M3FN zero_point (opset 19).
+    # The zero_point dtype determines the output dtype per the ONNX spec.
+    zero_point = g.op("Constant", value_t=torch.tensor(0.0))
+    zero_point = g.op("Cast", zero_point, to_i=onnx_dtype_map["Float8"])
+    q_op = g.op("QuantizeLinear", inputs, scale, zero_point, saturate_i=1)
+    q_op.setType(inputs.type().with_dtype(torch.float8_e4m3fn).with_sizes(output_shape))
     return q_op
 
 
@@ -249,21 +256,22 @@ def _fp8_dequantize(
 ):
     """Helper Function for Dequantization."""
     output_shape = sym_help._get_tensor_sizes(inputs)
-    assert trt_high_precision_dtype in (otype, "Float"), (
-        "TRT StronglyType requires both weights and amax to be in the BF16/FP16, or the QDQ in Float."
-    )
     scale = g.op(
         "Constant",
         value_t=torch.tensor(scale_inv, dtype=torch_dtype_map[otype]),  # type: ignore[index]
     )
-    out = g.op("trt::TRT_FP8DequantizeLinear", inputs, scale).setType(
+    # Use standard ONNX DequantizeLinear with FLOAT8E4M3FN zero_point (opset 19).
+    # Per the ONNX spec, DequantizeLinear with FLOAT8E4M3FN input outputs float32.
+    zero_point = g.op("Constant", value_t=torch.tensor(0.0))
+    zero_point = g.op("Cast", zero_point, to_i=onnx_dtype_map["Float8"])
+    out = g.op("DequantizeLinear", inputs, scale, zero_point)
+    out.setType(
         inputs.type().with_dtype(torch_dtype_map[trt_high_precision_dtype]).with_sizes(output_shape)
     )
 
-    # DQ outputs are currently constrained to FP32 due to a similar limitation in ORT
-    # custom ops, so cast the output if needed.
-    if trt_high_precision_dtype != otype:
-        out = g.op("Cast", out, to_i=onnx_dtype_map[otype])  # type: ignore[index]
+    # DequantizeLinear outputs float32 in opset 19; cast back to original type if needed.
+    if otype in torch_dtype_map and otype != "Float":
+        out = g.op("Cast", out, to_i=onnx_dtype_map[otype])
     return out