pulp-platform · lee2716 · Jan 31, 2026 · Jan 31, 2026 · Feb 3, 2026 · Feb 3, 2026
@@ -35,4 +35,4 @@ jobs:
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "kernels and singlebuffer and l2"
+      pytest-marker: "(kernels or models) and singlebuffer and l2"
@@ -35,4 +35,12 @@ jobs:
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "kernels"
+      pytest-marker: kernels
+
+  snitch-models:
+    needs: select-env
+    uses: ./.github/workflows/_runner-snitch.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: models
@@ -31,3 +31,5 @@ ignore:
   - "**/toolchain/"
   # Ignore all files in .git
   - "**/.git/**"
+  # Ignore all files in .venv
+  - "**/.venv/"
@@ -3107,7 +3107,7 @@ def _exportGraph(self, folderPath, fileName):
         # VJUNG: ONNX-Graphsurgeon needs tensors to be in their export types
         constTensors = [tensor for tensor in self.graph.tensors().values() if isinstance(tensor, gs.Constant)]
         for tensor in constTensors:
-            if tensor.dtype != tensor.export_dtype:
+            if hasattr(tensor, 'export_dtype') and tensor.dtype != tensor.export_dtype:
                 tensor.values = tensor.values.astype(tensor.export_dtype)
 
         model = gs.export_onnx(self.graph)

@@ -286,6 +286,9 @@
 BasicConcatBindings = [
     NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
                 ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                ConcatTemplate.referenceTemplate, BasicTransformer)
 ]
 
 BasicQuantBindings = [

@@ -709,3 +709,52 @@ def computeOps(self):
             numPx = opRep['dim_im_out_x']
 
         return numPx * opsPerPx
+
+
+class RMSNormLayer(ONNXLayer):
+    """Layer support for the ONNX RMSNormalization operator.
+
+    Supported opset: 23
+
+    It is computed as follows:
+        - XSquared = Mul(X, X)
+        - XSquaredMean = ReduceMean<axes=normalized_axes>(XSquared)
+        - MeanSquareEpsilon = Add(XSquaredMean, epsilon)
+        - RMS = Sqrt(MeanSquareEpsilon)
+        - Normalized = Div(X, RMS)
+        - Y = Mul(Normalized, Scale)
+
+    For more details, this is the official ONNX documentation:
+    https://onnx.ai/onnx/operators/onnx__RMSNormalization.html#rmsnormalization-23
+    """
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        inputSize = self.mapper.parser.operatorRepresentation['inputSize']
+        NormalizedAxesSize = self.mapper.parser.operatorRepresentation['NormalizedAxesSize']
+        scale = self.mapper.parser.operatorRepresentation['scale']
+
+        # a. XSquared = Mul(X, X) => inputSize ops
+        # b. XSquaredMean = ReduceMean<axes=normalized_axes>(XSquared)
+        #    => inputSize ops (additions) + (inputSize - NormalizedAxesSize) ops (divisions)
+        # c. MeanSquareEpsilon = Add(XSquaredMean, epsilon) => (inputSize - NormalizedAxesSize) ops
+        # d. RMS = Sqrt(MeanSquareEpsilon) => (inputSize - NormalizedAxesSize) ops
+        # e. Normalized = Div(X, RMS) => inputSize ops
+        # f. Y = Mul(Normalized, Scale) => 0 if all(Scale == 1.0), else inputSize ops
+        scale_ops = 0 if (scale == 1.0).all() else inputSize
+        ops = 6 * inputSize - 3 * NormalizedAxesSize + scale_ops
+        return ops
+
+
+class HardSwishLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        # HardSwish(x) = x * clip(x/6 + 0.5, 0, 1)
+        # Operations: div + add + clip + mul
+        size = self.mapper.parser.operatorRepresentation['size']
+        return size * 4
@@ -11,6 +11,37 @@
 from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeParser, VariableBuffer
 
 
+def compute_broadcast_strides(shape1, shape2, out_shape):
+    """Compute strides for ONNX/NumPy-style broadcasting.
+
+    Pads both input shapes from the left to match the output ndim,
+    then computes strides where broadcast dimensions (size 1) get stride 0.
+
+    Example:
+        shape1=[8,8,8], shape2=[8]
+        -> strides1=[64,8,1], strides2=[0,0,1]
+    """
+    ndim = len(out_shape)
+
+    pad1 = [1] * (ndim - len(shape1)) + shape1
+    pad2 = [1] * (ndim - len(shape2)) + shape2
+
+    def _calc_strides(padded_shape, out_shape):
+        strides = []
+        stride = 1
+        for i in range(ndim - 1, -1, -1):
+            if padded_shape[i] == 1 and out_shape[i] > 1:
+                strides.insert(0, 0)
+            else:
+                strides.insert(0, stride)
+            stride *= padded_shape[i] if padded_shape[i] > 1 else 1
+        return strides
+
+    strides1 = _calc_strides(pad1, out_shape)
+    strides2 = _calc_strides(pad2, out_shape)
+    return strides1, strides2
+
+
 class ConcatParser(NodeParser):
 
     def __init__(self):
@@ -55,6 +86,10 @@ def parseNode(self, node: gs.Node) -> (bool):
             self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'])
             self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['D']))
 
+            stash_type = node.attrs.get('stash_type', 1)
+            if stash_type != 1:
+                raise ValueError(f"iRMSNorm: only stash_type=1 (FP32) is supported, got {stash_type}")
+
         return ret
 
     def parseNodeCtxt(self,
@@ -70,8 +105,19 @@ def parseNodeCtxt(self,
         for idx, outputNode in enumerate(node.outputs):
             self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
 
-        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
-        self.operatorRepresentation['lastDimLength'] = ctxt.lookup(node.inputs[0].name).shape[-1]
+        input_shape = list(ctxt.lookup(node.inputs[0].name).shape)
+
+        axis = node.attrs.get('axis', -1)
+        if axis < 0:
+            axis = len(input_shape) + axis
+
+        self.operatorRepresentation['inputSize'] = int(np.prod(input_shape))
+        self.operatorRepresentation['NormalizedAxesSize'] = int(np.prod(input_shape[axis:]))
+        self.operatorRepresentation['scale'] = node.inputs[1].values
+
+        # Keep old keys for C template compatibility
+        self.operatorRepresentation['size'] = int(np.prod(input_shape))
+        self.operatorRepresentation['lastDimLength'] = int(input_shape[-1])
 
         return ctxt, True
 
@@ -471,23 +517,37 @@ def __init__(self):
         super().__init__()
 
     def parseNode(self, node: gs.Node) -> bool:
-
         ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
-
         return ret
 
     def parseNodeCtxt(self,
                       ctxt: NetworkContext,
                       node: gs.Node,
                       channels_first: bool = True) -> Tuple[NetworkContext, bool]:
-
         data_in_1 = ctxt.lookup(node.inputs[0].name)
         data_in_2 = ctxt.lookup(node.inputs[1].name)
         data_out = ctxt.lookup(node.outputs[0].name)
+
         self.operatorRepresentation['data_in_1'] = data_in_1.name
         self.operatorRepresentation['data_in_2'] = data_in_2.name
         self.operatorRepresentation['data_out'] = data_out.name
-        self.operatorRepresentation['size'] = np.prod(data_in_1.shape)
+        self.operatorRepresentation['size'] = np.prod(data_out.shape)
+
+        # Check if broadcasting is needed
+        shape1 = list(data_in_1.shape)
+        shape2 = list(data_in_2.shape)
+        out_shape = list(data_out.shape)
+
+        need_broadcast = (shape1 != out_shape) or (shape2 != out_shape)
+        self.operatorRepresentation['need_broadcast'] = need_broadcast
+
+        if need_broadcast:
+            strides1, strides2 = compute_broadcast_strides(shape1, shape2, out_shape)
+
+            self.operatorRepresentation['ndim'] = len(out_shape)
+            self.operatorRepresentation['strides1'] = strides1
+            self.operatorRepresentation['strides2'] = strides2
+            self.operatorRepresentation['out_shape'] = out_shape
 
         return ctxt, True
 
@@ -2096,15 +2156,15 @@ def parseNodeCtxt(self,
                       node: gs.Node,
                       channels_first: bool = True) -> Tuple[NetworkContext, bool]:
 
-        inputs = ["input1", "input2"]
-        outputs = ["output"]
+        inputs = ["A", "B"]
+        outputs = ["C"]
         for idx, inputNode in enumerate(node.inputs):
             if idx < len(inputs):
                 self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
         for idx, outputNode in enumerate(node.outputs):
             self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
 
-        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(self.operatorRepresentation['input1']).shape)
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(self.operatorRepresentation['A']).shape)
 
         return ctxt, True
 

@@ -6,5 +6,5 @@
 
 referenceTemplate = NodeTemplate("""
 // Division (Name: ${nodeName}, Op: ${nodeOp})
-SINGLE_CORE Div_fp${input1_type.referencedType.typeWidth}_fp${input2_type.referencedType.typeWidth}_fp${output_type.referencedType.typeWidth}(${input1}, ${input2}, ${output}, ${size});
+SINGLE_CORE Div_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}(${A}, ${B}, ${C}, ${size});
 """)
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.AbstractDataTypes import FloatImmediate, Pointer
 from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker
 from Deeploy.DeeployTypes import ConstantBuffer, OperatorRepresentation, VariableBuffer
 
@@ -409,7 +409,10 @@ def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[
 
     def _inferNumLevels(self, inputs: List[VariableBuffer],
                         operatorRepresentation: OperatorRepresentation) -> List[int]:
-        return [2**(4 * self.input_types[0].referencedType.typeWidth)]
+        input_type = self.input_types[0].referencedType
+        if issubclass(input_type, FloatImmediate):
+            return [2**(input_type.typeWidth)]
+        return [2**(4 * input_type.typeWidth)]
 
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> List[bool]:
@@ -610,3 +613,25 @@ def _inferNumLevels(self, inputs: List[VariableBuffer],
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> List[bool]:
         return [True]
+
+
+class RMSNormChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        # RMSNorm: square, mean, sqrt, reciprocal, multiply
+        # Output precision similar to input
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        # RMSNorm output can be signed (depending on input signedness)
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+