Enable fused MoE kernel for Qwen 3.5 MoE model (pytorch#18388)

mergennachin · web-flow · commit acf15bfbe0d4 · 2026-03-23T16:27:38.000-04:00
Replace the compute-all-gather approach (ConditionalFeedForward with
grouped nn.Linear) with FusedMoEExperts that calls the fused MoE Triton
kernel directly. Expert weights are quantized to simple packed INT4
using torchao primitives, separate from the tinygemm path used for
attention and shared expert linears.

For decode (M=1), only 8 of 256 experts' weights are loaded from HBM
per layer (128x less memory traffic vs the old approach).

Depends on the fused MoE Triton kernel (triton::fused_moe).


Decode latency: 12.41 tokens/s
Prefill latency: 47.3 tokens/s

on A100
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -12,7 +12,7 @@
 import torch
 import torch.nn as nn
 
-from executorch.examples.models.qwen3_5_moe.model import Qwen35MoE
+from executorch.examples.models.qwen3_5_moe.model import FusedMoEExperts, Qwen35MoE
 
 
 # ---------------------------------------------------------------------------
@@ -43,6 +43,89 @@ def load_and_quantize(args):
     return model, config
 
 
+def _quantize_experts_int4(model, config, group_size=32, use_hqq=False):
+    """Quantize expert weights to packed INT4 for the fused MoE kernel.
+
+    Two quantization methods:
+      --hqq: HQQ (Half-Quadratic Quantization) iteratively refines scales
+             via least-squares for better accuracy (slower).
+      default: Standard min/max symmetric quantization (faster).
+
+    Converts w1_weight [E, N, K] and w2_weight [E, N, K] to:
+      w1 [E, N, K//2] int8 packed, w1_scale [E, N, K//gs] bf16
+      w2 [E, N, K//2] int8 packed, w2_scale [E, N, K//gs] bf16
+    """
+    if use_hqq:
+        from torchao.quantization.quant_primitives import (
+            _choose_qparams_and_quantize_scale_only_hqq,
+        )
+    else:
+        from torchao.quantization.quant_primitives import (
+            choose_qparams_affine,
+            MappingType,
+            quantize_affine,
+        )
+
+    method = "HQQ" if use_hqq else "min/max"
+
+    for i, layer in enumerate(model.layers):
+        experts = layer.mlp.experts
+        if not isinstance(experts, FusedMoEExperts):
+            continue
+
+        experts.group_size = group_size
+        for name in ("w1_weight", "w2_weight"):
+            w = getattr(experts, name).data.float()
+            E, N, K = w.shape
+
+            if use_hqq:
+                qdata, scale = _choose_qparams_and_quantize_scale_only_hqq(
+                    w.view(E * N, K),
+                    block_size=[1, group_size],
+                    qmin=-8,
+                    qmax=7,
+                )
+                int_data = qdata.to(torch.int8).view(E, N, K)
+                scale = scale.view(E, N, -1)
+            else:
+                block_size = (1, 1, group_size)
+                scale, zero_point = choose_qparams_affine(
+                    w,
+                    MappingType.SYMMETRIC,
+                    block_size,
+                    target_dtype=torch.int8,
+                    quant_min=-8,
+                    quant_max=7,
+                )
+                int_data = quantize_affine(
+                    w,
+                    block_size,
+                    scale,
+                    zero_point,
+                    output_dtype=torch.int8,
+                    quant_min=-8,
+                    quant_max=7,
+                )
+                scale = scale.reshape(E, N, -1)
+
+            # Pack two int4 values per byte: even K -> low nibble, odd K -> high nibble
+            uint4 = (int_data + 8).to(torch.int16)  # shift to unsigned [0, 15]
+            low = uint4[:, :, 0::2]
+            high = uint4[:, :, 1::2]
+            packed = (low | (high << 4)).to(torch.int8)  # [E, N, K//2]
+
+            buf_name = name.replace("_weight", "")
+            experts.register_buffer(buf_name, packed)
+            experts.register_buffer(f"{buf_name}_scale", scale.to(torch.bfloat16))
+            delattr(experts, name)
+
+        print(
+            f"  Quantized experts (INT4 {method}) layer {i + 1}/{config.num_hidden_layers}",
+            end="\r",
+        )
+    print()
+
+
 def _to_device_skip_meta(module, device, dtype=None):
     """Move submodules to device, skipping any that have meta-device buffers.
 
@@ -71,6 +154,10 @@ def _quantize(model, config, args):
     """
     from executorch.extension.llm.export.quantize import quantize_model_
 
+    # Quantize MoE expert weights (packed INT4 for fused_moe kernel)
+    if args.qlinear:
+        _quantize_experts_int4(model, config, args.qlinear_group_size, use_hqq=args.hqq)
+
     # Untie lm_head/embedding so they can be quantized independently:
     # embedding uses index lookup (8w), lm_head uses matmul (4w).
     if model.lm_head.weight.data_ptr() == model.embed_tokens.weight.data_ptr():
@@ -287,8 +374,16 @@ def main():
     parser.add_argument(
         "--qembedding", default=None, choices=["8w"], help="Quantize embedding layers."
     )
+    parser.add_argument(
+        "--hqq",
+        action="store_true",
+        help="Use HQQ scale-only optimization for expert quantization (slower, better accuracy).",
+    )
     args = parser.parse_args()
 
+    if args.hqq and not args.qlinear:
+        parser.error("--hqq requires --qlinear")
+
     # Register FLA Triton kernel
     import executorch.backends.cuda.triton.kernels  # noqa: F401
 
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -399,71 +399,54 @@ def forward(self, x, input_pos):
 
 
 # ---------------------------------------------------------------------------
-# MoE: stacked expert weights + index by top-k
+# MoE: expert weights for fused MoE Triton kernel
 
-# 16 experts per group keeps each nn.Linear under ~32K output features,
-# within tinygemm int4 packing limits while keeping the graph small
-# (32 matmul nodes per layer instead of 768 with per-expert linears).
-_EXPERTS_PER_GROUP = 16
 
+class FusedMoEExperts(nn.Module):
+    """Expert weights stored as stacked tensors for the fused MoE Triton kernel.
 
-class ConditionalFeedForward(nn.Module):
-    """Grouped expert weights as nn.Linear for quantization compatibility.
+    Before quantization: w1_weight [E, 2*inter, hidden] and w2_weight [E, hidden, inter]
+    are nn.Parameter tensors loaded from the checkpoint.
 
-    Experts are split into groups of _EXPERTS_PER_GROUP. Each group has:
-      gate_up_projs[g]: nn.Linear(hidden_size, G * intermediate_size * 2)
-      down_projs[g]: nn.Linear(intermediate_size, G * hidden_size)
-    This keeps each nn.Linear small enough for tinygemm int4 packing while
-    allowing quantize_model_() to handle them automatically.
+    After quantization (in export.py): replaced with packed INT4 buffers
+    w1 [E, 2*inter, hidden//2], w1_scale, w2 [E, hidden, inter//2], w2_scale.
     """
 
-    def __init__(self, hidden_size, intermediate_size, num_experts):
+    def __init__(self, config):
         super().__init__()
-        self.num_experts = num_experts
-        self.intermediate_size = intermediate_size
-        self.hidden_size = hidden_size
-        G = _EXPERTS_PER_GROUP
-        assert num_experts % G == 0
-        num_groups = num_experts // G
-
-        self.gate_up_projs = nn.ModuleList(
-            [
-                nn.Linear(hidden_size, G * intermediate_size * 2, bias=False)
-                for _ in range(num_groups)
-            ]
+        self.num_experts = config.num_experts
+        self.intermediate_size = config.moe_intermediate_size
+        self.hidden_size = config.hidden_size
+        self.group_size = 32
+
+        self.w1_weight = nn.Parameter(
+            torch.empty(
+                config.num_experts,
+                2 * config.moe_intermediate_size,
+                config.hidden_size,
+            )
         )
-        self.down_projs = nn.ModuleList(
-            [
-                nn.Linear(intermediate_size, G * hidden_size, bias=False)
-                for _ in range(num_groups)
-            ]
+        self.w2_weight = nn.Parameter(
+            torch.empty(
+                config.num_experts,
+                config.hidden_size,
+                config.moe_intermediate_size,
+            )
         )
 
-    def forward(self, x, expert_indices):
-        # x: (T, D), expert_indices: (T, top_k)
-        T = x.size(0)
-        top_k = expert_indices.size(1)
-        G = _EXPERTS_PER_GROUP
-        H = self.intermediate_size
-        D = self.hidden_size
-
-        # Gate + Up: compute per-group, cat, gather top-k
-        gate_up_parts = [proj(x).view(T, G, 2, H) for proj in self.gate_up_projs]
-        gate_up = torch.cat(gate_up_parts, dim=1)  # (T, E, 2, H)
-
-        idx = expert_indices.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, 2, H)
-        gate_up_sel = gate_up.gather(1, idx)  # (T, top_k, 2, H)
-        intermediate = F.silu(gate_up_sel[:, :, 0, :]) * gate_up_sel[:, :, 1, :]
-
-        # Down: compute per-group, cat, gather correct expert per slot
-        intermediate_flat = intermediate.reshape(T * top_k, H)
-        down_parts = [
-            proj(intermediate_flat).view(T, top_k, G, D) for proj in self.down_projs
-        ]
-        all_down = torch.cat(down_parts, dim=2)  # (T, top_k, E, D)
-
-        eidx = expert_indices.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, 1, D)
-        return all_down.gather(2, eidx).squeeze(2)  # (T, top_k, D)
+    def forward(self, x, expert_weights, expert_indices, top_k):
+        return torch.ops.triton.fused_moe(
+            x,
+            self.w1,
+            self.w1_scale,
+            self.w2,
+            self.w2_scale,
+            expert_weights,
+            expert_indices,
+            top_k,
+            self.num_experts,
+            self.group_size,
+        )
 
 
 class SwiGLU(nn.Module):
@@ -484,12 +467,9 @@ class SparseMoE(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.top_k = config.num_experts_per_tok
+        self.num_experts = config.num_experts
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
-        self.cond_ffn = ConditionalFeedForward(
-            config.hidden_size,
-            config.moe_intermediate_size,
-            config.num_experts,
-        )
+        self.experts = FusedMoEExperts(config)
         self.shared_expert = SwiGLU(
             config.hidden_size, config.shared_expert_intermediate_size
         )
@@ -503,8 +483,9 @@ def forward(self, x):
         expert_weights, expert_indices = torch.topk(scores, self.top_k, dim=-1)
         expert_weights = expert_weights.softmax(dim=-1)
 
-        expert_outs = self.cond_ffn(x_flat, expert_indices)
-        routed_out = torch.einsum("tai,ta->ti", expert_outs, expert_weights)
+        routed_out = self.experts(
+            x_flat, expert_weights.float(), expert_indices, self.top_k
+        )
 
         shared_out = self.shared_expert(x_flat)
         shared_gate = torch.sigmoid(self.shared_expert_gate(x_flat))
@@ -641,9 +622,8 @@ def _load_and_remap_checkpoint(model_dir, config):
                     expert_weights,
                 )
 
-    # Stack per-expert weights, split into groups, reshape for nn.Linear
+    # Stack per-expert weights into [E, N, K] tensors for FusedMoEExperts
     if expert_weights:
-        G = _EXPERTS_PER_GROUP
         for layer_idx in range(config.num_hidden_layers):
             gate_list = [
                 expert_weights.get((layer_idx, "gate", e))
@@ -661,21 +641,13 @@ def _load_and_remap_checkpoint(model_dir, config):
             if gate_list[0] is not None:
                 w_gate = torch.stack(gate_list, dim=0)  # (E, H, D)
                 w_up = torch.stack(up_list, dim=0)
-                fused = torch.cat([w_gate, w_up], dim=1)  # (E, 2*H, D)
-                num_groups = config.num_experts // G
-                for g in range(num_groups):
-                    chunk = fused[g * G : (g + 1) * G]
-                    state_dict[
-                        f"layers.{layer_idx}.mlp.cond_ffn.gate_up_projs.{g}.weight"
-                    ] = chunk.reshape(-1, chunk.size(-1))
+                state_dict[f"layers.{layer_idx}.mlp.experts.w1_weight"] = torch.cat(
+                    [w_gate, w_up], dim=1
+                )  # (E, 2*H, D)
             if down_list[0] is not None:
-                w_down = torch.stack(down_list, dim=0)  # (E, D, H)
-                num_groups = config.num_experts // G
-                for g in range(num_groups):
-                    chunk = w_down[g * G : (g + 1) * G]
-                    state_dict[
-                        f"layers.{layer_idx}.mlp.cond_ffn.down_projs.{g}.weight"
-                    ] = chunk.reshape(-1, chunk.size(-1))
+                state_dict[f"layers.{layer_idx}.mlp.experts.w2_weight"] = torch.stack(
+                    down_list, dim=0
+                )  # (E, D, H)
         del expert_weights
 
     # Handle tied embeddings
@@ -697,27 +669,15 @@ def _process_checkpoint_key(ckpt_key, tensor, state_dict, expert_weights):
     if norm_key.startswith(("model.visual.", "model.mtp_")):
         return
 
-    # Fused expert weights: split into groups of _EXPERTS_PER_GROUP
+    # Fused expert weights: store directly as [E, N, K] for FusedMoEExperts
     m = _FUSED_EXPERT_RE.match(norm_key)
     if m:
         layer_idx = int(m.group(1))
         proj_name = m.group(2)
-        G = _EXPERTS_PER_GROUP
-        num_groups = tensor.size(0) // G
         if proj_name == "gate_up_proj":
-            # (E, 2*H, D) → groups of (G, 2*H, D) → each (G*2*H, D)
-            for g in range(num_groups):
-                chunk = tensor[g * G : (g + 1) * G]
-                state_dict[
-                    f"layers.{layer_idx}.mlp.cond_ffn.gate_up_projs.{g}.weight"
-                ] = chunk.reshape(-1, chunk.size(-1)).contiguous()
+            state_dict[f"layers.{layer_idx}.mlp.experts.w1_weight"] = tensor
         else:
-            # down_proj: (E, D, H) → groups of (G, D, H) → each (G*D, H)
-            for g in range(num_groups):
-                chunk = tensor[g * G : (g + 1) * G]
-                state_dict[f"layers.{layer_idx}.mlp.cond_ffn.down_projs.{g}.weight"] = (
-                    chunk.reshape(-1, chunk.size(-1)).contiguous()
-                )
+            state_dict[f"layers.{layer_idx}.mlp.experts.w2_weight"] = tensor
         return
 
     # Per-expert weights