AI-Hypercomputer · copybara-service · May 12, 2026 · May 11, 2026
@@ -210,6 +210,10 @@ def _validate_or_update_architecture(hf_config, max_config, override: bool):
       if isinstance(mt_value, int):
         mt_value = mt_value * 2
 
+    # Special handling for Qwen3-MoE: hf.intermediate_size is the aggregated dense MLP dim, but mt.mlp_dim is dim per expert
+    if "qwen3" in max_config.model_name and getattr(max_config, "num_experts", 0) > 1 and hf_attr == "intermediate_size":
+      mt_value = mt_value * getattr(max_config, "num_experts_per_tok", 1)
+
     # Handle vocab size padding
     if hf_attr == "vocab_size" and isinstance(mt_value, int) and isinstance(hf_value, int):
       # MaxText often pads vocab size to a multiple of 128 or 256 for TPU efficiency

@@ -813,7 +813,7 @@ def _eager_getter(key):
       "--eager_load_method",
       type=str,
       required=False,
-      default="transformers",
+      default="safetensors",
       choices=["transformers", "safetensors"],
       help="Backend to use for eager loading: `transformers_class.from_pretrained` or `safetensors.safe_open` with pt",
   )

@@ -716,7 +716,7 @@ def QWEN_HF_WEIGHTS_TO_SHAPE(config):
   }
 
   # Determine if the model is MoE based on config keys
-  num_experts = config.get("num_experts", 0)
+  num_experts = config.get("num_experts", config.get("num_local_experts", 0))
 
   for layer_idx in range(num_hidden_layers):
     layer_prefix = f"model.layers.{layer_idx}"

@@ -607,7 +607,7 @@ def QWEN_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False):
       or scanned with expert stacking (nested list of strings).
   """
   n_layers = config["num_hidden_layers"]
-  num_experts = config.get("num_experts", 0)
+  num_experts = config.get("num_experts", config.get("num_local_experts", 0))
 
   mapping = {
       "params-token_embedder-embedding": "model.embed_tokens.weight",
@@ -753,7 +753,7 @@ def QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN(config, maxtext_config, scan_layers=False,
       transformation functions.
   """
   n_layers = config["num_hidden_layers"]
-  num_experts = config.get("num_experts", 0)
+  num_experts = config.get("num_experts", config.get("num_local_experts", 0))
 
   def pad_embedding_layer(input_tensor, target_shape):
     """Pads or truncates embedding layer to match target vocab size."""