Enable LFM2.5 MLX export and runner build

seyeong-han · seyeong-han · commit 2e3a50c2e275 · 2026-05-20T12:37:08.000-07:00
Add LFM2.5 350M registration, MLX export config, focused regression coverage, and a make target for building the shared Llama C++ runner with MLX.

Made-with: Cursor
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -350,7 +350,10 @@
         "CMAKE_BUILD_TYPE": "Release",
         "CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out",
         "ET_MLX_ENABLE_OP_LOGGING": "OFF",
-        "ET_MIN_LOG_LEVEL": "Error"
+        "ET_MIN_LOG_LEVEL": "Error",
+        "EXECUTORCH_BUILD_KERNELS_LLM": "ON",
+        "EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",
+        "EXECUTORCH_BUILD_KERNELS_OPTIMIZED": "ON"
       }
     },
     {
diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -123,6 +123,7 @@ help:
 	@echo "  llama-cuda          - Build Llama runner with CUDA backend"
 	@echo "  llama-cuda-debug    - Build Llama runner with CUDA backend (debug mode)"
 	@echo "  llama-cpu           - Build Llama runner with CPU backend"
+	@echo "  lfm_2_5-mlx         - Build LFM2.5 runner with MLX backend"
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
@@ -373,6 +374,15 @@ llama-cuda-debug:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
 
+lfm_2_5-mlx:
+	@echo "==> Building and installing ExecuTorch with MLX..."
+	cmake --workflow --preset mlx-release
+	@echo "==> Building LFM2.5 runner with MLX..."
+	cd examples/models/llama && cmake --workflow --preset llama-mlx
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
+
 llava-cpu:
 	@echo "==> Building and installing ExecuTorch..."
 	cmake --workflow --preset llm-release
diff --git a/examples/models/lfm2/README.md b/examples/models/lfm2/README.md
@@ -47,6 +47,24 @@ python -m extension.llm.export.export_llm \
   +export.output_name="lfm2_5_1_2b_8da4w.pte"
 ```
 
+Export LFM2.5 350M to MLX on Apple Silicon, quantized with 4-bit weights:
+```
+python -m extension.llm.export.export_llm \
+  --config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
+  +base.model_class="lfm2_5_350m" \
+  +base.params="examples/models/lfm2/config/lfm2_5_350m_config.json" \
+  +export.output_name="lfm2_5_350m_mlx_4w.pte"
+```
+
+Export LFM2.5 1.2B to MLX on Apple Silicon, quantized with 4-bit weights:
+```
+python -m extension.llm.export.export_llm \
+  --config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
+  +base.model_class="lfm2_5_1_2b" \
+  +base.params="examples/models/lfm2/config/lfm2_5_1_2b_config.json" \
+  +export.output_name="lfm2_5_1_2b_mlx_4w.pte"
+```
+
 To export with extended context (e.g., 2048 tokens):
 ```
 python -m extension.llm.export.export_llm \
@@ -58,6 +76,17 @@ python -m extension.llm.export.export_llm \
   +export.output_name="lfm2_5_1_2b_8da4w.pte"
 ```
 ### Example run
+For MLX on Apple Silicon, build or install ExecuTorch with MLX enabled. The
+easiest local path is:
+```
+conda activate et-mlx
+python install_executorch.py
+xcrun -sdk macosx --find metal
+```
+
+The `metal` command must resolve to an Xcode path, not fail under standalone
+Command Line Tools.
+
 With ExecuTorch pybindings:
 ```
 python -m examples.models.llama.runner.native \
@@ -72,7 +101,31 @@ python -m examples.models.llama.runner.native \
   --temperature 0.3
 ```
 
-With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner):
+With ExecuTorch pybindings and an LFM2.5 MLX export:
+```
+python -m examples.models.llama.runner.native \
+  --model lfm2_5_350m \
+  --pte lfm2_5_350m_mlx_4w.pte \
+  --tokenizer ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
+  --tokenizer_config ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer_config.json \
+  --prompt "<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
+  --params examples/models/lfm2/config/lfm2_5_350m_config.json \
+  --max_len 128 \
+  -kv \
+  --temperature 0.3
+```
+
+Find the Hugging Face cache snapshot directory with:
+```
+python - <<'PY'
+from pathlib import Path
+root = Path.home() / ".cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots"
+for path in root.glob("*/tokenizer.json"):
+    print(path.parent)
+PY
+```
+
+With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) for general runner details):
 ```
 cmake-out/examples/models/llama/llama_main \
   --model_path lfm2_700m_8da4w.pte \
@@ -81,4 +134,18 @@ cmake-out/examples/models/llama/llama_main \
   --temperature 0.3
 ```
 
+Build the C++ runner with MLX support for LFM2.5:
+```
+make lfm_2_5-mlx
+```
+
+Then run an LFM2.5 MLX export with the C++ runner:
+```
+cmake-out/examples/models/llama/llama_main \
+  --model_path lfm2_5_350m_mlx_4w.pte \
+  --tokenizer_path ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
+  --prompt="<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
+  --temperature 0.3
+```
+
 To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
diff --git a/examples/models/lfm2/config/lfm2_5_350m_config.json b/examples/models/lfm2/config/lfm2_5_350m_config.json
@@ -0,0 +1,33 @@
+{
+  "dim": 1024,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 4608,
+  "n_heads": 16,
+  "n_kv_heads": 8,
+  "n_layers": 16,
+  "norm_eps": 1e-5,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 65536,
+  "use_hf_rope": true,
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true,
+  "layer_types": [
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv"
+  ]
+}
diff --git a/examples/models/lfm2/config/lfm2_mlx_4w.yaml b/examples/models/lfm2/config/lfm2_mlx_4w.yaml
@@ -0,0 +1,15 @@
+base:
+  metadata: '{"get_bos_id": 1, "get_eos_ids":[7]}'
+
+model:
+  use_kv_cache: True
+  use_sdpa_with_kv_cache: True
+  dtype_override: bf16
+
+quantization:
+  qmode: 4w
+  group_size: 64
+
+backend:
+  mlx:
+    enabled: True
diff --git a/examples/models/lfm2/test_lfm2_5_mlx.py b/examples/models/lfm2/test_lfm2_5_mlx.py
@@ -0,0 +1,102 @@
+import ast
+import json
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+CONFIG_DIR = REPO_ROOT / "examples" / "models" / "lfm2" / "config"
+EXPORT_LLAMA_LIB = REPO_ROOT / "examples" / "models" / "llama" / "export_llama_lib.py"
+LLM_CONFIG = REPO_ROOT / "extension" / "llm" / "export" / "config" / "llm_config.py"
+
+
+def _load_json_config(name: str) -> dict:
+    with open(CONFIG_DIR / name, "r") as f:
+        return json.load(f)
+
+
+def _module_ast(path: Path) -> ast.Module:
+    return ast.parse(path.read_text())
+
+
+def _literal_assignment(module: ast.Module, name: str):
+    for node in module.body:
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                if isinstance(target, ast.Name) and target.id == name:
+                    return ast.literal_eval(node.value)
+    raise AssertionError(f"{name} not found")
+
+
+def _class_string_assignments(module: ast.Module, class_name: str) -> dict[str, str]:
+    for node in module.body:
+        if isinstance(node, ast.ClassDef) and node.name == class_name:
+            values = {}
+            for stmt in node.body:
+                if (
+                    isinstance(stmt, ast.Assign)
+                    and len(stmt.targets) == 1
+                    and isinstance(stmt.targets[0], ast.Name)
+                ):
+                    values[stmt.targets[0].id] = ast.literal_eval(stmt.value)
+            return values
+    raise AssertionError(f"{class_name} not found")
+
+
+def test_lfm2_5_models_are_registered() -> None:
+    export_module = _module_ast(EXPORT_LLAMA_LIB)
+    model_types = _class_string_assignments(_module_ast(LLM_CONFIG), "ModelType")
+    executor_defined_models = _literal_assignment(
+        export_module, "EXECUTORCH_DEFINED_MODELS"
+    )
+    hf_repo_ids = _literal_assignment(export_module, "HUGGING_FACE_REPO_IDS")
+
+    assert "lfm2_5_350m" in executor_defined_models
+    assert "lfm2_5_1_2b" in executor_defined_models
+    assert model_types["lfm2_5_350m"] == "lfm2_5_350m"
+    assert model_types["lfm2_5_1_2b"] == "lfm2_5_1_2b"
+    assert hf_repo_ids["lfm2_5_350m"] == "LiquidAI/LFM2.5-350M"
+    assert hf_repo_ids["lfm2_5_1_2b"] == "LiquidAI/LFM2.5-1.2B-Instruct"
+
+
+def test_lfm2_5_architecture_configs_match_expected_shapes() -> None:
+    expected = {
+        "lfm2_5_350m_config.json": {
+            "dim": 1024,
+            "hidden_dim": 4608,
+            "n_heads": 16,
+            "n_kv_heads": 8,
+        },
+        "lfm2_5_1_2b_config.json": {
+            "dim": 2048,
+            "hidden_dim": 8192,
+            "n_heads": 32,
+            "n_kv_heads": 8,
+        },
+    }
+
+    for filename, expected_fields in expected.items():
+        cfg = _load_json_config(filename)
+        for key, value in expected_fields.items():
+            assert cfg[key] == value
+        assert cfg["n_layers"] == 16
+        assert len(cfg["layer_types"]) == cfg["n_layers"]
+        assert cfg["layer_types"].count("full_attention") == 6
+        assert cfg["layer_types"].count("conv") == 10
+        assert cfg["vocab_size"] == 65536
+        assert cfg["rope_theta"] == 1000000.0
+        assert cfg["use_hf_rope"] is True
+        assert cfg["use_qk_norm"] is True
+        assert cfg["qk_norm_before_rope"] is True
+
+
+def test_lfm2_mlx_config_enables_mlx_backend() -> None:
+    cfg = OmegaConf.load(CONFIG_DIR / "lfm2_mlx_4w.yaml")
+    assert cfg.base.metadata == '{"get_bos_id": 1, "get_eos_ids":[7]}'
+    assert cfg.model.use_kv_cache is True
+    assert cfg.model.use_sdpa_with_kv_cache is True
+    assert cfg.model.dtype_override == "bf16"
+    assert cfg.quantization.qmode == "4w"
+    assert cfg.quantization.group_size == 64
+    assert cfg.backend.mlx.enabled is True
diff --git a/examples/models/llama/CMakePresets.json b/examples/models/llama/CMakePresets.json
@@ -48,6 +48,21 @@
                 "string": "${hostSystemName}",
                 "list": ["Linux", "Windows"]
             }
+        },
+        {
+            "name": "llama-mlx",
+            "displayName": "Llama runner with MLX backend",
+            "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/llama",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+                "EXECUTORCH_BUILD_MLX": "ON"
+            },
+            "condition": {
+                "lhs": "${hostSystemName}",
+                "type": "equals",
+                "rhs": "Darwin"
+            }
         }
     ],
     "buildPresets": [
@@ -74,6 +89,12 @@
             "displayName": "Build Llama runner with CUDA backend",
             "configurePreset": "llama-cuda",
             "targets": ["llama_main"]
+        },
+        {
+            "name": "llama-mlx",
+            "displayName": "Build Llama runner with MLX backend",
+            "configurePreset": "llama-mlx",
+            "targets": ["llama_main"]
         }
     ],
     "workflowPresets": [
@@ -132,6 +153,20 @@
                     "name": "llama-cuda"
                 }
             ]
+        },
+        {
+            "name": "llama-mlx",
+            "displayName": "Configure and build Llama runner with MLX backend",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "llama-mlx"
+                },
+                {
+                    "type": "build",
+                    "name": "llama-mlx"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -115,6 +115,7 @@
     "lfm2_350m",  # hybrid
     "lfm2_700m",  # hybrid
     "lfm2_1_2b",  # hybrid
+    "lfm2_5_350m",  # hybrid
     "lfm2_5_1_2b",  # hybrid
 ]
 TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
@@ -133,6 +134,7 @@
     "lfm2_350m": "LiquidAI/LFM2-350M",
     "lfm2_700m": "LiquidAI/LFM2-700M",
     "lfm2_1_2b": "LiquidAI/LFM2-1.2B",
+    "lfm2_5_350m": "LiquidAI/LFM2.5-350M",
     "lfm2_5_1_2b": "LiquidAI/LFM2.5-1.2B-Instruct",
 }
 
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
@@ -52,6 +52,7 @@ class ModelType(str, Enum):
     lfm2_350m = "lfm2_350m"
     lfm2_700m = "lfm2_700m"
     lfm2_1_2b = "lfm2_1_2b"
+    lfm2_5_350m = "lfm2_5_350m"
     lfm2_5_1_2b = "lfm2_5_1_2b"
 
 

Original file line number	Diff line number	Diff line change
`@@ -350,7 +350,10 @@`
`350`	`350`	`"CMAKE_BUILD_TYPE": "Release",`
`351`	`351`	`"CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out",`
`352`	`352`	`"ET_MLX_ENABLE_OP_LOGGING": "OFF",`
`353`		`- "ET_MIN_LOG_LEVEL": "Error"`
	`353`	`+ "ET_MIN_LOG_LEVEL": "Error",`
	`354`	`+ "EXECUTORCH_BUILD_KERNELS_LLM": "ON",`
	`355`	`+ "EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",`
	`356`	`+ "EXECUTORCH_BUILD_KERNELS_OPTIMIZED": "ON"`
`354`	`357`	`}`
`355`	`358`	`},`
`356`	`359`	`{`
Original file line number	Diff line number	Diff line change
`@@ -115,6 +115,7 @@`
`115`	`115`	`"lfm2_350m", # hybrid`
`116`	`116`	`"lfm2_700m", # hybrid`
`117`	`117`	`"lfm2_1_2b", # hybrid`
	`118`	`+ "lfm2_5_350m", # hybrid`
`118`	`119`	`"lfm2_5_1_2b", # hybrid`
`119`	`120`	`]`
`120`	`121`	`TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]`
`@@ -133,6 +134,7 @@`
`133`	`134`	`"lfm2_350m": "LiquidAI/LFM2-350M",`
`134`	`135`	`"lfm2_700m": "LiquidAI/LFM2-700M",`
`135`	`136`	`"lfm2_1_2b": "LiquidAI/LFM2-1.2B",`
	`137`	`+ "lfm2_5_350m": "LiquidAI/LFM2.5-350M",`
`136`	`138`	`"lfm2_5_1_2b": "LiquidAI/LFM2.5-1.2B-Instruct",`
`137`	`139`	`}`
`138`	`140`