Skip to content

Commit 2e3a50c

Browse files
committed
Enable LFM2.5 MLX export and runner build
Add LFM2.5 350M registration, MLX export config, focused regression coverage, and a make target for building the shared Llama C++ runner with MLX. Made-with: Cursor
1 parent 6ba868e commit 2e3a50c

9 files changed

Lines changed: 271 additions & 3 deletions

File tree

CMakePresets.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,10 @@
350350
"CMAKE_BUILD_TYPE": "Release",
351351
"CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out",
352352
"ET_MLX_ENABLE_OP_LOGGING": "OFF",
353-
"ET_MIN_LOG_LEVEL": "Error"
353+
"ET_MIN_LOG_LEVEL": "Error",
354+
"EXECUTORCH_BUILD_KERNELS_LLM": "ON",
355+
"EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",
356+
"EXECUTORCH_BUILD_KERNELS_OPTIMIZED": "ON"
354357
}
355358
},
356359
{

Makefile

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
#
9292
# ==============================================================================
9393

94-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
94+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
9595

9696
help:
9797
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -123,6 +123,7 @@ help:
123123
@echo " llama-cuda - Build Llama runner with CUDA backend"
124124
@echo " llama-cuda-debug - Build Llama runner with CUDA backend (debug mode)"
125125
@echo " llama-cpu - Build Llama runner with CPU backend"
126+
@echo " lfm_2_5-mlx - Build LFM2.5 runner with MLX backend"
126127
@echo " llava-cpu - Build Llava runner with CPU backend"
127128
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
128129
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
@@ -373,6 +374,15 @@ llama-cuda-debug:
373374
@echo "✓ Build complete!"
374375
@echo " Binary: cmake-out/examples/models/llama/llama_main"
375376

377+
lfm_2_5-mlx:
378+
@echo "==> Building and installing ExecuTorch with MLX..."
379+
cmake --workflow --preset mlx-release
380+
@echo "==> Building LFM2.5 runner with MLX..."
381+
cd examples/models/llama && cmake --workflow --preset llama-mlx
382+
@echo ""
383+
@echo "✓ Build complete!"
384+
@echo " Binary: cmake-out/examples/models/llama/llama_main"
385+
376386
llava-cpu:
377387
@echo "==> Building and installing ExecuTorch..."
378388
cmake --workflow --preset llm-release

examples/models/lfm2/README.md

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,24 @@ python -m extension.llm.export.export_llm \
4747
+export.output_name="lfm2_5_1_2b_8da4w.pte"
4848
```
4949

50+
Export LFM2.5 350M to MLX on Apple Silicon, quantized with 4-bit weights:
51+
```
52+
python -m extension.llm.export.export_llm \
53+
--config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
54+
+base.model_class="lfm2_5_350m" \
55+
+base.params="examples/models/lfm2/config/lfm2_5_350m_config.json" \
56+
+export.output_name="lfm2_5_350m_mlx_4w.pte"
57+
```
58+
59+
Export LFM2.5 1.2B to MLX on Apple Silicon, quantized with 4-bit weights:
60+
```
61+
python -m extension.llm.export.export_llm \
62+
--config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
63+
+base.model_class="lfm2_5_1_2b" \
64+
+base.params="examples/models/lfm2/config/lfm2_5_1_2b_config.json" \
65+
+export.output_name="lfm2_5_1_2b_mlx_4w.pte"
66+
```
67+
5068
To export with extended context (e.g., 2048 tokens):
5169
```
5270
python -m extension.llm.export.export_llm \
@@ -58,6 +76,17 @@ python -m extension.llm.export.export_llm \
5876
+export.output_name="lfm2_5_1_2b_8da4w.pte"
5977
```
6078
### Example run
79+
For MLX on Apple Silicon, build or install ExecuTorch with MLX enabled. The
80+
easiest local path is:
81+
```
82+
conda activate et-mlx
83+
python install_executorch.py
84+
xcrun -sdk macosx --find metal
85+
```
86+
87+
The `metal` command must resolve to an Xcode path, not fail under standalone
88+
Command Line Tools.
89+
6190
With ExecuTorch pybindings:
6291
```
6392
python -m examples.models.llama.runner.native \
@@ -72,7 +101,31 @@ python -m examples.models.llama.runner.native \
72101
--temperature 0.3
73102
```
74103

75-
With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner):
104+
With ExecuTorch pybindings and an LFM2.5 MLX export:
105+
```
106+
python -m examples.models.llama.runner.native \
107+
--model lfm2_5_350m \
108+
--pte lfm2_5_350m_mlx_4w.pte \
109+
--tokenizer ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
110+
--tokenizer_config ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer_config.json \
111+
--prompt "<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
112+
--params examples/models/lfm2/config/lfm2_5_350m_config.json \
113+
--max_len 128 \
114+
-kv \
115+
--temperature 0.3
116+
```
117+
118+
Find the Hugging Face cache snapshot directory with:
119+
```
120+
python - <<'PY'
121+
from pathlib import Path
122+
root = Path.home() / ".cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots"
123+
for path in root.glob("*/tokenizer.json"):
124+
print(path.parent)
125+
PY
126+
```
127+
128+
With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) for general runner details):
76129
```
77130
cmake-out/examples/models/llama/llama_main \
78131
--model_path lfm2_700m_8da4w.pte \
@@ -81,4 +134,18 @@ cmake-out/examples/models/llama/llama_main \
81134
--temperature 0.3
82135
```
83136

137+
Build the C++ runner with MLX support for LFM2.5:
138+
```
139+
make lfm_2_5-mlx
140+
```
141+
142+
Then run an LFM2.5 MLX export with the C++ runner:
143+
```
144+
cmake-out/examples/models/llama/llama_main \
145+
--model_path lfm2_5_350m_mlx_4w.pte \
146+
--tokenizer_path ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
147+
--prompt="<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
148+
--temperature 0.3
149+
```
150+
84151
To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"dim": 1024,
3+
"ffn_dim_multiplier": 1,
4+
"hidden_dim": 4608,
5+
"n_heads": 16,
6+
"n_kv_heads": 8,
7+
"n_layers": 16,
8+
"norm_eps": 1e-5,
9+
"rope_theta": 1000000.0,
10+
"use_scaled_rope": false,
11+
"vocab_size": 65536,
12+
"use_hf_rope": true,
13+
"use_qk_norm": true,
14+
"qk_norm_before_rope": true,
15+
"layer_types": [
16+
"conv",
17+
"conv",
18+
"full_attention",
19+
"conv",
20+
"conv",
21+
"full_attention",
22+
"conv",
23+
"conv",
24+
"full_attention",
25+
"conv",
26+
"full_attention",
27+
"conv",
28+
"full_attention",
29+
"conv",
30+
"full_attention",
31+
"conv"
32+
]
33+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
base:
2+
metadata: '{"get_bos_id": 1, "get_eos_ids":[7]}'
3+
4+
model:
5+
use_kv_cache: True
6+
use_sdpa_with_kv_cache: True
7+
dtype_override: bf16
8+
9+
quantization:
10+
qmode: 4w
11+
group_size: 64
12+
13+
backend:
14+
mlx:
15+
enabled: True
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import ast
2+
import json
3+
from pathlib import Path
4+
5+
from omegaconf import OmegaConf
6+
7+
8+
REPO_ROOT = Path(__file__).resolve().parents[3]
9+
CONFIG_DIR = REPO_ROOT / "examples" / "models" / "lfm2" / "config"
10+
EXPORT_LLAMA_LIB = REPO_ROOT / "examples" / "models" / "llama" / "export_llama_lib.py"
11+
LLM_CONFIG = REPO_ROOT / "extension" / "llm" / "export" / "config" / "llm_config.py"
12+
13+
14+
def _load_json_config(name: str) -> dict:
15+
with open(CONFIG_DIR / name, "r") as f:
16+
return json.load(f)
17+
18+
19+
def _module_ast(path: Path) -> ast.Module:
20+
return ast.parse(path.read_text())
21+
22+
23+
def _literal_assignment(module: ast.Module, name: str):
24+
for node in module.body:
25+
if isinstance(node, ast.Assign):
26+
for target in node.targets:
27+
if isinstance(target, ast.Name) and target.id == name:
28+
return ast.literal_eval(node.value)
29+
raise AssertionError(f"{name} not found")
30+
31+
32+
def _class_string_assignments(module: ast.Module, class_name: str) -> dict[str, str]:
33+
for node in module.body:
34+
if isinstance(node, ast.ClassDef) and node.name == class_name:
35+
values = {}
36+
for stmt in node.body:
37+
if (
38+
isinstance(stmt, ast.Assign)
39+
and len(stmt.targets) == 1
40+
and isinstance(stmt.targets[0], ast.Name)
41+
):
42+
values[stmt.targets[0].id] = ast.literal_eval(stmt.value)
43+
return values
44+
raise AssertionError(f"{class_name} not found")
45+
46+
47+
def test_lfm2_5_models_are_registered() -> None:
48+
export_module = _module_ast(EXPORT_LLAMA_LIB)
49+
model_types = _class_string_assignments(_module_ast(LLM_CONFIG), "ModelType")
50+
executor_defined_models = _literal_assignment(
51+
export_module, "EXECUTORCH_DEFINED_MODELS"
52+
)
53+
hf_repo_ids = _literal_assignment(export_module, "HUGGING_FACE_REPO_IDS")
54+
55+
assert "lfm2_5_350m" in executor_defined_models
56+
assert "lfm2_5_1_2b" in executor_defined_models
57+
assert model_types["lfm2_5_350m"] == "lfm2_5_350m"
58+
assert model_types["lfm2_5_1_2b"] == "lfm2_5_1_2b"
59+
assert hf_repo_ids["lfm2_5_350m"] == "LiquidAI/LFM2.5-350M"
60+
assert hf_repo_ids["lfm2_5_1_2b"] == "LiquidAI/LFM2.5-1.2B-Instruct"
61+
62+
63+
def test_lfm2_5_architecture_configs_match_expected_shapes() -> None:
64+
expected = {
65+
"lfm2_5_350m_config.json": {
66+
"dim": 1024,
67+
"hidden_dim": 4608,
68+
"n_heads": 16,
69+
"n_kv_heads": 8,
70+
},
71+
"lfm2_5_1_2b_config.json": {
72+
"dim": 2048,
73+
"hidden_dim": 8192,
74+
"n_heads": 32,
75+
"n_kv_heads": 8,
76+
},
77+
}
78+
79+
for filename, expected_fields in expected.items():
80+
cfg = _load_json_config(filename)
81+
for key, value in expected_fields.items():
82+
assert cfg[key] == value
83+
assert cfg["n_layers"] == 16
84+
assert len(cfg["layer_types"]) == cfg["n_layers"]
85+
assert cfg["layer_types"].count("full_attention") == 6
86+
assert cfg["layer_types"].count("conv") == 10
87+
assert cfg["vocab_size"] == 65536
88+
assert cfg["rope_theta"] == 1000000.0
89+
assert cfg["use_hf_rope"] is True
90+
assert cfg["use_qk_norm"] is True
91+
assert cfg["qk_norm_before_rope"] is True
92+
93+
94+
def test_lfm2_mlx_config_enables_mlx_backend() -> None:
95+
cfg = OmegaConf.load(CONFIG_DIR / "lfm2_mlx_4w.yaml")
96+
assert cfg.base.metadata == '{"get_bos_id": 1, "get_eos_ids":[7]}'
97+
assert cfg.model.use_kv_cache is True
98+
assert cfg.model.use_sdpa_with_kv_cache is True
99+
assert cfg.model.dtype_override == "bf16"
100+
assert cfg.quantization.qmode == "4w"
101+
assert cfg.quantization.group_size == 64
102+
assert cfg.backend.mlx.enabled is True

examples/models/llama/CMakePresets.json

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,21 @@
4848
"string": "${hostSystemName}",
4949
"list": ["Linux", "Windows"]
5050
}
51+
},
52+
{
53+
"name": "llama-mlx",
54+
"displayName": "Llama runner with MLX backend",
55+
"binaryDir": "${sourceDir}/../../../cmake-out/examples/models/llama",
56+
"cacheVariables": {
57+
"CMAKE_BUILD_TYPE": "Release",
58+
"CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
59+
"EXECUTORCH_BUILD_MLX": "ON"
60+
},
61+
"condition": {
62+
"lhs": "${hostSystemName}",
63+
"type": "equals",
64+
"rhs": "Darwin"
65+
}
5166
}
5267
],
5368
"buildPresets": [
@@ -74,6 +89,12 @@
7489
"displayName": "Build Llama runner with CUDA backend",
7590
"configurePreset": "llama-cuda",
7691
"targets": ["llama_main"]
92+
},
93+
{
94+
"name": "llama-mlx",
95+
"displayName": "Build Llama runner with MLX backend",
96+
"configurePreset": "llama-mlx",
97+
"targets": ["llama_main"]
7798
}
7899
],
79100
"workflowPresets": [
@@ -132,6 +153,20 @@
132153
"name": "llama-cuda"
133154
}
134155
]
156+
},
157+
{
158+
"name": "llama-mlx",
159+
"displayName": "Configure and build Llama runner with MLX backend",
160+
"steps": [
161+
{
162+
"type": "configure",
163+
"name": "llama-mlx"
164+
},
165+
{
166+
"type": "build",
167+
"name": "llama-mlx"
168+
}
169+
]
135170
}
136171
]
137172
}

examples/models/llama/export_llama_lib.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@
115115
"lfm2_350m", # hybrid
116116
"lfm2_700m", # hybrid
117117
"lfm2_1_2b", # hybrid
118+
"lfm2_5_350m", # hybrid
118119
"lfm2_5_1_2b", # hybrid
119120
]
120121
TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
@@ -133,6 +134,7 @@
133134
"lfm2_350m": "LiquidAI/LFM2-350M",
134135
"lfm2_700m": "LiquidAI/LFM2-700M",
135136
"lfm2_1_2b": "LiquidAI/LFM2-1.2B",
137+
"lfm2_5_350m": "LiquidAI/LFM2.5-350M",
136138
"lfm2_5_1_2b": "LiquidAI/LFM2.5-1.2B-Instruct",
137139
}
138140

extension/llm/export/config/llm_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class ModelType(str, Enum):
5252
lfm2_350m = "lfm2_350m"
5353
lfm2_700m = "lfm2_700m"
5454
lfm2_1_2b = "lfm2_1_2b"
55+
lfm2_5_350m = "lfm2_5_350m"
5556
lfm2_5_1_2b = "lfm2_5_1_2b"
5657

5758

0 commit comments

Comments
 (0)