Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
1710 commits
Select commit Hold shift + click to select a range
5b93480
perf(mlx): eliminate KV restore doubling — native-slab layers skip de…
Snider May 30, 2026
b1bba1f
docs(goal): trim the dated journey-log — keep live goal + current sta…
Snider May 30, 2026
9661b00
merge(ax11): zero-alloc tokenizer decode + 416MB KV-restore doubling fix
Snider May 30, 2026
e0a7714
fix(metal): nil-guard model close paths to surface real Metal load er…
Snider May 30, 2026
14fca7a
fix(admin): use filepath.Rel containment for model-path escape gate (…
Snider May 30, 2026
89877a4
fix(admin): bound download job registry with oldest-first eviction (M…
Snider May 30, 2026
80b73f3
fix(admin): wire size-drift detection to structured warning (Mantis #…
Snider May 30, 2026
66a3d2b
fix(admin): reject dotfile segments in HF entry paths (Mantis #1786)
Snider May 30, 2026
ef1d4ac
fix(admin): sort .sha256 manifest entries for deterministic output (M…
Snider May 30, 2026
6f34293
fix(serve): preserve auto-tuned load opts across hot-swap reload (Man…
Snider May 30, 2026
ef0fbbc
feat(profile): expose decode bandwidth proxy
Snider May 31, 2026
b53e612
test(kv): guard native State payload bytes
Snider May 31, 2026
e5287c5
feat(api): lock official gemma4 e2b snapshots
Snider May 31, 2026
a6767b7
feat(api): select gemma4 e2b quant tier
Snider May 31, 2026
efed371
fix(model): route official gemma4 e2b to text path
Snider May 31, 2026
d35eb39
feat(metal): draft gemma4 ordered assistant logits
Snider May 31, 2026
5f8904e
feat(api): expose gemma4 mtp profile metrics
Snider May 31, 2026
3d16eaa
test(metal): prove gemma4 ordered assistant logits
Snider May 31, 2026
905d99c
feat(bench): label gemma4 assistant mtp metrics
Snider May 31, 2026
902e661
feat(memory): prefer q6 for gemma4 small models
Snider May 31, 2026
07b7be1
feat(api): expose official gemma4 e2b locks
Snider May 31, 2026
26d8173
feat(api): surface gemma4 mtp assistant policy
Snider May 31, 2026
f6becb0
feat(memory): add turboquant kv mode contract
Snider May 31, 2026
4fbcabd
fix(metal): fail closed on turboquant snapshots
Snider May 31, 2026
b021980
feat(api): verify official gemma4 snapshots
Snider May 31, 2026
eb71af6
feat(api): add official gemma4 verifier command
Snider May 31, 2026
7db9e0f
feat(api): default gemma4 lane to q6
Snider May 31, 2026
0f58b34
feat(cli): preflight official gemma4 snapshots
Snider May 31, 2026
6b16ec9
feat(api): verify official gemma4 assistant pairs
Snider May 31, 2026
8bb2639
feat(memory): expose gemma4 quant ladder
Snider May 31, 2026
4995b81
docs(runtime): record official gemma4 e2b locks
Snider May 31, 2026
01ff019
fix(runtime): resolve official gemma4 cache roots
Snider May 31, 2026
851abc1
fix(runtime): resolve official gemma4 pair snapshots
Snider May 31, 2026
d3e4cab
fix(runtime): resolve official gemma4 speculative paths
Snider May 31, 2026
be55fb0
test(runtime): tighten official gemma4 cache roots
Snider May 31, 2026
08a1d39
docs(runtime): align quantization guidance with q6 default
Snider May 31, 2026
844819d
fix(metal): accept matrix token ordering for gemma4 assistant
Snider May 31, 2026
fea5fea
docs(runtime): map turboquant onto kv state
Snider May 31, 2026
f91a54a
feat(metal): define turboquant kv page layout
Snider May 31, 2026
255eaad
feat(api): compare official gemma4 e2b control metadata
Snider May 31, 2026
36c8b98
docs(runtime): record official gemma4 e2b local preflight
Snider May 31, 2026
93bf673
feat(api): expose production quantization selection
Snider May 31, 2026
c3acf1f
feat(bench): label target-only mtp summary metrics
Snider May 31, 2026
5fd5429
fix(bench): align gemma4 mtp prompt controls
Snider May 31, 2026
47ef9c4
perf(metal): normalise gemma4 assistant token ordering
Snider May 31, 2026
51c9ec9
perf(metal): select assistant drafts from sparse logits
Snider May 31, 2026
12b9659
fix(cli): plan production quantisation for retained context
Snider May 31, 2026
118ccdc
fix(memory): keep source snapshot quantisation honest
Snider May 31, 2026
3b9412f
feat(api): gate production mtp promotion
Snider May 31, 2026
9854078
feat(cli): compare production mtp profiles
Snider May 31, 2026
90802c1
fix(cli): align mtp draft token default
Snider May 31, 2026
71197f5
fix(cli): trim official gemma4 json reports
Snider May 31, 2026
94d663a
feat(api): lock production gemma4 quant packs
Snider May 31, 2026
0c780db
feat(cli): report production quant pack locks
Snider May 31, 2026
b1202d2
fix(api): use production mtp draft default
Snider May 31, 2026
83f179b
fix(metal): use production gemma4 assistant draft default
Snider May 31, 2026
e564b7f
fix(speculative): use production mtp profile default
Snider May 31, 2026
c7c0ab5
feat(cli): expose mtp comparison evidence
Snider May 31, 2026
f88c17d
build(darwin): link binaries for macos 26
Snider May 31, 2026
efa9f33
feat(cli): expose official gemma4 locks
Snider May 31, 2026
4963f9c
feat(metal): account turboquant kv payload bytes
Snider May 31, 2026
d1e3de6
feat(metal): add turboquant mse reference codec
Snider May 31, 2026
ab2f759
feat(metal): add turboquant prod reference estimator
Snider May 31, 2026
b80dbff
test(metal): benchmark turboquant reference codec
Snider May 31, 2026
594ec61
feat(metal): add turboquant reference page codec
Snider May 31, 2026
58b9a11
test(metal): centre turboquant qjl estimator error
Snider May 31, 2026
1f37723
feat(metal): pack turboquant reference bitstreams
Snider May 31, 2026
d2550fd
feat(metal): pack turboquant page payloads
Snider May 31, 2026
7df6016
feat(metal): restore turboquant payload arrays
Snider May 31, 2026
aecab98
feat(metal): add turboquant reference cache
Snider May 31, 2026
63ca28c
feat(metal): persist turboquant kv snapshot payloads
Snider May 31, 2026
6539403
feat(kv): preserve turboquant state blocks
Snider May 31, 2026
a4a204d
docs(repo): cite macos 26 metal api floor
Snider May 31, 2026
73ddb40
feat(api): lock q4 production fallback pack
Snider May 31, 2026
a457e21
docs(repo): link macos 26 api floor
Snider May 31, 2026
be2bdd2
fix(api): require explicit mtp draft evidence
Snider May 31, 2026
77d6d98
feat(api): gate turboquant production promotion
Snider May 31, 2026
a66adac
feat(cli): expose turboquant promotion policy
Snider May 31, 2026
3318ec6
docs(operator): link macos 26 metal api floor
Snider May 31, 2026
4df03c6
feat(cli): compare turboquant driver profiles
Snider May 31, 2026
e1fe597
feat(cli): include source locks in quantization profile
Snider May 31, 2026
a37c1f2
docs(api): lock macos 26 metal sources
Snider May 31, 2026
e262d0d
feat(api): require mtp operational evidence
Snider May 31, 2026
6a34883
feat(cli): include derived gemma4 locks
Snider May 31, 2026
0edd2c9
docs(runtime): sync gemma4 source locks
Snider May 31, 2026
413272b
fix(runtime): require turboquant throughput evidence
Snider May 31, 2026
b875113
fix(runtime): require mtp draft accounting
Snider May 31, 2026
710ced4
fix(runtime): require mtp draft schedule evidence
Snider May 31, 2026
9e6d9d4
fix(runtime): require mtp throughput breakdown
Snider May 31, 2026
80f987b
docs(runtime): add metal feature availability source
Snider May 31, 2026
1f73f65
fix(cli): flag missing mtp benchmark metrics
Snider May 31, 2026
8c84802
fix(runtime): expose gemma4 assistant layer mapping
Snider May 31, 2026
faebfb4
fix(cli): flag missing turboquant metrics
Snider May 31, 2026
9a6b104
fix(api): expose quantization ladder evidence
Snider May 31, 2026
f32bcae
fix(cli): flag missing mtp metrics
Snider May 31, 2026
82b61fe
docs(platform): lock metal feature table source
Snider May 31, 2026
591d997
feat(api): expose official assistant tensor evidence
Snider May 31, 2026
bb3ffa2
fix(cli): require mtp draft sweep evidence
Snider May 31, 2026
590e3ce
feat(api): report official state cache parity
Snider May 31, 2026
1e95a5d
fix(cli): restore shared-mask fast lane default
Snider May 31, 2026
9a1e634
feat(api): record quant pack conversion evidence
Snider May 31, 2026
88611a1
feat(api): expose quantisation throughput model
Snider May 31, 2026
c281b7e
fix(api): require mtp draft call evidence
Snider May 31, 2026
9e3c3b9
fix(cli): gate mtp compare on load policy
Snider May 31, 2026
0a38267
fix(api): require mtp load policy metrics
Snider May 31, 2026
dfdd4ba
fix(cli): gate turboquant compare on load policy
Snider May 31, 2026
83c8264
fix(api): expose load policy promotion evidence
Snider May 31, 2026
5c89005
fix(api): enforce load policy promotion evidence
Snider May 31, 2026
8dc43c2
fix(api): report input output throughput evidence
Snider May 31, 2026
1c3c986
docs(platform): lock macOS 26 API source
Snider May 31, 2026
22df269
fix(metal): honour turboquant outlier bit budgets
Snider May 31, 2026
e683804
fix(metal): record turboquant outlier policy
Snider May 31, 2026
20ea267
fix(metal): record turboquant norm policy
Snider May 31, 2026
835f8c5
fix(api): require turboquant active memory evidence
Snider May 31, 2026
7090e9e
fix(api): require mtp active memory evidence
Snider May 31, 2026
457e42c
fix(api): gate turboquant on active memory savings
Snider May 31, 2026
8fbd8d1
fix(api): gate combined mtp turboquant promotion
Snider May 31, 2026
2f4b7fb
feat(cli): combine mtp turboquant promotion reports
Snider May 31, 2026
71212a8
fix(api): require assistant layout evidence for mtp promotion
Snider May 31, 2026
3ee2da5
feat(cli): derive mtp assistant evidence from pair report
Snider May 31, 2026
0cae72b
fix(api): require turboquant layout evidence for promotion
Snider May 31, 2026
4dbe0fb
fix(api): expose macos 26 api provenance
Snider May 31, 2026
4167f1c
fix(cli): carry mtp assistant layout evidence
Snider May 31, 2026
468458b
fix(cli): route bench through attached mtp assistant
Snider May 31, 2026
2069d3e
feat(cli): include mtp evidence in bench json
Snider May 31, 2026
8e8a97d
docs(operator): link macos 26 api floor
Snider May 31, 2026
ae676a7
feat(api): expose quantization step-down evidence
Snider May 31, 2026
dea5f6d
fix(cli): show quantization fallback evidence
Snider May 31, 2026
9efa72b
fix(kv): fail closed on turboquant snapshot metadata
Snider May 31, 2026
50c1a65
docs(repo): document macos 26 native floor
Snider May 31, 2026
d8d1e2f
fix(metal): clarify gemma4 assistant load boundary
Snider May 31, 2026
e97908f
fix(hf): preserve gemma4 assistant preflight identity
Snider May 31, 2026
09c19a3
fix(cli): reject contaminated mtp target-only evidence
Snider May 31, 2026
e101d82
fix(mtp): require four-layer assistant evidence
Snider May 31, 2026
5da7479
docs(repo): link macos 26 api floor
Snider May 31, 2026
d4d9076
docs(api): align examples with q6 default
Snider May 31, 2026
1dc0ebc
fix(metal): admit q6 native matvec
Snider May 31, 2026
1721a8d
docs(repo): lock macos 26 platform symbol
Snider May 31, 2026
748b63e
test(repo): keep source lock platform evidence current
Snider May 31, 2026
933dbd2
test(metal): benchmark q6 dense matvec
Snider Jun 1, 2026
8269831
test(api): tighten combined production metric policy
Snider Jun 1, 2026
a3acf96
test(api): validate official assistant token ordering dtype
Snider Jun 1, 2026
7a399b6
fix(metal): reject malformed assistant token ordering
Snider Jun 1, 2026
352d9a7
feat(api): carry assistant token-ordering evidence
Snider Jun 1, 2026
0107762
docs(runtime): record official e2b mtp smoke
Snider Jun 1, 2026
4f3a419
test(metal): smoke official e2b mtp generation
Snider Jun 1, 2026
90c35b4
feat(cmd): prove mtp parity with token hashes
Snider Jun 1, 2026
f331510
feat(api): advertise turboquant cache mode
Snider Jun 1, 2026
7452d8c
chore(repo): treat macos 26 as fixed floor
Snider Jun 1, 2026
133f322
perf(metal): decode turboquant base payloads directly
Snider Jun 1, 2026
5f933dc
test(metal): split turboquant payload restore benches
Snider Jun 1, 2026
4193b76
perf(metal): tighten turboquant payload packing
Snider Jun 1, 2026
4d58505
perf(api): reuse production policy defaults
Snider Jun 1, 2026
f3d44b9
perf(api): cache production policy structs
Snider Jun 1, 2026
f7a0fd1
perf(metal): decode turboquant pages in one pass
Snider Jun 1, 2026
66d8154
perf(metal): avoid turboquant residual decode allocations
Snider Jun 1, 2026
a7d165f
perf(metal): reuse turboquant encode scratch
Snider Jun 1, 2026
cfc2db4
perf(metal): reuse turboquant decode scratch
Snider Jun 1, 2026
fc2eb65
perf(metal): restore turboquant payloads from page buffers
Snider Jun 1, 2026
9eadcee
perf(metal): pack turboquant payloads in place
Snider Jun 1, 2026
b314f4e
perf(metal): encode turboquant pages into shared buffers
Snider Jun 1, 2026
af51ddb
perf(metal): avoid full turboquant sequence copies
Snider Jun 1, 2026
f4c1903
perf(metal): encode turboquant pages from source strides
Snider Jun 1, 2026
f62ac6f
perf(metal): pin turboquant restored arrays
Snider Jun 1, 2026
4513a2b
perf(metal): pool turboquant decode scratch
Snider Jun 1, 2026
81da32e
perf(metal): guard q6 bitstream matvec lane
Snider Jun 1, 2026
0a51f13
test(metal): isolate q6 native matvec bench
Snider Jun 1, 2026
ecb3711
fix(bench): keep target-only MTP reports clean
Snider Jun 1, 2026
fa72179
test(metal): benchmark q6 bitstream fallback shapes
Snider Jun 1, 2026
68be723
test(bench): require observed mtp sweep evidence
Snider Jun 1, 2026
4b8f510
test(mtp): require verified official pair evidence
Snider Jun 1, 2026
5ac44c0
test(mtp): tighten official token ordering evidence
Snider Jun 1, 2026
b0f3804
test(gemma4): require official I64 token ordering
Snider Jun 1, 2026
cf8fbbf
fix(production): require q8 memory headroom
Snider Jun 1, 2026
00d0fc5
perf(metal): pool turboquant payload decode scratch
Snider Jun 1, 2026
35baa5b
perf(metal): pool turboquant multi-page restore scratch
Snider Jun 1, 2026
9ffa25c
perf(metal): pool turboquant encode scratch
Snider Jun 1, 2026
d9e3f9a
perf(metal): add zero-alloc turboquant estimator path
Snider Jun 1, 2026
0f484f5
perf(metal): add reusable turboquant payload decode
Snider Jun 1, 2026
47f6b9f
perf(metal): add reusable turboquant multipage decode
Snider Jun 1, 2026
3e3ee4d
perf(metal): account turboquant cache payload bytes
Snider Jun 1, 2026
6437a81
perf(metal): report turboquant payload metrics
Snider Jun 1, 2026
dd6ec47
feat(metal): add q8 greedy output head
Snider Jun 1, 2026
1c4abaa
fix(metal): report assistant first-token latency
Snider Jun 1, 2026
f9bb06b
feat(api): require mtp first-token evidence
Snider Jun 1, 2026
b81e9f9
perf(api): keep q6 runtime gates opt-in
Snider Jun 1, 2026
6b7c006
perf(api): promote q6 direct greedy default
Snider Jun 1, 2026
c1806c6
fix(api): copy production policy defaults
Snider Jun 1, 2026
ce1c257
perf(metal): avoid turboquant state slice allocation
Snider Jun 1, 2026
f27a220
docs(goal): top-priority last-run lane — native kernels for all 16 de…
Snider Jun 1, 2026
cdc5c33
perf(metal): avoid dense matvec shape allocations
Snider Jun 1, 2026
c704184
perf(metal): tighten assistant ordered embedding slice
Snider Jun 1, 2026
be5ccbb
feat(metal): route mistral through native dense loader
Snider Jun 1, 2026
e106309
feat(metal): route hermes through native dense loader
Snider Jun 1, 2026
8129efc
feat(metal): route granite through native dense loader
Snider Jun 1, 2026
1f55b55
feat(metal): route phi through native dense loader
Snider Jun 1, 2026
7084ec0
feat(metal): route glm through native dense loader
Snider Jun 1, 2026
695e579
fix(api): stop tuning from selecting mlx_lm fallback
Snider Jun 1, 2026
8f29cd1
docs(api): remove mlx_lm fallback guidance from native gaps
Snider Jun 1, 2026
c4d3205
docs(goal): refresh native architecture progress
Snider Jun 1, 2026
8de70cb
feat(api): mark gemma4 assistant as native attached drafter
Snider Jun 1, 2026
1c21b5b
feat(api): mark minimax m2 as native staged
Snider Jun 1, 2026
9d32f1e
fix(hf): expose bert task fit flags
Snider Jun 1, 2026
dedd1c1
fix(metal): add explicit guards for pending native loaders
Snider Jun 1, 2026
878d3a7
perf(api): remove combined policy evaluation allocations
Snider Jun 1, 2026
a4df821
perf(api): promote paged decode fast concat default
Snider Jun 1, 2026
3da5c5e
docs(runtime): record q6 go-mlx self bench
Snider Jun 1, 2026
35ff105
docs(goal): reconcile official e2b setup gates
Snider Jun 1, 2026
d2ec87f
test(api): benchmark production policy gates
Snider Jun 1, 2026
9943d51
docs(runtime): record official e2b target state smoke
Snider Jun 1, 2026
3ce56d1
docs(runtime): record official e2b mtp diagnostic
Snider Jun 1, 2026
de54a64
docs(runtime): extend official e2b mtp sweep
Snider Jun 1, 2026
033f23d
fix(metal): withhold assistant stop tokens
Snider Jun 1, 2026
f5f3b76
feat(mtp): summarise retained draft metrics
Snider Jun 2, 2026
a7db095
feat(mtp): compare retained state-ramp reports
Snider Jun 2, 2026
9ddc8f0
fix(api): narrow gemma4 q6 default gates
Snider Jun 2, 2026
2efe482
docs(turboquant): mark implementation mapping evidence
Snider Jun 2, 2026
95c19af
docs(runtime): refresh q6 go-mlx self bench
Snider Jun 2, 2026
f13a5d2
feat(mtp): preserve retained target call evidence
Snider Jun 2, 2026
d2517e7
feat(mtp): carry retained assistant profile config
Snider Jun 2, 2026
e8bdc92
feat(mtp): label speculative generation mode
Snider Jun 2, 2026
e4d43be
docs(repo): clarify no automatic python fallback
Snider Jun 2, 2026
4c4bbfc
fix(model): stop advertising python conversion for native gaps
Snider Jun 2, 2026
b87f92a
docs(runtime): refresh q6 go-mlx self bench
Snider Jun 2, 2026
1392e1c
docs(runtime): record q6 retained gate check
Snider Jun 2, 2026
5945ad7
fix(mtp): compare retained state ramp shapes
Snider Jun 2, 2026
c9d5585
docs(runtime): refresh q6 go-mlx self comparator
Snider Jun 2, 2026
806732c
perf(cli): trim retained trace summary allocations
Snider Jun 2, 2026
73a0e43
perf(api): expose no-copy Gemma 4 gate access
Snider Jun 2, 2026
8c5158f
feat(api): expose supported Gemma 4 quant packs
Snider Jun 2, 2026
1cc3dd3
feat(cli): resolve supported quant packs
Snider Jun 2, 2026
60e99e0
feat(cli): report native architecture gaps
Snider Jun 2, 2026
37bebb8
feat(metal): stage native bert loaders
Snider Jun 2, 2026
24102c3
feat(api): lock gemma4 e2b quant matrix
Snider Jun 2, 2026
787516d
feat(metal): stage qwen36 native loader
Snider Jun 2, 2026
66ecfd1
feat(metal): stage qwen3 moe native loader
Snider Jun 2, 2026
c93cbac
feat(metal): complete native architecture checkpoint
Snider Jun 2, 2026
b7c781f
feat(metal): add unit-scale moe router topk
Snider Jun 2, 2026
2c34dc8
feat(metal): share moe router selection
Snider Jun 2, 2026
39d7cbb
feat(metal): share swiglu moe expert dispatch
Snider Jun 2, 2026
339620f
feat(metal): enable shared moe sparse runtimes
Snider Jun 2, 2026
18b69d7
docs(repo): link native gates at macos 26
Snider Jun 2, 2026
356c189
feat(metal): add bert pooling rerank primitive
Snider Jun 2, 2026
001de06
feat(metal): plan qwen36 hybrid attention layers
Snider Jun 2, 2026
4dc23b2
test(metal): cover shared moe generation
Snider Jun 2, 2026
f05ace3
feat(metal): plan deepseek mla staging
Snider Jun 2, 2026
d80ab88
docs(goal): pin native macos floor
Snider Jun 2, 2026
5cca2ad
feat(metal): stage qwen36 hybrid caches
Snider Jun 2, 2026
de95d9d
feat(metal): profile qwen36 cacheless layers
Snider Jun 2, 2026
641fb78
feat(openai): spec-compliant Anthropic Messages streaming for Claude …
Snider Jun 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
6 changes: 6 additions & 0 deletions .codex/environments/environment.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# THIS IS AUTOGENERATED. DO NOT EDIT MANUALLY
version = 1
name = "go-mlx"

[setup]
script = ""
20 changes: 17 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
# Build artifacts
build/
bin/
*.dylib
*.so
*.a

# `go build ./go/cmd/mlx/` without -o lands the binary at repo root.
# Convention is `go build -o bin/mlx` (bin/ already ignored above);
# this catches the shortcut form too.
/mlx

# CMake
CMakeCache.txt
CMakeFiles/
cmake_install.cmake
Makefile

# CMake install output (keep headers for Go module consumers)
dist/*
!dist/include/
# CMake install output
dist/

# Local Go build/test shortcuts
/go/mlx
/*.test

# IDE
.idea/
Expand All @@ -22,6 +31,11 @@ dist/*
# macOS
.DS_Store

# lthn/desktop frontend dist — copied at build time by
# scripts/make-app-bundle.sh, embedded in cmd/mlx via go:embed.
# Single source of truth lives in lthn/desktop/frontend/.
go/cmd/mlx/frontend/dist/

# Knowledge base
KB/
.core/
Expand Down
12 changes: 12 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,15 @@
path = external/go-io
url = https://github.com/dappcore/go-io.git
branch = dev
[submodule "external/go-ai"]
path = external/go-ai
url = https://github.com/dappcore/go-ai.git
branch = dev
[submodule "external/go-ml"]
path = external/go-ml
url = https://github.com/dappcore/go-ml.git
branch = dev
[submodule "external/go-cgo"]
path = external/go-cgo
url = https://github.com/dappcore/go-cgo.git
branch = dev
20 changes: 15 additions & 5 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ All Go code lives under `go/`:
`nomlxlm` removes it)
- `go/cmd/violet/` and `go/pkg/daemon/` — local Violet Unix-socket sidecar
- `cpp/` — C++ side companion (CLion-side worktree)
- `lib/mlx/` — upstream MLX submodule pinned at `v0.30.1`
- `lib/mlx/` — upstream MLX submodule pinned at `v0.31.1`
- `patches/` — local patches against `lib/mlx` (manual apply only)
- `docs/`, `examples/` — markdown documentation and per-feature usage examples

Expand All @@ -25,6 +25,15 @@ Unsupported builds compile against the `*_stub.go` files and a stub
`MetalAvailable() bool` that returns false. Do not move CGO code out of
`go/internal/metal/`.

The native path targets [macOS Tahoe 26.0+](https://developer.apple.com/documentation/macos-release-notes/macos-26-release-notes)
on Apple Silicon. The floor is intentional: the Metal 4 API generation this
runner is built around shipped with macOS 26, including lower-overhead command
encoding, explicit compilation control, tensor resources, and machine-learning
passes. Keep build and test invocations aligned with that floor by passing
`-ldflags "-extldflags=-mmacosx-version-min=26.0"` when compiling native code.
See `docs/operator/deployment.md` and `docs/operator/metallib-and-variants.md`
for the full reference chain.

## Conventions

- UK English in code, comments, and docs (colour, organisation, behaviour)
Expand All @@ -47,10 +56,11 @@ model downloads.

## Sandboxing Notes

Before handing off, run the repository gates from the brief with `GOWORK=off`.
On sandboxed systems, set `GOCACHE` to a writable directory such as
`/tmp/codex-go-mlx-cache` so Go can compile without touching the user
cache. If the sandbox cannot resolve the bundled `mlx.metallib`, apply
Before handing off, run the repository gates from the checked-in workspace; do
not use `GOWORK=off` unless the user explicitly asks for an isolated module
check. On sandboxed systems, set `GOCACHE` to a writable directory such as
`/tmp/codex-go-mlx-cache` so Go can compile without touching the user cache.
If the sandbox cannot resolve the bundled `mlx.metallib`, apply
`patches/mlx-metallib-path.patch` inside `lib/mlx` to enable the
`MLX_METALLIB_PATH` env-var override (not auto-applied).

Expand Down
7 changes: 4 additions & 3 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,18 @@ After Mantis #1241, all Go code lives under `go/`:
```
go/ Go module root (dappco.re/go/mlx)
*.go Public root API: model, tokenizer, compute, training, eval, distill, GRPO, hf-fit, merge, gguf-quantize, kv-snapshot, lora-fuse
cmd/mlx/ CLI tool (built with `-o core-mlx`; consumers rename: lthn-mlx)
cmd/violet/ Unix-socket sidecar daemon
internal/metal/ All CGO code (mlx-c bindings)
mlxlm/ CGO-free Python subprocess backend
pkg/daemon/ Daemon implementation
pkg/memvid/ Memvid storage CLI
pkg/memvid/ Deprecated State codec compatibility shim
tests/ Integration tests
cpp/ C++ side (CLion-side companion)
docs/ Markdown documentation
examples/ Per-feature usage examples (markdown)
external/ Vendored core libraries
lib/mlx/ Upstream mlx submodule (pinned at v0.30.1)
lib/mlx/ Upstream mlx submodule (pinned at v0.31.1)
patches/ Local patches to lib/mlx (not auto-applied)
```

Expand Down Expand Up @@ -127,7 +128,7 @@ Architecture is detected from `config.json` (`model_type`) for safetensors and f

## Submodule Patches

`lib/mlx` is pinned at upstream tag `v0.30.1`. Local patches that we do not upstream live in `patches/` as standalone diff files (e.g. `patches/mlx-metallib-path.patch` for the `MLX_METALLIB_PATH` env-var override). Patches are not auto-applied — run them inside the submodule manually when their function is needed:
`lib/mlx` is pinned at upstream tag `v0.31.1`. Local patches that we do not upstream live in `patches/` as standalone diff files (e.g. `patches/mlx-metallib-path.patch` for the `MLX_METALLIB_PATH` env-var override). Patches are not auto-applied — run them inside the submodule manually when their function is needed:

```bash
git -C lib/mlx apply ../../patches/mlx-metallib-path.patch
Expand Down
119 changes: 119 additions & 0 deletions CLAUDE.operator.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# CLAUDE.operator.md

Operator-facing guidance for **running** `lthn-mlx` in production. Companion to `CLAUDE.md` (developer-facing — architecture, build, contribute). If you arrived here mid-session needing to deploy, troubleshoot, or reason about distribution, you're in the right doc. If you arrived needing to add a model decoder or change cgo bindings, go to `CLAUDE.md`.

The operator audience is a future Cladius / Athena / Hephaestus session, *or* a human operator (Snider, ops-side) doing a deploy. Same mental model serves both — the difference is just whether the reader can edit code on the spot.

## Read order

1. **This file**, skim through "Operating principles" — calibrates what the binary is and isn't.
2. **`docs/operator/deployment.md`** — what you ship, how it runs, what to bind to.
3. **`docs/operator/metallib-and-variants.md`** — the variant question, the bundling strategy, the active CWD-resolution panic.
4. **`docs/operator/troubleshooting.md`** — the failure modes in lifecycle order, with fixes.
5. **`docs/operator/index.md`** — the full operator doc set + what's planned.

If you have ~3 minutes, read this file. If you have ~30 minutes, read all five.

## What lthn-mlx is

A single-process boundary that wraps native Apple Metal GPU inference (via mlx-c CGO bindings) and serves it as OpenAI / Anthropic / Ollama-compatible HTTP. Snider's framing, made explicit on 2026-05-25:

> **"The actual model is the binary, the rest is package."**

This is the load-bearing architecture decision. Everything that wants inference — `lthn` desktop, `pkg/lemma` in lthn/desktop, providers in `go-ai`, any OpenAI-compatible Python / TypeScript / curl client — talks to `lthn-mlx` over HTTP. There is no in-process library substitute for production. The binary is the boundary.

**One process. One model. One HTTP listener.** That's the unit. Multi-model deployments mean multiple processes on different ports plus a router in front (the `pkg/lemma` client is the canonical Go-side router).

The binary is built from `dappco.re/go/mlx/cmd/mlx`, default output name `core-mlx`, consumers rename to `lthn-mlx`. Module path is `dappco.re/go/mlx`.

## Operating principles

These are the load-bearing facts an operator needs in working memory. Each one shapes a deployment decision.

### 1. Apple Silicon only

`darwin/arm64`. No Linux. No Intel macOS. The CGO files carry `//go:build darwin && arm64`; a stub returns `MetalAvailable() = false` everywhere else. M1 / M2 / M3 / M4, any chip class, any deployment macOS ≥13 — one binary serves them all (modulo the metallib variant matrix; see point 5).

If the deployment target isn't Apple Silicon, you don't want `lthn-mlx` — you want a different go-inference backend (`go-rocm` for AMD GPUs, or the CGO-free `mlxlm` subprocess backend bundled in the same repo for Python-on-anything).

### 2. The binary needs the metallib

`mlx.metallib` (~107 MB, MetalLib v1.2.9, the compiled GPU kernel archive) must be findable at runtime. Today, until the bundling work lands, this means **setting `MLX_METALLIB_PATH` to an absolute path** before invoking. Not setting it is the single most common deployment failure — the binary starts, `/v1/health` passes, then panics inside `mlx_metal_load_library` on the first GPU dispatch.

```bash
export MLX_METALLIB_PATH=/opt/lthn-mlx/lib/mlx.metallib
lthn-mlx serve --model /opt/lthn-mlx/models/lemer-lite --addr :11434
```

The permanent fix is Path B bundling (embed via `//go:embed`, load via `MTLDevice newLibraryWithData:`). Until that ships, treat the env var as mandatory deployment config. See `docs/operator/metallib-and-variants.md` for the why and `docs/operator/troubleshooting.md` for the panic signature.

### 3. Model loads lazily

`lthn-mlx serve` starts in under a second. The model loads on the **first request that needs it**, not at process start. This means:

- Liveness probes against `/v1/health` pass before the model is loaded. They are not readiness probes.
- The first inference request after start takes 2-15 seconds depending on model size and storage speed.
- For consistent first-request latency, pre-warm in the service manager's post-start hook with a one-token completion (see deployment.md).

There is no on-disk lock, no PID file, no recovery state. Restart is safe; the new process starts cold and lazy-loads. The service manager is responsible for single-instance enforcement.

### 4. HTTP surface is trusted-network only

`lthn-mlx serve` has no authentication, no rate limiting, no TLS. Default bind is `:11434` (matches Ollama). Bind to `127.0.0.1:11434` for same-machine, `0.0.0.0:11434` for LAN. **Production LAN exposure sits behind a reverse proxy** that handles auth and TLS (Caddy, nginx).

If you need authenticated remote access, that lives in `pkg/lemma` (the Go client) plus a tunnel / proxy / auth-gateway — not in `lthn-mlx` itself. Don't try to add auth to the serve binary; it would violate the boundary rule and duplicate work already done one layer up.

### 5. Variants matter at the toolchain axis, not the chip axis

Snider's question of 2026-05-25: "if the lib is different for different apple versions, we need to know the variants that need building." The chip family (M1/M2/M3/M4) is **not** a variant axis — Apple's Metal driver handles forward-compatibility from a single archive. What actually varies is the build-host toolchain: Metal language version ≥4.0 + macOS SDK ≥26.2 (Xcode 26+) unlocks the NAX kernel family for M4-class tensor coprocessors.

**Practical ship matrix:**

| Variant | Build host | Runs on | Use case |
|---------|------------|---------|----------|
| `mlx-baseline.metallib` | Any modern Xcode, deployment-min 13 | M1-M4 on macOS 13+ | Default ship today |
| `mlx-nax.metallib` | Xcode 26+, deployment-min 26 | M4-class on macOS 26+ only | Deferred to M4 optimisation lane |

Ship the baseline. The NAX variant is a future M4 fast-path optimisation, not a today-decision. Full evidence and the open questions (driver-side load behaviour for higher `min`, NAX dispatch gating on non-M4) in `docs/operator/metallib-and-variants.md`.

### 6. Unified memory is the budget

On Apple Silicon there is no separate VRAM line item — the GPU and CPU share unified memory. The process budget includes: model weights, KV cache (scales linearly with `--context`), MLX allocator cache, plus everything else macOS is doing. A 7B model in 4-bit needs ~5 GB resident; the default 131k context can add several more.

Tuning knobs live in `dappco.re/go/mlx` at the package level (`SetMemoryLimit`, `SetCacheLimit`, `SetWiredLimit`, `ClearCache`, `GetActiveMemory`, `GetPeakMemory`). They are **not** exposed as `serve` flags today — if you need them on the bundled CLI, file a feature ticket against `cmd/mlx/serve.go`. For now, custom integrations on top of `openai.NewMuxWithAdmin` can wire them directly.

Activity Monitor's "Memory" column is the right place to watch the process. `/v1/cache/stats` reports MLX's allocator view.

### 7. Graceful shutdown is signal-driven

SIGINT and SIGTERM both trigger `http.Server.Shutdown` with `--shutdown-timeout` (default 10s) as the drain deadline. After the deadline, the process exits. There is no explicit model-unload step — the OS reclaims Metal allocations on exit.

If you have long-running generations and need them to drain cleanly on bounce, raise `--shutdown-timeout` (30s-60s). If you need explicit teardown for an exotic daemon scenario, wire the `Sleep` admin callback in a custom integration.

## Mental model in one paragraph

`lthn-mlx serve` is a stateless OpenAI-compatible HTTP server backed by Apple Metal GPU inference, single-model per process, lazy-load on first request, signal-driven graceful shutdown, requires a findable `mlx.metallib` (env var until bundling lands), no built-in auth or TLS, designed for trusted-network use, with a `pkg/lemma`-shaped routing layer one level up for multi-model or remote-access patterns. The architecture insists on the binary as the only process boundary — everything else is packages talking to it over HTTP.

That paragraph plus the seven principles is the working mental model. Everything else in `docs/operator/` fills in the operator's view of specific concerns.

## What this doc does not cover

- **How the inference works inside.** That's `docs/architecture.md`, `docs/runtime/`, `docs/memory/`. Developer-side.
- **How to add a model architecture.** That's a decoder under `go/internal/metal/`. Developer-side.
- **How training works.** That's `docs/training.md`, `docs/distillation.md`, `docs/grpo.md`. Production-bench / research-side.
- **GOAL.md production-bench lane.** Separate concern with its own canonical brief.
- **Memory limits & cache tuning as a knob set.** Stubbed in `docs/operator/performance-tuning.md` — not yet written. Source of truth meanwhile: `go/internal/metal/backend.go:10-12` and the `mlx.Set*` package surface.

## When the docs and reality disagree

This doc and `docs/operator/*` describe behaviour. Behaviour changes. If you find a discrepancy between what `lthn-mlx serve` actually does and what these docs claim, **the code is right and the docs are wrong**. Fix the doc, or PR a comment-block on the responsible source file referencing this directory.

The maintenance discipline lives in `docs/operator/index.md` under "Maintenance discipline." Read it if you're about to merge a PR that touches `cmd/mlx/serve.go`, `go/openai/openai.go`, `go/openai/admin.go`, or `go/internal/metal/backend.go` — those four files are the operator-visible surface.

## Files this directory ships

- `CLAUDE.operator.md` (this file) — operator mental model
- `docs/operator/index.md` — operator doc index + planned slots
- `docs/operator/deployment.md` — what you ship + how it runs
- `docs/operator/metallib-and-variants.md` — bundling strategy + variant matrix
- `docs/operator/troubleshooting.md` — lifecycle-phase failure modes
8 changes: 6 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ cmake_minimum_required(VERSION 3.24)
project(mlx)

set(CMAKE_OSX_DEPLOYMENT_TARGET "26.0" CACHE STRING "Minimum macOS version")
set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)

if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/dist" CACHE PATH "" FORCE)
Expand All @@ -11,13 +14,14 @@ endif()
set(MLX_BUILD_GGUF ON CACHE BOOL "" FORCE)
set(MLX_BUILD_SAFETENSORS ON CACHE BOOL "" FORCE)
set(MLX_C_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)

set(CMAKE_INSTALL_RPATH "@loader_path")

include(FetchContent)

set(MLX_C_GIT_TAG "v0.4.1" CACHE STRING "")
set(MLX_C_GIT_TAG "v0.6.0" CACHE STRING "")
set(FETCHCONTENT_SOURCE_DIR_MLX "${CMAKE_CURRENT_SOURCE_DIR}/lib/mlx" CACHE PATH "Local patched MLX source")

FetchContent_Declare(
mlx-c
Expand Down
Loading