Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,37 @@ brew install mlxcel
# Download an MLX-format checkpoint from Hugging Face.
mlxcel download mlx-community/Qwen3.5-0.8B-4bit

# Check the memory budget before loading anything.
mlxcel inspect -m models/Qwen3.5-0.8B-4bit --max-tokens 32768

# One-off generation.
mlxcel generate \
-m models/Qwen3.5-0.8B-4bit \
-p "Hello, world!" -n 100

# Same generation, but refuse to start if the model + 32K KV cache will not fit.
mlxcel generate \
-m models/Qwen3.5-0.8B-4bit \
-p "Hello, world!" -n 32768 \
--estimate-memory

# OpenAI-compatible server.
mlxcel-server \
-m models/Qwen3.5-0.8B-4bit \
--port 8080
```

`mlxcel inspect` is read-only and prints a byte-level breakdown of weights /
KV cache / runtime headroom against available unified memory without loading
any tensors. `--estimate-memory` on `mlxcel generate` and `mlxcel serve`
runs the same estimator as a preflight and aborts when the model will not
fit; pass `--force` (alias `--no-memory-check`) to override the abort.
`MLXCEL_MEMORY_LIMIT=NGB` tightens the "available" figure to a chosen soft
cap so the preflight is meaningful even on hosts with plenty of RAM. The
runtime headroom factor defaults to `1.20×` and is overridable via
`MLXCEL_HEADROOM_FACTOR=<f>` for calibration runs — see the in-code recipe
in `src/execution/memory_estimate.rs`.

If you build from source instead, use `./target/release/mlxcel` and
`./target/release/mlxcel-server` in place of the installed commands above.

Expand Down
2 changes: 2 additions & 0 deletions docs/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ and cached on first use. Set them before starting `mlxcel` or `mlxcel-server`.
|----------|--------|---------|-------|
| `MLXCEL_DEVICE` | `gpu`, `metal`, `cpu` | `gpu` hint | `cpu` requests CPU execution. Invalid values are ignored with a warning and treated as `gpu`; if no GPU backend is available, runtime falls back to CPU. |
| `MLXCEL_WIRED_LIMIT` | `max`, `0`, `none`, bytes, `NGB`, `NMB` | `max` | Apple Silicon GPU wired-memory limit. Unset/empty/`max` sets MLX's reported GPU max memory size; `0`/`none` disables the limit; numeric values set an explicit limit. |
| `MLXCEL_MEMORY_LIMIT` | `0`, `none`, bytes, `NGB`, `NMB` | unset | Soft MLX allocator memory cap. Unset/`0`/`none` lets MLX use its backend default; numeric values cap the allocator and make MLX raise an exception once allocations would push the working set past this value. Also feeds the `mlxcel inspect` / `--estimate-memory` preflight as the authoritative "available unified memory" figure when nonzero. |
| `MLXCEL_HEADROOM_FACTOR` | positive `f64` | `1.20` | Runtime/activation headroom multiplier used by the unified memory estimator (`mlxcel inspect`, `--estimate-memory`, `--recommend-quant`). Values `<= 1.0` disable the headroom term; invalid values warn and fall back to the default. Override only for calibration runs — see the in-code recipe in `src/execution/memory_estimate.rs`. |
| `MLXCEL_CACHE_DIR` | directory path | `$HOME/.cache/mlxcel` | Root for the tokenizer language-analysis disk cache used by language-bias features. Files live under `tokenizer-scripts/`. |
| `MLXCEL_SERVER_DECODE_STORAGE` | `auto`, `dense`, `paged` | `auto` | Server continuous-batching decode storage. `--decode-storage-backend` takes precedence. Invalid values warn and fall back to `auto`. |
| `MLXCEL_SURGERY` | YAML file path | unset | Feature-gated weight-load surgery configuration. `--surgery` takes precedence when the `surgery` feature is built. |
Expand Down
157 changes: 156 additions & 1 deletion src/commands/generate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ use mlxcel::{
resolve_model_shard_plan, shard_config_from_cli, validate_supported_runtime,
},
initialize_runtime, load_model, load_model_with_adapter, load_model_with_tensor_parallel,
memory_estimate::{
MemoryEstimate, QuantHint, estimate_total_memory, format_bytes, format_estimate,
},
quant_advisor::{advise_quantization, print_quant_advice},
sampling::{ResolvedSamplingParams, build_sampling_config},
server::chat_template::{ChatMessage, ChatTemplateProcessor},
Expand Down Expand Up @@ -108,6 +111,7 @@ fn print_runtime_setup(runtime: &RuntimeSetup) {

fn load_generation_model(
args: &GenerateArgs,
preflight: Option<&MemoryEstimate>,
) -> Result<(mlxcel::LoadedModel, mlxcel::tokenizer::MlxcelTokenizer)> {
println!("Loading model from {:?}...", args.model.model);
let load_start = Instant::now();
Expand Down Expand Up @@ -153,9 +157,151 @@ fn load_generation_model(
load_seconds = load_elapsed.as_secs_f64(),
"Model resident after load",
);

// Issue #56: compare the pre-load estimate against MLX's
// observed active memory once loading is complete. The delta
// feeds future headroom-factor calibration (see the recipe on
// `memory_estimate::DEFAULT_HEADROOM_FACTOR`).
//
// On Linux/CPU MLX returns zero for most memory metrics, so we
// skip the delta when `snap.active_bytes == 0` — it would just
// print misleading "100% under-estimate" lines. The structural
// wiring is verified by the call site and the unit tests; the
// numerical delta is meaningful only on Apple Silicon (Metal) /
// CUDA backends that populate the active counter.
if let Some(est) = preflight {
log_estimate_vs_actual_delta(est, &snap);
}
Ok(result)
}

/// Log the delta between a pre-load `MemoryEstimate` and the
/// post-load MLX allocator snapshot.
///
/// Skips when MLX reports zero active bytes (Linux/CPU has no
/// per-process allocator counter on the no-gpu backend). When active
/// bytes are nonzero, prints a `delta` line and emits a tracing
/// event so an off-line collector can chart preflight accuracy
/// across loads — feeding the manual recalibration recipe on
/// `DEFAULT_HEADROOM_FACTOR`.
fn log_estimate_vs_actual_delta(est: &MemoryEstimate, snap: &mlxcel_core::memory::MemorySnapshot) {
if snap.active_bytes == 0 {
// No allocator counter to compare against (no-gpu CPU
// backend). Surface the no-op so operators reading the log
// know the preflight estimate is structurally wired but
// can't be validated numerically on this host.
println!(
"Memory estimate vs actual: skipped (MLX active_memory() is 0 — \
non-Metal/CUDA backend; estimate was {} and is structurally valid \
but cannot be verified without a populated allocator counter)",
format_bytes(est.total_bytes),
);
tracing::info!(
estimate_total = est.total_bytes,
actual_active = snap.active_bytes,
skipped = true,
reason = "active_memory zero on this backend",
"Memory estimate vs actual delta",
);
return;
}

let est_bytes = est.total_bytes;
let actual = snap.active_bytes;
let (delta_label, delta_bytes) = if actual >= est_bytes {
("over-estimated by", actual.saturating_sub(est_bytes))
} else {
("under-estimated by", est_bytes.saturating_sub(actual))
};
let ratio = if est_bytes > 0 {
actual as f64 / est_bytes as f64
} else {
0.0
};
println!(
"Memory estimate vs actual: estimate {} | actual {} | {} {} (ratio {:.3})",
format_bytes(est_bytes),
format_bytes(actual),
delta_label,
format_bytes(delta_bytes),
ratio,
);
tracing::info!(
estimate_total = est_bytes,
actual_active = actual,
delta_bytes,
ratio,
headroom_factor = est.headroom_factor,
weights_bytes = est.weights_bytes,
kv_cache_bytes = est.kv_cache_bytes,
runtime_headroom_bytes = est.runtime_headroom_bytes,
"Memory estimate vs actual delta",
);
}

/// Run the `--estimate-memory` preflight for `mlxcel generate`.
///
/// Returns `Some(estimate)` when the user passed `--estimate-memory`
/// (so the caller can later log the estimate-vs-actual delta), and
/// `None` when the preflight was not requested. The function never
/// allocates on MLX and never touches the model.
///
/// When `total > available` and `--force` was not set, returns
/// `Err(...)` with an actionable message that names the over-budget
/// figure and the override flags. Always prints the formatted
/// breakdown before aborting so operators can see the same byte
/// table `mlxcel inspect` would have shown.
fn run_memory_preflight(args: &GenerateArgs) -> Result<Option<MemoryEstimate>> {
if !args.generation.estimate_memory {
return Ok(None);
}

// Derive int8 KV from the existing --cache-type-k / --cache-type-v
// pair so the preflight reflects what the loaded cache will
// actually allocate. Mixed-precision configurations fall back to
// FP16 sizing because the size formula does not model them
// directly — surfaced in the printed breakdown.
let kv_int8 = matches!(
(
args.generation.turbo.cache_type_k.as_deref(),
args.generation.turbo.cache_type_v.as_deref(),
),
(Some("int8"), Some("int8")) | (Some("i8"), Some("i8"))
);

// Use the user's `--max-tokens` as the KV ctx_len input. This
// matches the way `mlxcel inspect --max-tokens N` sizes the KV
// estimate, so the preflight and the inspect view never disagree.
let ctx_len = args.generation.max_tokens.max(1) as u64;

let estimate =
estimate_total_memory(&args.model.model, ctx_len, 1, QuantHint::Default, kv_int8);

let banner = format_estimate(&args.model.model, &estimate);
println!("{banner}");

if !estimate.fits {
if args.generation.force_memory {
eprintln!(
"WARNING: --estimate-memory preflight says this load is over budget by {}. \
Continuing because --force was set.",
format_bytes(estimate.overflow_bytes()),
);
} else {
return Err(anyhow::anyhow!(
"--estimate-memory: total {} exceeds available {} by {}. \
Pass --force (or --no-memory-check) to override, or rerun with \
a smaller --max-tokens / a smaller model.",
format_bytes(estimate.total_bytes),
format_bytes(estimate.available_bytes),
format_bytes(estimate.overflow_bytes()),
));
}
}

Ok(Some(estimate))
}

fn cli_pipeline_requested(args: &GenerateArgs) -> bool {
args.pipeline_parallel.pp_size > 1 || args.pipeline_parallel.pp_layers.is_some()
}
Expand Down Expand Up @@ -906,6 +1052,15 @@ pub(crate) fn run_generate(args: GenerateArgs) -> Result<()> {
}
}

// Memory preflight (issue #56). Runs the unified estimator and
// aborts when total > available. Skipped when --estimate-memory
// was not passed. --force / --no-memory-check downgrades the
// abort to a warning. Sub-issue C's `MLXCEL_MEMORY_LIMIT` env hook
// is honoured transparently by the estimator's
// `resolve_available_memory` step (MLX allocator soft cap wins
// over OS RAM when nonzero).
let preflight_estimate = run_memory_preflight(&args)?;

let pipeline_requested = cli_pipeline_requested(&args);
let tokenizer = load_tokenizer(&args.model.model)?;
let prompt = load_cli_prompt(
Expand Down Expand Up @@ -1047,7 +1202,7 @@ pub(crate) fn run_generate(args: GenerateArgs) -> Result<()> {
&args,
)?
} else {
let (model, _loaded_tokenizer) = load_generation_model(&args)?;
let (model, _loaded_tokenizer) = load_generation_model(&args, preflight_estimate.as_ref())?;
let vlm_embeddings = generate_vlm::compute_vlm_embeddings(
&model,
&mut prompt_tokens,
Expand Down
2 changes: 2 additions & 0 deletions src/commands/generate_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ fn sample_generate_args(model_path: PathBuf) -> crate::GenerateArgs {
profile: false,
no_chat_template: false,
recommend_quant: false,
estimate_memory: false,
force_memory: false,
turbo: mlxcel::cli::turbo_args::TurboKvCacheArgs::default(),
},
sampling: crate::SamplingOptions {
Expand Down
121 changes: 121 additions & 0 deletions src/commands/inspect.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright 2025-2026 Lablup Inc. and Jeongkyu Shin
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! CLI handler for `mlxcel inspect`.
//!
//! Read-only entry point that surfaces the unified memory estimator
//! (issue #56, epic #52 capstone). Prints the byte breakdown for
//! weights / KV cache / runtime activation headroom / total vs
//! available unified memory, then exits without loading the model.
//!
//! Used by: operators sizing a model for a given host before
//! launching `mlxcel generate` or `mlxcel serve`.

use anyhow::{Result, anyhow};

use mlxcel::memory_estimate::{QuantHint, estimate_total_memory, format_estimate};

use crate::InspectArgs;

/// Run the `mlxcel inspect` subcommand.
pub(crate) fn run_inspect(args: InspectArgs) -> Result<()> {
if !args.model.exists() {
return Err(anyhow!(
"Model directory does not exist: {}",
args.model.display()
));
}

// Translate the user-facing `--quant` label into the typed hint.
let quant = parse_quant_hint(&args.quant)?;

// Translate the K/V cache flag pair into the int8/fp16 decision the
// estimator understands. Both flags must point at int8 for KV
// bytes to halve; any other combination is treated as fp16 (the
// default) since mixed-precision KV is not directly modelled in
// the size formula. Surface the consequence in the printed output.
let kv_int8 = matches!(
(
args.turbo.cache_type_k.as_deref(),
args.turbo.cache_type_v.as_deref(),
),
(Some("int8"), Some("int8")) | (Some("i8"), Some("i8"))
);

let estimate = estimate_total_memory(&args.model, args.max_tokens, args.batch, quant, kv_int8);

let banner = format_estimate(&args.model, &estimate);
println!("{banner}");

if !estimate.fits {
// Exit successfully — `inspect` is read-only and informational.
// The caller can pipe this to a script that checks for the
// "DOES NOT FIT" marker. Returning Err here would conflate
// "inspect ran successfully and reported over-capacity" with
// "inspect itself failed".
println!(
"Note: this configuration is expected to fail the `--estimate-memory` \
preflight on `mlxcel generate` / `mlxcel serve` unless `--force` is set."
);
}

Ok(())
}

/// Parse the user-facing `--quant` label into a typed [`QuantHint`].
///
/// Accepts: `default`, `fp16`, `int8`, `int4`. Returns a clear
/// `anyhow::Error` for unknown labels so the CLI fails fast with a
/// usable error rather than silently coercing to the default.
fn parse_quant_hint(label: &str) -> Result<QuantHint> {
match label.to_ascii_lowercase().as_str() {
"default" | "" => Ok(QuantHint::Default),
"fp16" | "float16" => Ok(QuantHint::Fp16),
"int8" | "i8" => Ok(QuantHint::Int8),
"int4" | "i4" => Ok(QuantHint::Int4),
other => Err(anyhow!(
"--quant: unknown value '{other}'; expected one of \
default, fp16, int8, int4"
)),
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn parse_quant_hint_accepts_known_labels() {
assert_eq!(parse_quant_hint("default").unwrap(), QuantHint::Default);
assert_eq!(parse_quant_hint("").unwrap(), QuantHint::Default);
assert_eq!(parse_quant_hint("fp16").unwrap(), QuantHint::Fp16);
assert_eq!(parse_quant_hint("float16").unwrap(), QuantHint::Fp16);
assert_eq!(parse_quant_hint("int8").unwrap(), QuantHint::Int8);
assert_eq!(parse_quant_hint("i8").unwrap(), QuantHint::Int8);
assert_eq!(parse_quant_hint("int4").unwrap(), QuantHint::Int4);
assert_eq!(parse_quant_hint("i4").unwrap(), QuantHint::Int4);
assert_eq!(parse_quant_hint("INT8").unwrap(), QuantHint::Int8);
}

#[test]
fn parse_quant_hint_rejects_unknown() {
let err = parse_quant_hint("turbo3").unwrap_err();
let msg = format!("{err}");
assert!(msg.contains("unknown"), "expected 'unknown' in: {msg}");
assert!(
msg.contains("turbo3"),
"expected label echoed back in: {msg}"
);
}
}
2 changes: 2 additions & 0 deletions src/commands/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
pub(crate) mod download;
pub(crate) mod generate;
mod generate_vlm;
pub(crate) mod inspect;
mod serve;

pub(crate) use download::run_download;
pub(crate) use generate::run_generate;
pub(crate) use inspect::run_inspect;
pub(crate) use serve::run_serve;
Loading