lablup · inureyes · May 21, 2026 · May 21, 2026
diff --git a/README.md b/README.md
@@ -40,17 +40,37 @@ brew install mlxcel
 # Download an MLX-format checkpoint from Hugging Face.
 mlxcel download mlx-community/Qwen3.5-0.8B-4bit
 
+# Check the memory budget before loading anything.
+mlxcel inspect -m models/Qwen3.5-0.8B-4bit --max-tokens 32768
+
 # One-off generation.
 mlxcel generate \
     -m models/Qwen3.5-0.8B-4bit \
     -p "Hello, world!" -n 100
 
+# Same generation, but refuse to start if the model + 32K KV cache will not fit.
+mlxcel generate \
+    -m models/Qwen3.5-0.8B-4bit \
+    -p "Hello, world!" -n 32768 \
+    --estimate-memory
+
 # OpenAI-compatible server.
 mlxcel-server \
     -m models/Qwen3.5-0.8B-4bit \
     --port 8080
 ```
 
+`mlxcel inspect` is read-only and prints a byte-level breakdown of weights /
+KV cache / runtime headroom against available unified memory without loading
+any tensors. `--estimate-memory` on `mlxcel generate` and `mlxcel serve`
+runs the same estimator as a preflight and aborts when the model will not
+fit; pass `--force` (alias `--no-memory-check`) to override the abort.
+`MLXCEL_MEMORY_LIMIT=NGB` tightens the "available" figure to a chosen soft
+cap so the preflight is meaningful even on hosts with plenty of RAM. The
+runtime headroom factor defaults to `1.20×` and is overridable via
+`MLXCEL_HEADROOM_FACTOR=<f>` for calibration runs — see the in-code recipe
+in `src/execution/memory_estimate.rs`.
+
 If you build from source instead, use `./target/release/mlxcel` and
 `./target/release/mlxcel-server` in place of the installed commands above.
 

diff --git a/docs/environment-variables.md b/docs/environment-variables.md
@@ -31,6 +31,8 @@ and cached on first use. Set them before starting `mlxcel` or `mlxcel-server`.
 |----------|--------|---------|-------|
 | `MLXCEL_DEVICE` | `gpu`, `metal`, `cpu` | `gpu` hint | `cpu` requests CPU execution. Invalid values are ignored with a warning and treated as `gpu`; if no GPU backend is available, runtime falls back to CPU. |
 | `MLXCEL_WIRED_LIMIT` | `max`, `0`, `none`, bytes, `NGB`, `NMB` | `max` | Apple Silicon GPU wired-memory limit. Unset/empty/`max` sets MLX's reported GPU max memory size; `0`/`none` disables the limit; numeric values set an explicit limit. |
+| `MLXCEL_MEMORY_LIMIT` | `0`, `none`, bytes, `NGB`, `NMB` | unset | Soft MLX allocator memory cap. Unset/`0`/`none` lets MLX use its backend default; numeric values cap the allocator and make MLX raise an exception once allocations would push the working set past this value. Also feeds the `mlxcel inspect` / `--estimate-memory` preflight as the authoritative "available unified memory" figure when nonzero. |
+| `MLXCEL_HEADROOM_FACTOR` | positive `f64` | `1.20` | Runtime/activation headroom multiplier used by the unified memory estimator (`mlxcel inspect`, `--estimate-memory`, `--recommend-quant`). Values `<= 1.0` disable the headroom term; invalid values warn and fall back to the default. Override only for calibration runs — see the in-code recipe in `src/execution/memory_estimate.rs`. |
 | `MLXCEL_CACHE_DIR` | directory path | `$HOME/.cache/mlxcel` | Root for the tokenizer language-analysis disk cache used by language-bias features. Files live under `tokenizer-scripts/`. |
 | `MLXCEL_SERVER_DECODE_STORAGE` | `auto`, `dense`, `paged` | `auto` | Server continuous-batching decode storage. `--decode-storage-backend` takes precedence. Invalid values warn and fall back to `auto`. |
 | `MLXCEL_SURGERY` | YAML file path | unset | Feature-gated weight-load surgery configuration. `--surgery` takes precedence when the `surgery` feature is built. |

diff --git a/src/commands/generate.rs b/src/commands/generate.rs
@@ -33,6 +33,9 @@ use mlxcel::{
         resolve_model_shard_plan, shard_config_from_cli, validate_supported_runtime,
     },
     initialize_runtime, load_model, load_model_with_adapter, load_model_with_tensor_parallel,
+    memory_estimate::{
+        MemoryEstimate, QuantHint, estimate_total_memory, format_bytes, format_estimate,
+    },
     quant_advisor::{advise_quantization, print_quant_advice},
     sampling::{ResolvedSamplingParams, build_sampling_config},
     server::chat_template::{ChatMessage, ChatTemplateProcessor},
@@ -108,6 +111,7 @@ fn print_runtime_setup(runtime: &RuntimeSetup) {
 
 fn load_generation_model(
     args: &GenerateArgs,
+    preflight: Option<&MemoryEstimate>,
 ) -> Result<(mlxcel::LoadedModel, mlxcel::tokenizer::MlxcelTokenizer)> {
     println!("Loading model from {:?}...", args.model.model);
     let load_start = Instant::now();
@@ -153,9 +157,151 @@ fn load_generation_model(
         load_seconds = load_elapsed.as_secs_f64(),
         "Model resident after load",
     );
+
+    // Issue #56: compare the pre-load estimate against MLX's
+    // observed active memory once loading is complete. The delta
+    // feeds future headroom-factor calibration (see the recipe on
+    // `memory_estimate::DEFAULT_HEADROOM_FACTOR`).
+    //
+    // On Linux/CPU MLX returns zero for most memory metrics, so we
+    // skip the delta when `snap.active_bytes == 0` — it would just
+    // print misleading "100% under-estimate" lines. The structural
+    // wiring is verified by the call site and the unit tests; the
+    // numerical delta is meaningful only on Apple Silicon (Metal) /
+    // CUDA backends that populate the active counter.
+    if let Some(est) = preflight {
+        log_estimate_vs_actual_delta(est, &snap);
+    }
     Ok(result)
 }
 
+/// Log the delta between a pre-load `MemoryEstimate` and the
+/// post-load MLX allocator snapshot.
+///
+/// Skips when MLX reports zero active bytes (Linux/CPU has no
+/// per-process allocator counter on the no-gpu backend). When active
+/// bytes are nonzero, prints a `delta` line and emits a tracing
+/// event so an off-line collector can chart preflight accuracy
+/// across loads — feeding the manual recalibration recipe on
+/// `DEFAULT_HEADROOM_FACTOR`.
+fn log_estimate_vs_actual_delta(est: &MemoryEstimate, snap: &mlxcel_core::memory::MemorySnapshot) {
+    if snap.active_bytes == 0 {
+        // No allocator counter to compare against (no-gpu CPU
+        // backend). Surface the no-op so operators reading the log
+        // know the preflight estimate is structurally wired but
+        // can't be validated numerically on this host.
+        println!(
+            "Memory estimate vs actual: skipped (MLX active_memory() is 0 — \
+             non-Metal/CUDA backend; estimate was {} and is structurally valid \
+             but cannot be verified without a populated allocator counter)",
+            format_bytes(est.total_bytes),
+        );
+        tracing::info!(
+            estimate_total = est.total_bytes,
+            actual_active = snap.active_bytes,
+            skipped = true,
+            reason = "active_memory zero on this backend",
+            "Memory estimate vs actual delta",
+        );
+        return;
+    }
+
+    let est_bytes = est.total_bytes;
+    let actual = snap.active_bytes;
+    let (delta_label, delta_bytes) = if actual >= est_bytes {
+        ("over-estimated by", actual.saturating_sub(est_bytes))
+    } else {
+        ("under-estimated by", est_bytes.saturating_sub(actual))
+    };
+    let ratio = if est_bytes > 0 {
+        actual as f64 / est_bytes as f64
+    } else {
+        0.0
+    };
+    println!(
+        "Memory estimate vs actual: estimate {} | actual {} | {} {} (ratio {:.3})",
+        format_bytes(est_bytes),
+        format_bytes(actual),
+        delta_label,
+        format_bytes(delta_bytes),
+        ratio,
+    );
+    tracing::info!(
+        estimate_total = est_bytes,
+        actual_active = actual,
+        delta_bytes,
+        ratio,
+        headroom_factor = est.headroom_factor,
+        weights_bytes = est.weights_bytes,
+        kv_cache_bytes = est.kv_cache_bytes,
+        runtime_headroom_bytes = est.runtime_headroom_bytes,
+        "Memory estimate vs actual delta",
+    );
+}
+
+/// Run the `--estimate-memory` preflight for `mlxcel generate`.
+///
+/// Returns `Some(estimate)` when the user passed `--estimate-memory`
+/// (so the caller can later log the estimate-vs-actual delta), and
+/// `None` when the preflight was not requested. The function never
+/// allocates on MLX and never touches the model.
+///
+/// When `total > available` and `--force` was not set, returns
+/// `Err(...)` with an actionable message that names the over-budget
+/// figure and the override flags. Always prints the formatted
+/// breakdown before aborting so operators can see the same byte
+/// table `mlxcel inspect` would have shown.
+fn run_memory_preflight(args: &GenerateArgs) -> Result<Option<MemoryEstimate>> {
+    if !args.generation.estimate_memory {
+        return Ok(None);
+    }
+
+    // Derive int8 KV from the existing --cache-type-k / --cache-type-v
+    // pair so the preflight reflects what the loaded cache will
+    // actually allocate. Mixed-precision configurations fall back to
+    // FP16 sizing because the size formula does not model them
+    // directly — surfaced in the printed breakdown.
+    let kv_int8 = matches!(
+        (
+            args.generation.turbo.cache_type_k.as_deref(),
+            args.generation.turbo.cache_type_v.as_deref(),
+        ),
+        (Some("int8"), Some("int8")) | (Some("i8"), Some("i8"))
+    );
+
+    // Use the user's `--max-tokens` as the KV ctx_len input. This
+    // matches the way `mlxcel inspect --max-tokens N` sizes the KV
+    // estimate, so the preflight and the inspect view never disagree.
+    let ctx_len = args.generation.max_tokens.max(1) as u64;
+
+    let estimate =
+        estimate_total_memory(&args.model.model, ctx_len, 1, QuantHint::Default, kv_int8);
+
+    let banner = format_estimate(&args.model.model, &estimate);
+    println!("{banner}");
+
+    if !estimate.fits {
+        if args.generation.force_memory {
+            eprintln!(
+                "WARNING: --estimate-memory preflight says this load is over budget by {}. \
+                 Continuing because --force was set.",
+                format_bytes(estimate.overflow_bytes()),
+            );
+        } else {
+            return Err(anyhow::anyhow!(
+                "--estimate-memory: total {} exceeds available {} by {}. \
+                 Pass --force (or --no-memory-check) to override, or rerun with \
+                 a smaller --max-tokens / a smaller model.",
+                format_bytes(estimate.total_bytes),
+                format_bytes(estimate.available_bytes),
+                format_bytes(estimate.overflow_bytes()),
+            ));
+        }
+    }
+
+    Ok(Some(estimate))
+}
+
 fn cli_pipeline_requested(args: &GenerateArgs) -> bool {
     args.pipeline_parallel.pp_size > 1 || args.pipeline_parallel.pp_layers.is_some()
 }
@@ -906,6 +1052,15 @@ pub(crate) fn run_generate(args: GenerateArgs) -> Result<()> {
         }
     }
 
+    // Memory preflight (issue #56). Runs the unified estimator and
+    // aborts when total > available. Skipped when --estimate-memory
+    // was not passed. --force / --no-memory-check downgrades the
+    // abort to a warning. Sub-issue C's `MLXCEL_MEMORY_LIMIT` env hook
+    // is honoured transparently by the estimator's
+    // `resolve_available_memory` step (MLX allocator soft cap wins
+    // over OS RAM when nonzero).
+    let preflight_estimate = run_memory_preflight(&args)?;
+
     let pipeline_requested = cli_pipeline_requested(&args);
     let tokenizer = load_tokenizer(&args.model.model)?;
     let prompt = load_cli_prompt(
@@ -1047,7 +1202,7 @@ pub(crate) fn run_generate(args: GenerateArgs) -> Result<()> {
             &args,
         )?
     } else {
-        let (model, _loaded_tokenizer) = load_generation_model(&args)?;
+        let (model, _loaded_tokenizer) = load_generation_model(&args, preflight_estimate.as_ref())?;
         let vlm_embeddings = generate_vlm::compute_vlm_embeddings(
             &model,
             &mut prompt_tokens,

diff --git a/src/commands/generate_tests.rs b/src/commands/generate_tests.rs
@@ -106,6 +106,8 @@ fn sample_generate_args(model_path: PathBuf) -> crate::GenerateArgs {
             profile: false,
             no_chat_template: false,
             recommend_quant: false,
+            estimate_memory: false,
+            force_memory: false,
             turbo: mlxcel::cli::turbo_args::TurboKvCacheArgs::default(),
         },
         sampling: crate::SamplingOptions {

diff --git a/src/commands/inspect.rs b/src/commands/inspect.rs
@@ -0,0 +1,121 @@
+// Copyright 2025-2026 Lablup Inc. and Jeongkyu Shin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! CLI handler for `mlxcel inspect`.
+//!
+//! Read-only entry point that surfaces the unified memory estimator
+//! (issue #56, epic #52 capstone). Prints the byte breakdown for
+//! weights / KV cache / runtime activation headroom / total vs
+//! available unified memory, then exits without loading the model.
+//!
+//! Used by: operators sizing a model for a given host before
+//! launching `mlxcel generate` or `mlxcel serve`.
+
+use anyhow::{Result, anyhow};
+
+use mlxcel::memory_estimate::{QuantHint, estimate_total_memory, format_estimate};
+
+use crate::InspectArgs;
+
+/// Run the `mlxcel inspect` subcommand.
+pub(crate) fn run_inspect(args: InspectArgs) -> Result<()> {
+    if !args.model.exists() {
+        return Err(anyhow!(
+            "Model directory does not exist: {}",
+            args.model.display()
+        ));
+    }
+
+    // Translate the user-facing `--quant` label into the typed hint.
+    let quant = parse_quant_hint(&args.quant)?;
+
+    // Translate the K/V cache flag pair into the int8/fp16 decision the
+    // estimator understands. Both flags must point at int8 for KV
+    // bytes to halve; any other combination is treated as fp16 (the
+    // default) since mixed-precision KV is not directly modelled in
+    // the size formula. Surface the consequence in the printed output.
+    let kv_int8 = matches!(
+        (
+            args.turbo.cache_type_k.as_deref(),
+            args.turbo.cache_type_v.as_deref(),
+        ),
+        (Some("int8"), Some("int8")) | (Some("i8"), Some("i8"))
+    );
+
+    let estimate = estimate_total_memory(&args.model, args.max_tokens, args.batch, quant, kv_int8);
+
+    let banner = format_estimate(&args.model, &estimate);
+    println!("{banner}");
+
+    if !estimate.fits {
+        // Exit successfully — `inspect` is read-only and informational.
+        // The caller can pipe this to a script that checks for the
+        // "DOES NOT FIT" marker. Returning Err here would conflate
+        // "inspect ran successfully and reported over-capacity" with
+        // "inspect itself failed".
+        println!(
+            "Note: this configuration is expected to fail the `--estimate-memory` \
+             preflight on `mlxcel generate` / `mlxcel serve` unless `--force` is set."
+        );
+    }
+
+    Ok(())
+}
+
+/// Parse the user-facing `--quant` label into a typed [`QuantHint`].
+///
+/// Accepts: `default`, `fp16`, `int8`, `int4`. Returns a clear
+/// `anyhow::Error` for unknown labels so the CLI fails fast with a
+/// usable error rather than silently coercing to the default.
+fn parse_quant_hint(label: &str) -> Result<QuantHint> {
+    match label.to_ascii_lowercase().as_str() {
+        "default" | "" => Ok(QuantHint::Default),
+        "fp16" | "float16" => Ok(QuantHint::Fp16),
+        "int8" | "i8" => Ok(QuantHint::Int8),
+        "int4" | "i4" => Ok(QuantHint::Int4),
+        other => Err(anyhow!(
+            "--quant: unknown value '{other}'; expected one of \
+             default, fp16, int8, int4"
+        )),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_quant_hint_accepts_known_labels() {
+        assert_eq!(parse_quant_hint("default").unwrap(), QuantHint::Default);
+        assert_eq!(parse_quant_hint("").unwrap(), QuantHint::Default);
+        assert_eq!(parse_quant_hint("fp16").unwrap(), QuantHint::Fp16);
+        assert_eq!(parse_quant_hint("float16").unwrap(), QuantHint::Fp16);
+        assert_eq!(parse_quant_hint("int8").unwrap(), QuantHint::Int8);
+        assert_eq!(parse_quant_hint("i8").unwrap(), QuantHint::Int8);
+        assert_eq!(parse_quant_hint("int4").unwrap(), QuantHint::Int4);
+        assert_eq!(parse_quant_hint("i4").unwrap(), QuantHint::Int4);
+        assert_eq!(parse_quant_hint("INT8").unwrap(), QuantHint::Int8);
+    }
+
+    #[test]
+    fn parse_quant_hint_rejects_unknown() {
+        let err = parse_quant_hint("turbo3").unwrap_err();
+        let msg = format!("{err}");
+        assert!(msg.contains("unknown"), "expected 'unknown' in: {msg}");
+        assert!(
+            msg.contains("turbo3"),
+            "expected label echoed back in: {msg}"
+        );
+    }
+}
diff --git a/src/commands/mod.rs b/src/commands/mod.rs
@@ -21,8 +21,10 @@
 pub(crate) mod download;
 pub(crate) mod generate;
 mod generate_vlm;
+pub(crate) mod inspect;
 mod serve;
 
 pub(crate) use download::run_download;
 pub(crate) use generate::run_generate;
+pub(crate) use inspect::run_inspect;
 pub(crate) use serve::run_serve;