Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 43 additions & 36 deletions src/bin/mlx_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ use mlxcel::server::{
env_fallback_lang_bias, env_fallback_lang_bias_include_byte_fragments,
env_fallback_prompt_cache_capacity_bytes, env_fallback_prompt_cache_enabled,
env_fallback_prompt_cache_max_entries, env_fallback_prompt_cache_min_prefix,
env_fallback_prompt_cache_ttl, env_fallback_reasoning_budget, start_server,
env_fallback_prompt_cache_ttl, env_fallback_reasoning_budget, long_cli_flag_was_set,
start_server,
};

/// mlxcel-server: llama-server compatible HTTP server for MLX inference
Expand All @@ -49,7 +50,7 @@ use mlxcel::server::{
/// 2. Subcommand mode:
/// `mlxcel-server download <REPO_ID>`
/// `download` fetches a HuggingFace model snapshot using the same
/// downloader the `mlxcel` CLI uses (issue #457). Server flags are
/// downloader the `mlxcel` CLI uses. Server flags are
/// rejected when a subcommand is supplied.
#[derive(Parser, Debug)]
#[command(
Expand Down Expand Up @@ -101,7 +102,7 @@ Thunderbolt mode:
Subcommands:
download <REPO_ID> Fetch a HuggingFace model snapshot into models/<basename>

See also: docs/PIPELINE_PARALLELISM.md"
See also: docs/distributed.md"
)]
struct Cli {
/// Subcommand to run. When omitted, the binary boots the HTTP server
Expand Down Expand Up @@ -269,7 +270,7 @@ struct ServerArgs {
/// When set to `N > 0`, the batch scheduler caps each per-sequence plain
/// `KVCache` to `N` tokens by dropping the oldest entries once `offset`
/// exceeds the bound. Mirrors upstream mlx-lm's
/// `BatchGenerator(max_kv_size=N)` parameter (PR #1106).
/// `BatchGenerator(max_kv_size=N)` parameter.
///
/// Sliding-window models that already build their own `RotatingKVCache`
/// (Gemma 3/4, Exaone 4, RecurrentGemma, Step 3.5, gpt-oss) are
Expand All @@ -288,7 +289,7 @@ struct ServerArgs {
)]
max_kv_size: usize,

/// Issue #622: maximum number of responses persisted by the OpenAI
/// Maximum number of responses persisted by the OpenAI
/// `/v1/responses` store (in-memory). `0` disables persistence
/// entirely. Also reads `LLAMA_ARG_RESPONSES_STORE_MAX_ENTRIES`.
#[arg(
Expand All @@ -299,7 +300,7 @@ struct ServerArgs {
)]
responses_store_max_entries: usize,

/// Issue #622: TTL (seconds) for in-memory Responses-API response
/// TTL (seconds) for in-memory Responses-API response
/// entries. `0` disables TTL.
/// Also reads `LLAMA_ARG_RESPONSES_STORE_TTL_SECS`.
#[arg(
Expand All @@ -310,7 +311,7 @@ struct ServerArgs {
)]
responses_store_ttl_secs: u64,

/// Issue #622: maximum number of conversation transcripts persisted
/// Maximum number of conversation transcripts persisted
/// for the OpenAI Responses API `conversation` field. `0` disables.
/// Also reads `LLAMA_ARG_CONVERSATION_STORE_MAX_ENTRIES`.
#[arg(
Expand All @@ -321,7 +322,7 @@ struct ServerArgs {
)]
conversation_store_max_entries: usize,

/// Issue #622: TTL (seconds) for conversation transcript entries.
/// TTL (seconds) for conversation transcript entries.
/// `0` disables TTL.
/// Also reads `LLAMA_ARG_CONVERSATION_STORE_TTL_SECS`.
#[arg(
Expand Down Expand Up @@ -440,7 +441,7 @@ struct ServerArgs {
#[arg(long, value_name = "PATH")]
distributed_config: Option<PathBuf>,

/// Role this node plays in the cluster (prefill, decode, pipeline_stage, tensor_parallel_rank, hybrid)
/// Role this node plays in the cluster (prefill, decode, pipeline_stage, tensor_parallel_rank, pipeline_tensor_parallel, hybrid)
#[arg(long, value_name = "ROLE")]
node_role: Option<String>,

Expand Down Expand Up @@ -675,11 +676,9 @@ struct ServerArgs {

/// Enable experimental elastic pipeline-parallel repartitioning.
///
/// When set, `mlxcel-server` constructs a repartition coordinator (see
/// `docs_internal/architecture/elastic-pipeline-repartition-20260418.md`)
/// that can drain in-flight requests, recompute the partition plan, and
/// reload layer weights without a full cluster restart. Off by default —
/// v1 is explicitly opt-in.
/// When set, `mlxcel-server` constructs a repartition coordinator that can
/// drain in-flight requests, recompute the partition plan, and reload
/// layer weights without a full cluster restart. Off by default.
#[arg(long = "enable-elastic-pp", default_value_t = false)]
enable_elastic_pp: bool,

Expand Down Expand Up @@ -717,7 +716,7 @@ struct ServerArgs {
/// Currently the Prometheus endpoint is multiplexed onto the same HTTP
/// port as the OpenAI API. Passing this flag enables the endpoint.
/// A warning is logged when the requested port differs from `--port`
/// because a separate socket is deferred to a follow-up rollout.
/// because metrics are currently served on the main HTTP listener.
#[arg(long = "metrics-port", value_name = "PORT")]
metrics_port: Option<u16>,

Expand All @@ -742,7 +741,7 @@ struct ServerArgs {
#[command(flatten)]
turbo: TurboKvCacheArgs,

/// Issue #545: continuous-batching KV quantization flag group
/// Continuous-batching KV quantization flag group
/// (`--kv-bits`, `--kv-group-size`, `--kv-quant-scheme`,
/// `--kv-skip-last-layer`). Defined once in
/// `mlxcel::cli::batch_quant_args` so both server binaries
Expand All @@ -761,16 +760,16 @@ struct ServerArgs {
#[command(flatten)]
speculative: SpeculativeArgs,

/// Axis B Epic #362 (B8): language-bias options for server-wide output
/// Language-bias options for server-wide output
/// steering. See `--lang-bias`, `--lang-bias-config`, `--lang-bias-policy`,
/// and the `--lang-bias-include-*` family of flags.
///
/// The `--lang-bias` flag also reads from the `LLAMA_ARG_LANG_BIAS` env var
/// (plan §6.4, B7). CLI flag takes precedence over the env var.
/// The `--lang-bias` flag also reads from the `LLAMA_ARG_LANG_BIAS` env var.
/// CLI flag takes precedence over the env var.
#[command(flatten)]
lang_bias: LangBiasCliArgs,

/// Issue #409: default thinking-token budget for Qwen3-family models.
/// Default thinking-token budget for Qwen3-family models.
///
/// Caps the number of tokens generated inside the `<think>...</think>`
/// reasoning block. Matches llama.cpp `--reasoning-budget` semantics:
Expand All @@ -792,7 +791,7 @@ struct ServerArgs {
)]
reasoning_budget: i32,

/// Issue #410: default chat-template kwargs (JSON object).
/// Default chat-template kwargs (JSON object).
///
/// Forwarded verbatim as Jinja template kwargs when rendering chat
/// conversations. Matches llama.cpp's `--chat-template-kwargs` shape.
Expand Down Expand Up @@ -829,7 +828,11 @@ struct ServerArgs {
#[arg(
long = "prompt-cache-enabled",
default_value_t = true,
value_name = "BOOL"
value_name = "BOOL",
num_args = 0..=1,
require_equals = true,
default_missing_value = "true",
action = clap::ArgAction::Set
)]
prompt_cache_enabled: bool,

Expand Down Expand Up @@ -886,7 +889,15 @@ struct ServerArgs {
/// into hashable blocks.
///
/// Also reads `APC_ENABLED` (parity with upstream `mlx-vlm`).
#[arg(long = "apc-enabled", default_value_t = false, value_name = "BOOL")]
#[arg(
long = "apc-enabled",
default_value_t = false,
value_name = "BOOL",
num_args = 0..=1,
require_equals = true,
default_missing_value = "true",
action = clap::ArgAction::Set
)]
apc_enabled: bool,

/// Tokens per APC block (default: 16).
Expand Down Expand Up @@ -929,8 +940,7 @@ struct ServerArgs {
///
/// mlxcel-server -m models/foo --surgery surgery.yaml --port 8080
///
/// The YAML schema is documented in
/// `docs_internal/architecture/structural-finetuning-overview-20260419.md`.
/// The supported surgery operations are summarised in the project README.
#[cfg(feature = "surgery")]
#[arg(long = "surgery", value_name = "FILE", env = "MLXCEL_SURGERY")]
surgery: Option<PathBuf>,
Expand Down Expand Up @@ -980,24 +990,21 @@ fn build_startup_input(mut args: ServerArgs) -> anyhow::Result<ServerStartupInpu
// Issue #410 — env-var fallback for the chat-template kwargs default.
env_fallback_chat_template_kwargs(&mut args.chat_template_kwargs);

// Issue #424 — env-var fallbacks for prompt-cache knobs.
// `prompt_cache_enabled` clap default is `true`, so we must detect
// whether the flag was explicitly set. Since clap doesn't expose a
// "was this flag explicitly set" predicate for boolean defaults without
// using an `Option<bool>`, we pass `false` for `cli_was_set` here so
// that the env-var path is always consulted. CLI-sourced `false` is also
// correctly propagated because clap will have already stored `false`
// in `args.prompt_cache_enabled` when the user passes
// `--prompt-cache-enabled=false`.
env_fallback_prompt_cache_enabled(&mut args.prompt_cache_enabled, false);
// Env-var fallbacks for prompt-cache knobs. Detect explicit boolean flags
// from argv so `--prompt-cache-enabled=false` keeps CLI-over-env precedence
// while the compiled-in default still allows env overrides.
env_fallback_prompt_cache_enabled(
&mut args.prompt_cache_enabled,
long_cli_flag_was_set("prompt-cache-enabled"),
);
env_fallback_prompt_cache_capacity_bytes(&mut args.prompt_cache_capacity_bytes);
env_fallback_prompt_cache_max_entries(&mut args.prompt_cache_max_entries);
env_fallback_prompt_cache_ttl(&mut args.prompt_cache_ttl);
env_fallback_prompt_cache_min_prefix(&mut args.prompt_cache_min_prefix);

// Issue #552 — env-var fallbacks for the APC knobs (parity with upstream
// mlx-vlm `APC_*` env vars).
env_fallback_apc_enabled(&mut args.apc_enabled, false);
env_fallback_apc_enabled(&mut args.apc_enabled, long_cli_flag_was_set("apc-enabled"));
env_fallback_apc_block_size(&mut args.apc_block_size);
env_fallback_apc_num_blocks(&mut args.apc_num_blocks);
env_fallback_apc_hash(&mut args.apc_hash);
Expand Down
2 changes: 1 addition & 1 deletion src/cli/speculative_args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ impl SpeculativeArgs {
///
/// Note that we intentionally **do not** accept the third
/// `internal-mtp` variant of [`DrafterKind`] on the CLI — that
/// variant is auto-detected from the target checkpoint (epic #647)
/// variant is auto-detected from the target checkpoint
/// and is not user-selectable today. The accepted set on the CLI is
/// the upstream `KNOWN_DRAFTER_KINDS = {"dflash", "mtp"}` only;
/// passing `internal-mtp` returns a parse error with a hint.
Expand Down
11 changes: 7 additions & 4 deletions src/commands/serve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ use mlxcel::server::{
env_fallback_lang_bias, env_fallback_lang_bias_include_byte_fragments,
env_fallback_prompt_cache_capacity_bytes, env_fallback_prompt_cache_enabled,
env_fallback_prompt_cache_max_entries, env_fallback_prompt_cache_min_prefix,
env_fallback_prompt_cache_ttl, env_fallback_reasoning_budget, resolve_parallel_context_size,
start_server,
env_fallback_prompt_cache_ttl, env_fallback_reasoning_budget, long_cli_flag_was_set,
resolve_parallel_context_size, start_server,
};
use mlxcel_core::cache::KVCacheMode;

Expand Down Expand Up @@ -151,13 +151,16 @@ fn build_startup_input(mut args: crate::ServeArgs) -> anyhow::Result<ServerStart
// Issue #410 — env-var fallback for the chat-template kwargs default.
env_fallback_chat_template_kwargs(&mut args.chat_template_kwargs);
// Issue #424 — env-var fallbacks for prompt-cache knobs.
env_fallback_prompt_cache_enabled(&mut args.prompt_cache_enabled, false);
env_fallback_prompt_cache_enabled(
&mut args.prompt_cache_enabled,
long_cli_flag_was_set("prompt-cache-enabled"),
);
env_fallback_prompt_cache_capacity_bytes(&mut args.prompt_cache_capacity_bytes);
env_fallback_prompt_cache_max_entries(&mut args.prompt_cache_max_entries);
env_fallback_prompt_cache_ttl(&mut args.prompt_cache_ttl);
env_fallback_prompt_cache_min_prefix(&mut args.prompt_cache_min_prefix);
// Issue #552 — env-var fallbacks for the APC knobs.
env_fallback_apc_enabled(&mut args.apc_enabled, false);
env_fallback_apc_enabled(&mut args.apc_enabled, long_cli_flag_was_set("apc-enabled"));
env_fallback_apc_block_size(&mut args.apc_block_size);
env_fallback_apc_num_blocks(&mut args.apc_num_blocks);
env_fallback_apc_hash(&mut args.apc_hash);
Expand Down
20 changes: 10 additions & 10 deletions src/lang_bias.rs
Original file line number Diff line number Diff line change
Expand Up @@ -276,51 +276,51 @@ pub fn load_yaml_config(path: &PathBuf) -> Result<LangBiasYamlConfig, CliError>
#[derive(Args, Debug, Default, Clone)]
#[command(next_help_heading = "Language Bias Options")]
pub struct LangBiasCliArgs {
/// Language bias entries, e.g. `ja=-inf,zh=-10,ko=+5.0` (plan §6.1).
/// Language bias entries, e.g. `ja=-inf,zh=-10,ko=+5.0`.
///
/// Syntax: `<lang_code>=<bias>[,<lang_code>=<bias>]*`
/// where `<bias>` is `-inf`, `+inf`, `inf`, or a float.
/// Supported language codes: ja, zh, ko, en, ru, ar, th, hi, he, el.
#[arg(long = "lang-bias", value_name = "ENTRIES")]
pub lang_bias: Option<String>,

/// Path to a YAML file containing language bias configuration (plan §6.2).
/// Path to a YAML file containing language bias configuration.
///
/// CLI flags take precedence over YAML config values.
#[arg(long = "lang-bias-config", value_name = "PATH")]
pub lang_bias_config: Option<PathBuf>,

/// Language token inclusion policy: `conservative` (default) or `strict` (plan §6.3).
/// Language token inclusion policy: `conservative` (default) or `strict`.
///
/// Conservative: any token containing at least one character of a target script.
/// Strict: only tokens whose entire script set is contained in the target set.
#[arg(long = "lang-bias-policy", value_name = "POLICY")]
pub lang_bias_policy: Option<String>,

/// Include special tokens (BOS/EOS/PAD/…) in language sets (plan §6.3).
/// Include special tokens (BOS/EOS/PAD/…) in language sets.
///
/// By default, special tokens are excluded from all language sets.
#[arg(long = "lang-bias-include-special", default_value_t = false)]
pub include_special: bool,

/// Include purely numeric tokens in language sets (plan §6.3).
/// Include purely numeric tokens in language sets.
///
/// By default, purely numeric tokens are excluded from all language sets.
#[arg(long = "lang-bias-include-numeric", default_value_t = false)]
pub include_numeric: bool,

/// Include purely punctuation tokens in language sets (plan §6.3).
/// Include purely punctuation tokens in language sets.
///
/// By default, purely punctuation tokens are excluded from all language sets.
#[arg(long = "lang-bias-include-punctuation", default_value_t = false)]
pub include_punctuation: bool,

/// Include byte-fragment tokens in language sets (issue #405).
/// Include byte-fragment tokens in language sets.
///
/// Byte-level BPE tokenizers (Qwen, GPT-2, LLaMA, Mistral) represent
/// less-common CJK characters as sequences of individual byte tokens.
/// Each byte decodes to `U+FFFD` on its own and is classified as
/// `Other` by the Phase 1 decode path, bypassing filters like
/// `Other` by the standard decode-path classifier, bypassing filters like
/// `zh=-inf` even though the fragments reassemble into the target
/// character at generation time.
///
Expand All @@ -333,11 +333,11 @@ pub struct LangBiasCliArgs {
/// `mlxcel_lang_bias_byte_fragment_suppressions_total` metric to
/// observe how much suppression comes from byte-fragment entries.
///
/// **Default:** off (behavior bit-exact identical to Phase 1).
/// **Default:** off (behavior is unchanged unless this flag is enabled).
#[arg(long = "lang-bias-include-byte-fragments", default_value_t = false)]
pub include_byte_fragments: bool,

/// Force a rebuild of the `TokenLanguageIndex` cache (plan §6.3).
/// Force a rebuild of the `TokenLanguageIndex` cache.
///
/// Normally the cache is rebuilt only when the tokenizer vocab changes.
/// Use this flag to force a rebuild regardless of cache state.
Expand Down
Loading