Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions docs/commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,34 @@ This page contains example commands to help you choose models and configure Open
--assistant-confidence-threshold 0.5
```

=== "Model caching"

The `--cache-dir` parameter can be specified to cache compiled models upon first start. This can greatly reduce startup memory cost and time for subsequent process starts.
On some setups this can reduce peak memory utilization on subsequent restarts by 3x or more, and start time by 7x. For additional details, see
[here](https://docs.openvino.ai/2026/model-server/ovms_docs_model_cache.html).

The cache can be shared by multiple processes (and on shared network filesystems such as NFS or CephFS) provided that only one process updates it at a time.

The cache will be fully or partially invalidated when doing any of the below:
* Changing the utilized device(s) (swapping GPU models, adding or removing a GPU, adding or removing a CPU, etc.)
* Changing `runtime_config` that impacts the model itself (e.g. `PERFORMANCE_HINT: THROUGHPUT` to `PERFORMANCE_HINT: LATENCY` but not `NUM_STREAMS: 1 to `NUM_STREAMS: 2`)
* Changing any part of the software stack from the firmware up - GPU firmware, OS kernel, kernel modules/drivers, dependency libraries, OpenARC, model versions.

> [!WARNING]
> Due to OpenVINO limitations, unused cache files are never cleaned up and will persist until an operator removes them. The cache can grow large over time. It is recommended
> that operators monitor the cache size and manually clean it up as needed to reduce disk usage.


```
openarc add \
--model-name <model-name> \
--model-path <path/to/model> \
--engine ovgenai \
--model-type llm \
--device GPU \
--cache-dir <path/to/model/cache>
```

=== "list"

Reads added configurations from `openarc_config.json`.
Expand Down
10 changes: 9 additions & 1 deletion src/cli/groups/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
@click.option("--runtime-config", "--rtc",
default=None,
help='OpenVINO runtime configuration as JSON string (e.g., \'{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}\').')
@click.option('--cache-dir', '--cd',
required=False,
default=None,
help='Directory for the OpenVINO model cache. Caching compiled model blobs here speeds up subsequent loads of this model. Relative paths are resolved against the config file, like --model-path.')
@click.option('--draft-model-path', '--dmp',
required=False,
default=None,
Expand All @@ -53,7 +57,7 @@
type=float,
help='Confidence threshold for accepting draft tokens.')
@click.pass_context
def add(ctx, model_path, model_name, engine, model_type, device, runtime_config, draft_model_path, draft_device, num_assistant_tokens, assistant_confidence_threshold):
def add(ctx, model_path, model_name, engine, model_type, device, runtime_config, cache_dir, draft_model_path, draft_device, num_assistant_tokens, assistant_confidence_threshold):
"""- Add a model configuration to the config file."""

# Validate model path
Expand Down Expand Up @@ -85,6 +89,10 @@ def add(ctx, model_path, model_name, engine, model_type, device, runtime_config,
"runtime_config": parsed_runtime_config,
}

# Store the cache directory (resolved relative to the config file at load time)
if cache_dir:
load_config["cache_dir"] = cache_dir

# Add speculative decoding options if provided
if draft_model_path:
if not validate_model_path(draft_model_path):
Expand Down
2 changes: 2 additions & 0 deletions src/cli/groups/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def list(ctx, model_name, verbose, remove):
config_table.add_row("model_type", f"[magenta]{model_config.get('model_type')}[/magenta]")

# Display optional fields when available
if model_config.get('cache_dir'):
config_table.add_row("cache_dir", f"[yellow]{model_config.get('cache_dir')}[/yellow]")
if model_config.get('draft_model_path'):
config_table.add_row("draft_model_path", f"[red]{model_config.get('draft_model_path')}[/red]")
if model_config.get('draft_device'):
Expand Down
8 changes: 6 additions & 2 deletions src/cli/modules/server_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ def get_all_models(self) -> Dict[str, Dict[str, Any]]:
return {name: self._resolve_model_paths(cfg) for name, cfg in models.items()}

def _resolve_model_paths(self, model_config: Dict[str, Any]) -> Dict[str, Any]:
"""Return a copy of model_config with a relative model_path made
absolute by joining it onto the config file's directory."""
"""Return a copy of model_config with relative model_path, draft_model_path,
and cache_dir made absolute by joining them onto the config file's directory."""
resolved = dict(model_config)

path = resolved.get("model_path")
Expand All @@ -157,6 +157,10 @@ def _resolve_model_paths(self, model_config: Dict[str, Any]) -> Dict[str, Any]:
if draft_model_path and not Path(draft_model_path).is_absolute():
resolved["draft_model_path"] = str((self.config_file.parent / draft_model_path).resolve())

cache_dir = resolved.get("cache_dir")
if cache_dir and not Path(cache_dir).is_absolute():
resolved["cache_dir"] = str((self.config_file.parent / cache_dir).resolve())

return resolved

def remove_model_config(self, model_name: str) -> bool:
Expand Down
4 changes: 4 additions & 0 deletions src/engine/openvino/kokoro.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ def load_model(self, load_config: ModelLoadConfig):
self.context_length = model_config["plbert"]["max_position_embeddings"]

core = ov.Core()
if load_config.cache_dir:
core.set_property({"CACHE_DIR": load_config.cache_dir})
if load_config.runtime_config:
core.set_property(load_config.runtime_config)
self.model = core.compile_model(self.model_path / "openvino_model.xml", self._device)
return self.model

Expand Down
4 changes: 4 additions & 0 deletions src/engine/openvino/qwen3_asr/qwen3_asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ def __init__(self, load_config: ModelLoadConfig):
self._mel_filters_t = torch.from_numpy(self.mel_filters).float()
self._hann_window = torch.hann_window(WINDOW_SIZE)
self.core = ov.Core()
if load_config.cache_dir:
self.core.set_property({"CACHE_DIR": load_config.cache_dir})
if load_config.runtime_config:
self.core.set_property(load_config.runtime_config)
self.t_model_load = 0.0
self.enc_model = None
self.emb_model = None
Expand Down
7 changes: 5 additions & 2 deletions src/engine/openvino/qwen3_tts/qwen3_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def load_model(self, load_config: ModelLoadConfig) -> None:
p = Path(load_config.model_path)
device = load_config.device
core = ov.Core()
core.set_property({"CACHE_DIR": str(p / ".ov_cache")})
if load_config.cache_dir:
core.set_property({"CACHE_DIR": load_config.cache_dir})

self.tokenizer = AutoTokenizer.from_pretrained(str(p), trust_remote_code=True)

Expand All @@ -123,7 +124,9 @@ def load_model(self, load_config: ModelLoadConfig) -> None:
CP_MAX_POS, CP_HEAD_DIM, CP_ROPE_THETA,
)

_hint = {"PERFORMANCE_HINT": "LATENCY"}
# runtime_config is merged in (and can override the default hint) so it
# reaches every compiled sub-model.
_hint = {"PERFORMANCE_HINT": "LATENCY", **(load_config.runtime_config or {})}
self._text_model_c = core.compile_model(str(p / "text_model.xml"), device, _hint)
self._codec_emb_c = core.compile_model(str(p / "codec_embedding.xml"), device, _hint)
# Code predictor: many tiny inferences per frame; CPU avoids GPU launch/transfer overhead.
Expand Down
13 changes: 10 additions & 3 deletions src/engine/optimum/optimum_emb.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,16 @@ def load_model(self, loader: ModelLoadConfig):
loader: ModelLoadConfig containing model_path, device, engine, and runtime_config.
"""

self.model = OVModelForFeatureExtraction.from_pretrained(loader.model_path,
device=loader.device,
export=False)
ov_config = {**(loader.runtime_config or {})}
if loader.cache_dir:
ov_config["CACHE_DIR"] = loader.cache_dir

from_pretrained_kwargs = {"device": loader.device, "export": False}
if ov_config:
from_pretrained_kwargs["ov_config"] = ov_config

self.model = OVModelForFeatureExtraction.from_pretrained(loader.model_path,
**from_pretrained_kwargs)

self.tokenizer = AutoTokenizer.from_pretrained(loader.model_path)
logging.info(f"Model loaded successfully: {loader.model_name}")
Expand Down
14 changes: 10 additions & 4 deletions src/engine/optimum/optimum_rr.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,16 @@ def load_model(self, loader: ModelLoadConfig):
loader: ModelLoadConfig containing model_path, device, engine, and runtime_config.
"""

self.model = OVModelForCausalLM.from_pretrained(loader.model_path,
device=loader.device,
export=False,
use_cache=False)
ov_config = {**(loader.runtime_config or {})}
if loader.cache_dir:
ov_config["CACHE_DIR"] = loader.cache_dir

from_pretrained_kwargs = {"device": loader.device, "export": False, "use_cache": False}
if ov_config:
from_pretrained_kwargs["ov_config"] = ov_config

self.model = OVModelForCausalLM.from_pretrained(loader.model_path,
**from_pretrained_kwargs)

self.tokenizer = AutoTokenizer.from_pretrained(loader.model_path)
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
Expand Down
10 changes: 9 additions & 1 deletion src/engine/ov_genai/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,15 @@ def load_model(self, loader: ModelLoadConfig):
draft_model = None
if loader.draft_model_path:
try:
# Cache the draft model alongside the main model. OpenVINO keys
# cache blobs by model content, so sharing one CACHE_DIR is safe.
draft_model_properties = {}
if loader.cache_dir:
draft_model_properties['CACHE_DIR'] = loader.cache_dir
draft_model = openvino_genai.draft_model(
loader.draft_model_path,
loader.draft_device
loader.draft_device,
**draft_model_properties
)
logger.info(f"Loaded draft model from {loader.draft_model_path} on {loader.draft_device}")
self.draft_model_loaded = True
Expand Down Expand Up @@ -263,6 +269,8 @@ def load_model(self, loader: ModelLoadConfig):
self.model_assistant_confidence_threshold = None

pipeline_kwargs = {**(loader.runtime_config or {})}
if loader.cache_dir:
pipeline_kwargs['CACHE_DIR'] = loader.cache_dir
if draft_model is not None:
pipeline_kwargs['draft_model'] = draft_model

Expand Down
6 changes: 5 additions & 1 deletion src/engine/ov_genai/vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,10 +271,14 @@ def load_model(self, loader: ModelLoadConfig):
try:
logger.info(f"{loader.model_type} on {loader.device} with {loader.runtime_config}")

pipeline_kwargs = {**(loader.runtime_config or {})}
if loader.cache_dir:
pipeline_kwargs['CACHE_DIR'] = loader.cache_dir

self.model_path = VLMPipeline(
loader.model_path,
loader.device,
**(loader.runtime_config or {})
**pipeline_kwargs
)

self.tokenizer = AutoTokenizer.from_pretrained(loader.model_path)
Expand Down
6 changes: 5 additions & 1 deletion src/engine/ov_genai/whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,14 @@ def load_model(self, loader: ModelLoadConfig) -> None:
"""
Load (or reload) a Whisper model into a pipeline for the given device.
"""
pipeline_kwargs = {**(loader.runtime_config or {})}
if loader.cache_dir:
pipeline_kwargs['CACHE_DIR'] = loader.cache_dir

self.whisper_model = WhisperPipeline(
loader.model_path,
loader.device,
**(loader.runtime_config or {})
**pipeline_kwargs
)

async def unload_model(self, registry: ModelRegistry, model_name: str) -> bool:
Expand Down
10 changes: 9 additions & 1 deletion src/server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,16 @@ async def lifespan(app: FastAPI):
model_path = model_config.get("model_path")
if model_path and not Path(model_path).is_absolute():
model_config["model_path"] = str((config_file.parent / model_path).resolve())


cache_dir = model_config.get("cache_dir")
if cache_dir and not Path(cache_dir).is_absolute():
cache_dir = str((config_file.parent / cache_dir).resolve())
model_config["cache_dir"] = cache_dir

try:
if cache_dir:
# Create the cache directory at startup if it doesn't exist.
Path(cache_dir).mkdir(parents=True, exist_ok=True)
await _registry.register_load(ModelLoadConfig(**model_config))
logger.info(f"Startup: loaded '{name}'")
except Exception as e:
Expand Down
9 changes: 9 additions & 0 deletions src/server/models/registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ class ModelLoadConfig(BaseModel):
runtime_config: Dict[str, Any] = Field(
default_factory=dict,
description="Optional OpenVINO runtime properties.")
cache_dir: Optional[str] = Field(
default=None,
description="""
Optional directory for the OpenVINO model cache (CACHE_DIR property).

When set, compiled model blobs are cached here so subsequent loads of
this model skip recompilation. Relative paths are resolved against the
config file's directory when the model is loaded, the same as
model_path.""")

draft_model_path: Optional[str] = Field(
default=None,
Expand Down
29 changes: 29 additions & 0 deletions src/tests/test_optimum_emb_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,41 @@ def test_load_model_initializes_pipeline(monkeypatch: pytest.MonkeyPatch, load_c
load_config.model_path,
device=load_config.device,
export=False,
ov_config=load_config.runtime_config,
)
emb_module.AutoTokenizer.from_pretrained.assert_called_once_with(load_config.model_path)
assert emb.model is model_instance
assert emb.tokenizer is tokenizer_instance


def test_load_model_forwards_cache_dir(monkeypatch: pytest.MonkeyPatch) -> None:
loader = ModelLoadConfig(
model_path="/models/mock",
model_name="cache-model",
model_type=ModelType.EMB,
engine=EngineType.OV_OPTIMUM,
device="CPU",
runtime_config={},
cache_dir="/tmp/ov_cache",
)
emb = Optimum_EMB(loader)
monkeypatch.setattr(
emb_module.OVModelForFeatureExtraction,
"from_pretrained",
MagicMock(return_value=MagicMock()),
)
monkeypatch.setattr(emb_module.AutoTokenizer, "from_pretrained", MagicMock(return_value=MagicMock()))

emb.load_model(loader)

emb_module.OVModelForFeatureExtraction.from_pretrained.assert_called_once_with(
loader.model_path,
device=loader.device,
export=False,
ov_config={"CACHE_DIR": "/tmp/ov_cache"},
)


def test_unload_model_resets_state(monkeypatch: pytest.MonkeyPatch, load_config: ModelLoadConfig) -> None:
emb = Optimum_EMB(load_config)
emb.model = object()
Expand Down
1 change: 1 addition & 0 deletions src/tests/test_optimum_rr_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def test_load_model_initializes_pipeline(monkeypatch: pytest.MonkeyPatch, load_c
device=load_config.device,
export=False,
use_cache=False,
ov_config=load_config.runtime_config,
)
rr_module.AutoTokenizer.from_pretrained.assert_called_once_with(load_config.model_path)
assert rr.model is model_instance
Expand Down
27 changes: 27 additions & 0 deletions src/tests/test_ov_genai_kokoro_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,33 @@ def test_load_model_sets_model_and_metadata(tmp_path: Path, monkeypatch: pytest.
assert kokoro.context_length == 256


def test_load_model_forwards_runtime_config(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
model_dir = tmp_path / "kokoro"
model_dir.mkdir()
(model_dir / "config.json").write_text(
json.dumps({"vocab": ["a"], "plbert": {"max_position_embeddings": 256}}),
encoding="utf-8",
)
(model_dir / "openvino_model.xml").write_text("<xml />", encoding="utf-8")

core_instance = MagicMock()
core_instance.compile_model.return_value = "compiled-model"
monkeypatch.setattr(kokoro_module.ov, "Core", MagicMock(return_value=core_instance))

load_config = ModelLoadConfig(
model_path=str(model_dir),
model_name="rtc-kokoro",
model_type=ModelType.KOKORO,
engine=EngineType.OPENVINO,
device="CPU",
runtime_config={"NUM_STREAMS": "2"},
)

OV_Kokoro(load_config).load_model(load_config)

core_instance.set_property.assert_called_once_with({"NUM_STREAMS": "2"})


def test_chunk_forward_pass_yields_chunks(monkeypatch: pytest.MonkeyPatch, load_config: ModelLoadConfig) -> None:
kokoro = OV_Kokoro(load_config)
kokoro.model = object()
Expand Down
Loading