SearchSavior · solidDoWant · May 22, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/docs/commands.md b/docs/commands.md
@@ -371,6 +371,34 @@ This page contains example commands to help you choose models and configure Open
               --assistant-confidence-threshold 0.5
             ```
 
+        === "Model caching"
+
+            The `--cache-dir` parameter can be specified to cache compiled models upon first start. This can greatly reduce startup memory cost and time for subsequent process starts.
+            On some setups this can reduce peak memory utilization on subsequent restarts by 3x or more, and start time by 7x. For additional details, see
+            [here](https://docs.openvino.ai/2026/model-server/ovms_docs_model_cache.html).
+
+            The cache can be shared by multiple processes (and on shared network filesystems such as NFS or CephFS) provided that only one process updates it at a time.
+
+            The cache will be fully or partially invalidated when doing any of the below:
+            * Changing the utilized device(s) (swapping GPU models, adding or removing a GPU, adding or removing a CPU, etc.)
+            * Changing `runtime_config` that impacts the model itself (e.g. `PERFORMANCE_HINT: THROUGHPUT` to `PERFORMANCE_HINT: LATENCY` but not `NUM_STREAMS: 1 to `NUM_STREAMS: 2`)
+            * Changing any part of the software stack from the firmware up - GPU firmware, OS kernel, kernel modules/drivers, dependency libraries, OpenARC, model versions.
+
+            > [!WARNING]
+            > Due to OpenVINO limitations, unused cache files are never cleaned up and will persist until an operator removes them. The cache can grow large over time. It is recommended
+            > that operators monitor the cache size and manually clean it up as needed to reduce disk usage.
+
+
+            ```
+            openarc add \
+              --model-name <model-name> \
+              --model-path <path/to/model> \
+              --engine ovgenai \
+              --model-type llm \
+              --device GPU \
+              --cache-dir <path/to/model/cache>
+            ```
+
 === "list"
 
     Reads added configurations from `openarc_config.json`.

diff --git a/src/cli/groups/add.py b/src/cli/groups/add.py
@@ -34,6 +34,10 @@
 @click.option("--runtime-config", "--rtc",
     default=None,
     help='OpenVINO runtime configuration as JSON string (e.g., \'{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}\').')
+@click.option('--cache-dir', '--cd',
+    required=False,
+    default=None,
+    help='Directory for the OpenVINO model cache. Caching compiled model blobs here speeds up subsequent loads of this model. Relative paths are resolved against the config file, like --model-path.')
 @click.option('--draft-model-path', '--dmp',
     required=False,
     default=None,
@@ -53,7 +57,7 @@
     type=float,
     help='Confidence threshold for accepting draft tokens.')
 @click.pass_context
-def add(ctx, model_path, model_name, engine, model_type, device, runtime_config, draft_model_path, draft_device, num_assistant_tokens, assistant_confidence_threshold):
+def add(ctx, model_path, model_name, engine, model_type, device, runtime_config, cache_dir, draft_model_path, draft_device, num_assistant_tokens, assistant_confidence_threshold):
     """- Add a model configuration to the config file."""
 
     # Validate model path
@@ -85,6 +89,10 @@ def add(ctx, model_path, model_name, engine, model_type, device, runtime_config,
         "runtime_config": parsed_runtime_config,
     }
 
+    # Store the cache directory (resolved relative to the config file at load time)
+    if cache_dir:
+        load_config["cache_dir"] = cache_dir
+
     # Add speculative decoding options if provided
     if draft_model_path:
         if not validate_model_path(draft_model_path):

diff --git a/src/cli/groups/list.py b/src/cli/groups/list.py
@@ -120,6 +120,8 @@ def list(ctx, model_name, verbose, remove):
         config_table.add_row("model_type", f"[magenta]{model_config.get('model_type')}[/magenta]")
 
         # Display optional fields when available
+        if model_config.get('cache_dir'):
+            config_table.add_row("cache_dir", f"[yellow]{model_config.get('cache_dir')}[/yellow]")
         if model_config.get('draft_model_path'):
             config_table.add_row("draft_model_path", f"[red]{model_config.get('draft_model_path')}[/red]")
         if model_config.get('draft_device'):

diff --git a/src/cli/modules/server_config.py b/src/cli/modules/server_config.py
@@ -145,8 +145,8 @@ def get_all_models(self) -> Dict[str, Dict[str, Any]]:
         return {name: self._resolve_model_paths(cfg) for name, cfg in models.items()}
 
     def _resolve_model_paths(self, model_config: Dict[str, Any]) -> Dict[str, Any]:
-        """Return a copy of model_config with a relative model_path made
-        absolute by joining it onto the config file's directory."""
+        """Return a copy of model_config with relative model_path, draft_model_path,
+        and cache_dir made absolute by joining them onto the config file's directory."""
         resolved = dict(model_config)
 
         path = resolved.get("model_path")
@@ -157,6 +157,10 @@ def _resolve_model_paths(self, model_config: Dict[str, Any]) -> Dict[str, Any]:
         if draft_model_path and not Path(draft_model_path).is_absolute():
             resolved["draft_model_path"] = str((self.config_file.parent / draft_model_path).resolve())
 
+        cache_dir = resolved.get("cache_dir")
+        if cache_dir and not Path(cache_dir).is_absolute():
+            resolved["cache_dir"] = str((self.config_file.parent / cache_dir).resolve())
+
         return resolved
 
     def remove_model_config(self, model_name: str) -> bool:

diff --git a/src/engine/openvino/kokoro.py b/src/engine/openvino/kokoro.py
@@ -51,6 +51,10 @@ def load_model(self, load_config: ModelLoadConfig):
         self.context_length = model_config["plbert"]["max_position_embeddings"]
 
         core = ov.Core()
+        if load_config.cache_dir:
+            core.set_property({"CACHE_DIR": load_config.cache_dir})
+        if load_config.runtime_config:
+            core.set_property(load_config.runtime_config)
         self.model = core.compile_model(self.model_path / "openvino_model.xml", self._device)
         return self.model
 

diff --git a/src/engine/openvino/qwen3_asr/qwen3_asr.py b/src/engine/openvino/qwen3_asr/qwen3_asr.py
@@ -220,6 +220,10 @@ def __init__(self, load_config: ModelLoadConfig):
         self._mel_filters_t = torch.from_numpy(self.mel_filters).float()
         self._hann_window = torch.hann_window(WINDOW_SIZE)
         self.core = ov.Core()
+        if load_config.cache_dir:
+            self.core.set_property({"CACHE_DIR": load_config.cache_dir})
+        if load_config.runtime_config:
+            self.core.set_property(load_config.runtime_config)
         self.t_model_load = 0.0
         self.enc_model = None
         self.emb_model = None

diff --git a/src/engine/openvino/qwen3_tts/qwen3_tts.py b/src/engine/openvino/qwen3_tts/qwen3_tts.py
@@ -112,7 +112,8 @@ def load_model(self, load_config: ModelLoadConfig) -> None:
         p = Path(load_config.model_path)
         device = load_config.device
         core = ov.Core()
-        core.set_property({"CACHE_DIR": str(p / ".ov_cache")})
+        if load_config.cache_dir:
+            core.set_property({"CACHE_DIR": load_config.cache_dir})
 
         self.tokenizer = AutoTokenizer.from_pretrained(str(p), trust_remote_code=True)
 
@@ -123,7 +124,9 @@ def load_model(self, load_config: ModelLoadConfig) -> None:
             CP_MAX_POS, CP_HEAD_DIM, CP_ROPE_THETA,
         )
 
-        _hint = {"PERFORMANCE_HINT": "LATENCY"}
+        # runtime_config is merged in (and can override the default hint) so it
+        # reaches every compiled sub-model.
+        _hint = {"PERFORMANCE_HINT": "LATENCY", **(load_config.runtime_config or {})}
         self._text_model_c = core.compile_model(str(p / "text_model.xml"), device, _hint)
         self._codec_emb_c = core.compile_model(str(p / "codec_embedding.xml"), device, _hint)
         # Code predictor: many tiny inferences per frame; CPU avoids GPU launch/transfer overhead.

diff --git a/src/engine/optimum/optimum_emb.py b/src/engine/optimum/optimum_emb.py
@@ -81,9 +81,16 @@ def load_model(self, loader: ModelLoadConfig):
             loader: ModelLoadConfig containing model_path, device, engine, and runtime_config.
         """
 
-        self.model = OVModelForFeatureExtraction.from_pretrained(loader.model_path, 
-            device=loader.device, 
-            export=False)
+        ov_config = {**(loader.runtime_config or {})}
+        if loader.cache_dir:
+            ov_config["CACHE_DIR"] = loader.cache_dir
+
+        from_pretrained_kwargs = {"device": loader.device, "export": False}
+        if ov_config:
+            from_pretrained_kwargs["ov_config"] = ov_config
+
+        self.model = OVModelForFeatureExtraction.from_pretrained(loader.model_path,
+            **from_pretrained_kwargs)
 
         self.tokenizer = AutoTokenizer.from_pretrained(loader.model_path)
         logging.info(f"Model loaded successfully: {loader.model_name}")

diff --git a/src/engine/optimum/optimum_rr.py b/src/engine/optimum/optimum_rr.py
@@ -72,10 +72,16 @@ def load_model(self, loader: ModelLoadConfig):
             loader: ModelLoadConfig containing model_path, device, engine, and runtime_config.
         """
 
-        self.model = OVModelForCausalLM.from_pretrained(loader.model_path, 
-            device=loader.device, 
-            export=False,
-            use_cache=False)
+        ov_config = {**(loader.runtime_config or {})}
+        if loader.cache_dir:
+            ov_config["CACHE_DIR"] = loader.cache_dir
+
+        from_pretrained_kwargs = {"device": loader.device, "export": False, "use_cache": False}
+        if ov_config:
+            from_pretrained_kwargs["ov_config"] = ov_config
+
+        self.model = OVModelForCausalLM.from_pretrained(loader.model_path,
+            **from_pretrained_kwargs)
 
         self.tokenizer = AutoTokenizer.from_pretrained(loader.model_path)
         self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")

diff --git a/src/engine/ov_genai/llm.py b/src/engine/ov_genai/llm.py
@@ -232,9 +232,15 @@ def load_model(self, loader: ModelLoadConfig):
         draft_model = None
         if loader.draft_model_path:
             try:
+                # Cache the draft model alongside the main model. OpenVINO keys
+                # cache blobs by model content, so sharing one CACHE_DIR is safe.
+                draft_model_properties = {}
+                if loader.cache_dir:
+                    draft_model_properties['CACHE_DIR'] = loader.cache_dir
                 draft_model = openvino_genai.draft_model(
                     loader.draft_model_path,
-                    loader.draft_device
+                    loader.draft_device,
+                    **draft_model_properties
                 )
                 logger.info(f"Loaded draft model from {loader.draft_model_path} on {loader.draft_device}")
                 self.draft_model_loaded = True
@@ -263,6 +269,8 @@ def load_model(self, loader: ModelLoadConfig):
             self.model_assistant_confidence_threshold = None
 
         pipeline_kwargs = {**(loader.runtime_config or {})}
+        if loader.cache_dir:
+            pipeline_kwargs['CACHE_DIR'] = loader.cache_dir
         if draft_model is not None:
             pipeline_kwargs['draft_model'] = draft_model
 

diff --git a/src/engine/ov_genai/vlm.py b/src/engine/ov_genai/vlm.py
@@ -271,10 +271,14 @@ def load_model(self, loader: ModelLoadConfig):
         try:
             logger.info(f"{loader.model_type} on {loader.device} with {loader.runtime_config}")
 
+            pipeline_kwargs = {**(loader.runtime_config or {})}
+            if loader.cache_dir:
+                pipeline_kwargs['CACHE_DIR'] = loader.cache_dir
+
             self.model_path = VLMPipeline(
                 loader.model_path,
                 loader.device,
-                **(loader.runtime_config or {})
+                **pipeline_kwargs
             )
 
             self.tokenizer = AutoTokenizer.from_pretrained(loader.model_path)

diff --git a/src/engine/ov_genai/whisper.py b/src/engine/ov_genai/whisper.py
@@ -75,10 +75,14 @@ def load_model(self, loader: ModelLoadConfig) -> None:
         """
         Load (or reload) a Whisper model into a pipeline for the given device.
         """
+        pipeline_kwargs = {**(loader.runtime_config or {})}
+        if loader.cache_dir:
+            pipeline_kwargs['CACHE_DIR'] = loader.cache_dir
+
         self.whisper_model = WhisperPipeline(
             loader.model_path,
             loader.device,
-            **(loader.runtime_config or {})
+            **pipeline_kwargs
         )
 
     async def unload_model(self, registry: ModelRegistry, model_name: str) -> bool:

diff --git a/src/server/main.py b/src/server/main.py
@@ -70,8 +70,16 @@ async def lifespan(app: FastAPI):
                 model_path = model_config.get("model_path")
                 if model_path and not Path(model_path).is_absolute():
                     model_config["model_path"] = str((config_file.parent / model_path).resolve())
-
+
+                cache_dir = model_config.get("cache_dir")
+                if cache_dir and not Path(cache_dir).is_absolute():
+                    cache_dir = str((config_file.parent / cache_dir).resolve())
+                    model_config["cache_dir"] = cache_dir
+
                 try:
+                    if cache_dir:
+                        # Create the cache directory at startup if it doesn't exist.
+                        Path(cache_dir).mkdir(parents=True, exist_ok=True)
                     await _registry.register_load(ModelLoadConfig(**model_config))
                     logger.info(f"Startup: loaded '{name}'")
                 except Exception as e:

diff --git a/src/server/models/registration.py b/src/server/models/registration.py
@@ -85,6 +85,15 @@ class ModelLoadConfig(BaseModel):
     runtime_config: Dict[str, Any] = Field(
         default_factory=dict,
         description="Optional OpenVINO runtime properties.")
+    cache_dir: Optional[str] = Field(
+        default=None,
+        description="""
+        Optional directory for the OpenVINO model cache (CACHE_DIR property).
+
+        When set, compiled model blobs are cached here so subsequent loads of
+        this model skip recompilation. Relative paths are resolved against the
+        config file's directory when the model is loaded, the same as
+        model_path.""")
 
     draft_model_path: Optional[str] = Field(
         default=None,

diff --git a/src/tests/test_optimum_emb_unit.py b/src/tests/test_optimum_emb_unit.py
@@ -106,12 +106,41 @@ def test_load_model_initializes_pipeline(monkeypatch: pytest.MonkeyPatch, load_c
         load_config.model_path,
         device=load_config.device,
         export=False,
+        ov_config=load_config.runtime_config,
     )
     emb_module.AutoTokenizer.from_pretrained.assert_called_once_with(load_config.model_path)
     assert emb.model is model_instance
     assert emb.tokenizer is tokenizer_instance
 
 
+def test_load_model_forwards_cache_dir(monkeypatch: pytest.MonkeyPatch) -> None:
+    loader = ModelLoadConfig(
+        model_path="/models/mock",
+        model_name="cache-model",
+        model_type=ModelType.EMB,
+        engine=EngineType.OV_OPTIMUM,
+        device="CPU",
+        runtime_config={},
+        cache_dir="/tmp/ov_cache",
+    )
+    emb = Optimum_EMB(loader)
+    monkeypatch.setattr(
+        emb_module.OVModelForFeatureExtraction,
+        "from_pretrained",
+        MagicMock(return_value=MagicMock()),
+    )
+    monkeypatch.setattr(emb_module.AutoTokenizer, "from_pretrained", MagicMock(return_value=MagicMock()))
+
+    emb.load_model(loader)
+
+    emb_module.OVModelForFeatureExtraction.from_pretrained.assert_called_once_with(
+        loader.model_path,
+        device=loader.device,
+        export=False,
+        ov_config={"CACHE_DIR": "/tmp/ov_cache"},
+    )
+
+
 def test_unload_model_resets_state(monkeypatch: pytest.MonkeyPatch, load_config: ModelLoadConfig) -> None:
     emb = Optimum_EMB(load_config)
     emb.model = object()

diff --git a/src/tests/test_optimum_rr_unit.py b/src/tests/test_optimum_rr_unit.py
@@ -132,6 +132,7 @@ def test_load_model_initializes_pipeline(monkeypatch: pytest.MonkeyPatch, load_c
         device=load_config.device,
         export=False,
         use_cache=False,
+        ov_config=load_config.runtime_config,
     )
     rr_module.AutoTokenizer.from_pretrained.assert_called_once_with(load_config.model_path)
     assert rr.model is model_instance

diff --git a/src/tests/test_ov_genai_kokoro_unit.py b/src/tests/test_ov_genai_kokoro_unit.py
@@ -74,6 +74,33 @@ def test_load_model_sets_model_and_metadata(tmp_path: Path, monkeypatch: pytest.
     assert kokoro.context_length == 256
 
 
+def test_load_model_forwards_runtime_config(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    model_dir = tmp_path / "kokoro"
+    model_dir.mkdir()
+    (model_dir / "config.json").write_text(
+        json.dumps({"vocab": ["a"], "plbert": {"max_position_embeddings": 256}}),
+        encoding="utf-8",
+    )
+    (model_dir / "openvino_model.xml").write_text("<xml />", encoding="utf-8")
+
+    core_instance = MagicMock()
+    core_instance.compile_model.return_value = "compiled-model"
+    monkeypatch.setattr(kokoro_module.ov, "Core", MagicMock(return_value=core_instance))
+
+    load_config = ModelLoadConfig(
+        model_path=str(model_dir),
+        model_name="rtc-kokoro",
+        model_type=ModelType.KOKORO,
+        engine=EngineType.OPENVINO,
+        device="CPU",
+        runtime_config={"NUM_STREAMS": "2"},
+    )
+
+    OV_Kokoro(load_config).load_model(load_config)
+
+    core_instance.set_property.assert_called_once_with({"NUM_STREAMS": "2"})
+
+
 def test_chunk_forward_pass_yields_chunks(monkeypatch: pytest.MonkeyPatch, load_config: ModelLoadConfig) -> None:
     kokoro = OV_Kokoro(load_config)
     kokoro.model = object()