volcengine · heaoxiang-ai · May 20, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md
@@ -686,6 +686,29 @@ For OpenAI-compatible providers that return SSE (Server-Sent Events) format resp
 
 > **Note**: The OpenAI SDK requires `stream=true` to properly parse SSE responses. When using providers that force SSE format, you must set this option to `true`.
 
+### query_planner
+
+Optional lightweight model for retrieval intent analysis and query planning. It uses the same configuration shape as `vlm`, but only affects `search()` intent analysis and query expansion. If `query_planner` is omitted or empty, OpenViking falls back to `vlm` for backward compatibility.
+
+Only add this section when the planner model is already available in your environment. For example, the Ollama model below must be pulled and served locally before use.
+
+```json
+{
+  "query_planner": {
+    "provider": "litellm",
+    "model": "ollama/guoxuter/ov_intent_analysis_sft:v1_q8",
+    "api_base": "http://127.0.0.1:11434",
+    "temperature": 0.0,
+    "timeout": 60,
+    "extra_request_body": {
+      "think": false
+    }
+  }
+}
+```
+
+Use `query_planner` when you want a smaller or cheaper model to handle retrieval planning while keeping a stronger `vlm` for semantic extraction, memory extraction, and multimodal processing.
+
 ### feishu
 
 Configuration for Feishu/Lark cloud document parsing. See [Resources](../api/02-resources.md) for supported URL patterns.

diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md
@@ -656,6 +656,29 @@ LiteLLM 的 Bedrock bearer-token API-key 鉴权，请设置 `forward_api_key=tru
 
 > **注意**: OpenAI SDK 需要 `stream=true` 才能正确解析 SSE 响应。使用强制返回 SSE 格式的 provider 时，必须将此选项设置为 `true`。
 
+### query_planner
+
+可选的轻量模型配置，用于检索前的意图分析和 query 规划/改写。配置结构与 `vlm` 相同，但只影响 `search()` 的意图分析和 query expansion。未配置或配置为空时，OpenViking 会回退到 `vlm`，保持向后兼容。
+
+只有在环境里已经部署好 planner 模型时才需要添加这一段配置。例如下面的 Ollama 模型需要先在本地 pull 并启动后才能使用。
+
+```json
+{
+  "query_planner": {
+    "provider": "litellm",
+    "model": "ollama/guoxuter/ov_intent_analysis_sft:v1_q8",
+    "api_base": "http://127.0.0.1:11434",
+    "temperature": 0.0,
+    "timeout": 60,
+    "extra_request_body": {
+      "think": false
+    }
+  }
+}
+```
+
+适合用小模型承担检索规划，同时保留更强的 `vlm` 处理语义提取、记忆提取和多模态内容。
+
 ### feishu
 
 飞书/Lark 云端文档解析配置。支持的 URL 格式详见[资源管理](../api/02-resources.md)。

diff --git a/openviking/retrieve/intent_analyzer.py b/openviking/retrieve/intent_analyzer.py
@@ -61,8 +61,10 @@ async def analyze(
             target_abstract,
         )
 
-        # Call LLM
-        response = await get_openviking_config().vlm.get_completion_async(prompt)
+        # Call the lightweight query planner when configured; otherwise keep using VLM.
+        config = get_openviking_config()
+        query_planner = config.get_query_planner()
+        response = await query_planner.get_completion_async(prompt)
 
         # Parse result
         parsed = parse_json_from_response(response)

diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py
@@ -69,6 +69,14 @@ class OpenVikingConfig(BaseModel):
 
     vlm: VLMConfig = Field(default_factory=VLMConfig, description="VLM configuration")
 
+    query_planner: Optional[VLMConfig] = Field(
+        default=None,
+        description=(
+            "Optional lightweight model configuration for retrieval intent analysis and query "
+            "planning. Falls back to vlm when unset or empty."
+        ),
+    )
+
     rerank: RerankConfig = Field(default_factory=RerankConfig, description="Rerank configuration")
 
     retrieval: RetrievalConfig = Field(
@@ -293,6 +301,12 @@ def to_dict(self) -> Dict[str, Any]:
         """Convert configuration to dictionary."""
         return self.model_dump()
 
+    def get_query_planner(self) -> VLMConfig:
+        """Return the model config used for retrieval intent analysis and query planning."""
+        if self.query_planner is not None and self.query_planner._has_any_config():
+            return self.query_planner
+        return self.vlm
+
 
 class OpenVikingConfigSingleton:
     """Global singleton for OpenVikingConfig.

diff --git a/scripts/test_ov_intent_planner.py b/scripts/test_ov_intent_planner.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+"""Smoke-test OpenViking intent analysis through the configured query_planner.
+
+This calls OpenViking's IntentAnalyzer directly. It does not call Ollama with a
+hand-written payload, so it verifies the same model path used by search()
+intent analysis.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import importlib.util
+import json
+import os
+import sys
+import time
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any
+
+from openviking_cli.retrieve.types import ContextType
+from openviking_cli.utils.config import OPENVIKING_CONFIG_ENV
+from openviking_cli.utils.config.open_viking_config import OpenVikingConfigSingleton
+
+DEFAULT_CONFIG = Path.home() / ".openviking" / "ov.conf"
+EXPECTED_MODEL = "ollama/guoxuter/ov_intent_analysis_sft:v1_q8"
+
+
+CASES: dict[str, dict[str, Any]] = {
+    "rfc": {
+        "name": "operational RFC task",
+        "compression_summary": "",
+        "recent_messages": [
+            ("user", "帮我写一份 RFC 文档"),
+            ("assistant", "好的，标题是什么？"),
+        ],
+        "current_message": "标题就叫《支付链路重构》，按公司标准模板来",
+        "context_type": None,
+        "target_abstract": "",
+    },
+    "format": {
+        "name": "informational format question",
+        "compression_summary": "",
+        "recent_messages": [
+            ("user", "我在做新项目的技术选型"),
+            ("assistant", "好的，需要参考什么资料？"),
+        ],
+        "current_message": "RFC 文档的标准格式是什么？",
+        "context_type": None,
+        "target_abstract": "",
+    },
+    "chat": {
+        "name": "conversational small talk",
+        "compression_summary": "",
+        "recent_messages": [
+            ("user", "你好"),
+            ("assistant", "你好，有什么可以帮你？"),
+        ],
+        "current_message": "今天天气挺好的",
+        "context_type": None,
+        "target_abstract": "",
+    },
+    "memory": {
+        "name": "restricted memory query",
+        "compression_summary": "",
+        "recent_messages": [
+            ("user", "帮我对比下这次活动和上月会员日的 ROI"),
+        ],
+        "current_message": "活动周期 7 天，预计 10 万用户参与",
+        "context_type": ContextType.MEMORY,
+        "target_abstract": "User's long-term memory; preferences / events / historical KPIs",
+    },
+}
+
+
+def _message(i: int, role: str, text: str):
+    from openviking.message import Message, TextPart
+
+    return Message(id=f"case-msg-{i}", role=role, parts=[TextPart(text=text)])
+
+
+def _messages(raw_messages: list[tuple[str, str]]) -> list[Any]:
+    return [_message(i, role, text) for i, (role, text) in enumerate(raw_messages, start=1)]
+
+
+def _load_config(config_path: Path, *, preserve_log_output: bool = False):
+    os.environ[OPENVIKING_CONFIG_ENV] = str(config_path)
+    data = json.loads(config_path.read_text(encoding="utf-8-sig"))
+    if not preserve_log_output:
+        data.setdefault("log", {})
+        data["log"]["output"] = "stdout"
+    return OpenVikingConfigSingleton.initialize(config_dict=data)
+
+
+def _load_intent_analyzer_class():
+    module_path = (
+        Path(__file__).resolve().parents[1] / "openviking" / "retrieve" / "intent_analyzer.py"
+    )
+    spec = importlib.util.spec_from_file_location("_ov_intent_analyzer_direct", module_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Could not load IntentAnalyzer from {module_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module.IntentAnalyzer
+
+
+def _planner_summary(config: Any) -> dict[str, Any]:
+    planner = config.get_query_planner()
+    return {
+        "provider": planner.provider,
+        "model": planner.model,
+        "api_base": planner.api_base,
+        "max_tokens": planner.max_tokens,
+        "timeout": planner.timeout,
+        "extra_request_body": planner.extra_request_body,
+        "uses_query_planner": planner is config.query_planner,
+    }
+
+
+def _print_plan(case_key: str, case_name: str, elapsed: float, plan: Any) -> None:
+    print(f"\n=== {case_key}: {case_name} ===")
+    print(f"elapsed_seconds: {elapsed:.3f}")
+    print(f"reasoning: {plan.reasoning[:400]}")
+    if not plan.queries:
+        print("queries: []")
+        return
+    print("queries:")
+    for idx, query in enumerate(plan.queries, start=1):
+        context_type = query.context_type.value if query.context_type else None
+        print(f"  {idx}. [{context_type}] p={query.priority} {query.query}")
+        if query.intent:
+            print(f"     intent: {query.intent}")
+
+
+async def _run_case(
+    intent_analyzer_class: Any,
+    case_key: str,
+    case: dict[str, Any],
+) -> dict[str, Any]:
+    analyzer = intent_analyzer_class(max_recent_messages=5)
+    start = time.perf_counter()
+    plan = await analyzer.analyze(
+        compression_summary=case["compression_summary"],
+        messages=_messages(case["recent_messages"]),
+        current_message=case["current_message"],
+        context_type=case["context_type"],
+        target_abstract=case["target_abstract"],
+    )
+    elapsed = time.perf_counter() - start
+    _print_plan(case_key, case["name"], elapsed, plan)
+    return {
+        "case": case_key,
+        "elapsed_seconds": elapsed,
+        "plan": {
+            "reasoning": plan.reasoning,
+            "queries": [asdict(query) for query in plan.queries],
+        },
+    }
+
+
+async def _main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--config",
+        default=str(DEFAULT_CONFIG),
+        help="Path to ov.conf. Defaults to ~/.openviking/ov.conf.",
+    )
+    parser.add_argument(
+        "--case",
+        choices=["all", *CASES.keys()],
+        default="all",
+        help="Scenario to run.",
+    )
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Print a machine-readable summary at the end.",
+    )
+    parser.add_argument(
+        "--preserve-log-output",
+        action="store_true",
+        help="Use ov.conf log.output as-is. By default this smoke test logs to stdout.",
+    )
+    args = parser.parse_args()
+
+    config_path = Path(args.config).expanduser()
+    config = _load_config(config_path, preserve_log_output=args.preserve_log_output)
+    intent_analyzer_class = _load_intent_analyzer_class()
+    planner = _planner_summary(config)
+
+    print(f"config: {config_path}")
+    print("query_planner:")
+    print(json.dumps(planner, ensure_ascii=False, indent=2))
+
+    if not planner["uses_query_planner"]:
+        print("ERROR: ov.conf did not load a dedicated query_planner.", file=sys.stderr)
+        return 2
+    if planner["model"] != EXPECTED_MODEL:
+        print(
+            f"ERROR: expected model {EXPECTED_MODEL!r}, got {planner['model']!r}.",
+            file=sys.stderr,
+        )
+        return 2
+
+    selected = CASES.keys() if args.case == "all" else [args.case]
+    results = []
+    for case_key in selected:
+        results.append(await _run_case(intent_analyzer_class, case_key, CASES[case_key]))
+
+    if args.json:
+        print("\n=== json_summary ===")
+        print(json.dumps({"query_planner": planner, "results": results}, ensure_ascii=False))
+
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(asyncio.run(_main()))
+    except Exception as exc:
+        print(f"ERROR: {type(exc).__name__}: {exc}", file=sys.stderr)
+        raise SystemExit(1)