volcengine · huangruiteng · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 23, 2026
diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
@@ -173,6 +173,12 @@ Start the OpenViking service before executing memory cells, and verify it with
 Memory V2 baseline. For trajectory memory evidence, start the service from this
 branch and inspect generated trajectory files; changing `search_uri` alone does
 not prove the new trajectory prompt was used.
+Agent Harness / TAU-2 corpus preparation defaults to batch experience
+consolidation. Configure the running OpenViking server with
+`memory.agent_experience_consolidation_mode="batch"` and
+`memory.agent_experience_batch_max_trajectories=5`; `--strict-preflight` checks
+`OPENVIKING_CONFIG_FILE` (or `~/.openviking/ov.conf`) and fails fast if the
+server-side memory config does not match the experiment config.
 
 ## Memory Adapter
 

diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml
@@ -48,6 +48,11 @@ openviking:
   url: ${OPENVIKING_URL:-http://localhost:1933}
   account: ${OPENVIKING_ACCOUNT:-default}
   agent_id: ${OPENVIKING_AGENT_ID:-tau2-openviking-agent}
+  # Agent Harness / TAU-2 experiment corpus preparation defaults to the
+  # batch consolidation path. OpenViking's product default remains configured
+  # by the running server's ov.conf; strict preflight verifies they match.
+  agent_experience_consolidation_mode: batch
+  agent_experience_batch_max_trajectories: 5
   reuse_corpus_across_runs: true
   retrieval_top_k: 4
   prewrite_retrieval_top_k: 6

diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
@@ -101,6 +101,121 @@ def _enabled(value: Any) -> bool:
     return str(value).strip().lower() in {"1", "true", "yes", "on"}
 
 
+def _has_openviking_train_strategy(config: dict[str, Any]) -> bool:
+    return any(
+        strategy.get("memory_backend") == "openviking" and strategy.get("train_required")
+        for strategy in config.get("strategies") or []
+    )
+
+
+def _openviking_agent_experience_config(config: dict[str, Any]) -> dict[str, Any]:
+    openviking = config.get("openviking") or {}
+    mode = openviking.get("agent_experience_consolidation_mode")
+    batch_max = openviking.get("agent_experience_batch_max_trajectories")
+    result: dict[str, Any] = {
+        "expected_agent_experience_consolidation_mode": str(mode) if mode is not None else None,
+        "expected_agent_experience_batch_max_trajectories": (
+            int(batch_max) if batch_max is not None else None
+        ),
+    }
+    if result["expected_agent_experience_consolidation_mode"] not in {
+        None,
+        "per_trajectory",
+        "batch",
+    }:
+        raise ValueError(
+            "openviking.agent_experience_consolidation_mode must be 'per_trajectory' or 'batch'"
+        )
+    if (
+        result["expected_agent_experience_batch_max_trajectories"] is not None
+        and result["expected_agent_experience_batch_max_trajectories"] < 1
+    ):
+        raise ValueError("openviking.agent_experience_batch_max_trajectories must be >= 1")
+    return result
+
+
+def _openviking_server_config_path(config: dict[str, Any]) -> Path:
+    openviking = config.get("openviking") or {}
+    raw = openviking.get("server_config_file") or os.environ.get("OPENVIKING_CONFIG_FILE")
+    if raw:
+        return resolve_path(str(raw))
+    return Path.home() / ".openviking" / "ov.conf"
+
+
+def _openviking_server_memory_config_report(
+    config: dict[str, Any], *, strict: bool
+) -> tuple[dict[str, Any], list[str]]:
+    expected = _openviking_agent_experience_config(config)
+    report: dict[str, Any] = {
+        "expected": expected,
+        "config_path": None,
+        "exists": False,
+        "actual": None,
+        "checked": False,
+    }
+    errors: list[str] = []
+    if not _has_openviking_train_strategy(config):
+        return report, errors
+    if not any(value is not None for value in expected.values()):
+        return report, errors
+
+    config_path = _openviking_server_config_path(config)
+    report["config_path"] = str(config_path)
+    report["exists"] = config_path.is_file()
+    if not config_path.is_file():
+        if strict:
+            errors.append(
+                "cannot verify OpenViking server memory config for agent-experience "
+                f"consolidation; set OPENVIKING_CONFIG_FILE or openviking.server_config_file "
+                f"(checked {config_path})"
+            )
+        return report, errors
+
+    try:
+        raw = json.loads(config_path.read_text(encoding="utf-8-sig"))
+    except json.JSONDecodeError as exc:
+        if strict:
+            errors.append(f"invalid OpenViking server config JSON at {config_path}: {exc}")
+        report["error"] = str(exc)
+        return report, errors
+
+    memory = raw.get("memory") if isinstance(raw, dict) else {}
+    if not isinstance(memory, dict):
+        memory = {}
+    actual = {
+        "agent_experience_consolidation_mode": memory.get(
+            "agent_experience_consolidation_mode", "per_trajectory"
+        ),
+        "agent_experience_batch_max_trajectories": memory.get(
+            "agent_experience_batch_max_trajectories", 5
+        ),
+    }
+    report["actual"] = actual
+    report["checked"] = True
+
+    expected_mode = expected["expected_agent_experience_consolidation_mode"]
+    if expected_mode is not None and actual["agent_experience_consolidation_mode"] != expected_mode:
+        errors.append(
+            "OpenViking server memory.agent_experience_consolidation_mode mismatch: "
+            f"expected {expected_mode!r}, actual "
+            f"{actual['agent_experience_consolidation_mode']!r} in {config_path}"
+        )
+    expected_batch_max = expected["expected_agent_experience_batch_max_trajectories"]
+    try:
+        actual_batch_max = int(actual["agent_experience_batch_max_trajectories"])
+    except (TypeError, ValueError):
+        actual_batch_max = None
+    if expected_batch_max is not None and actual_batch_max != expected_batch_max:
+        errors.append(
+            "OpenViking server memory.agent_experience_batch_max_trajectories mismatch: "
+            f"expected {expected_batch_max!r}, actual "
+            f"{actual['agent_experience_batch_max_trajectories']!r} in {config_path}"
+        )
+    if not strict:
+        errors = []
+    return report, errors
+
+
 def _require_fixed_first_user(config: dict[str, Any]) -> bool:
     return _enabled(config.get("eval", {}).get("require_fixed_first_user"))
 
@@ -312,6 +427,7 @@ def _tau2_command(
         if not search_uri:
             search_uri = _search_uri(agent_id, search_memory_type)
         budget = _retrieval_budget(config, strategy)
+        agent_experience_config = _openviking_agent_experience_config(config)
         command = [
             sys.executable,
             str(Path(__file__).with_name("run_memory_v2_eval.py")),
@@ -372,6 +488,22 @@ def _tau2_command(
             "--seed",
             str(seed),
         ]
+        if agent_experience_config["expected_agent_experience_consolidation_mode"] is not None:
+            command.extend(
+                [
+                    "--expected-agent-experience-consolidation-mode",
+                    agent_experience_config["expected_agent_experience_consolidation_mode"],
+                ]
+            )
+        if agent_experience_config["expected_agent_experience_batch_max_trajectories"] is not None:
+            command.extend(
+                [
+                    "--expected-agent-experience-batch-max-trajectories",
+                    str(
+                        agent_experience_config["expected_agent_experience_batch_max_trajectories"]
+                    ),
+                ]
+            )
         if budget["memory_inject_max_chars"] is not None:
             command.extend(["--memory-inject-max-chars", str(budget["memory_inject_max_chars"])])
         if budget["first_user_memory_inject_max_chars"] is not None:
@@ -605,6 +737,9 @@ def _build_plan(
                         ),
                         "train_skip_failed_sessions": _train_skip_failed_sessions(strategy),
                         "train_tool_output_max_chars": _train_tool_output_max_chars(strategy),
+                        "openviking_memory_config": _openviking_agent_experience_config(config)
+                        if strategy.get("memory_backend") == "openviking"
+                        else None,
                         "retrieval_budget": _retrieval_budget(config, strategy),
                         "search_memory_type": strategy.get("search_memory_type", "experiences"),
                         "adapter_status": strategy.get("adapter_status", "ready"),
@@ -631,6 +766,7 @@ def _build_plan(
         "eval_protocol": config.get("eval", {}).get("protocol"),
         "require_fixed_first_user": require_fixed_first_user,
         "simulator_policy": policy_report,
+        "openviking_memory_config": _openviking_agent_experience_config(config),
         "cell_count": len(cells),
         "executable_cell_count": executable_cell_count,
         "pending_cell_count": len(cells) - executable_cell_count,
@@ -730,6 +866,14 @@ def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[
                 f"{key}: {cached_skip_failed!r} != {requested_skip_failed!r}; "
                 "use a distinct corpus_id or rebuild the corpus"
             )
+        cached_memory_config = manifest.get("openviking_memory_config")
+        requested_memory_config = cell.get("openviking_memory_config")
+        if cached_memory_config != requested_memory_config:
+            raise RuntimeError(
+                "cached corpus openviking_memory_config mismatch for "
+                f"{key}: {cached_memory_config!r} != {requested_memory_config!r}; "
+                "use a distinct corpus_id or rebuild the corpus"
+            )
         row = {
             "domain": cell["domain"],
             "strategy_id": cell["strategy_id"],
@@ -956,6 +1100,10 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int:
     llm_env = normalize_litellm_env()
     tau2_info = tau2_context(config)
     policy_report = simulator_policy_report(config)
+    openviking_memory_config_report, openviking_memory_config_errors = (
+        _openviking_server_memory_config_report(config, strict=strict)
+    )
+    errors.extend(openviking_memory_config_errors)
     if strict and not tau2_info["tau2_repo_exists"]:
         errors.append(f"missing TAU-2 repo: {tau2_info['tau2_repo']}")
     if strict and not tau2_info["tau2_cli_resolved"]:
@@ -1009,6 +1157,7 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int:
         "require_fixed_first_user": _require_fixed_first_user(config),
         "llm_env": llm_env,
         "simulator_policy": policy_report,
+        "openviking_memory_config": openviking_memory_config_report,
         "domains": domains(config),
         "strategies": strategy_ids(config),
         "imports": import_rows,

diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py
@@ -604,6 +604,7 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path)
                     )
             result = client.commit_session(sid, telemetry=True)
             task = _wait_task(client, result.get("task_id"), args.openviking_wait_timeout)
+            task_result = task.get("result") if isinstance(task.get("result"), dict) else {}
             committed.append(
                 {
                     "session_id": sid,
@@ -614,6 +615,7 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path)
                     "commit_status": result.get("status"),
                     "openviking_task_id": result.get("task_id"),
                     "openviking_task_status": task.get("status"),
+                    "openviking_task_telemetry": task_result.get("telemetry"),
                 }
             )
     finally:
@@ -639,6 +641,14 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path)
         "train_include_system_prompt": bool(args.train_include_system_prompt),
         "train_skip_failed_sessions": bool(args.train_skip_failed_sessions),
         "train_tool_output_max_chars": args.train_tool_output_max_chars,
+        "openviking_memory_config": {
+            "expected_agent_experience_consolidation_mode": (
+                args.expected_agent_experience_consolidation_mode
+            ),
+            "expected_agent_experience_batch_max_trajectories": (
+                args.expected_agent_experience_batch_max_trajectories
+            ),
+        },
         "committed_sessions": committed,
         "committed_session_count": len(committed),
         "skipped_failed_sessions": skipped_failed_sessions,
@@ -944,6 +954,23 @@ def main() -> int:
     parser.add_argument("--openviking-agent-id")
     parser.add_argument("--openviking-timeout", type=float, default=600.0)
     parser.add_argument("--openviking-wait-timeout", type=int, default=600)
+    parser.add_argument(
+        "--expected-agent-experience-consolidation-mode",
+        choices=["per_trajectory", "batch"],
+        help=(
+            "Expected server-side memory.agent_experience_consolidation_mode. "
+            "This runner records it for corpus identity; the running OpenViking "
+            "server must be configured separately."
+        ),
+    )
+    parser.add_argument(
+        "--expected-agent-experience-batch-max-trajectories",
+        type=int,
+        help=(
+            "Expected server-side memory.agent_experience_batch_max_trajectories. "
+            "Recorded in corpus manifests for reproducibility."
+        ),
+    )
     parser.add_argument("--search-uri")
     parser.add_argument("--retrieval-top-k", type=int, default=4)
     parser.add_argument("--first-user-retrieval-top-k", type=int)
@@ -1005,6 +1032,11 @@ def main() -> int:
     normalize_litellm_env()
     if args.train_tool_output_max_chars <= 0:
         parser.error("--train-tool-output-max-chars must be positive")
+    if (
+        args.expected_agent_experience_batch_max_trajectories is not None
+        and args.expected_agent_experience_batch_max_trajectories <= 0
+    ):
+        parser.error("--expected-agent-experience-batch-max-trajectories must be positive")
     for name in (
         "memory_inject_max_chars",
         "first_user_memory_inject_max_chars",

diff --git a/openviking/prompts/templates/memory/experiences.yaml b/openviking/prompts/templates/memory/experiences.yaml
@@ -14,32 +14,47 @@ fields:
     type: string
     description: |
       Name the generalizable pattern, not the specific instance.
-      Must be written in {{ language }}.
-      {% if language == 'en' %}Use lowercase snake_case, max 5 words.{% else %}Use a concise noun phrase, max 15 characters.{% endif %}
+      Choose ONE language and stick to it — do not create both a Chinese and an English version of the same concept.
+      If English: lowercase snake_case, max 5 words.
+      If Chinese: concise noun phrase, max 15 characters.
       Good: "booking_duplicate_handling", "pytest_asyncio_cancel_hang_fix", "重复预订处理".
     merge_op: immutable
 
   - name: content
     type: string
     description: |
-      Structured experience extraction in EXACTLY this 3-section format. This output will be injected directly into an autonomous agent's system prompt, so it MUST be written as strict, executable machine instructions:
+      Structured experience extraction in EXACTLY this 3-section format. This output will be
+      injected directly into an autonomous agent's system prompt, so it MUST be written as
+      strict, executable machine instructions:
 
       ## Situation
-      <markdown bullets: Entry Conditions. Describe the generalized context, user intent, or overarching scenario that dictates when this entire rule block becomes relevant>
+      <markdown bullets: Entry Conditions>
 
       ## Approach
-      <markdown bullets: Active Execution Logic (The "DOs"). The step-by-step optimized path to success. Use direct imperative commands and explicit IF/THEN/ELSE statements for conditional execution. Do NOT place negative constraints or failure warnings here.>
+      <markdown bullets: Active Execution Logic (The "DOs")>
 
       ## Reflect
-      <markdown bullets: Hard Guardrails & Principles (The "DON'Ts"). Strict negative rules (e.g., "NEVER do Z"), boundary conditions, and failure-prevention heuristics to avoid past mistakes.>
+      <markdown bullets: Hard Guardrails & Principles (The "DON'Ts")>
 
       Rules:
-      - MUTUAL EXCLUSIVITY (NO REDUNDANCY): Strictly separate active steps from constraints to eliminate redundant information. 'Approach' is ONLY for actionable, positive execution steps to advance the task. 'Reflect' is ONLY for negative boundaries, limits, and "what not to do." Do not repeat the same concept across both sections.
-      - OPTIMIZED EXECUTION PATH: You MUST critically analyze the original trajectory and aggressively trim away conversational noise, redundant retry loops, false starts, and irrelevant setup actions. Outline only the essential, efficient path in 'Approach'.
-      - MACHINE READABILITY (IMPERATIVE VOICE): Address the future agent directly using commanding imperatives (e.g., "Ask the user for X", "Call tool Y").
-      - ABSTRACTION MANDATE: Strip away specific entities, IDs, user names, or raw text from the past trajectory. Use generalized abstract descriptions so the rule applies universally.
-      - FAILURE INTEGRATION: Translate past mistakes into strict negative constraints. These MUST be placed exclusively in the 'Reflect' section.
-      - STRICT FORMATTING: Use exactly the 3 headings above in this order. Use concise markdown bullets (-) for every section. No numbered lists, no introductory sentences, no conversational filler, and no closing paragraphs. Token efficiency is critical.
+      - MUTUAL EXCLUSIVITY (NO REDUNDANCY): Strictly separate active steps from constraints.
+        'Approach' is ONLY for actionable, positive execution steps.
+        'Reflect' is ONLY for negative boundaries and "what not to do."
+      - OPTIMIZED EXECUTION PATH: Aggressively trim conversational noise, redundant retries,
+        false starts. Outline only the essential, efficient path in 'Approach'.
+      - EXECUTION-FIRST PRINCIPLE: 'Approach' steps MUST describe direct tool invocations only.
+        Strip all "I will now do X" / "I have completed X" communication steps.
+      - CONDITIONAL BRANCH PRESERVATION: Capture full decision trees as explicit IF/THEN/ELSE
+        branches. NEVER collapse divergent user-driven branches into a single terminal action.
+      - ATOMIC SCOPE: Each experience MUST cover exactly ONE user intent and its associated
+        tool-invocation sequence. Multiple distinct user intents → SEPARATE experiences.
+        A 'Situation' listing more than one user goal is a violation: split immediately.
+        Hard limit: if 'Approach' would exceed 8 bullets, STOP and split.
+      - MACHINE READABILITY (IMPERATIVE VOICE): Use commanding imperatives ("Ask the user for X").
+      - ABSTRACTION MANDATE: Strip specific entities, IDs, user names. Use generalized descriptions.
+      - FAILURE INTEGRATION: Translate past mistakes into strict negative constraints in 'Reflect'.
+      - STRICT FORMATTING: Exactly 3 headings, concise markdown bullets, no numbered lists,
+        no introductory sentences, no closing paragraphs. Token efficiency is critical.
 
     merge_op: replace
 
@@ -48,6 +63,5 @@ fields:
     description: |
       The experience_name of an existing experience that this one supersedes.
       Set ONLY when this experience replaces a narrower existing experience with a DIFFERENT name.
-      The system will automatically delete the old experience and inherit its trajectory history.
       Leave empty for a new experience or when updating an experience with the same name.
     merge_op: replace