Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
f690c88
feat: batch agent experience consolidation
huangruiteng May 22, 2026
f33643e
style: format batch experience tests
huangruiteng May 22, 2026
309f916
test: cover batch experience chunk sizing
huangruiteng May 22, 2026
bf6acfb
fix: derive batch experience prompt from single provider
huangruiteng May 23, 2026
098e50b
style: format batch experience prompt adapter
huangruiteng May 23, 2026
f0de40e
feat(memory): align experience prompt with atomic intent
huangruiteng May 23, 2026
f8dacfa
fix(memory): match atomic experience prompt archive
huangruiteng May 23, 2026
76ebbf6
fix: satisfy batch experience lint
huangruiteng May 23, 2026
d13ccbf
fix: preserve batch experience granularity
huangruiteng May 23, 2026
dc03db1
style: format batch experience test
huangruiteng May 23, 2026
941fa18
feat: expose agent memory phase telemetry
huangruiteng May 23, 2026
abe602b
feat: surface commit telemetry in benchmark manifests
huangruiteng May 23, 2026
8cce894
fix: preserve batch action boundaries
huangruiteng May 23, 2026
3fe7bc5
feat: audit experience corpus quality
huangruiteng May 23, 2026
223a473
chore: trim batch experience diagnostics
huangruiteng May 23, 2026
cf6a7c7
chore: tighten batch prompt adapter
huangruiteng May 23, 2026
beaf4b9
chore: default TAU corpus prep to batch mode
huangruiteng May 23, 2026
1ae63ba
chore: format TAU batch eval config
huangruiteng May 23, 2026
9b109ce
chore: keep batch PR focused on core memory
huangruiteng May 23, 2026
397764e
Revert "chore: keep batch PR focused on core memory"
huangruiteng May 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions benchmark/tau2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,12 @@ Start the OpenViking service before executing memory cells, and verify it with
Memory V2 baseline. For trajectory memory evidence, start the service from this
branch and inspect generated trajectory files; changing `search_uri` alone does
not prove the new trajectory prompt was used.
Agent Harness / TAU-2 corpus preparation defaults to batch experience
consolidation. Configure the running OpenViking server with
`memory.agent_experience_consolidation_mode="batch"` and
`memory.agent_experience_batch_max_trajectories=5`; `--strict-preflight` checks
`OPENVIKING_CONFIG_FILE` (or `~/.openviking/ov.conf`) and fails fast if the
server-side memory config does not match the experiment config.

## Memory Adapter

Expand Down
5 changes: 5 additions & 0 deletions benchmark/tau2/config/baseline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ openviking:
url: ${OPENVIKING_URL:-http://localhost:1933}
account: ${OPENVIKING_ACCOUNT:-default}
agent_id: ${OPENVIKING_AGENT_ID:-tau2-openviking-agent}
# Agent Harness / TAU-2 experiment corpus preparation defaults to the
# batch consolidation path. OpenViking's product default remains configured
# by the running server's ov.conf; strict preflight verifies they match.
agent_experience_consolidation_mode: batch
agent_experience_batch_max_trajectories: 5
reuse_corpus_across_runs: true
retrieval_top_k: 4
prewrite_retrieval_top_k: 6
Expand Down
149 changes: 149 additions & 0 deletions benchmark/tau2/scripts/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,121 @@ def _enabled(value: Any) -> bool:
return str(value).strip().lower() in {"1", "true", "yes", "on"}


def _has_openviking_train_strategy(config: dict[str, Any]) -> bool:
return any(
strategy.get("memory_backend") == "openviking" and strategy.get("train_required")
for strategy in config.get("strategies") or []
)


def _openviking_agent_experience_config(config: dict[str, Any]) -> dict[str, Any]:
openviking = config.get("openviking") or {}
mode = openviking.get("agent_experience_consolidation_mode")
batch_max = openviking.get("agent_experience_batch_max_trajectories")
result: dict[str, Any] = {
"expected_agent_experience_consolidation_mode": str(mode) if mode is not None else None,
"expected_agent_experience_batch_max_trajectories": (
int(batch_max) if batch_max is not None else None
),
}
if result["expected_agent_experience_consolidation_mode"] not in {
None,
"per_trajectory",
"batch",
}:
raise ValueError(
"openviking.agent_experience_consolidation_mode must be 'per_trajectory' or 'batch'"
)
if (
result["expected_agent_experience_batch_max_trajectories"] is not None
and result["expected_agent_experience_batch_max_trajectories"] < 1
):
raise ValueError("openviking.agent_experience_batch_max_trajectories must be >= 1")
return result


def _openviking_server_config_path(config: dict[str, Any]) -> Path:
openviking = config.get("openviking") or {}
raw = openviking.get("server_config_file") or os.environ.get("OPENVIKING_CONFIG_FILE")
if raw:
return resolve_path(str(raw))
return Path.home() / ".openviking" / "ov.conf"


def _openviking_server_memory_config_report(
config: dict[str, Any], *, strict: bool
) -> tuple[dict[str, Any], list[str]]:
expected = _openviking_agent_experience_config(config)
report: dict[str, Any] = {
"expected": expected,
"config_path": None,
"exists": False,
"actual": None,
"checked": False,
}
errors: list[str] = []
if not _has_openviking_train_strategy(config):
return report, errors
if not any(value is not None for value in expected.values()):
return report, errors

config_path = _openviking_server_config_path(config)
report["config_path"] = str(config_path)
report["exists"] = config_path.is_file()
if not config_path.is_file():
if strict:
errors.append(
"cannot verify OpenViking server memory config for agent-experience "
f"consolidation; set OPENVIKING_CONFIG_FILE or openviking.server_config_file "
f"(checked {config_path})"
)
return report, errors

try:
raw = json.loads(config_path.read_text(encoding="utf-8-sig"))
except json.JSONDecodeError as exc:
if strict:
errors.append(f"invalid OpenViking server config JSON at {config_path}: {exc}")
report["error"] = str(exc)
return report, errors

memory = raw.get("memory") if isinstance(raw, dict) else {}
if not isinstance(memory, dict):
memory = {}
actual = {
"agent_experience_consolidation_mode": memory.get(
"agent_experience_consolidation_mode", "per_trajectory"
),
"agent_experience_batch_max_trajectories": memory.get(
"agent_experience_batch_max_trajectories", 5
),
}
report["actual"] = actual
report["checked"] = True

expected_mode = expected["expected_agent_experience_consolidation_mode"]
if expected_mode is not None and actual["agent_experience_consolidation_mode"] != expected_mode:
errors.append(
"OpenViking server memory.agent_experience_consolidation_mode mismatch: "
f"expected {expected_mode!r}, actual "
f"{actual['agent_experience_consolidation_mode']!r} in {config_path}"
)
expected_batch_max = expected["expected_agent_experience_batch_max_trajectories"]
try:
actual_batch_max = int(actual["agent_experience_batch_max_trajectories"])
except (TypeError, ValueError):
actual_batch_max = None
if expected_batch_max is not None and actual_batch_max != expected_batch_max:
errors.append(
"OpenViking server memory.agent_experience_batch_max_trajectories mismatch: "
f"expected {expected_batch_max!r}, actual "
f"{actual['agent_experience_batch_max_trajectories']!r} in {config_path}"
)
if not strict:
errors = []
return report, errors


def _require_fixed_first_user(config: dict[str, Any]) -> bool:
return _enabled(config.get("eval", {}).get("require_fixed_first_user"))

Expand Down Expand Up @@ -312,6 +427,7 @@ def _tau2_command(
if not search_uri:
search_uri = _search_uri(agent_id, search_memory_type)
budget = _retrieval_budget(config, strategy)
agent_experience_config = _openviking_agent_experience_config(config)
command = [
sys.executable,
str(Path(__file__).with_name("run_memory_v2_eval.py")),
Expand Down Expand Up @@ -372,6 +488,22 @@ def _tau2_command(
"--seed",
str(seed),
]
if agent_experience_config["expected_agent_experience_consolidation_mode"] is not None:
command.extend(
[
"--expected-agent-experience-consolidation-mode",
agent_experience_config["expected_agent_experience_consolidation_mode"],
]
)
if agent_experience_config["expected_agent_experience_batch_max_trajectories"] is not None:
command.extend(
[
"--expected-agent-experience-batch-max-trajectories",
str(
agent_experience_config["expected_agent_experience_batch_max_trajectories"]
),
]
)
if budget["memory_inject_max_chars"] is not None:
command.extend(["--memory-inject-max-chars", str(budget["memory_inject_max_chars"])])
if budget["first_user_memory_inject_max_chars"] is not None:
Expand Down Expand Up @@ -605,6 +737,9 @@ def _build_plan(
),
"train_skip_failed_sessions": _train_skip_failed_sessions(strategy),
"train_tool_output_max_chars": _train_tool_output_max_chars(strategy),
"openviking_memory_config": _openviking_agent_experience_config(config)
if strategy.get("memory_backend") == "openviking"
else None,
"retrieval_budget": _retrieval_budget(config, strategy),
"search_memory_type": strategy.get("search_memory_type", "experiences"),
"adapter_status": strategy.get("adapter_status", "ready"),
Expand All @@ -631,6 +766,7 @@ def _build_plan(
"eval_protocol": config.get("eval", {}).get("protocol"),
"require_fixed_first_user": require_fixed_first_user,
"simulator_policy": policy_report,
"openviking_memory_config": _openviking_agent_experience_config(config),
"cell_count": len(cells),
"executable_cell_count": executable_cell_count,
"pending_cell_count": len(cells) - executable_cell_count,
Expand Down Expand Up @@ -730,6 +866,14 @@ def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[
f"{key}: {cached_skip_failed!r} != {requested_skip_failed!r}; "
"use a distinct corpus_id or rebuild the corpus"
)
cached_memory_config = manifest.get("openviking_memory_config")
requested_memory_config = cell.get("openviking_memory_config")
if cached_memory_config != requested_memory_config:
raise RuntimeError(
"cached corpus openviking_memory_config mismatch for "
f"{key}: {cached_memory_config!r} != {requested_memory_config!r}; "
"use a distinct corpus_id or rebuild the corpus"
)
row = {
"domain": cell["domain"],
"strategy_id": cell["strategy_id"],
Expand Down Expand Up @@ -956,6 +1100,10 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int:
llm_env = normalize_litellm_env()
tau2_info = tau2_context(config)
policy_report = simulator_policy_report(config)
openviking_memory_config_report, openviking_memory_config_errors = (
_openviking_server_memory_config_report(config, strict=strict)
)
errors.extend(openviking_memory_config_errors)
if strict and not tau2_info["tau2_repo_exists"]:
errors.append(f"missing TAU-2 repo: {tau2_info['tau2_repo']}")
if strict and not tau2_info["tau2_cli_resolved"]:
Expand Down Expand Up @@ -1009,6 +1157,7 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int:
"require_fixed_first_user": _require_fixed_first_user(config),
"llm_env": llm_env,
"simulator_policy": policy_report,
"openviking_memory_config": openviking_memory_config_report,
"domains": domains(config),
"strategies": strategy_ids(config),
"imports": import_rows,
Expand Down
32 changes: 32 additions & 0 deletions benchmark/tau2/scripts/run_memory_v2_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path)
)
result = client.commit_session(sid, telemetry=True)
task = _wait_task(client, result.get("task_id"), args.openviking_wait_timeout)
task_result = task.get("result") if isinstance(task.get("result"), dict) else {}
committed.append(
{
"session_id": sid,
Expand All @@ -614,6 +615,7 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path)
"commit_status": result.get("status"),
"openviking_task_id": result.get("task_id"),
"openviking_task_status": task.get("status"),
"openviking_task_telemetry": task_result.get("telemetry"),
}
)
finally:
Expand All @@ -639,6 +641,14 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path)
"train_include_system_prompt": bool(args.train_include_system_prompt),
"train_skip_failed_sessions": bool(args.train_skip_failed_sessions),
"train_tool_output_max_chars": args.train_tool_output_max_chars,
"openviking_memory_config": {
"expected_agent_experience_consolidation_mode": (
args.expected_agent_experience_consolidation_mode
),
"expected_agent_experience_batch_max_trajectories": (
args.expected_agent_experience_batch_max_trajectories
),
},
"committed_sessions": committed,
"committed_session_count": len(committed),
"skipped_failed_sessions": skipped_failed_sessions,
Expand Down Expand Up @@ -944,6 +954,23 @@ def main() -> int:
parser.add_argument("--openviking-agent-id")
parser.add_argument("--openviking-timeout", type=float, default=600.0)
parser.add_argument("--openviking-wait-timeout", type=int, default=600)
parser.add_argument(
"--expected-agent-experience-consolidation-mode",
choices=["per_trajectory", "batch"],
help=(
"Expected server-side memory.agent_experience_consolidation_mode. "
"This runner records it for corpus identity; the running OpenViking "
"server must be configured separately."
),
)
parser.add_argument(
"--expected-agent-experience-batch-max-trajectories",
type=int,
help=(
"Expected server-side memory.agent_experience_batch_max_trajectories. "
"Recorded in corpus manifests for reproducibility."
),
)
parser.add_argument("--search-uri")
parser.add_argument("--retrieval-top-k", type=int, default=4)
parser.add_argument("--first-user-retrieval-top-k", type=int)
Expand Down Expand Up @@ -1005,6 +1032,11 @@ def main() -> int:
normalize_litellm_env()
if args.train_tool_output_max_chars <= 0:
parser.error("--train-tool-output-max-chars must be positive")
if (
args.expected_agent_experience_batch_max_trajectories is not None
and args.expected_agent_experience_batch_max_trajectories <= 0
):
parser.error("--expected-agent-experience-batch-max-trajectories must be positive")
for name in (
"memory_inject_max_chars",
"first_user_memory_inject_max_chars",
Expand Down
40 changes: 27 additions & 13 deletions openviking/prompts/templates/memory/experiences.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,47 @@ fields:
type: string
description: |
Name the generalizable pattern, not the specific instance.
Must be written in {{ language }}.
{% if language == 'en' %}Use lowercase snake_case, max 5 words.{% else %}Use a concise noun phrase, max 15 characters.{% endif %}
Choose ONE language and stick to it — do not create both a Chinese and an English version of the same concept.
If English: lowercase snake_case, max 5 words.
If Chinese: concise noun phrase, max 15 characters.
Good: "booking_duplicate_handling", "pytest_asyncio_cancel_hang_fix", "重复预订处理".
merge_op: immutable

- name: content
type: string
description: |
Structured experience extraction in EXACTLY this 3-section format. This output will be injected directly into an autonomous agent's system prompt, so it MUST be written as strict, executable machine instructions:
Structured experience extraction in EXACTLY this 3-section format. This output will be
injected directly into an autonomous agent's system prompt, so it MUST be written as
strict, executable machine instructions:

## Situation
<markdown bullets: Entry Conditions. Describe the generalized context, user intent, or overarching scenario that dictates when this entire rule block becomes relevant>
<markdown bullets: Entry Conditions>

## Approach
<markdown bullets: Active Execution Logic (The "DOs"). The step-by-step optimized path to success. Use direct imperative commands and explicit IF/THEN/ELSE statements for conditional execution. Do NOT place negative constraints or failure warnings here.>
<markdown bullets: Active Execution Logic (The "DOs")>

## Reflect
<markdown bullets: Hard Guardrails & Principles (The "DON'Ts"). Strict negative rules (e.g., "NEVER do Z"), boundary conditions, and failure-prevention heuristics to avoid past mistakes.>
<markdown bullets: Hard Guardrails & Principles (The "DON'Ts")>

Rules:
- MUTUAL EXCLUSIVITY (NO REDUNDANCY): Strictly separate active steps from constraints to eliminate redundant information. 'Approach' is ONLY for actionable, positive execution steps to advance the task. 'Reflect' is ONLY for negative boundaries, limits, and "what not to do." Do not repeat the same concept across both sections.
- OPTIMIZED EXECUTION PATH: You MUST critically analyze the original trajectory and aggressively trim away conversational noise, redundant retry loops, false starts, and irrelevant setup actions. Outline only the essential, efficient path in 'Approach'.
- MACHINE READABILITY (IMPERATIVE VOICE): Address the future agent directly using commanding imperatives (e.g., "Ask the user for X", "Call tool Y").
- ABSTRACTION MANDATE: Strip away specific entities, IDs, user names, or raw text from the past trajectory. Use generalized abstract descriptions so the rule applies universally.
- FAILURE INTEGRATION: Translate past mistakes into strict negative constraints. These MUST be placed exclusively in the 'Reflect' section.
- STRICT FORMATTING: Use exactly the 3 headings above in this order. Use concise markdown bullets (-) for every section. No numbered lists, no introductory sentences, no conversational filler, and no closing paragraphs. Token efficiency is critical.
- MUTUAL EXCLUSIVITY (NO REDUNDANCY): Strictly separate active steps from constraints.
'Approach' is ONLY for actionable, positive execution steps.
'Reflect' is ONLY for negative boundaries and "what not to do."
- OPTIMIZED EXECUTION PATH: Aggressively trim conversational noise, redundant retries,
false starts. Outline only the essential, efficient path in 'Approach'.
- EXECUTION-FIRST PRINCIPLE: 'Approach' steps MUST describe direct tool invocations only.
Strip all "I will now do X" / "I have completed X" communication steps.
- CONDITIONAL BRANCH PRESERVATION: Capture full decision trees as explicit IF/THEN/ELSE
branches. NEVER collapse divergent user-driven branches into a single terminal action.
- ATOMIC SCOPE: Each experience MUST cover exactly ONE user intent and its associated
tool-invocation sequence. Multiple distinct user intents → SEPARATE experiences.
A 'Situation' listing more than one user goal is a violation: split immediately.
Hard limit: if 'Approach' would exceed 8 bullets, STOP and split.
- MACHINE READABILITY (IMPERATIVE VOICE): Use commanding imperatives ("Ask the user for X").
- ABSTRACTION MANDATE: Strip specific entities, IDs, user names. Use generalized descriptions.
- FAILURE INTEGRATION: Translate past mistakes into strict negative constraints in 'Reflect'.
- STRICT FORMATTING: Exactly 3 headings, concise markdown bullets, no numbered lists,
no introductory sentences, no closing paragraphs. Token efficiency is critical.

merge_op: replace

Expand All @@ -48,6 +63,5 @@ fields:
description: |
The experience_name of an existing experience that this one supersedes.
Set ONLY when this experience replaces a narrower existing experience with a DIFFERENT name.
The system will automatically delete the old experience and inherit its trajectory history.
Leave empty for a new experience or when updating an experience with the same name.
merge_op: replace
Loading
Loading