add message token count inspector

Tarquinen · Tarquinen · commit 6ddb0f2b2ebb · 2026-03-22T20:23:54.000-04:00
diff --git a/scripts/opencode-message-token-counts b/scripts/opencode-message-token-counts
@@ -0,0 +1,393 @@
+#!/usr/bin/env python3
+"""
+Show countAllMessageTokens-style token counts for each message in an OpenCode session.
+
+Usage: opencode-message-token-counts [--session ID] [--json] [--no-color] [--db PATH]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+from opencode_api import APIError, add_api_arguments, create_client_from_args, list_sessions_across_projects
+
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+
+
+class Colors:
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+    DIM = "\033[2m"
+    GREEN = "\033[32m"
+    YELLOW = "\033[33m"
+    CYAN = "\033[36m"
+
+
+NO_COLOR = Colors()
+for attr in dir(NO_COLOR):
+    if not attr.startswith("_"):
+        setattr(NO_COLOR, attr, "")
+
+
+def stringify_json(value) -> str:
+    return json.dumps(value, separators=(",", ":"), ensure_ascii=False)
+
+
+def collapse_whitespace(text: str) -> str:
+    return " ".join(text.split())
+
+
+def truncate(text: str, limit: int = 64) -> str:
+    if len(text) <= limit:
+        return text
+    return text[: limit - 3] + "..."
+
+
+def get_terminal_width(default: int = 120) -> int:
+    return max(80, shutil.get_terminal_size((default, 20)).columns)
+
+
+def short_message_id(message_id: str, limit: int = 14) -> str:
+    return truncate(message_id or "-", limit)
+
+
+def preview_message(parts: list[dict]) -> str:
+    for part in parts:
+        if part.get("type") != "text":
+            continue
+        text = collapse_whitespace(part.get("text", ""))
+        if not text:
+            continue
+        prefix = "[ignored] " if part.get("ignored", False) else ""
+        return truncate(prefix + text)
+
+    tool_names = [part.get("tool", "tool") for part in parts if part.get("type") == "tool"]
+    if tool_names:
+        return truncate(f"[tools: {', '.join(tool_names[:3])}]")
+
+    for part in parts:
+        part_type = part.get("type", "unknown")
+        if part_type in {"step-start", "step-finish"}:
+            continue
+        if part_type == "tool":
+            tool_name = part.get("tool", "tool")
+            status = (part.get("state") or {}).get("status")
+            suffix = f" {status}" if status else ""
+            return f"[tool:{tool_name}{suffix}]"
+        return f"[{part_type}]"
+
+    for part in parts:
+        part_type = part.get("type", "unknown")
+        return f"[{part_type}]"
+
+    return "[no content]"
+
+
+def extract_tool_content(part: dict) -> list[str]:
+    contents: list[str] = []
+    tool_name = part.get("tool")
+    state = part.get("state") or {}
+
+    if tool_name == "question":
+        questions = (state.get("input") or {}).get("questions")
+        if questions is not None:
+            content = questions if isinstance(questions, str) else stringify_json(questions)
+            contents.append(content)
+        return contents
+
+    if tool_name in {"edit", "write"}:
+        if state.get("input") is not None:
+            input_content = state["input"] if isinstance(state["input"], str) else stringify_json(state["input"])
+            contents.append(input_content)
+
+    if state.get("status") == "completed" and state.get("output") is not None:
+        output = state["output"]
+        contents.append(output if isinstance(output, str) else stringify_json(output))
+    elif state.get("status") == "error" and state.get("error") is not None:
+        error = state["error"]
+        contents.append(error if isinstance(error, str) else stringify_json(error))
+
+    return contents
+
+
+def collect_message_segments(message: dict) -> tuple[list[str], int, int, list[str]]:
+    segments: list[str] = []
+    text_segments = 0
+    tool_segments = 0
+    part_types: list[str] = []
+
+    for part in message.get("parts", []):
+        part_type = part.get("type", "unknown")
+        part_types.append(part_type)
+        if part_type == "text":
+            text = part.get("text", "")
+            if text:
+                segments.append(text)
+                text_segments += 1
+            continue
+
+        tool_contents = extract_tool_content(part)
+        segments.extend(tool_contents)
+        tool_segments += len(tool_contents)
+
+    return segments, text_segments, tool_segments, part_types
+
+
+def fallback_count_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return round(len(text) / 4)
+
+
+def count_tokens_batch(texts: list[str]) -> tuple[list[int], str]:
+    if not texts:
+        return [], "anthropic"
+
+    node_script = """
+import { countTokens } from \"@anthropic-ai/tokenizer\";
+import { readFileSync } from \"node:fs\";
+
+const texts = JSON.parse(readFileSync(0, \"utf8\"));
+const counts = texts.map((text) => countTokens(text || \"\"));
+process.stdout.write(JSON.stringify(counts));
+""".strip()
+
+    try:
+        proc = subprocess.run(
+            ["node", "--input-type=module", "-e", node_script],
+            input=stringify_json(texts),
+            capture_output=True,
+            text=True,
+            cwd=REPO_ROOT,
+            check=True,
+            timeout=15,
+        )
+        counts = json.loads(proc.stdout)
+        if isinstance(counts, list) and len(counts) == len(texts):
+            return [int(count) for count in counts], "anthropic"
+    except (subprocess.SubprocessError, FileNotFoundError, json.JSONDecodeError, ValueError):
+        pass
+
+    return [fallback_count_tokens(text) for text in texts], "approximate"
+
+
+def get_most_recent_session(client, session_list_limit: int) -> Optional[dict]:
+    sessions = list_sessions_across_projects(client, per_project_limit=session_list_limit)
+    return sessions[0] if sessions else None
+
+
+def analyze_session(client, session: dict) -> dict:
+    session_id = session["id"]
+    messages = client.get_session_messages(session_id, directory=session.get("directory"))
+
+    analyzed_messages = []
+    count_inputs: list[str] = []
+    for index, message in enumerate(messages, 1):
+        info = message.get("info", {})
+        segments, text_segments, tool_segments, part_types = collect_message_segments(message)
+        count_inputs.append(" ".join(segments))
+        analyzed_messages.append(
+            {
+                "index": index,
+                "message_id": info.get("id", ""),
+                "role": info.get("role", "unknown"),
+                "part_count": len(message.get("parts", [])),
+                "part_types": part_types,
+                "counted_segments": len(segments),
+                "text_segments": text_segments,
+                "tool_segments": tool_segments,
+                "preview": preview_message(message.get("parts", [])),
+            }
+        )
+
+    counts, tokenizer = count_tokens_batch(count_inputs)
+    total_tokens = 0
+    nonzero_messages = 0
+    max_tokens = 0
+
+    for message_data, count in zip(analyzed_messages, counts):
+        message_data["tokens"] = count
+        total_tokens += count
+        if count > 0:
+            nonzero_messages += 1
+            max_tokens = max(max_tokens, count)
+
+    return {
+        "session_id": session_id,
+        "title": session.get("title", "Unknown"),
+        "tokenizer": tokenizer,
+        "messages": analyzed_messages,
+        "total_messages": len(analyzed_messages),
+        "messages_with_tokens": nonzero_messages,
+        "messages_without_tokens": len(analyzed_messages) - nonzero_messages,
+        "total_tokens": total_tokens,
+        "max_message_tokens": max_tokens,
+    }
+
+
+def format_token_count(count: int, colors: Colors) -> str:
+    c = colors
+    if count == 0:
+        return f"{c.DIM}{count:>10,}{c.RESET}"
+    return f"{count:>10,}"
+
+
+def format_role(role: str, colors: Colors, width: int = 9) -> str:
+    c = colors
+    label = f"{role:<{width}}"
+    if role == "user":
+        return f"{c.CYAN}{label}{c.RESET}"
+    if role == "assistant":
+        return f"{c.GREEN}{label}{c.RESET}"
+    return f"{c.YELLOW}{label}{c.RESET}"
+
+
+def format_size_indicator(count: int, max_count: int, width: int = 8) -> str:
+    if max_count <= 0:
+        return f"{'.' * width}   0%"
+
+    pct = round((count / max_count) * 100)
+    if count <= 0:
+        filled = 0
+    else:
+        filled = max(1, round((count / max_count) * width))
+    filled = min(width, filled)
+    return f"{'#' * filled}{'.' * (width - filled)} {pct:>3}%"
+
+
+def largest_messages(messages: list[dict], limit: int = 5) -> list[dict]:
+    return sorted(messages, key=lambda message: message.get("tokens", 0), reverse=True)[:limit]
+
+
+def print_wide_message_table(result: dict, colors: Colors, width: int):
+    c = colors
+    messages = result["messages"]
+    preview_width = max(24, width - 72)
+
+    print(
+        f"{c.BOLD}{'#':>3} {'Role':<9} {'Tokens':>10} {'Size':<12} {'Seg/Part':<8} {'ID':<14} Preview{c.RESET}"
+    )
+    print("-" * width)
+
+    for message in messages:
+        preview = truncate(message["preview"], preview_width)
+        mix = f"{message['counted_segments']}/{message['part_count']}"
+        print(
+            f"{message['index']:>3} "
+            f"{format_role(message['role'], c, 9)} "
+            f"{format_token_count(message['tokens'], c)} "
+            f"{format_size_indicator(message['tokens'], result['max_message_tokens']):<12} "
+            f"{mix:<8} "
+            f"{c.DIM}{short_message_id(message['message_id']):<14}{c.RESET} "
+            f"{preview}"
+        )
+
+
+def print_compact_message_list(result: dict, colors: Colors, width: int):
+    c = colors
+    messages = result["messages"]
+    meta_width = max(18, width - 6)
+    preview_width = max(32, width - 8)
+
+    print(f"{c.BOLD}Messages{c.RESET}")
+    print("-" * width)
+
+    for message in messages:
+        tokens = f"{message['tokens']:,} tokens"
+        size = format_size_indicator(message["tokens"], result["max_message_tokens"])
+        mix = f"{message['counted_segments']}/{message['part_count']} seg/part"
+        meta = truncate(f"{tokens}  {size}  {mix}", meta_width)
+        preview = truncate(message["preview"], preview_width)
+
+        print(f"{message['index']:>3}  {format_role(message['role'], c, 9)} {meta}")
+        print(f"     {c.DIM}{short_message_id(message['message_id'])}{c.RESET}  {preview}")
+
+
+def print_highlights(result: dict, colors: Colors, width: int):
+    c = colors
+    heavy_messages = [message for message in largest_messages(result["messages"]) if message.get("tokens", 0) > 0]
+    if not heavy_messages:
+        return
+
+    print(f"\n{c.BOLD}Largest messages{c.RESET}")
+    print("-" * width)
+    for message in heavy_messages:
+        print(
+            f"  #{message['index']:<3} {format_role(message['role'], c, 9)} "
+            f"{message['tokens']:>10,}  {truncate(message['preview'], max(30, width - 33))}"
+        )
+
+
+def print_message_tokens(result: dict, colors: Colors):
+    c = colors
+    width = get_terminal_width()
+    print(f"{c.BOLD}{'=' * width}{c.RESET}")
+    print(f"{c.BOLD}SESSION MESSAGE TOKEN COUNTS{c.RESET}")
+    print(f"{c.BOLD}{'=' * width}{c.RESET}\n")
+    print(f"  Session:   {c.CYAN}{result['session_id']}{c.RESET}")
+    print(f"  Title:     {result['title']}")
+    print(f"  Messages:  {result['total_messages']}")
+    print(f"  Tokenizer: {result['tokenizer']}")
+    print(f"  Total:     {result['total_tokens']:,} tokens")
+    print(f"  Largest:   {result['max_message_tokens']:,} tokens\n")
+
+    if not result["messages"]:
+        print("  No messages found in this session.")
+        return
+
+    if width >= 110:
+        print_wide_message_table(result, c, width)
+    else:
+        print_compact_message_list(result, c, width)
+
+    print("-" * width)
+    print_highlights(result, c, width)
+    print(f"\n{c.BOLD}SESSION SUMMARY{c.RESET}")
+    print(f"  Total message tokens: {result['total_tokens']:,}")
+    print(f"  Messages with tokens: {result['messages_with_tokens']:,}")
+    print(f"  Empty messages:       {result['messages_without_tokens']:,}")
+    print(f"  Largest message:      {result['max_message_tokens']:,}")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Show countAllMessageTokens-style token counts for each message in an OpenCode session"
+    )
+    parser.add_argument("--session", "-s", type=str, default=None, help="Session ID to analyze (default: most recent)")
+    parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
+    parser.add_argument("--no-color", action="store_true", help="Disable colored output")
+    add_api_arguments(parser)
+    args = parser.parse_args()
+
+    try:
+        with create_client_from_args(args) as client:
+            if args.session is None:
+                session = get_most_recent_session(client, args.session_list_limit)
+                if session is None:
+                    print("Error: No sessions found")
+                    return 1
+            else:
+                session = client.get_session(args.session)
+            result = analyze_session(client, session)
+    except APIError as err:
+        print(f"Error: {err}")
+        return 1
+
+    if args.json:
+        print(json.dumps(result, indent=2, ensure_ascii=False))
+    else:
+        colors = NO_COLOR if args.no_color else Colors()
+        print_message_tokens(result, colors)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())