Skip to content

Commit 8103916

Browse files
author
SentienceDEV
committed
token usage & video recording
1 parent 8263270 commit 8103916

File tree

5 files changed

+328
-4
lines changed

5 files changed

+328
-4
lines changed

CHANGELOG.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,28 @@ from predicate.llm_provider import LocalLLMProvider
8484
llm = LocalLLMProvider(model_name="Qwen/Qwen2.5-3B-Instruct", device="auto", load_in_4bit=True)
8585
```
8686

87+
##### Opt-in token usage accounting (best-effort)
88+
89+
If you want to measure token spend, you can enable best-effort accounting (depends on provider reporting `prompt_tokens` / `completion_tokens` / `total_tokens` in `LLMResponse`):
90+
91+
```python
92+
from predicate import PredicateBrowserAgentConfig
93+
94+
config = PredicateBrowserAgentConfig(token_usage_enabled=True)
95+
96+
# Later:
97+
usage = agent.get_token_usage()
98+
agent.reset_token_usage()
99+
```
100+
101+
##### RuntimeAgent: act once without step lifecycle (orchestrators)
102+
103+
`RuntimeAgent` now exposes `act_once(...)` helpers that execute exactly one action **without** calling `runtime.begin_step()` / `runtime.emit_step_end()`. This is intended for external orchestrators (e.g. WebBench) that already own step lifecycle and just want the SDK’s snapshot-first propose+execute block.
104+
105+
- `await agent.act_once(...) -> str`
106+
- `await agent.act_once_with_snapshot(...) -> (action, snap)`
107+
- `await agent.act_once_result(...) -> { action, snap, used_vision }`
108+
87109
### 2026-02-13
88110

89111
#### Expanded deterministic verifications (adaptive resnapshotting)

examples/agent/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ Predicate agent examples.
22

33
- `predicate_browser_agent_minimal.py`: minimal `PredicateBrowserAgent` usage.
44
- `predicate_browser_agent_custom_prompt.py`: customize the compact prompt builder.
5+
- `predicate_browser_agent_video_recording_playwright.py`: enable Playwright video recording via context options (recommended).
56

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
Example: PredicateBrowserAgent + Playwright video recording (recommended approach).
3+
4+
Video recording is a *Playwright context feature* (record_video_dir), not a PredicateBrowserAgent knob.
5+
This example shows how to:
6+
1) create a Playwright context with video recording enabled
7+
2) wrap the existing page with AsyncSentienceBrowser.from_page(...)
8+
3) use AgentRuntime + PredicateBrowserAgent normally
9+
10+
Usage:
11+
python examples/agent/predicate_browser_agent_video_recording_playwright.py
12+
"""
13+
14+
import asyncio
15+
import os
16+
from pathlib import Path
17+
18+
from playwright.async_api import async_playwright
19+
20+
from predicate import AsyncSentienceBrowser, PredicateBrowserAgent, PredicateBrowserAgentConfig
21+
from predicate.agent_runtime import AgentRuntime
22+
from predicate.llm_provider import LLMProvider, LLMResponse
23+
from predicate.runtime_agent import RuntimeStep
24+
25+
26+
class FixedActionProvider(LLMProvider):
27+
def __init__(self, action: str):
28+
super().__init__(model="fixed-action")
29+
self._action = action
30+
31+
def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse:
32+
_ = system_prompt, user_prompt, kwargs
33+
return LLMResponse(content=self._action, model_name=self.model_name)
34+
35+
def supports_json_mode(self) -> bool:
36+
return False
37+
38+
@property
39+
def model_name(self) -> str:
40+
return "fixed-action"
41+
42+
43+
async def main() -> None:
44+
api_key = os.environ.get("PREDICATE_API_KEY") or os.environ.get("SENTIENCE_API_KEY")
45+
46+
recordings_dir = Path("recordings")
47+
recordings_dir.mkdir(parents=True, exist_ok=True)
48+
49+
async with async_playwright() as p:
50+
browser = await p.chromium.launch(headless=False)
51+
context = await browser.new_context(
52+
record_video_dir=str(recordings_dir),
53+
record_video_size={"width": 1280, "height": 720},
54+
)
55+
page = await context.new_page()
56+
57+
# Wrap existing Playwright page.
58+
sentience_browser = await AsyncSentienceBrowser.from_page(
59+
page, api_key=api_key
60+
)
61+
62+
try:
63+
await page.goto("https://example.com")
64+
await page.wait_for_load_state("networkidle")
65+
66+
runtime = await AgentRuntime.from_sentience_browser(
67+
browser=sentience_browser, page=page, tracer=None
68+
)
69+
70+
agent = PredicateBrowserAgent(
71+
runtime=runtime,
72+
executor=FixedActionProvider("FINISH()"),
73+
config=PredicateBrowserAgentConfig(history_last_n=0),
74+
)
75+
76+
out = await agent.step(
77+
task_goal="Open example.com",
78+
step=RuntimeStep(goal="Finish immediately"),
79+
)
80+
print(f"step ok: {out.ok}")
81+
print(f"videos will be saved under: {recordings_dir.resolve()}")
82+
finally:
83+
# Close the Playwright context to flush the video.
84+
try:
85+
await context.close()
86+
finally:
87+
await browser.close()
88+
89+
90+
if __name__ == "__main__":
91+
asyncio.run(main())
92+

predicate/agents/browser_agent.py

Lines changed: 154 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from ..captcha import CaptchaHandler, CaptchaOptions
1111
from ..captcha_strategies import ExternalSolver, HumanHandoffSolver, VisionSolver
1212
from ..llm_interaction_handler import LLMInteractionHandler
13-
from ..llm_provider import LLMProvider
13+
from ..llm_provider import LLMProvider, LLMResponse
1414
from ..models import Snapshot, StepHookContext
1515
from ..permissions import PermissionPolicy
1616
from ..runtime_agent import RuntimeAgent, RuntimeStep
@@ -84,6 +84,9 @@ class PredicateBrowserAgentConfig:
8484
# Prompt / token controls
8585
history_last_n: int = 0 # 0 disables LLM-facing step history (lowest token usage)
8686

87+
# Opt-in: track token usage from LLM provider responses (best-effort; depends on provider reporting).
88+
token_usage_enabled: bool = False
89+
8790
# Compact prompt customization
8891
# Signature: builder(task_goal, step_goal, dom_context, snapshot, history_summary) -> (system, user)
8992
compact_prompt_builder: Callable[
@@ -146,6 +149,112 @@ def apply_captcha_config_to_runtime(
146149
)
147150

148151

152+
@dataclass
153+
class TokenUsageTotals:
154+
calls: int = 0
155+
prompt_tokens: int = 0
156+
completion_tokens: int = 0
157+
total_tokens: int = 0
158+
159+
def add(self, resp: LLMResponse) -> None:
160+
self.calls += 1
161+
pt = resp.prompt_tokens if isinstance(resp.prompt_tokens, int) else 0
162+
ct = resp.completion_tokens if isinstance(resp.completion_tokens, int) else 0
163+
tt = resp.total_tokens if isinstance(resp.total_tokens, int) else (pt + ct)
164+
self.prompt_tokens += max(0, int(pt))
165+
self.completion_tokens += max(0, int(ct))
166+
self.total_tokens += max(0, int(tt))
167+
168+
169+
class _TokenUsageCollector:
170+
def __init__(self) -> None:
171+
self._by_role: dict[str, TokenUsageTotals] = {}
172+
self._by_model: dict[str, TokenUsageTotals] = {}
173+
174+
def record(self, *, role: str, resp: LLMResponse) -> None:
175+
self._by_role.setdefault(role, TokenUsageTotals()).add(resp)
176+
m = str(resp.model_name or "").strip() or "unknown"
177+
self._by_model.setdefault(m, TokenUsageTotals()).add(resp)
178+
179+
def reset(self) -> None:
180+
self._by_role.clear()
181+
self._by_model.clear()
182+
183+
def summary(self) -> dict[str, Any]:
184+
def _sum(items: dict[str, TokenUsageTotals]) -> TokenUsageTotals:
185+
out = TokenUsageTotals()
186+
for t in items.values():
187+
out.calls += t.calls
188+
out.prompt_tokens += t.prompt_tokens
189+
out.completion_tokens += t.completion_tokens
190+
out.total_tokens += t.total_tokens
191+
return out
192+
193+
total = _sum(self._by_role)
194+
return {
195+
"total": {
196+
"calls": total.calls,
197+
"prompt_tokens": total.prompt_tokens,
198+
"completion_tokens": total.completion_tokens,
199+
"total_tokens": total.total_tokens,
200+
},
201+
"by_role": {
202+
k: {
203+
"calls": v.calls,
204+
"prompt_tokens": v.prompt_tokens,
205+
"completion_tokens": v.completion_tokens,
206+
"total_tokens": v.total_tokens,
207+
}
208+
for k, v in self._by_role.items()
209+
},
210+
"by_model": {
211+
k: {
212+
"calls": v.calls,
213+
"prompt_tokens": v.prompt_tokens,
214+
"completion_tokens": v.completion_tokens,
215+
"total_tokens": v.total_tokens,
216+
}
217+
for k, v in self._by_model.items()
218+
},
219+
}
220+
221+
222+
class _TokenAccountingProvider(LLMProvider):
223+
def __init__(self, *, inner: LLMProvider, collector: _TokenUsageCollector, role: str):
224+
super().__init__(model=getattr(inner, "model_name", "wrapped"))
225+
self._inner = inner
226+
self._collector = collector
227+
self._role = role
228+
229+
def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse:
230+
resp = self._inner.generate(system_prompt, user_prompt, **kwargs)
231+
try:
232+
self._collector.record(role=self._role, resp=resp)
233+
except Exception:
234+
pass
235+
return resp
236+
237+
def supports_json_mode(self) -> bool:
238+
return self._inner.supports_json_mode()
239+
240+
def supports_vision(self) -> bool:
241+
return self._inner.supports_vision()
242+
243+
def generate_with_image(
244+
self, system_prompt: str, user_prompt: str, image_base64: str, **kwargs
245+
) -> LLMResponse:
246+
resp = self._inner.generate_with_image(system_prompt, user_prompt, image_base64, **kwargs)
247+
try:
248+
self._collector.record(role=self._role, resp=resp)
249+
except Exception:
250+
pass
251+
return resp
252+
253+
@property
254+
def model_name(self) -> str:
255+
return self._inner.model_name
256+
257+
149258
class _RuntimeAgentWithPromptOverrides(RuntimeAgent):
150259
def __init__(
151260
self,
@@ -227,9 +336,33 @@ def __init__(
227336
config: PredicateBrowserAgentConfig = PredicateBrowserAgentConfig(),
228337
) -> None:
229338
self.runtime = runtime
230-
self.executor = executor
231-
self.vision_executor = vision_executor
232-
self.vision_verifier = vision_verifier
339+
self._token_usage: _TokenUsageCollector | None = (
340+
_TokenUsageCollector() if bool(config.token_usage_enabled) else None
341+
)
342+
343+
# Optionally wrap providers for best-effort token usage accounting.
344+
if self._token_usage is not None:
345+
self.executor = _TokenAccountingProvider(
346+
inner=executor, collector=self._token_usage, role="executor"
347+
)
348+
self.vision_executor = (
349+
_TokenAccountingProvider(
350+
inner=vision_executor, collector=self._token_usage, role="vision_executor"
351+
)
352+
if vision_executor is not None
353+
else None
354+
)
355+
self.vision_verifier = (
356+
_TokenAccountingProvider(
357+
inner=vision_verifier, collector=self._token_usage, role="vision_verifier"
358+
)
359+
if vision_verifier is not None
360+
else None
361+
)
362+
else:
363+
self.executor = executor
364+
self.vision_executor = vision_executor
365+
self.vision_verifier = vision_verifier
233366
self.config = config
234367

235368
# LLM-facing step history summaries (bounded)
@@ -252,6 +385,23 @@ def __init__(
252385
history_summary_provider=self._get_history_summary,
253386
)
254387

388+
def get_token_usage(self) -> dict[str, Any]:
389+
"""
390+
Best-effort token usage summary.
391+
392+
Only available when `PredicateBrowserAgentConfig.token_usage_enabled=True`.
393+
"""
394+
if self._token_usage is None:
395+
return {"enabled": False, "reason": "token_usage_enabled is False"}
396+
out = self._token_usage.summary()
397+
out["enabled"] = True
398+
return out
399+
400+
def reset_token_usage(self) -> None:
401+
if self._token_usage is None:
402+
return
403+
self._token_usage.reset()
404+
255405
def _get_history_summary(self) -> str:
256406
if int(self.config.history_last_n) <= 0:
257407
return ""

tests/unit/test_predicate_browser_agent.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,29 @@ def model_name(self) -> str:
9595
return self._model_name
9696

9797

98+
class TokenProviderStub(LLMProvider):
99+
def __init__(self, *, model: str = "stub", response: str = "FINISH()"):
100+
super().__init__(model)
101+
self._response = response
102+
103+
def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse:
104+
_ = system_prompt, user_prompt, kwargs
105+
return LLMResponse(
106+
content=self._response,
107+
model_name=self.model_name,
108+
prompt_tokens=11,
109+
completion_tokens=7,
110+
total_tokens=18,
111+
)
112+
113+
def supports_json_mode(self) -> bool:
114+
return True
115+
116+
@property
117+
def model_name(self) -> str:
118+
return self._model_name
119+
120+
98121
def make_snapshot(*, url: str, elements: list[Element], confidence: float | None = None) -> Snapshot:
99122
diagnostics = SnapshotDiagnostics(confidence=confidence) if confidence is not None else None
100123
return Snapshot(
@@ -162,3 +185,39 @@ def builder(
162185

163186
asyncio.run(_run())
164187

188+
189+
def test_predicate_browser_agent_token_usage_is_opt_in_and_best_effort() -> None:
190+
async def _run() -> None:
191+
backend = MockBackend()
192+
tracer = MockTracer()
193+
runtime = AgentRuntime(backend=backend, tracer=tracer)
194+
195+
s0 = make_snapshot(url="https://example.com/start", elements=[make_clickable_element(1)])
196+
async def fake_snapshot(**_kwargs):
197+
runtime.last_snapshot = s0
198+
return runtime.last_snapshot
199+
runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign]
200+
201+
step = RuntimeStep(goal="No-op", verifications=[])
202+
executor = TokenProviderStub(response="FINISH()")
203+
204+
agent = PredicateBrowserAgent(
205+
runtime=runtime,
206+
executor=executor,
207+
config=PredicateBrowserAgentConfig(token_usage_enabled=True),
208+
)
209+
210+
out = await agent.step(task_goal="test", step=step)
211+
assert out.ok is True
212+
213+
usage = agent.get_token_usage()
214+
assert usage["enabled"] is True
215+
assert usage["total"]["total_tokens"] >= 18
216+
assert usage["by_role"]["executor"]["calls"] >= 1
217+
218+
agent.reset_token_usage()
219+
usage2 = agent.get_token_usage()
220+
assert usage2["total"]["total_tokens"] == 0
221+
222+
asyncio.run(_run())
223+

0 commit comments

Comments
 (0)