Skip to content

Commit a8bb166

Browse files
author
SentienceDEV
committed
P7: runtime safety net testing
1 parent de7d9bc commit a8bb166

File tree

1 file changed

+175
-0
lines changed

1 file changed

+175
-0
lines changed
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
from __future__ import annotations
2+
3+
from unittest.mock import AsyncMock, MagicMock
4+
5+
import pytest
6+
7+
from sentience.agent_runtime import AgentRuntime
8+
from sentience.models import BBox, Element, Snapshot, Viewport, VisualCues
9+
from sentience.verification import AssertContext, AssertOutcome, is_checked, is_disabled, is_enabled, value_contains
10+
11+
12+
class MockBackend:
13+
async def screenshot_png(self) -> bytes:
14+
return b""
15+
16+
17+
class MockTracer:
18+
def __init__(self) -> None:
19+
self.events: list[dict] = []
20+
21+
def emit(self, event_type: str, data: dict, step_id: str | None = None) -> None:
22+
self.events.append({"type": event_type, "data": data, "step_id": step_id})
23+
24+
25+
def make_element(
26+
element_id: int,
27+
*,
28+
role: str,
29+
text: str | None,
30+
disabled: bool | None = None,
31+
checked: bool | None = None,
32+
value: str | None = None,
33+
input_type: str | None = None,
34+
) -> Element:
35+
return Element(
36+
id=element_id,
37+
role=role,
38+
text=text,
39+
importance=10,
40+
bbox=BBox(x=0, y=0, width=100, height=40),
41+
visual_cues=VisualCues(is_primary=False, is_clickable=True, background_color_name=None),
42+
in_viewport=True,
43+
is_occluded=False,
44+
disabled=disabled,
45+
checked=checked,
46+
value=value,
47+
input_type=input_type,
48+
)
49+
50+
51+
def make_snapshot(elements: list[Element], url: str) -> Snapshot:
52+
return Snapshot(
53+
status="success",
54+
url=url,
55+
elements=elements,
56+
viewport=Viewport(width=1280, height=720),
57+
)
58+
59+
60+
def test_v1_state_assertions_enabled_disabled_checked_value() -> None:
61+
runtime = AgentRuntime(backend=MockBackend(), tracer=MockTracer())
62+
runtime.begin_step(goal="Test")
63+
64+
elements = [
65+
make_element(1, role="button", text="Submit", disabled=False),
66+
make_element(2, role="checkbox", text=None, checked=True),
67+
make_element(3, role="textbox", text=None, value="hello", input_type="text"),
68+
make_element(4, role="button", text="Disabled", disabled=True),
69+
]
70+
runtime.last_snapshot = make_snapshot(elements, url="https://example.com")
71+
72+
assert runtime.assert_(is_enabled("text~'Submit'"), label="enabled") is True
73+
assert runtime.assert_(is_disabled("text~'Disabled'"), label="disabled") is True
74+
assert runtime.assert_(is_checked("role=checkbox"), label="checked") is True
75+
assert runtime.assert_(value_contains("role=textbox", "hello"), label="value") is True
76+
77+
78+
@pytest.mark.asyncio
79+
async def test_eventually_retry_loop_succeeds() -> None:
80+
tracer = MockTracer()
81+
runtime = AgentRuntime(backend=MockBackend(), tracer=tracer)
82+
runtime.begin_step(goal="Test")
83+
84+
snaps = [
85+
make_snapshot([], url="https://example.com"),
86+
make_snapshot([], url="https://example.com"),
87+
make_snapshot([], url="https://example.com/done"),
88+
]
89+
90+
async def fake_snapshot(**_kwargs):
91+
runtime.last_snapshot = snaps.pop(0)
92+
return runtime.last_snapshot
93+
94+
runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign]
95+
96+
def pred(ctx: AssertContext) -> AssertOutcome:
97+
ok = (ctx.url or "").endswith("/done")
98+
return AssertOutcome(passed=ok, reason="" if ok else "not done", details={})
99+
100+
ok = await runtime.check(pred, label="eventually_done").eventually(timeout_s=2.0, poll_s=0.0)
101+
assert ok is True
102+
103+
104+
@pytest.mark.asyncio
105+
async def test_min_confidence_snapshot_exhausted() -> None:
106+
tracer = MockTracer()
107+
runtime = AgentRuntime(backend=MockBackend(), tracer=tracer)
108+
runtime.begin_step(goal="Test")
109+
110+
low_diag = MagicMock()
111+
low_diag.confidence = 0.1
112+
low_diag.model_dump = lambda: {"confidence": 0.1}
113+
114+
snaps = [
115+
MagicMock(url="https://example.com", elements=[], diagnostics=low_diag),
116+
MagicMock(url="https://example.com", elements=[], diagnostics=low_diag),
117+
]
118+
119+
async def fake_snapshot(**_kwargs):
120+
runtime.last_snapshot = snaps.pop(0)
121+
return runtime.last_snapshot
122+
123+
runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign]
124+
125+
def pred(_ctx: AssertContext) -> AssertOutcome:
126+
return AssertOutcome(passed=True, reason="would pass", details={})
127+
128+
ok = await runtime.check(pred, label="min_confidence_gate").eventually(
129+
timeout_s=2.0,
130+
poll_s=0.0,
131+
min_confidence=0.7,
132+
max_snapshot_attempts=2,
133+
)
134+
assert ok is False
135+
details = runtime._assertions_this_step[0]["details"]
136+
assert details["reason_code"] == "snapshot_exhausted"
137+
138+
139+
@pytest.mark.asyncio
140+
async def test_golden_flow_same_snapshots_actions_no_captcha() -> None:
141+
tracer = MockTracer()
142+
runtime = AgentRuntime(backend=MockBackend(), tracer=tracer)
143+
runtime.begin_step(goal="Test")
144+
145+
class FakeActionExecutor:
146+
def __init__(self) -> None:
147+
self.actions: list[str] = []
148+
149+
def execute(self, action: str) -> dict:
150+
self.actions.append(action)
151+
return {"success": True}
152+
153+
executor = FakeActionExecutor()
154+
executor.execute("CLICK(1)")
155+
executor.execute('TYPE(2, "hello")')
156+
assert executor.actions == ["CLICK(1)", 'TYPE(2, "hello")']
157+
158+
snaps = [
159+
make_snapshot([], url="https://example.com"),
160+
make_snapshot([], url="https://example.com/after"),
161+
make_snapshot([], url="https://example.com/done"),
162+
]
163+
164+
async def fake_snapshot(**_kwargs):
165+
runtime.last_snapshot = snaps.pop(0)
166+
return runtime.last_snapshot
167+
168+
runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign]
169+
170+
def pred(ctx: AssertContext) -> AssertOutcome:
171+
ok = (ctx.url or "").endswith("/done")
172+
return AssertOutcome(passed=ok, reason="" if ok else "not done", details={})
173+
174+
ok = await runtime.check(pred, label="golden_flow").eventually(timeout_s=2.0, poll_s=0.0)
175+
assert ok is True

0 commit comments

Comments
 (0)