Skip to content

Commit 5eb38cc

Browse files
authored
Merge pull request #103 from SentienceAPI/hardening2.2
Phase 2.2 - 2.3: hardening & cleaning
2 parents 0316503 + 300ab4d commit 5eb38cc

26 files changed

+2144
-915
lines changed

sentience/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414
from .cloud_tracing import CloudTraceSink, SentienceLogger
1515
from .conversational_agent import ConversationalAgent
1616
from .expect import expect
17-
18-
# Formatting (v0.12.0+)
19-
from .formatting import format_snapshot_for_llm
2017
from .generator import ScriptGenerator, generate
2118
from .inspector import Inspector, inspect
2219
from .llm_provider import (
@@ -62,13 +59,17 @@
6259
from .tracing import JsonlTraceSink, TraceEvent, Tracer, TraceSink
6360

6461
# Utilities (v0.12.0+)
62+
# Import from utils package (re-exports from submodules for backward compatibility)
6563
from .utils import (
6664
canonical_snapshot_loose,
6765
canonical_snapshot_strict,
6866
compute_snapshot_digests,
6967
save_storage_state,
7068
sha256_digest,
7169
)
70+
71+
# Formatting (v0.12.0+)
72+
from .utils.formatting import format_snapshot_for_llm
7273
from .wait import wait_for
7374

7475
__version__ = "0.91.1"

sentience/action_executor.py

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
"""
2+
Action Executor for Sentience Agent.
3+
4+
Handles parsing and execution of action commands (CLICK, TYPE, PRESS, FINISH).
5+
This separates action execution concerns from LLM interaction.
6+
"""
7+
8+
import re
9+
from typing import Any
10+
11+
from .actions import click, click_async, press, press_async, type_text, type_text_async
12+
from .browser import AsyncSentienceBrowser, SentienceBrowser
13+
from .models import Snapshot
14+
15+
16+
class ActionExecutor:
17+
"""
18+
Executes actions and handles parsing of action command strings.
19+
20+
This class encapsulates all action execution logic, making it easier to:
21+
- Test action execution independently
22+
- Add new action types in one place
23+
- Handle action parsing errors consistently
24+
"""
25+
26+
def __init__(self, browser: SentienceBrowser | AsyncSentienceBrowser):
27+
"""
28+
Initialize action executor.
29+
30+
Args:
31+
browser: SentienceBrowser or AsyncSentienceBrowser instance
32+
"""
33+
self.browser = browser
34+
self._is_async = isinstance(browser, AsyncSentienceBrowser)
35+
36+
def execute(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
37+
"""
38+
Parse action string and execute SDK call (synchronous).
39+
40+
Args:
41+
action_str: Action string from LLM (e.g., "CLICK(42)", "TYPE(15, \"text\")")
42+
snap: Current snapshot (for context, currently unused but kept for API consistency)
43+
44+
Returns:
45+
Execution result dictionary with keys:
46+
- success: bool
47+
- action: str (e.g., "click", "type", "press", "finish")
48+
- element_id: Optional[int] (for click/type actions)
49+
- text: Optional[str] (for type actions)
50+
- key: Optional[str] (for press actions)
51+
- outcome: Optional[str] (action outcome)
52+
- url_changed: Optional[bool] (for click actions)
53+
- error: Optional[str] (if action failed)
54+
- message: Optional[str] (for finish action)
55+
56+
Raises:
57+
ValueError: If action format is unknown
58+
RuntimeError: If called on async browser (use execute_async instead)
59+
"""
60+
if self._is_async:
61+
raise RuntimeError(
62+
"ActionExecutor.execute() called on async browser. Use execute_async() instead."
63+
)
64+
65+
# Parse CLICK(42)
66+
if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
67+
element_id = int(match.group(1))
68+
result = click(self.browser, element_id) # type: ignore
69+
return {
70+
"success": result.success,
71+
"action": "click",
72+
"element_id": element_id,
73+
"outcome": result.outcome,
74+
"url_changed": result.url_changed,
75+
}
76+
77+
# Parse TYPE(42, "hello world")
78+
elif match := re.match(
79+
r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
80+
action_str,
81+
re.IGNORECASE,
82+
):
83+
element_id = int(match.group(1))
84+
text = match.group(2)
85+
result = type_text(self.browser, element_id, text) # type: ignore
86+
return {
87+
"success": result.success,
88+
"action": "type",
89+
"element_id": element_id,
90+
"text": text,
91+
"outcome": result.outcome,
92+
}
93+
94+
# Parse PRESS("Enter")
95+
elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
96+
key = match.group(1)
97+
result = press(self.browser, key) # type: ignore
98+
return {
99+
"success": result.success,
100+
"action": "press",
101+
"key": key,
102+
"outcome": result.outcome,
103+
}
104+
105+
# Parse FINISH()
106+
elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
107+
return {
108+
"success": True,
109+
"action": "finish",
110+
"message": "Task marked as complete",
111+
}
112+
113+
else:
114+
raise ValueError(
115+
f"Unknown action format: {action_str}\n"
116+
f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
117+
)
118+
119+
async def execute_async(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
120+
"""
121+
Parse action string and execute SDK call (asynchronous).
122+
123+
Args:
124+
action_str: Action string from LLM (e.g., "CLICK(42)", "TYPE(15, \"text\")")
125+
snap: Current snapshot (for context, currently unused but kept for API consistency)
126+
127+
Returns:
128+
Execution result dictionary (same format as execute())
129+
130+
Raises:
131+
ValueError: If action format is unknown
132+
RuntimeError: If called on sync browser (use execute() instead)
133+
"""
134+
if not self._is_async:
135+
raise RuntimeError(
136+
"ActionExecutor.execute_async() called on sync browser. Use execute() instead."
137+
)
138+
139+
# Parse CLICK(42)
140+
if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
141+
element_id = int(match.group(1))
142+
result = await click_async(self.browser, element_id) # type: ignore
143+
return {
144+
"success": result.success,
145+
"action": "click",
146+
"element_id": element_id,
147+
"outcome": result.outcome,
148+
"url_changed": result.url_changed,
149+
}
150+
151+
# Parse TYPE(42, "hello world")
152+
elif match := re.match(
153+
r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
154+
action_str,
155+
re.IGNORECASE,
156+
):
157+
element_id = int(match.group(1))
158+
text = match.group(2)
159+
result = await type_text_async(self.browser, element_id, text) # type: ignore
160+
return {
161+
"success": result.success,
162+
"action": "type",
163+
"element_id": element_id,
164+
"text": text,
165+
"outcome": result.outcome,
166+
}
167+
168+
# Parse PRESS("Enter")
169+
elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
170+
key = match.group(1)
171+
result = await press_async(self.browser, key) # type: ignore
172+
return {
173+
"success": result.success,
174+
"action": "press",
175+
"key": key,
176+
"outcome": result.outcome,
177+
}
178+
179+
# Parse FINISH()
180+
elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
181+
return {
182+
"success": True,
183+
"action": "finish",
184+
"message": "Task marked as complete",
185+
}
186+
187+
else:
188+
raise ValueError(
189+
f"Unknown action format: {action_str}\n"
190+
f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
191+
)

0 commit comments

Comments
 (0)