55
66import re
77import time
8- from typing import TYPE_CHECKING , Any , Dict , List , Optional , Union
8+ from typing import TYPE_CHECKING , Any , Optional
99
1010from .actions import click , press , type_text
1111from .base_agent import BaseAgent
@@ -93,8 +93,11 @@ def __init__(
9393 # Step counter for tracing
9494 self ._step_count = 0
9595
96- def act (
97- self , goal : str , max_retries : int = 2 , snapshot_options : SnapshotOptions | None = None
96+ def act ( # noqa: C901
97+ self ,
98+ goal : str ,
99+ max_retries : int = 2 ,
100+ snapshot_options : SnapshotOptions | None = None ,
98101 ) -> AgentActionResult :
99102 """
100103 Execute a high-level goal using observe → think → act loop
@@ -116,9 +119,9 @@ def act(
116119 42
117120 """
118121 if self .verbose :
119- print (f"\n { '=' * 70 } " )
122+ print (f"\n { '=' * 70 } " )
120123 print (f"🤖 Agent Goal: { goal } " )
121- print (f"{ '=' * 70 } " )
124+ print (f"{ '=' * 70 } " )
122125
123126 # Generate step ID for tracing
124127 self ._step_count += 1
@@ -234,7 +237,7 @@ def act(
234237 self ._track_tokens (goal , llm_response )
235238
236239 # Parse action from LLM response
237- action_str = llm_response .content . strip ( )
240+ action_str = self . _extract_action_from_response ( llm_response .content )
238241
239242 # 4. EXECUTE: Parse and run action
240243 result_dict = self ._execute_action (action_str , filtered_snap )
@@ -392,6 +395,34 @@ def _build_context(self, snap: Snapshot, goal: str) -> str:
392395
393396 return "\n " .join (lines )
394397
398+ def _extract_action_from_response (self , response : str ) -> str :
399+ """
400+ Extract action command from LLM response, handling cases where
401+ the LLM adds extra explanation despite instructions.
402+
403+ Args:
404+ response: Raw LLM response text
405+
406+ Returns:
407+ Cleaned action command string
408+ """
409+ import re
410+
411+ # Remove markdown code blocks if present
412+ response = re .sub (r"```[\w]*\n?" , "" , response )
413+ response = response .strip ()
414+
415+ # Try to find action patterns in the response
416+ # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
417+ action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
418+
419+ match = re .search (action_pattern , response , re .IGNORECASE )
420+ if match :
421+ return match .group (1 )
422+
423+ # If no pattern match, return the original response (will likely fail parsing)
424+ return response
425+
395426 def _query_llm (self , dom_context : str , goal : str ) -> LLMResponse :
396427 """
397428 Query LLM with standardized prompt template
@@ -415,23 +446,30 @@ def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
415446- {{CLICKABLE}}: Element is clickable
416447- {{color:X}}: Background color name
417448
418- RESPONSE FORMAT:
419- Return ONLY the function call, no explanation or markdown.
420-
421- Available actions:
449+ CRITICAL RESPONSE FORMAT:
450+ You MUST respond with ONLY ONE of these exact action formats:
422451- CLICK(id) - Click element by ID
423452- TYPE(id, "text") - Type text into element
424453- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
425454- FINISH() - Task complete
426455
427- Examples:
428- - CLICK(42)
429- - TYPE(15, "magic mouse")
430- - PRESS("Enter")
431- - FINISH()
456+ DO NOT include any explanation, reasoning, or natural language.
457+ DO NOT use markdown formatting or code blocks.
458+ DO NOT say "The next step is..." or anything similar.
459+
460+ CORRECT Examples:
461+ CLICK(42)
462+ TYPE(15, "magic mouse")
463+ PRESS("Enter")
464+ FINISH()
465+
466+ INCORRECT Examples (DO NOT DO THIS):
467+ "The next step is to click..."
468+ "I will type..."
469+ ```CLICK(42)```
432470"""
433471
434- user_prompt = "What is the next step to achieve the goal? "
472+ user_prompt = "Return the single action command: "
435473
436474 return self .llm .generate (system_prompt , user_prompt , temperature = 0.0 )
437475
@@ -460,7 +498,9 @@ def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
460498
461499 # Parse TYPE(42, "hello world")
462500 elif match := re .match (
463- r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)' , action_str , re .IGNORECASE
501+ r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)' ,
502+ action_str ,
503+ re .IGNORECASE ,
464504 ):
465505 element_id = int (match .group (1 ))
466506 text = match .group (2 )
@@ -486,7 +526,11 @@ def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
486526
487527 # Parse FINISH()
488528 elif re .match (r"FINISH\s*\(\s*\)" , action_str , re .IGNORECASE ):
489- return {"success" : True , "action" : "finish" , "message" : "Task marked as complete" }
529+ return {
530+ "success" : True ,
531+ "action" : "finish" ,
532+ "message" : "Task marked as complete" ,
533+ }
490534
491535 else :
492536 raise ValueError (
0 commit comments