66"""
77
88import re
9- from typing import Optional
109
1110from .llm_provider import LLMProvider , LLMResponse
1211from .models import Snapshot
@@ -35,7 +34,7 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
3534 """
3635 Convert snapshot elements to token-efficient prompt string.
3736
38- Format: [ID] <role> "text" {cues} @ (x,y) (Imp: score)
37+ Format: [ID] <role> "text" {cues} @ position size:WxH importance: score [status]
3938
4039 Args:
4140 snap: Snapshot object
@@ -46,24 +45,50 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
4645 """
4746 lines = []
4847 for el in snap .elements :
48+ # Skip REMOVED elements - they're not actionable and shouldn't be in LLM context
49+ if el .diff_status == "REMOVED" :
50+ continue
4951 # Extract visual cues
50- cues = []
52+ cues : list [ str ] = []
5153 if el .visual_cues .is_primary :
5254 cues .append ("PRIMARY" )
5355 if el .visual_cues .is_clickable :
5456 cues .append ("CLICKABLE" )
5557 if el .visual_cues .background_color_name :
5658 cues .append (f"color:{ el .visual_cues .background_color_name } " )
5759
58- # Format element line
60+ # Format element line with improved readability
61+ # Ensure cues is defined before using it in f-string
5962 cues_str = f" {{{ ',' .join (cues )} }}" if cues else ""
60- text_preview = (
61- (el .text [:50 ] + "..." ) if el .text and len (el .text ) > 50 else (el .text or "" )
62- )
63-
63+
64+ # Better text handling - show truncation indicator
65+ text_preview = ""
66+ if el .text :
67+ if len (el .text ) > 50 :
68+ text_preview = f'"{ el .text [:50 ]} ..."'
69+ else :
70+ text_preview = f'"{ el .text } "'
71+
72+ # Build position and size info
73+ x , y = int (el .bbox .x ), int (el .bbox .y )
74+ width , height = int (el .bbox .width ), int (el .bbox .height )
75+ position_str = f"@ ({ x } ,{ y } )"
76+ size_str = f"size:{ width } x{ height } "
77+
78+ # Build status indicators (only include if relevant)
79+ status_parts = []
80+ if not el .in_viewport :
81+ status_parts .append ("not_in_viewport" )
82+ if el .is_occluded :
83+ status_parts .append ("occluded" )
84+ if el .diff_status :
85+ status_parts .append (f"diff:{ el .diff_status } " )
86+ status_str = f" [{ ',' .join (status_parts )} ]" if status_parts else ""
87+
88+ # Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
6489 lines .append (
65- f'[{ el .id } ] <{ el .role } > " { text_preview } " { cues_str } '
66- f"@ ( { int ( el . bbox . x ) } , { int ( el . bbox . y ) } ) (Imp :{ el .importance } ) "
90+ f'[{ el .id } ] <{ el .role } > { text_preview } { cues_str } '
91+ f"{ position_str } { size_str } importance :{ el .importance } { status_str } "
6792 )
6893
6994 return "\n " .join (lines )
@@ -87,24 +112,44 @@ def query_llm(self, dom_context: str, goal: str) -> LLMResponse:
87112{ dom_context }
88113
89114VISUAL CUES EXPLAINED:
90- - {{PRIMARY}}: Main call-to-action element on the page
91- - {{CLICKABLE}}: Element is clickable
92- - {{color:X}}: Background color name
115+ After the text, you may see visual cues in curly braces like {{CLICKABLE}} or {{PRIMARY,CLICKABLE,color:white}}:
116+ - PRIMARY: Main call-to-action element on the page
117+ - CLICKABLE: Element is clickable/interactive
118+ - color:X: Background color name (e.g., color:white, color:blue)
119+ Multiple cues are comma-separated inside the braces: {{CLICKABLE,color:white}}
120+
121+ ELEMENT FORMAT EXPLAINED:
122+ Each element line follows this format:
123+ [ID] <role> "text" {{cues}} @ (x,y) size:WxH importance:score [status]
124+
125+ Example: [346] <button> "Computer Accessories" {{CLICKABLE,color:white}} @ (664,100) size:150x40 importance:811
126+
127+ Breaking down each part:
128+ - [ID]: The number in brackets is the element ID - use this EXACT number in CLICK/TYPE commands
129+ Example: If you see [346], use CLICK(346) or TYPE(346, "text")
130+ - <role>: Element type (button, link, textbox, etc.)
131+ - "text": Visible text content (truncated with "..." if long)
132+ - {{cues}}: Optional visual cues in curly braces (e.g., {{CLICKABLE}}, {{PRIMARY,CLICKABLE}}, {{CLICKABLE,color:white}})
133+ If no cues, this part is omitted entirely
134+ - @ (x,y): Element position in pixels from top-left corner
135+ - size:WxH: Element dimensions (width x height in pixels)
136+ - importance: Score indicating element relevance (higher = more important)
137+ - [status]: Optional status flags in brackets (not_in_viewport, occluded, diff:ADDED/MODIFIED/etc)
93138
94139CRITICAL RESPONSE FORMAT:
95140You MUST respond with ONLY ONE of these exact action formats:
96- - CLICK(id) - Click element by ID
97- - TYPE(id, "text") - Type text into element
141+ - CLICK(id) - Click element by ID (use the number from [ID] brackets)
142+ - TYPE(id, "text") - Type text into element (use the number from [ID] brackets)
98143- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
99144- FINISH() - Task complete
100145
101146DO NOT include any explanation, reasoning, or natural language.
102147DO NOT use markdown formatting or code blocks.
103148DO NOT say "The next step is..." or anything similar.
104149
105- CORRECT Examples:
106- CLICK(42 )
107- TYPE(15, "magic mouse")
150+ CORRECT Examples (matching element IDs from the list above) :
151+ If element is [346] <button> "Click me" → respond: CLICK(346 )
152+ If element is [15] <textbox> "Search" → respond: TYPE(15, "magic mouse")
108153PRESS("Enter")
109154FINISH()
110155
0 commit comments