optimize LLM agent efficiency

rcholic · rcholic · commit fa4e49928c07 · 2026-01-03T12:35:19.000-08:00
diff --git a/sentience/element_filter.py b/sentience/element_filter.py
@@ -57,7 +57,8 @@ def filter_by_importance(
         Returns:
             Top N elements sorted by importance score
         """
-        elements = snapshot.elements
+        # Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
+        elements = [el for el in snapshot.elements if el.diff_status != "REMOVED"]
         # Elements are already sorted by importance in snapshot
         return elements[:max_elements]
 
@@ -81,7 +82,8 @@ def filter_by_goal(
         Returns:
             Filtered list of elements sorted by boosted importance score
         """
-        elements = snapshot.elements
+        # Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
+        elements = [el for el in snapshot.elements if el.diff_status != "REMOVED"]
 
         # If no goal provided, return all elements (up to limit)
         if not goal:
diff --git a/sentience/llm_interaction_handler.py b/sentience/llm_interaction_handler.py
@@ -6,7 +6,6 @@
 """
 
 import re
-from typing import Optional
 
 from .llm_provider import LLMProvider, LLMResponse
 from .models import Snapshot
@@ -35,7 +34,7 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
         """
         Convert snapshot elements to token-efficient prompt string.
 
-        Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
+        Format: [ID] <role> "text" {cues} @ position size:WxH importance:score [status]
 
         Args:
             snap: Snapshot object
@@ -46,24 +45,50 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
         """
         lines = []
         for el in snap.elements:
+            # Skip REMOVED elements - they're not actionable and shouldn't be in LLM context
+            if el.diff_status == "REMOVED":
+                continue
             # Extract visual cues
-            cues = []
+            cues: list[str] = []
             if el.visual_cues.is_primary:
                 cues.append("PRIMARY")
             if el.visual_cues.is_clickable:
                 cues.append("CLICKABLE")
             if el.visual_cues.background_color_name:
                 cues.append(f"color:{el.visual_cues.background_color_name}")
 
-            # Format element line
+            # Format element line with improved readability
+            # Ensure cues is defined before using it in f-string
             cues_str = f" {{{','.join(cues)}}}" if cues else ""
-            text_preview = (
-                (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
-            )
-
+            
+            # Better text handling - show truncation indicator
+            text_preview = ""
+            if el.text:
+                if len(el.text) > 50:
+                    text_preview = f'"{el.text[:50]}..."'
+                else:
+                    text_preview = f'"{el.text}"'
+            
+            # Build position and size info
+            x, y = int(el.bbox.x), int(el.bbox.y)
+            width, height = int(el.bbox.width), int(el.bbox.height)
+            position_str = f"@ ({x},{y})"
+            size_str = f"size:{width}x{height}"
+            
+            # Build status indicators (only include if relevant)
+            status_parts = []
+            if not el.in_viewport:
+                status_parts.append("not_in_viewport")
+            if el.is_occluded:
+                status_parts.append("occluded")
+            if el.diff_status:
+                status_parts.append(f"diff:{el.diff_status}")
+            status_str = f" [{','.join(status_parts)}]" if status_parts else ""
+            
+            # Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
             lines.append(
-                f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
-                f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
+                f'[{el.id}] <{el.role}> {text_preview}{cues_str} '
+                f"{position_str} {size_str} importance:{el.importance}{status_str}"
             )
 
         return "\n".join(lines)
@@ -87,24 +112,44 @@ def query_llm(self, dom_context: str, goal: str) -> LLMResponse:
 {dom_context}
 
 VISUAL CUES EXPLAINED:
-- {{PRIMARY}}: Main call-to-action element on the page
-- {{CLICKABLE}}: Element is clickable
-- {{color:X}}: Background color name
+After the text, you may see visual cues in curly braces like {{CLICKABLE}} or {{PRIMARY,CLICKABLE,color:white}}:
+- PRIMARY: Main call-to-action element on the page
+- CLICKABLE: Element is clickable/interactive
+- color:X: Background color name (e.g., color:white, color:blue)
+Multiple cues are comma-separated inside the braces: {{CLICKABLE,color:white}}
+
+ELEMENT FORMAT EXPLAINED:
+Each element line follows this format:
+[ID] <role> "text" {{cues}} @ (x,y) size:WxH importance:score [status]
+
+Example: [346] <button> "Computer Accessories" {{CLICKABLE,color:white}} @ (664,100) size:150x40 importance:811
+
+Breaking down each part:
+- [ID]: The number in brackets is the element ID - use this EXACT number in CLICK/TYPE commands
+  Example: If you see [346], use CLICK(346) or TYPE(346, "text")
+- <role>: Element type (button, link, textbox, etc.)
+- "text": Visible text content (truncated with "..." if long)
+- {{cues}}: Optional visual cues in curly braces (e.g., {{CLICKABLE}}, {{PRIMARY,CLICKABLE}}, {{CLICKABLE,color:white}})
+  If no cues, this part is omitted entirely
+- @ (x,y): Element position in pixels from top-left corner
+- size:WxH: Element dimensions (width x height in pixels)
+- importance: Score indicating element relevance (higher = more important)
+- [status]: Optional status flags in brackets (not_in_viewport, occluded, diff:ADDED/MODIFIED/etc)
 
 CRITICAL RESPONSE FORMAT:
 You MUST respond with ONLY ONE of these exact action formats:
-- CLICK(id) - Click element by ID
-- TYPE(id, "text") - Type text into element
+- CLICK(id) - Click element by ID (use the number from [ID] brackets)
+- TYPE(id, "text") - Type text into element (use the number from [ID] brackets)
 - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
 - FINISH() - Task complete
 
 DO NOT include any explanation, reasoning, or natural language.
 DO NOT use markdown formatting or code blocks.
 DO NOT say "The next step is..." or anything similar.
 
-CORRECT Examples:
-CLICK(42)
-TYPE(15, "magic mouse")
+CORRECT Examples (matching element IDs from the list above):
+If element is [346] <button> "Click me" → respond: CLICK(346)
+If element is [15] <textbox> "Search" → respond: TYPE(15, "magic mouse")
 PRESS("Enter")
 FINISH()
 
diff --git a/tests/test_agent.py b/tests/test_agent.py
@@ -185,7 +185,7 @@ def test_agent_build_context():
     assert "PRIMARY" in context
     assert "CLICKABLE" in context
     assert "color:blue" in context
-    assert "(Imp:900)" in context
+    assert "importance:900" in context
 
 
 def test_agent_execute_click_action():