fix cloud agent tracing bugs

rcholic · rcholic · commit 0805ab90581d · 2025-12-26T23:23:39.000-08:00
diff --git a/examples/cloud_tracing_agent.py b/examples/cloud_tracing_agent.py
@@ -85,9 +85,9 @@ def main():
         # 6. Get token usage stats
         stats = agent.get_token_stats()
         print("\n📊 Token Usage:")
-        print(f"   Total tokens: {stats['total_tokens']}")
-        print(f"   Prompt tokens: {stats['total_prompt_tokens']}")
-        print(f"   Completion tokens: {stats['total_completion_tokens']}")
+        print(f"   Total tokens: {stats.total_tokens}")
+        print(f"   Prompt tokens: {stats.total_prompt_tokens}")
+        print(f"   Completion tokens: {stats.total_completion_tokens}")
 
     except Exception as e:
         print(f"\n❌ Error during execution: {e}")
diff --git a/sentience/agent.py b/sentience/agent.py
@@ -237,7 +237,7 @@ def act(  # noqa: C901
                 self._track_tokens(goal, llm_response)
 
                 # Parse action from LLM response
-                action_str = llm_response.content.strip()
+                action_str = self._extract_action_from_response(llm_response.content)
 
                 # 4. EXECUTE: Parse and run action
                 result_dict = self._execute_action(action_str, filtered_snap)
@@ -395,6 +395,34 @@ def _build_context(self, snap: Snapshot, goal: str) -> str:
 
         return "\n".join(lines)
 
+    def _extract_action_from_response(self, response: str) -> str:
+        """
+        Extract action command from LLM response, handling cases where
+        the LLM adds extra explanation despite instructions.
+
+        Args:
+            response: Raw LLM response text
+
+        Returns:
+            Cleaned action command string
+        """
+        import re
+
+        # Remove markdown code blocks if present
+        response = re.sub(r"```[\w]*\n?", "", response)
+        response = response.strip()
+
+        # Try to find action patterns in the response
+        # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
+        action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
+
+        match = re.search(action_pattern, response, re.IGNORECASE)
+        if match:
+            return match.group(1)
+
+        # If no pattern match, return the original response (will likely fail parsing)
+        return response
+
     def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
         """
         Query LLM with standardized prompt template
@@ -418,23 +446,30 @@ def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
 - {{CLICKABLE}}: Element is clickable
 - {{color:X}}: Background color name
 
-RESPONSE FORMAT:
-Return ONLY the function call, no explanation or markdown.
-
-Available actions:
+CRITICAL RESPONSE FORMAT:
+You MUST respond with ONLY ONE of these exact action formats:
 - CLICK(id) - Click element by ID
 - TYPE(id, "text") - Type text into element
 - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
 - FINISH() - Task complete
 
-Examples:
-- CLICK(42)
-- TYPE(15, "magic mouse")
-- PRESS("Enter")
-- FINISH()
+DO NOT include any explanation, reasoning, or natural language.
+DO NOT use markdown formatting or code blocks.
+DO NOT say "The next step is..." or anything similar.
+
+CORRECT Examples:
+CLICK(42)
+TYPE(15, "magic mouse")
+PRESS("Enter")
+FINISH()
+
+INCORRECT Examples (DO NOT DO THIS):
+"The next step is to click..."
+"I will type..."
+```CLICK(42)```
 """
 
-        user_prompt = "What is the next step to achieve the goal?"
+        user_prompt = "Return the single action command:"
 
         return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
 
diff --git a/sentience/tracing.py b/sentience/tracing.py
@@ -9,7 +9,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, Optional, Union
+from typing import Any
 
 
 @dataclass
@@ -243,9 +243,24 @@ def emit_error(
         }
         self.emit("error", data, step_id=step_id)
 
-    def close(self) -> None:
-        """Close the underlying sink."""
-        self.sink.close()
+    def close(self, **kwargs) -> None:
+        """
+        Close the underlying sink.
+
+        Args:
+            **kwargs: Passed through to sink.close() (e.g., blocking=True for CloudTraceSink)
+        """
+        # Check if sink.close() accepts kwargs (CloudTraceSink does, JsonlTraceSink doesn't)
+        import inspect
+
+        sig = inspect.signature(self.sink.close)
+        if any(
+            p.kind in (inspect.Parameter.VAR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
+            for p in sig.parameters.values()
+        ):
+            self.sink.close(**kwargs)
+        else:
+            self.sink.close()
 
     def __enter__(self):
         """Context manager support."""
diff --git a/tests/test_agent.py b/tests/test_agent.py
@@ -3,7 +3,7 @@
 Tests LLM providers and SentienceAgent without requiring browser
 """
 
-from unittest.mock import MagicMock, Mock, patch
+from unittest.mock import Mock, patch
 
 import pytest
 
@@ -432,3 +432,38 @@ def test_agent_action_parsing_variations():
         assert mock_click.call_count == 2
         assert mock_type.call_count == 1
         assert mock_press.call_count == 1
+
+
+def test_agent_extract_action_from_llm_response():
+    """Test extraction of action commands from LLM responses with extra text"""
+    browser = create_mock_browser()
+    llm = MockLLMProvider()
+    agent = SentienceAgent(browser, llm, verbose=False)
+
+    # Test clean action (should pass through)
+    assert agent._extract_action_from_response("CLICK(42)") == "CLICK(42)"
+    assert agent._extract_action_from_response('TYPE(15, "test")') == 'TYPE(15, "test")'
+    assert agent._extract_action_from_response('PRESS("Enter")') == 'PRESS("Enter")'
+    assert agent._extract_action_from_response("FINISH()") == "FINISH()"
+
+    # Test with natural language prefix (the bug case)
+    assert (
+        agent._extract_action_from_response("The next step is to click the button. CLICK(42)")
+        == "CLICK(42)"
+    )
+    assert (
+        agent._extract_action_from_response(
+            'The next step is to type "Sentience AI agent SDK" into the search field. TYPE(15, "Sentience AI agent SDK")'
+        )
+        == 'TYPE(15, "Sentience AI agent SDK")'
+    )
+
+    # Test with markdown code blocks
+    assert agent._extract_action_from_response("```\nCLICK(42)\n```") == "CLICK(42)"
+    assert (
+        agent._extract_action_from_response('```python\nTYPE(15, "test")\n```')
+        == 'TYPE(15, "test")'
+    )
+
+    # Test with explanation after action
+    assert agent._extract_action_from_response("CLICK(42) to submit the form") == "CLICK(42)"