Merge pull request #174 from SentienceAPI/fix_agentruntime_step_id

rcholic · web-flow · commit 1b72c4ba4345 · 2026-01-20T19:02:30.000-08:00
Fix AgentRuntime step_id from UUID to step-N
diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py
@@ -66,7 +66,6 @@
 import asyncio
 import difflib
 import time
-import uuid
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
@@ -504,20 +503,20 @@ def begin_step(self, goal: str, step_index: int | None = None) -> str:
             step_index: Optional explicit step index (otherwise auto-increments)
 
         Returns:
-            Generated step_id
+            Generated step_id in format 'step-N' where N is the step index
         """
         # Clear previous step state
         self._assertions_this_step = []
 
-        # Generate new step_id
-        self.step_id = str(uuid.uuid4())
-
         # Update step index
         if step_index is not None:
             self.step_index = step_index
         else:
             self.step_index += 1
 
+        # Generate step_id in 'step-N' format for Studio compatibility
+        self.step_id = f"step-{self.step_index}"
+
         return self.step_id
 
     def assert_(
@@ -583,7 +582,7 @@ def assert_done(
             True if task is complete (assertion passed), False otherwise
         """
         # Convenience wrapper for assert_ with required=True
-        ok = self.assertTrue(predicate, label=label, required=True)
+        ok = self.assert_(predicate, label=label, required=True)
         if ok:
             self._task_done = True
             self._task_done_label = label
diff --git a/sentience/schemas/trace_v1.json b/sentience/schemas/trace_v1.json
@@ -37,8 +37,12 @@
     },
     "step_id": {
       "type": ["string", "null"],
-      "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
-      "description": "UUID for the step (present for step-scoped events)"
+      "description": "Step identifier in 'step-N' format where N is the step index (present for step-scoped events)"
+    },
+    "step_index": {
+      "type": ["integer", "null"],
+      "minimum": 0,
+      "description": "Step index (0-based), present for step-scoped events"
     },
     "data": {
       "type": "object",
@@ -67,6 +71,7 @@
           "description": "snapshot or snapshot_taken data",
           "properties": {
             "step_id": {"type": ["string", "null"]},
+            "step_index": {"type": ["integer", "null"], "minimum": 0, "description": "Step index for Studio compatibility"},
             "snapshot_id": {"type": ["string", "null"]},
             "snapshot_digest": {"type": "string", "pattern": "^sha256:[0-9a-f]{64}$"},
             "snapshot_digest_loose": {"type": "string", "pattern": "^sha256:[0-9a-f]{64}$"},
diff --git a/sentience/trace_event_builder.py b/sentience/trace_event_builder.py
@@ -23,6 +23,7 @@ class TraceEventBuilder:
     def build_snapshot_event(
         snapshot: Snapshot,
         include_all_elements: bool = True,
+        step_index: int | None = None,
     ) -> dict[str, Any]:
         """
         Build snapshot_taken trace event data.
@@ -31,6 +32,8 @@ def build_snapshot_event(
             snapshot: Snapshot to build event from
             include_all_elements: If True, include all elements (for DOM tree display).
                                  If False, use filtered elements only.
+            step_index: Optional step index (0-based) for Studio compatibility.
+                       Required when step_id is not in 'step-N' format (e.g., UUIDs).
 
         Returns:
             Dictionary with snapshot event data
@@ -64,13 +67,19 @@ def build_snapshot_event(
             el_dict["importance_score"] = importance_score
             elements_data.append(el_dict)
 
-        return {
+        result = {
             "url": snapshot.url,
             "element_count": len(snapshot.elements),
             "timestamp": snapshot.timestamp,
             "elements": elements_data,  # Full element data for DOM tree
         }
 
+        # Include step_index if provided (required for UUID step_ids)
+        if step_index is not None:
+            result["step_index"] = step_index
+
+        return result
+
     @staticmethod
     def build_step_end_event(
         step_id: str,
diff --git a/tests/test_agent_runtime.py b/tests/test_agent_runtime.py
@@ -152,15 +152,34 @@ class TestAgentRuntimeBeginStep:
     """Tests for begin_step method."""
 
     def test_begin_step_generates_step_id(self) -> None:
-        """Test begin_step generates a UUID step_id."""
+        """Test begin_step generates a step_id in 'step-N' format."""
         backend = MockBackend()
         tracer = MockTracer()
         runtime = AgentRuntime(backend=backend, tracer=tracer)
 
         step_id = runtime.begin_step(goal="Test step")
 
         assert step_id is not None
-        assert len(step_id) == 36  # UUID length with dashes
+        assert step_id == "step-1"  # First step should be step-1
+
+    def test_begin_step_id_matches_index(self) -> None:
+        """Test step_id format matches step_index for Studio compatibility."""
+        backend = MockBackend()
+        tracer = MockTracer()
+        runtime = AgentRuntime(backend=backend, tracer=tracer)
+
+        step_id_1 = runtime.begin_step(goal="Step 1")
+        assert step_id_1 == "step-1"
+        assert runtime.step_index == 1
+
+        step_id_2 = runtime.begin_step(goal="Step 2")
+        assert step_id_2 == "step-2"
+        assert runtime.step_index == 2
+
+        # With explicit index
+        step_id_10 = runtime.begin_step(goal="Step 10", step_index=10)
+        assert step_id_10 == "step-10"
+        assert runtime.step_index == 10
 
     def test_begin_step_increments_index(self) -> None:
         """Test begin_step auto-increments step_index."""
diff --git a/tests/test_screenshot_storage.py b/tests/test_screenshot_storage.py
@@ -42,13 +42,10 @@ def test_extract_screenshots_from_trace(self):
             }
         )
 
-        # Close to write file
-        sink.close(blocking=False)
-
-        # Wait a bit for file to be written
-        import time
-
-        time.sleep(0.1)
+        # Finalize trace file synchronously (closes file handle properly)
+        # Using _finalize_trace_file_for_upload instead of close(blocking=False)
+        # to avoid Windows file locking issues in tests
+        sink._finalize_trace_file_for_upload()
 
         # Extract screenshots
         screenshots = sink._extract_screenshots_from_trace()
@@ -59,7 +56,7 @@ def test_extract_screenshots_from_trace(self):
         assert screenshots[1]["format"] == "png"
         assert screenshots[1]["step_id"] == "step-1"
 
-        # Cleanup
+        # Cleanup - file handle is already closed
         cache_dir = Path.home() / ".sentience" / "traces" / "pending"
         trace_path = cache_dir / f"{run_id}.jsonl"
         if trace_path.exists():
@@ -93,15 +90,15 @@ def test_extract_screenshots_handles_multiple(self):
                 }
             )
 
-        sink.close(blocking=False)
-        import time
-
-        time.sleep(0.1)
+        # Finalize trace file synchronously (closes file handle properly)
+        # Using _finalize_trace_file_for_upload instead of close(blocking=False)
+        # to avoid Windows file locking issues in tests
+        sink._finalize_trace_file_for_upload()
 
         screenshots = sink._extract_screenshots_from_trace()
         assert len(screenshots) == 3
 
-        # Cleanup
+        # Cleanup - file handle is already closed
         cache_dir = Path.home() / ".sentience" / "traces" / "pending"
         trace_path = cache_dir / f"{run_id}.jsonl"
         if trace_path.exists():
@@ -130,15 +127,15 @@ def test_extract_screenshots_skips_events_without_screenshots(self):
             }
         )
 
-        sink.close(blocking=False)
-        import time
-
-        time.sleep(0.1)
+        # Finalize trace file synchronously (closes file handle properly)
+        # Using _finalize_trace_file_for_upload instead of close(blocking=False)
+        # to avoid Windows file locking issues in tests
+        sink._finalize_trace_file_for_upload()
 
         screenshots = sink._extract_screenshots_from_trace()
         assert len(screenshots) == 0
 
-        # Cleanup
+        # Cleanup - file handle is already closed
         cache_dir = Path.home() / ".sentience" / "traces" / "pending"
         trace_path = cache_dir / f"{run_id}.jsonl"
         if trace_path.exists():
@@ -174,10 +171,8 @@ def test_create_cleaned_trace_removes_screenshot_fields(self):
             }
         )
 
-        sink.close(blocking=False)
-        import time
-
-        time.sleep(0.1)
+        # Finalize trace file synchronously to avoid Windows file locking issues
+        sink._finalize_trace_file_for_upload()
 
         # Create cleaned trace
         cache_dir = Path.home() / ".sentience" / "traces" / "pending"
@@ -223,10 +218,8 @@ def test_create_cleaned_trace_preserves_other_events(self):
             }
         )
 
-        sink.close(blocking=False)
-        import time
-
-        time.sleep(0.1)
+        # Finalize trace file synchronously to avoid Windows file locking issues
+        sink._finalize_trace_file_for_upload()
 
         # Create cleaned trace
         cache_dir = Path.home() / ".sentience" / "traces" / "pending"
@@ -436,10 +429,8 @@ def test_upload_removes_screenshot_base64_from_trace(self):
             }
         )
 
-        sink.close(blocking=False)
-        import time
-
-        time.sleep(0.1)
+        # Finalize trace file synchronously to avoid Windows file locking issues
+        sink._finalize_trace_file_for_upload()
 
         # Mock gateway and upload responses
         mock_upload_urls = {
diff --git a/tests/test_trace_event_builder.py b/tests/test_trace_event_builder.py
@@ -320,3 +320,25 @@ def test_build_step_end_event_with_none_verify_data():
 
     # Verify should be empty dict when verify_data is None
     assert result["verify"] == {}
+
+
+def test_build_snapshot_event_with_step_index():
+    """Test that build_snapshot_event includes step_index when provided.
+
+    This is required for AgentRuntime which uses UUID step_ids that can't be
+    parsed by Studio's trace-parser to extract step_index.
+    """
+    elements = [create_element(1, text="Test element")]
+    snapshot = create_snapshot(elements)
+
+    # Without step_index
+    result_without = TraceEventBuilder.build_snapshot_event(snapshot)
+    assert "step_index" not in result_without
+
+    # With step_index=0
+    result_with_zero = TraceEventBuilder.build_snapshot_event(snapshot, step_index=0)
+    assert result_with_zero["step_index"] == 0
+
+    # With step_index=5
+    result_with_five = TraceEventBuilder.build_snapshot_event(snapshot, step_index=5)
+    assert result_with_five["step_index"] == 5