P2 with mutation observer

SentienceDEV · SentienceDEV · commit a421c68975dd · 2026-01-14T21:08:10.000-08:00
diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py
@@ -71,7 +71,7 @@
 from typing import TYPE_CHECKING, Any
 
 from .models import Snapshot, SnapshotOptions
-from .verification import AssertContext, Predicate
+from .verification import AssertContext, AssertOutcome, Predicate
 
 if TYPE_CHECKING:
     from playwright.async_api import Page
@@ -341,7 +341,7 @@ def assert_done(
             True if task is complete (assertion passed), False otherwise
         """
         # Convenience wrapper for assert_ with required=True
-        ok = self.assert_(predicate, label=label, required=True)
+        ok = self.assertTrue(predicate, label=label, required=True)
         if ok:
             self._task_done = True
             self._task_done_label = label
@@ -496,7 +496,12 @@ async def eventually(
         *,
         timeout_s: float = 10.0,
         poll_s: float = 0.25,
+        min_confidence: float | None = None,
+        max_snapshot_attempts: int = 3,
         snapshot_kwargs: dict[str, Any] | None = None,
+        vision_provider: Any | None = None,
+        vision_system_prompt: str | None = None,
+        vision_user_prompt: str | None = None,
     ) -> bool:
         """
         Retry until the predicate passes or timeout is reached.
@@ -506,11 +511,165 @@ async def eventually(
         """
         deadline = time.monotonic() + timeout_s
         attempt = 0
+        snapshot_attempt = 0
         last_outcome = None
 
         while True:
             attempt += 1
             await self.runtime.snapshot(**(snapshot_kwargs or {}))
+            snapshot_attempt += 1
+
+            # Optional: gate predicate evaluation on snapshot confidence.
+            # If diagnostics are missing, we don't block (backward compatible).
+            confidence = None
+            diagnostics = None
+            if self.runtime.last_snapshot is not None:
+                diagnostics = getattr(self.runtime.last_snapshot, "diagnostics", None)
+                if diagnostics is not None:
+                    confidence = getattr(diagnostics, "confidence", None)
+
+            if (
+                min_confidence is not None
+                and confidence is not None
+                and isinstance(confidence, (int, float))
+                and confidence < min_confidence
+            ):
+                last_outcome = AssertOutcome(
+                    passed=False,
+                    reason=f"Snapshot confidence {confidence:.3f} < min_confidence {min_confidence:.3f}",
+                    details={
+                        "reason_code": "snapshot_low_confidence",
+                        "confidence": confidence,
+                        "min_confidence": min_confidence,
+                        "snapshot_attempt": snapshot_attempt,
+                        "diagnostics": (
+                            diagnostics.model_dump()
+                            if hasattr(diagnostics, "model_dump")
+                            else diagnostics
+                        ),
+                    },
+                )
+
+                # Emit attempt event (not recorded in step_end)
+                self.runtime._record_outcome(
+                    outcome=last_outcome,
+                    label=self.label,
+                    required=self.required,
+                    kind="assert",
+                    record_in_step=False,
+                    extra={
+                        "eventually": True,
+                        "attempt": attempt,
+                        "snapshot_attempt": snapshot_attempt,
+                    },
+                )
+
+                if snapshot_attempt >= max_snapshot_attempts:
+                    # Optional: vision fallback as last resort (Phase 2-lite).
+                    # This keeps the assertion surface invariant; only the perception layer changes.
+                    if (
+                        vision_provider is not None
+                        and getattr(vision_provider, "supports_vision", lambda: False)()
+                    ):
+                        try:
+                            import base64
+
+                            png_bytes = await self.runtime.backend.screenshot_png()
+                            image_b64 = base64.b64encode(png_bytes).decode("utf-8")
+
+                            sys_prompt = vision_system_prompt or (
+                                "You are a strict visual verifier. Answer only YES or NO."
+                            )
+                            user_prompt = vision_user_prompt or (
+                                f"Given the screenshot, is the following condition satisfied?\n\n{self.label}\n\nAnswer YES or NO."
+                            )
+
+                            resp = vision_provider.generate_with_image(
+                                sys_prompt,
+                                user_prompt,
+                                image_base64=image_b64,
+                                temperature=0.0,
+                            )
+                            text = (resp.content or "").strip().lower()
+                            passed = text.startswith("yes")
+
+                            final_outcome = AssertOutcome(
+                                passed=passed,
+                                reason="vision_fallback_yes" if passed else "vision_fallback_no",
+                                details={
+                                    "reason_code": (
+                                        "vision_fallback_pass" if passed else "vision_fallback_fail"
+                                    ),
+                                    "vision_response": resp.content,
+                                    "min_confidence": min_confidence,
+                                    "snapshot_attempts": snapshot_attempt,
+                                },
+                            )
+                            self.runtime._record_outcome(
+                                outcome=final_outcome,
+                                label=self.label,
+                                required=self.required,
+                                kind="assert",
+                                record_in_step=True,
+                                extra={
+                                    "eventually": True,
+                                    "attempt": attempt,
+                                    "snapshot_attempt": snapshot_attempt,
+                                    "final": True,
+                                    "vision_fallback": True,
+                                },
+                            )
+                            return passed
+                        except Exception as e:
+                            # If vision fallback fails, fall through to snapshot_exhausted.
+                            last_outcome.details["vision_error"] = str(e)
+
+                    final_outcome = AssertOutcome(
+                        passed=False,
+                        reason=f"Snapshot exhausted after {snapshot_attempt} attempt(s) below min_confidence {min_confidence:.3f}",
+                        details={
+                            "reason_code": "snapshot_exhausted",
+                            "confidence": confidence,
+                            "min_confidence": min_confidence,
+                            "snapshot_attempts": snapshot_attempt,
+                            "diagnostics": last_outcome.details.get("diagnostics"),
+                        },
+                    )
+                    self.runtime._record_outcome(
+                        outcome=final_outcome,
+                        label=self.label,
+                        required=self.required,
+                        kind="assert",
+                        record_in_step=True,
+                        extra={
+                            "eventually": True,
+                            "attempt": attempt,
+                            "snapshot_attempt": snapshot_attempt,
+                            "final": True,
+                            "exhausted": True,
+                        },
+                    )
+                    return False
+
+                if time.monotonic() >= deadline:
+                    self.runtime._record_outcome(
+                        outcome=last_outcome,
+                        label=self.label,
+                        required=self.required,
+                        kind="assert",
+                        record_in_step=True,
+                        extra={
+                            "eventually": True,
+                            "attempt": attempt,
+                            "snapshot_attempt": snapshot_attempt,
+                            "final": True,
+                            "timeout": True,
+                        },
+                    )
+                    return False
+
+                await asyncio.sleep(poll_s)
+                continue
 
             last_outcome = self.predicate(self.runtime._ctx())
 
@@ -549,66 +708,3 @@ async def eventually(
                 return False
 
             await asyncio.sleep(poll_s)
-
-    def get_assertions_for_step_end(self) -> dict[str, Any]:
-        """
-        Get assertions data for inclusion in step_end.data.verify.signals.
-
-        This is called when building the step_end event to include
-        assertion results in the trace.
-
-        Returns:
-            Dictionary with 'assertions', 'task_done', 'task_done_label' keys
-        """
-        result: dict[str, Any] = {
-            "assertions": self._assertions_this_step.copy(),
-        }
-
-        if self._task_done:
-            result["task_done"] = True
-            result["task_done_label"] = self._task_done_label
-
-        return result
-
-    def flush_assertions(self) -> list[dict[str, Any]]:
-        """
-        Get and clear assertions for current step.
-
-        Call this at step end to get accumulated assertions
-        for the step_end event, then clear for next step.
-
-        Returns:
-            List of assertion records from this step
-        """
-        assertions = self._assertions_this_step.copy()
-        self._assertions_this_step = []
-        return assertions
-
-    @property
-    def is_task_done(self) -> bool:
-        """Check if task has been marked as done via assert_done()."""
-        return self._task_done
-
-    def reset_task_done(self) -> None:
-        """Reset task_done state (for multi-task runs)."""
-        self._task_done = False
-        self._task_done_label = None
-
-    def all_assertions_passed(self) -> bool:
-        """
-        Check if all assertions in current step passed.
-
-        Returns:
-            True if all assertions passed (or no assertions made)
-        """
-        return all(a["passed"] for a in self._assertions_this_step)
-
-    def required_assertions_passed(self) -> bool:
-        """
-        Check if all required assertions in current step passed.
-
-        Returns:
-            True if all required assertions passed (or no required assertions)
-        """
-        required = [a for a in self._assertions_this_step if a.get("required")]
-        return all(a["passed"] for a in required)
diff --git a/sentience/models.py b/sentience/models.py
@@ -2,6 +2,8 @@
 Pydantic models for Sentience SDK - matches spec/snapshot.schema.json
 """
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import Any, Literal
 
@@ -90,7 +92,7 @@ class Element(BaseModel):
     # Layout-derived metadata (internal-only in v0, not exposed in API responses)
     # Per ChatGPT feedback: explicitly optional to prevent users assuming layout is always present
     # Note: This field is marked with skip_serializing_if in Rust, so it won't appear in API responses
-    layout: "LayoutHints | None" = None
+    layout: LayoutHints | None = None
 
 
 class GridPosition(BaseModel):
@@ -153,6 +155,8 @@ class Snapshot(BaseModel):
     requires_license: bool | None = None
     # Phase 2: Dominant group key for ordinal selection
     dominant_group_key: str | None = None  # The most common group_key (main content group)
+    # Phase 2: Runtime stability/debug info (confidence/reasons/metrics)
+    diagnostics: SnapshotDiagnostics | None = None
 
     def save(self, filepath: str) -> None:
         """Save snapshot as JSON file"""
@@ -161,6 +165,22 @@ def save(self, filepath: str) -> None:
         with open(filepath, "w", encoding="utf-8") as f:
             json.dump(self.model_dump(), f, indent=2)
 
+
+class SnapshotDiagnosticsMetrics(BaseModel):
+    ready_state: str | None = None
+    quiet_ms: float | None = None
+    node_count: int | None = None
+    interactive_count: int | None = None
+    raw_elements_count: int | None = None
+
+
+class SnapshotDiagnostics(BaseModel):
+    """Runtime stability/debug information (reserved for diagnostics, not ML metadata)."""
+
+    confidence: float | None = None
+    reasons: list[str] = []
+    metrics: SnapshotDiagnosticsMetrics | None = None
+
     def get_grid_bounds(self, grid_id: int | None = None) -> list[GridInfo]:
         """
         Get grid coordinates (bounding boxes) for detected grids.
@@ -290,7 +310,7 @@ def get_grid_bounds(self, grid_id: int | None = None) -> list[GridInfo]:
         return grid_infos
 
     @staticmethod
-    def _infer_grid_label(elements: list["Element"]) -> str | None:
+    def _infer_grid_label(elements: list[Element]) -> str | None:
         """
         Infer grid label from element patterns using text fingerprinting (best-effort heuristic).
 
@@ -685,7 +705,7 @@ class StorageState(BaseModel):
     )
 
     @classmethod
-    def from_dict(cls, data: dict) -> "StorageState":
+    def from_dict(cls, data: dict) -> StorageState:
         """
         Create StorageState from dictionary (e.g., loaded from JSON).
 
diff --git a/sentience/snapshot.py b/sentience/snapshot.py
@@ -29,6 +29,13 @@ def _build_snapshot_payload(
 
     Shared helper used by both sync and async snapshot implementations.
     """
+    diagnostics = raw_result.get("diagnostics") or {}
+    client_metrics = None
+    try:
+        client_metrics = diagnostics.get("metrics")
+    except Exception:
+        client_metrics = None
+
     return {
         "raw_elements": raw_result.get("raw_elements", []),
         "url": raw_result.get("url", ""),
@@ -38,6 +45,7 @@ def _build_snapshot_payload(
             "limit": options.limit,
             "filter": options.filter.model_dump() if options.filter else None,
         },
+        "client_metrics": client_metrics,
     }
 
 
@@ -133,6 +141,8 @@ def _merge_api_result_with_local(
         "screenshot": raw_result.get("screenshot"),  # Keep local screenshot
         "screenshot_format": raw_result.get("screenshot_format"),
         "error": api_result.get("error"),
+        # Phase 2: Runtime stability/debug info
+        "diagnostics": api_result.get("diagnostics", raw_result.get("diagnostics")),
         # Phase 2: Ordinal support - dominant group key from Gateway
         "dominant_group_key": api_result.get("dominant_group_key"),
     }
diff --git a/tests/test_agent_runtime.py b/tests/test_agent_runtime.py