7171from typing import TYPE_CHECKING , Any
7272
7373from .models import Snapshot , SnapshotOptions
74- from .verification import AssertContext , Predicate
74+ from .verification import AssertContext , AssertOutcome , Predicate
7575
7676if TYPE_CHECKING :
7777 from playwright .async_api import Page
@@ -341,7 +341,7 @@ def assert_done(
341341 True if task is complete (assertion passed), False otherwise
342342 """
343343 # Convenience wrapper for assert_ with required=True
344- ok = self .assert_ (predicate , label = label , required = True )
344+ ok = self .assertTrue (predicate , label = label , required = True )
345345 if ok :
346346 self ._task_done = True
347347 self ._task_done_label = label
@@ -496,7 +496,12 @@ async def eventually(
496496 * ,
497497 timeout_s : float = 10.0 ,
498498 poll_s : float = 0.25 ,
499+ min_confidence : float | None = None ,
500+ max_snapshot_attempts : int = 3 ,
499501 snapshot_kwargs : dict [str , Any ] | None = None ,
502+ vision_provider : Any | None = None ,
503+ vision_system_prompt : str | None = None ,
504+ vision_user_prompt : str | None = None ,
500505 ) -> bool :
501506 """
502507 Retry until the predicate passes or timeout is reached.
@@ -506,11 +511,165 @@ async def eventually(
506511 """
507512 deadline = time .monotonic () + timeout_s
508513 attempt = 0
514+ snapshot_attempt = 0
509515 last_outcome = None
510516
511517 while True :
512518 attempt += 1
513519 await self .runtime .snapshot (** (snapshot_kwargs or {}))
520+ snapshot_attempt += 1
521+
522+ # Optional: gate predicate evaluation on snapshot confidence.
523+ # If diagnostics are missing, we don't block (backward compatible).
524+ confidence = None
525+ diagnostics = None
526+ if self .runtime .last_snapshot is not None :
527+ diagnostics = getattr (self .runtime .last_snapshot , "diagnostics" , None )
528+ if diagnostics is not None :
529+ confidence = getattr (diagnostics , "confidence" , None )
530+
531+ if (
532+ min_confidence is not None
533+ and confidence is not None
534+ and isinstance (confidence , (int , float ))
535+ and confidence < min_confidence
536+ ):
537+ last_outcome = AssertOutcome (
538+ passed = False ,
539+ reason = f"Snapshot confidence { confidence :.3f} < min_confidence { min_confidence :.3f} " ,
540+ details = {
541+ "reason_code" : "snapshot_low_confidence" ,
542+ "confidence" : confidence ,
543+ "min_confidence" : min_confidence ,
544+ "snapshot_attempt" : snapshot_attempt ,
545+ "diagnostics" : (
546+ diagnostics .model_dump ()
547+ if hasattr (diagnostics , "model_dump" )
548+ else diagnostics
549+ ),
550+ },
551+ )
552+
553+ # Emit attempt event (not recorded in step_end)
554+ self .runtime ._record_outcome (
555+ outcome = last_outcome ,
556+ label = self .label ,
557+ required = self .required ,
558+ kind = "assert" ,
559+ record_in_step = False ,
560+ extra = {
561+ "eventually" : True ,
562+ "attempt" : attempt ,
563+ "snapshot_attempt" : snapshot_attempt ,
564+ },
565+ )
566+
567+ if snapshot_attempt >= max_snapshot_attempts :
568+ # Optional: vision fallback as last resort (Phase 2-lite).
569+ # This keeps the assertion surface invariant; only the perception layer changes.
570+ if (
571+ vision_provider is not None
572+ and getattr (vision_provider , "supports_vision" , lambda : False )()
573+ ):
574+ try :
575+ import base64
576+
577+ png_bytes = await self .runtime .backend .screenshot_png ()
578+ image_b64 = base64 .b64encode (png_bytes ).decode ("utf-8" )
579+
580+ sys_prompt = vision_system_prompt or (
581+ "You are a strict visual verifier. Answer only YES or NO."
582+ )
583+ user_prompt = vision_user_prompt or (
584+ f"Given the screenshot, is the following condition satisfied?\n \n { self .label } \n \n Answer YES or NO."
585+ )
586+
587+ resp = vision_provider .generate_with_image (
588+ sys_prompt ,
589+ user_prompt ,
590+ image_base64 = image_b64 ,
591+ temperature = 0.0 ,
592+ )
593+ text = (resp .content or "" ).strip ().lower ()
594+ passed = text .startswith ("yes" )
595+
596+ final_outcome = AssertOutcome (
597+ passed = passed ,
598+ reason = "vision_fallback_yes" if passed else "vision_fallback_no" ,
599+ details = {
600+ "reason_code" : (
601+ "vision_fallback_pass" if passed else "vision_fallback_fail"
602+ ),
603+ "vision_response" : resp .content ,
604+ "min_confidence" : min_confidence ,
605+ "snapshot_attempts" : snapshot_attempt ,
606+ },
607+ )
608+ self .runtime ._record_outcome (
609+ outcome = final_outcome ,
610+ label = self .label ,
611+ required = self .required ,
612+ kind = "assert" ,
613+ record_in_step = True ,
614+ extra = {
615+ "eventually" : True ,
616+ "attempt" : attempt ,
617+ "snapshot_attempt" : snapshot_attempt ,
618+ "final" : True ,
619+ "vision_fallback" : True ,
620+ },
621+ )
622+ return passed
623+ except Exception as e :
624+ # If vision fallback fails, fall through to snapshot_exhausted.
625+ last_outcome .details ["vision_error" ] = str (e )
626+
627+ final_outcome = AssertOutcome (
628+ passed = False ,
629+ reason = f"Snapshot exhausted after { snapshot_attempt } attempt(s) below min_confidence { min_confidence :.3f} " ,
630+ details = {
631+ "reason_code" : "snapshot_exhausted" ,
632+ "confidence" : confidence ,
633+ "min_confidence" : min_confidence ,
634+ "snapshot_attempts" : snapshot_attempt ,
635+ "diagnostics" : last_outcome .details .get ("diagnostics" ),
636+ },
637+ )
638+ self .runtime ._record_outcome (
639+ outcome = final_outcome ,
640+ label = self .label ,
641+ required = self .required ,
642+ kind = "assert" ,
643+ record_in_step = True ,
644+ extra = {
645+ "eventually" : True ,
646+ "attempt" : attempt ,
647+ "snapshot_attempt" : snapshot_attempt ,
648+ "final" : True ,
649+ "exhausted" : True ,
650+ },
651+ )
652+ return False
653+
654+ if time .monotonic () >= deadline :
655+ self .runtime ._record_outcome (
656+ outcome = last_outcome ,
657+ label = self .label ,
658+ required = self .required ,
659+ kind = "assert" ,
660+ record_in_step = True ,
661+ extra = {
662+ "eventually" : True ,
663+ "attempt" : attempt ,
664+ "snapshot_attempt" : snapshot_attempt ,
665+ "final" : True ,
666+ "timeout" : True ,
667+ },
668+ )
669+ return False
670+
671+ await asyncio .sleep (poll_s )
672+ continue
514673
515674 last_outcome = self .predicate (self .runtime ._ctx ())
516675
@@ -549,66 +708,3 @@ async def eventually(
549708 return False
550709
551710 await asyncio .sleep (poll_s )
552-
553- def get_assertions_for_step_end (self ) -> dict [str , Any ]:
554- """
555- Get assertions data for inclusion in step_end.data.verify.signals.
556-
557- This is called when building the step_end event to include
558- assertion results in the trace.
559-
560- Returns:
561- Dictionary with 'assertions', 'task_done', 'task_done_label' keys
562- """
563- result : dict [str , Any ] = {
564- "assertions" : self ._assertions_this_step .copy (),
565- }
566-
567- if self ._task_done :
568- result ["task_done" ] = True
569- result ["task_done_label" ] = self ._task_done_label
570-
571- return result
572-
573- def flush_assertions (self ) -> list [dict [str , Any ]]:
574- """
575- Get and clear assertions for current step.
576-
577- Call this at step end to get accumulated assertions
578- for the step_end event, then clear for next step.
579-
580- Returns:
581- List of assertion records from this step
582- """
583- assertions = self ._assertions_this_step .copy ()
584- self ._assertions_this_step = []
585- return assertions
586-
587- @property
588- def is_task_done (self ) -> bool :
589- """Check if task has been marked as done via assert_done()."""
590- return self ._task_done
591-
592- def reset_task_done (self ) -> None :
593- """Reset task_done state (for multi-task runs)."""
594- self ._task_done = False
595- self ._task_done_label = None
596-
597- def all_assertions_passed (self ) -> bool :
598- """
599- Check if all assertions in current step passed.
600-
601- Returns:
602- True if all assertions passed (or no assertions made)
603- """
604- return all (a ["passed" ] for a in self ._assertions_this_step )
605-
606- def required_assertions_passed (self ) -> bool :
607- """
608- Check if all required assertions in current step passed.
609-
610- Returns:
611- True if all required assertions passed (or no required assertions)
612- """
613- required = [a for a in self ._assertions_this_step if a .get ("required" )]
614- return all (a ["passed" ] for a in required )
0 commit comments