6363
6464from __future__ import annotations
6565
66+ import asyncio
67+ import difflib
68+ import time
6669import uuid
70+ from dataclasses import dataclass
6771from typing import TYPE_CHECKING , Any
6872
6973from .models import Snapshot , SnapshotOptions
@@ -298,29 +302,23 @@ def assert_(
298302 True if assertion passed, False otherwise
299303 """
300304 outcome = predicate (self ._ctx ())
305+ self ._record_outcome (
306+ outcome = outcome ,
307+ label = label ,
308+ required = required ,
309+ kind = "assert" ,
310+ record_in_step = True ,
311+ )
312+ return outcome .passed
301313
302- record = {
303- "label" : label ,
304- "passed" : outcome .passed ,
305- "required" : required ,
306- "reason" : outcome .reason ,
307- "details" : outcome .details ,
308- }
309- self ._assertions_this_step .append (record )
314+ def check (self , predicate : Predicate , label : str , required : bool = False ) -> AssertionHandle :
315+ """
316+ Create an AssertionHandle for fluent `.once()` / `.eventually()` usage.
310317
311- # Emit dedicated verification event (Option B from design doc)
312- # This makes assertions visible in Studio timeline
313- self .tracer .emit (
314- "verification" ,
315- data = {
316- "kind" : "assert" ,
317- "passed" : outcome .passed ,
318- ** record ,
319- },
320- step_id = self .step_id ,
321- )
318+ This does NOT evaluate the predicate immediately.
319+ """
322320
323- return outcome . passed
321+ return AssertionHandle ( runtime = self , predicate = predicate , label = label , required = required )
324322
325323 def assert_done (
326324 self ,
@@ -342,6 +340,7 @@ def assert_done(
342340 Returns:
343341 True if task is complete (assertion passed), False otherwise
344342 """
343+ # Convenience wrapper for assert_ with required=True
345344 ok = self .assertTrue (predicate , label = label , required = True )
346345 if ok :
347346 self ._task_done = True
@@ -360,6 +359,197 @@ def assert_done(
360359
361360 return ok
362361
362+ def _record_outcome (
363+ self ,
364+ * ,
365+ outcome : Any ,
366+ label : str ,
367+ required : bool ,
368+ kind : str ,
369+ record_in_step : bool ,
370+ extra : dict [str , Any ] | None = None ,
371+ ) -> None :
372+ """
373+ Internal helper: emit verification event and optionally accumulate for step_end.
374+ """
375+ details = dict (outcome .details or {})
376+
377+ # Failure intelligence: nearest matches for selector-driven assertions
378+ if not outcome .passed and self .last_snapshot is not None and "selector" in details :
379+ selector = str (details .get ("selector" ) or "" )
380+ details .setdefault ("nearest_matches" , self ._nearest_matches (selector , limit = 3 ))
381+
382+ record = {
383+ "label" : label ,
384+ "passed" : bool (outcome .passed ),
385+ "required" : required ,
386+ "reason" : str (outcome .reason or "" ),
387+ "details" : details ,
388+ }
389+ if extra :
390+ record .update (extra )
391+
392+ if record_in_step :
393+ self ._assertions_this_step .append (record )
394+
395+ self .tracer .emit (
396+ "verification" ,
397+ data = {
398+ "kind" : kind ,
399+ "passed" : bool (outcome .passed ),
400+ ** record ,
401+ },
402+ step_id = self .step_id ,
403+ )
404+
405+ def _nearest_matches (self , selector : str , * , limit : int = 3 ) -> list [dict [str , Any ]]:
406+ """
407+ Best-effort nearest match suggestions for debugging failed selector assertions.
408+ """
409+ if self .last_snapshot is None :
410+ return []
411+
412+ s = selector .lower ().strip ()
413+ if not s :
414+ return []
415+
416+ scored : list [tuple [float , Any ]] = []
417+ for el in self .last_snapshot .elements :
418+ hay = (getattr (el , "name" , None ) or getattr (el , "text" , None ) or "" ).strip ()
419+ if not hay :
420+ continue
421+ score = difflib .SequenceMatcher (None , s , hay .lower ()).ratio ()
422+ scored .append ((score , el ))
423+
424+ scored .sort (key = lambda t : t [0 ], reverse = True )
425+ out : list [dict [str , Any ]] = []
426+ for score , el in scored [:limit ]:
427+ out .append (
428+ {
429+ "id" : getattr (el , "id" , None ),
430+ "role" : getattr (el , "role" , None ),
431+ "text" : (getattr (el , "text" , "" ) or "" )[:80 ],
432+ "name" : (getattr (el , "name" , "" ) or "" )[:80 ],
433+ "score" : round (float (score ), 4 ),
434+ }
435+ )
436+ return out
437+
438+ def get_assertions_for_step_end (self ) -> dict [str , Any ]:
439+ """
440+ Get assertions data for inclusion in step_end.data.verify.signals.
441+
442+ Returns:
443+ Dictionary with 'assertions', 'task_done', 'task_done_label' keys
444+ """
445+ result : dict [str , Any ] = {
446+ "assertions" : self ._assertions_this_step .copy (),
447+ }
448+
449+ if self ._task_done :
450+ result ["task_done" ] = True
451+ result ["task_done_label" ] = self ._task_done_label
452+
453+ return result
454+
455+ def flush_assertions (self ) -> list [dict [str , Any ]]:
456+ """
457+ Get and clear assertions for current step.
458+ """
459+ assertions = self ._assertions_this_step .copy ()
460+ self ._assertions_this_step = []
461+ return assertions
462+
463+ @property
464+ def is_task_done (self ) -> bool :
465+ """Check if task has been marked as done via assert_done()."""
466+ return self ._task_done
467+
468+ def reset_task_done (self ) -> None :
469+ """Reset task_done state (for multi-task runs)."""
470+ self ._task_done = False
471+ self ._task_done_label = None
472+
473+ def all_assertions_passed (self ) -> bool :
474+ """Return True if all assertions in current step passed (or none)."""
475+ return all (a ["passed" ] for a in self ._assertions_this_step )
476+
477+ def required_assertions_passed (self ) -> bool :
478+ """Return True if all required assertions in current step passed (or none)."""
479+ required = [a for a in self ._assertions_this_step if a .get ("required" )]
480+ return all (a ["passed" ] for a in required )
481+
482+
483+ @dataclass
484+ class AssertionHandle :
485+ runtime : AgentRuntime
486+ predicate : Predicate
487+ label : str
488+ required : bool = False
489+
490+ def once (self ) -> bool :
491+ """Evaluate once (same behavior as runtime.assert_)."""
492+ return self .runtime .assert_ (self .predicate , label = self .label , required = self .required )
493+
494+ async def eventually (
495+ self ,
496+ * ,
497+ timeout_s : float = 10.0 ,
498+ poll_s : float = 0.25 ,
499+ snapshot_kwargs : dict [str , Any ] | None = None ,
500+ ) -> bool :
501+ """
502+ Retry until the predicate passes or timeout is reached.
503+
504+ Intermediate attempts emit verification events but do NOT accumulate in step_end assertions.
505+ Final result is accumulated once.
506+ """
507+ deadline = time .monotonic () + timeout_s
508+ attempt = 0
509+ last_outcome = None
510+
511+ while True :
512+ attempt += 1
513+ await self .runtime .snapshot (** (snapshot_kwargs or {}))
514+
515+ last_outcome = self .predicate (self .runtime ._ctx ())
516+
517+ # Emit attempt event (not recorded in step_end)
518+ self .runtime ._record_outcome (
519+ outcome = last_outcome ,
520+ label = self .label ,
521+ required = self .required ,
522+ kind = "assert" ,
523+ record_in_step = False ,
524+ extra = {"eventually" : True , "attempt" : attempt },
525+ )
526+
527+ if last_outcome .passed :
528+ # Record final success once
529+ self .runtime ._record_outcome (
530+ outcome = last_outcome ,
531+ label = self .label ,
532+ required = self .required ,
533+ kind = "assert" ,
534+ record_in_step = True ,
535+ extra = {"eventually" : True , "attempt" : attempt , "final" : True },
536+ )
537+ return True
538+
539+ if time .monotonic () >= deadline :
540+ # Record final failure once
541+ self .runtime ._record_outcome (
542+ outcome = last_outcome ,
543+ label = self .label ,
544+ required = self .required ,
545+ kind = "assert" ,
546+ record_in_step = True ,
547+ extra = {"eventually" : True , "attempt" : attempt , "final" : True , "timeout" : True },
548+ )
549+ return False
550+
551+ await asyncio .sleep (poll_s )
552+
363553 def get_assertions_for_step_end (self ) -> dict [str , Any ]:
364554 """
365555 Get assertions data for inclusion in step_end.data.verify.signals.
0 commit comments