Add more tests, fix in-memory backend bug

andystaples · andystaples · commit 380b2fbf92d8 · 2026-03-26T17:17:25.000-06:00
diff --git a/durabletask/testing/in_memory_backend.py b/durabletask/testing/in_memory_backend.py
@@ -1667,11 +1667,10 @@ def _process_rewind_orchestration_action(
         # Identify sub-orchestrations that were created but did not
         # complete successfully — they need to be recursively rewound.
         completed_sub_orch_task_ids: set[int] = set()
-        created_sub_orchs: dict[int, str] = {}
+        created_sub_orch_events: dict[int, pb.HistoryEvent] = {}
         for event in new_history:
             if event.HasField("subOrchestrationInstanceCreated"):
-                created_sub_orchs[event.eventId] = (
-                    event.subOrchestrationInstanceCreated.instanceId)
+                created_sub_orch_events[event.eventId] = event
             elif event.HasField("subOrchestrationInstanceCompleted"):
                 completed_sub_orch_task_ids.add(
                     event.subOrchestrationInstanceCompleted.taskScheduledId)
@@ -1684,11 +1683,22 @@ def _process_rewind_orchestration_action(
                     reason = event.executionRewound.reason.value
                 break
 
-        # Recursively rewind failed sub-orchestrations.
-        for task_id, sub_instance_id in created_sub_orchs.items():
+        # Recursively rewind failed sub-orchestrations.  If the sub was
+        # purged (no longer in _instances), re-create it from the
+        # subOrchestrationInstanceCreated event so it runs fresh.
+        for task_id, event in created_sub_orch_events.items():
             if task_id not in completed_sub_orch_task_ids:
+                sub_info = event.subOrchestrationInstanceCreated
+                sub_instance_id = sub_info.instanceId
                 sub_instance = self._instances.get(sub_instance_id)
-                if (sub_instance and sub_instance.status == pb.ORCHESTRATION_STATUS_FAILED):
+                if sub_instance is None:
+                    # Sub-orchestration was purged — re-create it.
+                    sub_name = sub_info.name
+                    sub_input = sub_info.input.value if sub_info.HasField("input") else None
+                    sub_version = sub_info.version.value if sub_info.HasField("version") else None
+                    self._create_instance_internal(
+                        sub_instance_id, sub_name, sub_input, version=sub_version)
+                elif sub_instance.status == pb.ORCHESTRATION_STATUS_FAILED:
                     self._prepare_rewind(sub_instance, reason)
                 self._watch_sub_orchestration(
                     instance.instance_id, sub_instance_id, task_id)
diff --git a/tests/durabletask-azuremanaged/test_dts_rewind_e2e.py b/tests/durabletask-azuremanaged/test_dts_rewind_e2e.py
@@ -88,14 +88,15 @@ def test_rewind_preserves_successful_results():
     rewind should re-execute only the failed activity while the successful
     result is replayed from history."""
     call_tracker: dict[str, int] = {"first": 0, "second": 0}
+    should_fail_second = True
 
     def first_activity(_: task.ActivityContext, input: str) -> str:
         call_tracker["first"] += 1
         return f"first:{input}"
 
     def second_activity(_: task.ActivityContext, input: str) -> str:
         call_tracker["second"] += 1
-        if call_tracker["second"] == 1:
+        if should_fail_second:
             raise RuntimeError("Temporary failure")
         return f"second:{input}"
 
@@ -120,7 +121,8 @@ def orchestrator(ctx: task.OrchestrationContext, input: str):
         assert state is not None
         assert state.runtime_status == client.OrchestrationStatus.FAILED
 
-        # Rewind – second_activity will now succeed on retry.
+        # Fix second_activity so it now succeeds, then rewind.
+        should_fail_second = False
         c.rewind_orchestration(instance_id, reason="retry")
         state = c.wait_for_orchestration_completion(instance_id, timeout=30)
 
@@ -130,8 +132,8 @@ def orchestrator(ctx: task.OrchestrationContext, input: str):
     assert state.failure_details is None
     # first_activity should NOT be re-executed – its result is replayed.
     assert call_tracker["first"] == 1
-    # second_activity was called twice (once failed, once succeeded).
-    assert call_tracker["second"] == 2
+    # second_activity was called at least twice (once failed, once succeeded).
+    assert call_tracker["second"] >= 2
 
 
 def test_rewind_not_found():
@@ -210,6 +212,118 @@ def parent_orchestrator(ctx: task.OrchestrationContext, input: str):
     assert sub_call_count == 2
 
 
+def test_rewind_purged_sub_orchestration():
+    """A purged sub-orchestration is re-run when the parent is rewound.
+
+    Flow: parent orchestrator -> calls sub-orchestrator -> sub-orchestrator
+    fails -> parent fails -> client purges the sub-orchestration -> client
+    rewinds the parent -> parent re-schedules the sub-orchestration which
+    now succeeds -> parent completes.
+    """
+    child_call_count = 0
+
+    def child_activity(_: task.ActivityContext, input: str) -> str:
+        nonlocal child_call_count
+        child_call_count += 1
+        if child_call_count == 1:
+            raise RuntimeError("Child failure")
+        return f"child:{input}"
+
+    def child_orchestrator(ctx: task.OrchestrationContext, input: str):
+        result = yield ctx.call_activity(child_activity, input=input)
+        return result
+
+    def parent_orchestrator(ctx: task.OrchestrationContext, input: str):
+        result = yield ctx.call_sub_orchestrator(
+            child_orchestrator, input=input, instance_id="sub-orch-to-purge")
+        return f"parent:{result}"
+
+    with DurableTaskSchedulerWorker(host_address=endpoint, secure_channel=True,
+                                    taskhub=taskhub_name, token_credential=None) as w:
+        w.add_orchestrator(parent_orchestrator)
+        w.add_orchestrator(child_orchestrator)
+        w.add_activity(child_activity)
+        w.start()
+
+        c = DurableTaskSchedulerClient(host_address=endpoint, secure_channel=True,
+                                       taskhub=taskhub_name, token_credential=None)
+        instance_id = c.schedule_new_orchestration(
+            parent_orchestrator, input="data")
+        state = c.wait_for_orchestration_completion(instance_id, timeout=30)
+
+        # Parent should fail because child failed.
+        assert state is not None
+        assert state.runtime_status == client.OrchestrationStatus.FAILED
+
+        # Purge the sub-orchestration so it must be completely re-run.
+        c.purge_orchestration("sub-orch-to-purge")
+
+        # Rewind the parent – child will be re-scheduled and succeed.
+        c.rewind_orchestration(instance_id, reason="purge and retry")
+        state = c.wait_for_orchestration_completion(instance_id, timeout=30)
+
+    assert state is not None
+    assert state.runtime_status == client.OrchestrationStatus.COMPLETED
+    assert state.serialized_output == json.dumps("parent:child:data")
+    assert child_call_count == 2
+
+
+def test_rewind_does_not_rerun_successful_activities():
+    """Successful activities must not be re-executed during rewind.
+
+    The orchestration calls two activities in sequence.  The first
+    succeeds and the second fails.  After rewind, only the failed
+    activity is retried; the successful activity's result is replayed
+    from history and its body is never called again.
+    """
+    success_call_count = 0
+    fail_call_count = 0
+
+    def success_activity(_: task.ActivityContext, input: str) -> str:
+        nonlocal success_call_count
+        success_call_count += 1
+        return f"ok:{input}"
+
+    def fail_activity(_: task.ActivityContext, input: str) -> str:
+        nonlocal fail_call_count
+        fail_call_count += 1
+        if fail_call_count == 1:
+            raise RuntimeError("Temporary failure")
+        return f"recovered:{input}"
+
+    def orchestrator(ctx: task.OrchestrationContext, input: str):
+        r1 = yield ctx.call_activity(success_activity, input=input)
+        r2 = yield ctx.call_activity(fail_activity, input=input)
+        return [r1, r2]
+
+    with DurableTaskSchedulerWorker(host_address=endpoint, secure_channel=True,
+                                    taskhub=taskhub_name, token_credential=None) as w:
+        w.add_orchestrator(orchestrator)
+        w.add_activity(success_activity)
+        w.add_activity(fail_activity)
+        w.start()
+
+        c = DurableTaskSchedulerClient(host_address=endpoint, secure_channel=True,
+                                       taskhub=taskhub_name, token_credential=None)
+        instance_id = c.schedule_new_orchestration(orchestrator, input="v")
+        state = c.wait_for_orchestration_completion(instance_id, timeout=30)
+
+        assert state is not None
+        assert state.runtime_status == client.OrchestrationStatus.FAILED
+
+        # Rewind – only the failed activity should be retried.
+        c.rewind_orchestration(instance_id, reason="retry")
+        state = c.wait_for_orchestration_completion(instance_id, timeout=30)
+
+    assert state is not None
+    assert state.runtime_status == client.OrchestrationStatus.COMPLETED
+    assert state.serialized_output == json.dumps(["ok:v", "recovered:v"])
+    # The successful activity must have been called exactly once.
+    assert success_call_count == 1
+    # The failing activity was called twice (once failed, once succeeded).
+    assert fail_call_count == 2
+
+
 def test_rewind_without_reason():
     """Rewind should work when no reason is provided."""
     call_count = 0
diff --git a/tests/durabletask/test_rewind_e2e.py b/tests/durabletask/test_rewind_e2e.py
@@ -94,14 +94,15 @@ def test_rewind_preserves_successful_results():
     _reset_counters()
 
     call_tracker: dict[str, int] = {"first": 0, "second": 0}
+    should_fail_second = True
 
     def first_activity(_: task.ActivityContext, input: str) -> str:
         call_tracker["first"] += 1
         return f"first:{input}"
 
     def second_activity(_: task.ActivityContext, input: str) -> str:
         call_tracker["second"] += 1
-        if call_tracker["second"] == 1:
+        if should_fail_second:
             raise RuntimeError("Temporary failure")
         return f"second:{input}"
 
@@ -124,7 +125,8 @@ def orchestrator(ctx: task.OrchestrationContext, input: str):
         assert state is not None
         assert state.runtime_status == client.OrchestrationStatus.FAILED
 
-        # Rewind – second_activity will now succeed on retry.
+        # Fix second_activity so it now succeeds, then rewind.
+        should_fail_second = False
         c.rewind_orchestration(instance_id, reason="retry")
         state = c.wait_for_orchestration_completion(instance_id, timeout=30)
 
@@ -134,8 +136,8 @@ def orchestrator(ctx: task.OrchestrationContext, input: str):
     assert state.failure_details is None
     # first_activity should NOT be re-executed – its result is replayed.
     assert call_tracker["first"] == 1
-    # second_activity was called twice (once failed, once succeeded).
-    assert call_tracker["second"] == 2
+    # second_activity was called at least twice (once failed, once succeeded).
+    assert call_tracker["second"] >= 2
 
 
 def test_rewind_not_found():
@@ -247,6 +249,114 @@ def orchestrator(ctx: task.OrchestrationContext, _):
     assert state.serialized_output == json.dumps("ok")
 
 
+def test_rewind_purged_sub_orchestration():
+    """A purged sub-orchestration is re-run when the parent is rewound.
+
+    Flow: parent orchestrator -> calls sub-orchestrator -> sub-orchestrator
+    fails -> parent fails -> client purges the sub-orchestration -> client
+    rewinds the parent -> parent re-schedules the sub-orchestration which
+    now succeeds -> parent completes.
+    """
+    child_call_count = 0
+
+    def child_activity(_: task.ActivityContext, input: str) -> str:
+        nonlocal child_call_count
+        child_call_count += 1
+        if child_call_count == 1:
+            raise RuntimeError("Child failure")
+        return f"child:{input}"
+
+    def child_orchestrator(ctx: task.OrchestrationContext, input: str):
+        result = yield ctx.call_activity(child_activity, input=input)
+        return result
+
+    def parent_orchestrator(ctx: task.OrchestrationContext, input: str):
+        result = yield ctx.call_sub_orchestrator(
+            child_orchestrator, input=input, instance_id="sub-orch-to-purge")
+        return f"parent:{result}"
+
+    with worker.TaskHubGrpcWorker(host_address=HOST) as w:
+        w.add_orchestrator(parent_orchestrator)
+        w.add_orchestrator(child_orchestrator)
+        w.add_activity(child_activity)
+        w.start()
+
+        c = client.TaskHubGrpcClient(host_address=HOST)
+        instance_id = c.schedule_new_orchestration(
+            parent_orchestrator, input="data")
+        state = c.wait_for_orchestration_completion(instance_id, timeout=30)
+
+        # Parent should fail because child failed.
+        assert state is not None
+        assert state.runtime_status == client.OrchestrationStatus.FAILED
+
+        # Purge the sub-orchestration so it must be completely re-run.
+        c.purge_orchestration("sub-orch-to-purge")
+
+        # Rewind the parent – child will be re-scheduled and succeed.
+        c.rewind_orchestration(instance_id, reason="purge and retry")
+        state = c.wait_for_orchestration_completion(instance_id, timeout=30)
+
+    assert state is not None
+    assert state.runtime_status == client.OrchestrationStatus.COMPLETED
+    assert state.serialized_output == json.dumps("parent:child:data")
+    assert child_call_count == 2
+
+
+def test_rewind_does_not_rerun_successful_activities():
+    """Successful activities must not be re-executed during rewind.
+
+    The orchestration calls two activities in sequence.  The first
+    succeeds and the second fails.  After rewind, only the failed
+    activity is retried; the successful activity's result is replayed
+    from history and its body is never called again.
+    """
+    success_call_count = 0
+    fail_call_count = 0
+
+    def success_activity(_: task.ActivityContext, input: str) -> str:
+        nonlocal success_call_count
+        success_call_count += 1
+        return f"ok:{input}"
+
+    def fail_activity(_: task.ActivityContext, input: str) -> str:
+        nonlocal fail_call_count
+        fail_call_count += 1
+        if fail_call_count == 1:
+            raise RuntimeError("Temporary failure")
+        return f"recovered:{input}"
+
+    def orchestrator(ctx: task.OrchestrationContext, input: str):
+        r1 = yield ctx.call_activity(success_activity, input=input)
+        r2 = yield ctx.call_activity(fail_activity, input=input)
+        return [r1, r2]
+
+    with worker.TaskHubGrpcWorker(host_address=HOST) as w:
+        w.add_orchestrator(orchestrator)
+        w.add_activity(success_activity)
+        w.add_activity(fail_activity)
+        w.start()
+
+        c = client.TaskHubGrpcClient(host_address=HOST)
+        instance_id = c.schedule_new_orchestration(orchestrator, input="v")
+        state = c.wait_for_orchestration_completion(instance_id, timeout=30)
+
+        assert state is not None
+        assert state.runtime_status == client.OrchestrationStatus.FAILED
+
+        # Rewind – only the failed activity should be retried.
+        c.rewind_orchestration(instance_id, reason="retry")
+        state = c.wait_for_orchestration_completion(instance_id, timeout=30)
+
+    assert state is not None
+    assert state.runtime_status == client.OrchestrationStatus.COMPLETED
+    assert state.serialized_output == json.dumps(["ok:v", "recovered:v"])
+    # The successful activity must have been called exactly once.
+    assert success_call_count == 1
+    # The failing activity was called twice (once failed, once succeeded).
+    assert fail_call_count == 2
+
+
 def test_rewind_twice():
     """Rewind the same orchestration twice after it fails a second time.