From d7eb56024845e857e0f0ee18c959e1ddcc36475e Mon Sep 17 00:00:00 2001
From: Alessandro Bologna <alessandro.bologna@gmail.com>
Date: Sun, 4 Jan 2026 13:59:27 -0500
Subject: [PATCH] Fix sync checkpoint race by ordering operation fetch

Move completion event signaling to after the execution state is updated
from the checkpoint response. This prevents a waiting user thread from
running the second status check before new operations are added, which
could lead to a duplicate START and stalled checkpoint thread.

This preserves the existing checkpoint API semantics while closing the
race window between completion signaling and state refresh.
---
 src/aws_durable_execution_sdk_python/state.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/aws_durable_execution_sdk_python/state.py b/src/aws_durable_execution_sdk_python/state.py
index a6fc0c7..8553a7d 100644
--- a/src/aws_durable_execution_sdk_python/state.py
+++ b/src/aws_durable_execution_sdk_python/state.py
@@ -613,20 +613,20 @@ def checkpoint_batches_forever(self) -> None:
 
                     logger.debug("Checkpoint batch processed successfully")
 
-                    # Signal completion for any synchronous operations
-                    for queued_op in batch:
-                        if queued_op.completion_event is not None:
-                            queued_op.completion_event.set()
-
                     # Update local token for next iteration
                     current_checkpoint_token = output.checkpoint_token
 
-                    # Fetch new operations from the API
+                    # Fetch new operations from the API before unblocking sync waiters
                     self.fetch_paginated_operations(
                         output.new_execution_state.operations,
                         output.checkpoint_token,
                         output.new_execution_state.next_marker,
                     )
+
+                    # Signal completion for any synchronous operations
+                    for queued_op in batch:
+                        if queued_op.completion_event is not None:
+                            queued_op.completion_event.set()
                 except Exception as e:
                     # Checkpoint failed - wake all blocked threads so they can raise error
                     # Drain both queues and signal all completion events