From d7eb56024845e857e0f0ee18c959e1ddcc36475e Mon Sep 17 00:00:00 2001 From: Alessandro Bologna Date: Sun, 4 Jan 2026 13:59:27 -0500 Subject: [PATCH] Fix sync checkpoint race by ordering operation fetch Move completion event signaling to after the execution state is updated from the checkpoint response. This prevents a waiting user thread from running the second status check before new operations are added, which could lead to a duplicate START and stalled checkpoint thread. This preserves the existing checkpoint API semantics while closing the race window between completion signaling and state refresh. --- src/aws_durable_execution_sdk_python/state.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/aws_durable_execution_sdk_python/state.py b/src/aws_durable_execution_sdk_python/state.py index a6fc0c7..8553a7d 100644 --- a/src/aws_durable_execution_sdk_python/state.py +++ b/src/aws_durable_execution_sdk_python/state.py @@ -613,20 +613,20 @@ def checkpoint_batches_forever(self) -> None: logger.debug("Checkpoint batch processed successfully") - # Signal completion for any synchronous operations - for queued_op in batch: - if queued_op.completion_event is not None: - queued_op.completion_event.set() - # Update local token for next iteration current_checkpoint_token = output.checkpoint_token - # Fetch new operations from the API + # Fetch new operations from the API before unblocking sync waiters self.fetch_paginated_operations( output.new_execution_state.operations, output.checkpoint_token, output.new_execution_state.next_marker, ) + + # Signal completion for any synchronous operations + for queued_op in batch: + if queued_op.completion_event is not None: + queued_op.completion_event.set() except Exception as e: # Checkpoint failed - wake all blocked threads so they can raise error # Drain both queues and signal all completion events