From cdf331cec117881b1392520c99f239f08acc17db Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sat, 28 Mar 2026 09:13:38 -0500 Subject: [PATCH 1/9] Add diagnostic logging to SyncRemoteSessions IsProcessing guard Logs when IsProcessing changes, when the TurnEnd guard blocks a stale snapshot, and when the streaming guard skips a session. Helps diagnose future stale-state issues on mobile. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Bridge.cs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/PolyPilot/Services/CopilotService.Bridge.cs b/PolyPilot/Services/CopilotService.Bridge.cs index a3ff0fbbd..c9f9893f5 100644 --- a/PolyPilot/Services/CopilotService.Bridge.cs +++ b/PolyPilot/Services/CopilotService.Bridge.cs @@ -528,13 +528,23 @@ internal void SyncRemoteSessions() if (!turnEndGuardActive) { + if (state.Info.IsProcessing != rs.IsProcessing) + Debug($"SyncRemoteSessions: '{rs.Name}' IsProcessing {state.Info.IsProcessing} -> {rs.IsProcessing}"); state.Info.IsProcessing = rs.IsProcessing; state.Info.ProcessingStartedAt = rs.ProcessingStartedAt; state.Info.ToolCallCount = rs.ToolCallCount; state.Info.ProcessingPhase = rs.ProcessingPhase; } + else + { + Debug($"SyncRemoteSessions: '{rs.Name}' TurnEnd guard blocked IsProcessing=true"); + } state.Info.MessageCount = rs.MessageCount; } + else + { + Debug($"SyncRemoteSessions: '{rs.Name}' skipped — streaming guard active"); + } if (!string.IsNullOrEmpty(rs.Model)) state.Info.Model = rs.Model; } From f75e5c4cec871d429e1f39cbd7c9b71a76767c6e Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sat, 28 Mar 2026 14:40:29 -0500 Subject: [PATCH 2/9] Fix force-sync not clearing stale IsProcessing when streaming guard is stuck The streaming guard (_remoteStreamingSessions) blocks SyncRemoteSessions from updating IsProcessing. If TurnStart fires but TurnEnd is lost (connection drop), the guard stays active forever, causing permanent stale 'busy/sending' state that even the sync button can't fix. ForceRefreshRemoteAsync now: - Applies server's authoritative IsProcessing to ALL sessions (bypasses guards) - Clears stuck streaming guards for sessions the server reports as idle Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/BridgeDisconnectTests.cs | 28 +++++++++++++++++++++ PolyPilot/Services/CopilotService.Bridge.cs | 21 ++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/PolyPilot.Tests/BridgeDisconnectTests.cs b/PolyPilot.Tests/BridgeDisconnectTests.cs index ad56ffb4d..fdca1b24a 100644 --- a/PolyPilot.Tests/BridgeDisconnectTests.cs +++ b/PolyPilot.Tests/BridgeDisconnectTests.cs @@ -348,4 +348,32 @@ public async Task SyncRemoteSessions_AllowsSessionsListToClearProcessing() Assert.False(session.IsProcessing); } + + [Fact] + public async Task ForceSync_ClearsIsProcessing_EvenWithStreamingGuard() + { + // Scenario: Streaming guard is stuck (TurnStart received but TurnEnd lost). + // SyncRemoteSessions skips the session. But ForceRefreshRemoteAsync should + // always apply the server's authoritative IsProcessing state. + var svc = CreateRemoteService(); + await AddRemoteSession(svc, "stuck-session"); + var session = svc.GetSession("stuck-session")!; + + // Session appears processing with a stuck streaming guard + session.IsProcessing = true; + svc.SetRemoteStreamingGuardForTesting("stuck-session", true); + + // SyncRemoteSessions should skip (streaming guard active) + _bridgeClient.Sessions = new() { new SessionSummary { Name = "stuck-session", IsProcessing = false } }; + svc.SyncRemoteSessions(); + Assert.True(session.IsProcessing); // Guard blocks the update + + // Force sync should override the streaming guard + _bridgeClient.SessionHistories["stuck-session"] = new List(); + var result = await svc.ForceRefreshRemoteAsync("stuck-session"); + + Assert.True(result.Success); + Assert.False(session.IsProcessing); + Assert.False(svc.IsRemoteStreamingGuardActive("stuck-session")); + } } diff --git a/PolyPilot/Services/CopilotService.Bridge.cs b/PolyPilot/Services/CopilotService.Bridge.cs index c9f9893f5..4e2d90ea2 100644 --- a/PolyPilot/Services/CopilotService.Bridge.cs +++ b/PolyPilot/Services/CopilotService.Bridge.cs @@ -740,6 +740,27 @@ public async Task ForceRefreshRemoteAsync(string? activeSessionName } } + // Force-sync processing state for ALL sessions from the server snapshot. + // SyncRemoteSessions skips sessions in _remoteStreamingSessions, but a user-initiated + // force sync should always apply the server's authoritative IsProcessing state. + // Also clear stuck streaming guards — if the server says a session is idle, + // any lingering guard from a dropped connection should be cleared. + foreach (var rs in _bridgeClient.Sessions) + { + if (_sessions.TryGetValue(rs.Name, out var syncState)) + { + if (syncState.Info.IsProcessing != rs.IsProcessing) + Debug($"[SYNC] '{rs.Name}' IsProcessing {syncState.Info.IsProcessing} -> {rs.IsProcessing}"); + syncState.Info.IsProcessing = rs.IsProcessing; + syncState.Info.ProcessingStartedAt = rs.ProcessingStartedAt; + syncState.Info.ToolCallCount = rs.ToolCallCount; + syncState.Info.ProcessingPhase = rs.ProcessingPhase; + // Clear stuck streaming guard if server says session is idle + if (!rs.IsProcessing) + _remoteStreamingSessions.TryRemove(rs.Name, out _); + } + } + // Snapshot post-sync state var postSyncSessionCount = _sessions.Count; var postSyncMessageCount = 0; From 70ae3e1efef5f8d7b89abbae9456bdea9fb51881 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sat, 28 Mar 2026 15:18:55 -0500 Subject: [PATCH 3/9] Fix: eagerly resume sessions still active on headless server after restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, eager resume only triggered when LastPrompt was saved (debounced). If the app was killed before the debounce fired, actively-running sessions were only loaded as lazy placeholders with no SDK connection — appearing idle/stuck even though the headless server was still processing them. Now checks events.jsonl via IsSessionStillProcessing() to detect sessions that are genuinely still active, regardless of whether LastPrompt was persisted. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Persistence.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/PolyPilot/Services/CopilotService.Persistence.cs b/PolyPilot/Services/CopilotService.Persistence.cs index 3d73b18e3..3da97e247 100644 --- a/PolyPilot/Services/CopilotService.Persistence.cs +++ b/PolyPilot/Services/CopilotService.Persistence.cs @@ -601,10 +601,16 @@ public async Task RestorePreviousSessionsAsync(CancellationToken cancellationTok _sessions[entry.DisplayName] = lazyState; _activeSessionName ??= entry.DisplayName; RestoreUsageStats(entry); - if (!string.IsNullOrWhiteSpace(entry.LastPrompt)) + // Eagerly resume sessions that are still actively processing on the + // headless server. Check events.jsonl (authoritative) first, then fall + // back to LastPrompt (saved when IsProcessing=true at debounce time). + // Without this, actively-running sessions appear idle after app restart + // because they're only loaded as lazy placeholders with no SDK connection. + var isStillActive = IsSessionStillProcessing(entry.SessionId); + if (isStillActive || !string.IsNullOrWhiteSpace(entry.LastPrompt)) { eagerResumeCandidates.Add((entry.DisplayName, lazyState)); - Debug($"Queued eager resume for interrupted session: {entry.DisplayName}"); + Debug($"Queued eager resume for interrupted session: {entry.DisplayName} (active={isStillActive}, hasLastPrompt={!string.IsNullOrWhiteSpace(entry.LastPrompt)})"); } Debug($"Loaded session placeholder: {entry.DisplayName} ({lazyHistory.Count} messages)"); } From 8225a6b23b1a1b2f6de19232c96cc70dac9af159 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sat, 28 Mar 2026 15:57:42 -0500 Subject: [PATCH 4/9] fix: skip abort for sessions where CLI is still actively processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When resuming a session after app restart, the RESUME-ABORT logic detected unmatched tool.execution_start events and aborted the session to clear pending state. But in persistent mode, the headless CLI keeps running tools while PolyPilot is down — those tools WILL complete. Now checks IsSessionStillProcessing() before aborting. If the CLI is still active (events.jsonl fresh + last event is a tool/active event), we skip the abort and instead set IsProcessing=true with watchdog flags so the session correctly shows as working. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Services/CopilotService.Persistence.cs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/PolyPilot/Services/CopilotService.Persistence.cs b/PolyPilot/Services/CopilotService.Persistence.cs index 3da97e247..166b93f79 100644 --- a/PolyPilot/Services/CopilotService.Persistence.cs +++ b/PolyPilot/Services/CopilotService.Persistence.cs @@ -407,7 +407,12 @@ private async Task EnsureSessionConnectedAsync(string sessionName, SessionState // waiting for tool results that will never arrive. It silently queues/ignores // new SendAsync calls until the pending tools are resolved. An explicit abort // clears this state and allows new messages to flow. - if (wasResumed && HasInterruptedToolExecution(sessionId)) + // + // IMPORTANT: Only abort if the CLI has actually stopped working. In persistent + // mode, the headless server keeps running tools even while PolyPilot is down. + // If IsSessionStillProcessing() says the CLI is active, the tool results WILL + // arrive — aborting would kill legitimate in-progress work. + if (wasResumed && HasInterruptedToolExecution(sessionId) && !IsSessionStillProcessing(sessionId)) { Debug($"[RESUME-ABORT] '{sessionName}' has interrupted tool execution — sending abort to clear pending state"); try @@ -420,6 +425,18 @@ private async Task EnsureSessionConnectedAsync(string sessionName, SessionState Debug($"[RESUME-ABORT] '{sessionName}' abort failed (non-fatal): {abortEx.Message}"); } } + else if (wasResumed && HasInterruptedToolExecution(sessionId)) + { + Debug($"[RESUME-SKIP-ABORT] '{sessionName}' has unmatched tool starts but CLI is still active — NOT aborting"); + // The CLI is still running tools — mark the session as processing so the UI + // shows it as busy. Set watchdog flags so it gets the longer tool timeout. + state.Info.IsProcessing = true; + state.Info.IsResumed = true; + state.HasUsedToolsThisTurn = true; + state.Info.ProcessingPhase = 3; // Working + state.Info.ProcessingStartedAt = DateTime.UtcNow; + NotifyStateChanged(); + } Debug($"Lazy-resume complete: '{sessionName}'"); } From b16625b8fa959ea47d85700b55d879f4c37c0154 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sat, 28 Mar 2026 16:03:09 -0500 Subject: [PATCH 5/9] fix: watchdog uses longer freshness window for sessions with background tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sessions that received IDLE-DEFER (background agents/shells active) were getting killed by the watchdog after 300s because they weren't flagged as multi-agent. The 300s freshness window was too short — subagents can run for 10+ minutes without producing events.jsonl writes. Added HasDeferredIdle flag on SessionState, set when IDLE-DEFER fires. The watchdog Case B now uses the 1800s multi-agent freshness window for any session with HasDeferredIdle=true, not just multi-agent groups. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Events.cs | 3 ++- PolyPilot/Services/CopilotService.cs | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 9e8246f45..3e4c703c5 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -643,6 +643,7 @@ void Invoke(Action action) // Do NOT treat this as terminal — flush text and wait for the real idle. if (HasActiveBackgroundTasks(idle)) { + state.HasDeferredIdle = true; // Track for watchdog freshness window Debug($"[IDLE-DEFER] '{sessionName}' session.idle received with active background tasks — " + $"deferring completion (IsProcessing={state.Info.IsProcessing}, " + $"response={state.CurrentResponse.Length}+{state.FlushedResponse.Length} chars)"); @@ -2173,7 +2174,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session // - "after turn start" alone stays true forever once any event is written // - "recent" alone could match stale files from a previous turn var caseBEventsActive = false; - var freshnessSeconds = isMultiAgentSession + var freshnessSeconds = (isMultiAgentSession || state.HasDeferredIdle) ? WatchdogMultiAgentCaseBFreshnessSeconds : WatchdogCaseBFreshnessSeconds; try diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 7dc691855..17355156b 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -534,6 +534,11 @@ private class SessionState /// When this reaches WatchdogCaseBMaxStaleChecks, deferral is stopped even if the file /// modification time is within the freshness window (dead connection detected). public int WatchdogCaseBStaleCount; + /// True when an IDLE-DEFER has been observed for this session — the CLI reported + /// active background tasks (subagents/shells). The watchdog uses this to apply the longer + /// multi-agent freshness window even for non-multi-agent-group sessions, because the CLI + /// has confirmed it's running background work that won't produce events.jsonl writes. + public volatile bool HasDeferredIdle; /// True if the TurnEnd→Idle fallback was canceled by an AssistantTurnStartEvent. /// Used for diagnostic logging: when the next TurnEnd re-arms the fallback, the log shows /// the self-healing loop in action (TurnEnd → TurnStart cancel → TurnEnd re-arm). @@ -3049,6 +3054,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis state.Info.ClearPermissionDenials(); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); // Reset stale tool count from previous turn state.HasUsedToolsThisTurn = false; // Reset stale tool flag from previous turn + state.HasDeferredIdle = false; // Reset deferred idle flag from previous turn state.IsReconnectedSend = false; // Clear reconnect flag — new turn starts fresh (see watchdog reconnect timeout) state.PrematureIdleSignal.Reset(); // Clear premature idle detection from previous turn state.FallbackCanceledByTurnStart = false; From 6d4b1f0a3fd6b6ff47bf6bef2b63a2a9094ff416 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sat, 28 Mar 2026 17:46:30 -0500 Subject: [PATCH 6/9] fix: WsBridge retries port binding on startup instead of silently failing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When relaunch.sh kills the old instance and immediately starts a new one, port 4322 may still be in TIME_WAIT. Previously, Start() would try once and silently give up, leaving the bridge dead. Mobile clients could never reconnect because there was no server listening. Now Start() tries to bind immediately and, if the port is busy, starts the accept loop anyway — it retries via TryRestartListenerAsync with exponential backoff (2s, 4s, 8s... up to 30s). Also increased the retry delay from 500ms to 2s to better match macOS TIME_WAIT behavior. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/WsBridgeServer.cs | 48 +++++++++++++++++----------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/PolyPilot/Services/WsBridgeServer.cs b/PolyPilot/Services/WsBridgeServer.cs index 8c751d793..2abed33c9 100644 --- a/PolyPilot/Services/WsBridgeServer.cs +++ b/PolyPilot/Services/WsBridgeServer.cs @@ -55,33 +55,44 @@ public void Start(int bridgePort, int targetPort) _bridgePort = bridgePort; _cts = new CancellationTokenSource(); - _listener = new HttpListener(); - _listener.Prefixes.Add($"http://+:{bridgePort}/"); - - try + if (TryBindListener(bridgePort)) { - _listener.Start(); - Console.WriteLine($"[WsBridge] Listening on port {bridgePort} (state-sync mode)"); _acceptTask = AcceptLoopAsync(_cts.Token); OnStateChanged?.Invoke(); } - catch (Exception ex) + else + { + // Port likely in TIME_WAIT from a previous instance (relaunch). + // Start the accept loop anyway — it will retry via TryRestartListenerAsync + // with exponential backoff until the port is released (typically 5-15s). + Console.WriteLine($"[WsBridge] Port {bridgePort} busy — will retry in accept loop"); + _acceptTask = AcceptLoopAsync(_cts.Token); + } + } + + /// + /// Try to bind the HttpListener on the given port. Tries wildcard first (LAN access), + /// falls back to localhost. Returns true if the listener is now listening. + /// + private bool TryBindListener(int port) + { + foreach (var prefix in new[] { $"http://+:{port}/", $"http://localhost:{port}/" }) { - Console.WriteLine($"[WsBridge] Failed to start on wildcard: {ex.Message}"); try { - _listener = new HttpListener(); - _listener.Prefixes.Add($"http://localhost:{bridgePort}/"); - _listener.Start(); - Console.WriteLine($"[WsBridge] Listening on localhost:{bridgePort} (state-sync mode)"); - _acceptTask = AcceptLoopAsync(_cts.Token); - OnStateChanged?.Invoke(); + var listener = new HttpListener(); + listener.Prefixes.Add(prefix); + listener.Start(); + _listener = listener; + Console.WriteLine($"[WsBridge] Listening on port {port} (state-sync mode)"); + return true; } - catch (Exception ex2) + catch (Exception ex) { - Console.WriteLine($"[WsBridge] Failed to start on localhost: {ex2.Message}"); + Console.WriteLine($"[WsBridge] Bind on {prefix} failed: {ex.Message}"); } } + return false; } /// @@ -293,8 +304,9 @@ private async Task TryRestartListenerAsync(CancellationToken ct) try { _listener?.Stop(); } catch { } _listener = null; - // Brief pause so the OS has time to release the port after a crash. - try { await Task.Delay(500, ct); } catch (OperationCanceledException) { return false; } + // Wait for the OS to release the port after the old process died. + // macOS TIME_WAIT can hold the port for several seconds after kill. + try { await Task.Delay(2000, ct); } catch (OperationCanceledException) { return false; } // Try wildcard binding first (allows LAN / Tailscale access). foreach (var prefix in new[] { $"http://+:{_bridgePort}/", $"http://localhost:{_bridgePort}/" }) From 8c3424c132384e9d31ee1efd9d6babe0c4ebc120 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sat, 28 Mar 2026 19:05:50 -0500 Subject: [PATCH 7/9] =?UTF-8?q?fix:=20address=20PR=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20watchdog,=20cleanup,=20and=20dedup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit M5: Start processing watchdog after RESUME-SKIP-ABORT sets IsProcessing=true. Without this, a session marked as processing had no recovery if the CLI finishes without emitting session.idle. R1-M1: Add IsResumed=false to SessionComplete handler in Bridge.cs. Missing from the belt-and-suspenders cleanup that clears 4 other fields. M6: Clear HasDeferredIdle in CompleteResponse, AbortSessionAsync, error handler, and all watchdog completion paths. Prevents stale flag from granting an unwarranted 1800s freshness window on the next turn. R1-M2: Clear _recentTurnEndSessions in ReconnectAsync and server restart. Entries were only removed on TurnStart — after reconnect, stale entries could block legitimate IsProcessing updates. Minor: Deduplicate TryBindListener/TryRestartListenerAsync in WsBridgeServer. Both had identical prefix iteration loops. TryRestartListenerAsync now delegates to TryBindListener. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Bridge.cs | 1 + PolyPilot/Services/CopilotService.Events.cs | 5 +++++ .../Services/CopilotService.Persistence.cs | 1 + PolyPilot/Services/CopilotService.cs | 2 ++ PolyPilot/Services/WsBridgeServer.cs | 20 ++++--------------- 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/PolyPilot/Services/CopilotService.Bridge.cs b/PolyPilot/Services/CopilotService.Bridge.cs index 4e2d90ea2..068b01f8c 100644 --- a/PolyPilot/Services/CopilotService.Bridge.cs +++ b/PolyPilot/Services/CopilotService.Bridge.cs @@ -294,6 +294,7 @@ private async Task InitializeRemoteAsync(ConnectionSettings settings, Cancellati { Debug($"[BRIDGE-SESSION-COMPLETE] '{session.Name}' clearing stale IsProcessing"); session.IsProcessing = false; + session.IsResumed = false; session.ProcessingStartedAt = null; session.ToolCallCount = 0; session.ProcessingPhase = 0; diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 3e4c703c5..4800c3bf9 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -782,6 +782,7 @@ await notifService.SendNotificationAsync( CancelToolHealthCheck(state); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); @@ -1028,6 +1029,7 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul CancelToolHealthCheck(state); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; state.IsReconnectedSend = false; // Clear reconnect flag on turn completion (defense-in-depth) state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); @@ -1776,6 +1778,7 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s // Full cleanup mirroring CompleteResponse — missing fields here caused stuck sessions Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); @@ -2349,6 +2352,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session CancelToolHealthCheck(state); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); @@ -2443,6 +2447,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session Interlocked.Exchange(ref state.SendingFlag, 0); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); diff --git a/PolyPilot/Services/CopilotService.Persistence.cs b/PolyPilot/Services/CopilotService.Persistence.cs index 166b93f79..66effb413 100644 --- a/PolyPilot/Services/CopilotService.Persistence.cs +++ b/PolyPilot/Services/CopilotService.Persistence.cs @@ -435,6 +435,7 @@ private async Task EnsureSessionConnectedAsync(string sessionName, SessionState state.HasUsedToolsThisTurn = true; state.Info.ProcessingPhase = 3; // Working state.Info.ProcessingStartedAt = DateTime.UtcNow; + StartProcessingWatchdog(state, sessionName); NotifyStateChanged(); } diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 17355156b..4d0944ce7 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -1092,6 +1092,7 @@ public async Task ReconnectAsync(ConnectionSettings settings, CancellationToken _sessions.Clear(); _closedSessionIds.Clear(); _closedSessionNames.Clear(); + _recentTurnEndSessions.Clear(); lock (_imageQueueLock) { _queuedImagePaths.Clear(); @@ -1305,6 +1306,7 @@ public async Task RestartServerAsync(CancellationToken cancellationToken = defau _sessions.Clear(); _closedSessionIds.Clear(); _closedSessionNames.Clear(); + _recentTurnEndSessions.Clear(); // 2. Dispose old client if (_client != null) diff --git a/PolyPilot/Services/WsBridgeServer.cs b/PolyPilot/Services/WsBridgeServer.cs index 2abed33c9..b26dc6c4d 100644 --- a/PolyPilot/Services/WsBridgeServer.cs +++ b/PolyPilot/Services/WsBridgeServer.cs @@ -308,23 +308,11 @@ private async Task TryRestartListenerAsync(CancellationToken ct) // macOS TIME_WAIT can hold the port for several seconds after kill. try { await Task.Delay(2000, ct); } catch (OperationCanceledException) { return false; } - // Try wildcard binding first (allows LAN / Tailscale access). - foreach (var prefix in new[] { $"http://+:{_bridgePort}/", $"http://localhost:{_bridgePort}/" }) + if (TryBindListener(_bridgePort)) { - try - { - var listener = new HttpListener(); - listener.Prefixes.Add(prefix); - listener.Start(); - _listener = listener; - Console.WriteLine($"[WsBridge] Restarted listening on {prefix}"); - OnStateChanged?.Invoke(); - return true; - } - catch (Exception ex) - { - Console.WriteLine($"[WsBridge] Restart on {prefix} failed: {ex.Message}"); - } + Console.WriteLine($"[WsBridge] Restarted listening on port {_bridgePort}"); + OnStateChanged?.Invoke(); + return true; } return false; } From 8656f44360d934f9bc996d05ed1bb8836a14b381 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sat, 28 Mar 2026 20:27:38 -0500 Subject: [PATCH 8/9] fix: clear HasDeferredIdle in all IsProcessing=false paths (INV-1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mechanical sweep: every path that clears HasUsedToolsThisTurn now also clears HasDeferredIdle. This covers 18 total paths across CopilotService, Events, and Organization — including AbortSessionAsync, SendAsync error handlers, reconnect paths, steer errors, and session replacement. Maintains INV-1 consistency: every IsProcessing=false transition clears all companion fields to prevent stale state from affecting the next turn. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Events.cs | 3 +++ PolyPilot/Services/CopilotService.Organization.cs | 1 + PolyPilot/Services/CopilotService.cs | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 4800c3bf9..b4b6cf9fd 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -2503,6 +2503,7 @@ private void ClearProcessingStateForRecoveryFailure(SessionState state, string s state.Info.IsProcessing = false; state.Info.IsResumed = false; state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); @@ -2601,6 +2602,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN Interlocked.Exchange(ref state.SendingFlag, 0); // Clear stale tool flag so watchdog uses normal timeout if resend is skipped newState.HasUsedToolsThisTurn = false; + newState.HasDeferredIdle = false; // Replace in sessions dictionary BEFORE registering event handler // so HandleSessionEvent's isCurrentState check passes for the new state. @@ -2638,6 +2640,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN state.Info.IsProcessing = false; state.Info.IsResumed = false; state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs index 394f2d144..31c410505 100644 --- a/PolyPilot/Services/CopilotService.Organization.cs +++ b/PolyPilot/Services/CopilotService.Organization.cs @@ -2084,6 +2084,7 @@ private async Task ForceCompleteProcessingAsync(string sessionName, SessionState Interlocked.Exchange(ref state.WatchdogCaseBLastFileSize, 0); Interlocked.Exchange(ref state.WatchdogCaseBStaleCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; state.FallbackCanceledByTurnStart = false; state.Info.IsResumed = false; state.Info.ProcessingStartedAt = null; diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 4d0944ce7..3747ce3a4 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -3345,6 +3345,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis }; // Mirror primary reconnect: reset tool tracking for new connection siblingState.HasUsedToolsThisTurn = false; + siblingState.HasDeferredIdle = false; Interlocked.Exchange(ref siblingState.ActiveToolCallCount, 0); Interlocked.Exchange(ref siblingState.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref siblingState.ToolHealthStaleChecks, 0); @@ -3536,6 +3537,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // inflates the watchdog timeout from 120s to 600s, making stuck // sessions wait 5x longer than necessary to recover. newState.HasUsedToolsThisTurn = false; + newState.HasDeferredIdle = false; Interlocked.Exchange(ref newState.ActiveToolCallCount, 0); Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, 0); newState.IsMultiAgentSession = state.IsMultiAgentSession; @@ -3571,6 +3573,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // Reset HasUsedToolsThisTurn so the retried turn starts with the default // 120s watchdog tier instead of the inflated 600s from stale tool state. state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; // Schedule persistence of the new session ID so it survives app restart. // Without this, the debounced save captures the pre-reconnect snapshot @@ -3632,6 +3635,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Debug($"[ERROR] '{sessionName}' reconnect+retry failed, clearing IsProcessing"); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); state.Info.IsResumed = false; state.Info.IsProcessing = false; @@ -3654,6 +3658,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Debug($"[ERROR] '{sessionName}' SendAsync failed, clearing IsProcessing (error={ex.Message})"); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); state.Info.IsResumed = false; state.Info.IsProcessing = false; @@ -3804,6 +3809,7 @@ public async Task AbortSessionAsync(string sessionName, bool markAsInterrupted = state.Info.ProcessingPhase = 0; Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; state.IsReconnectedSend = false; // INV-1: clear all per-turn flags on abort Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); // Release send lock — allows a subsequent SteerSessionAsync to acquire it immediately @@ -3911,6 +3917,7 @@ await InvokeOnUIAsync(() => Debug($"[STEER-ERROR] '{sessionName}' soft steer SendAsync failed, clearing IsProcessing (error={ex.Message})"); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); state.Info.IsResumed = false; Interlocked.Exchange(ref state.SendingFlag, 0); @@ -4196,6 +4203,7 @@ await InvokeOnUIAsync(() => state.Info.IsProcessing = false; state.Info.IsResumed = false; state.HasUsedToolsThisTurn = false; + state.HasDeferredIdle = false; Interlocked.Exchange(ref state.ActiveToolCallCount, 0); Interlocked.Exchange(ref state.SendingFlag, 0); state.Info.ProcessingStartedAt = null; From 85c1c09266c96f00389d2092cba3325f96b1a1f7 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sat, 28 Mar 2026 22:15:02 -0500 Subject: [PATCH 9/9] fix: marshal RESUME-SKIP-ABORT state mutations to UI thread (INV-2) EnsureSessionConnectedAsync runs from Task.Run during eager resume. The RESUME-SKIP-ABORT branch was setting IsProcessing and companion fields directly on a thread-pool thread, violating INV-2. Wrap in InvokeOnUI to prevent torn state during Blazor render. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Services/CopilotService.Persistence.cs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/PolyPilot/Services/CopilotService.Persistence.cs b/PolyPilot/Services/CopilotService.Persistence.cs index 66effb413..432c342a2 100644 --- a/PolyPilot/Services/CopilotService.Persistence.cs +++ b/PolyPilot/Services/CopilotService.Persistence.cs @@ -430,13 +430,17 @@ private async Task EnsureSessionConnectedAsync(string sessionName, SessionState Debug($"[RESUME-SKIP-ABORT] '{sessionName}' has unmatched tool starts but CLI is still active — NOT aborting"); // The CLI is still running tools — mark the session as processing so the UI // shows it as busy. Set watchdog flags so it gets the longer tool timeout. - state.Info.IsProcessing = true; - state.Info.IsResumed = true; - state.HasUsedToolsThisTurn = true; - state.Info.ProcessingPhase = 3; // Working - state.Info.ProcessingStartedAt = DateTime.UtcNow; - StartProcessingWatchdog(state, sessionName); - NotifyStateChanged(); + // INV-2: marshal to UI thread — EnsureSessionConnectedAsync runs from Task.Run. + InvokeOnUI(() => + { + state.Info.IsProcessing = true; + state.Info.IsResumed = true; + state.HasUsedToolsThisTurn = true; + state.Info.ProcessingPhase = 3; // Working + state.Info.ProcessingStartedAt = DateTime.UtcNow; + StartProcessingWatchdog(state, sessionName); + NotifyStateChanged(); + }); } Debug($"Lazy-resume complete: '{sessionName}'");