From b00def0eb4e5ab83b4f28e7c75b9b533bd6dad64 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Mon, 25 May 2026 12:17:51 +1000 Subject: [PATCH 01/52] Migrate SilentProcessRunner.ExecuteCommand to async Replaces the sync WaitForExit() with await WaitForExitAsync(cancel). The cancel token is passed directly so the existing cancel semantics are preserved: cancel firing throws OCE from the await and unwinds. DoOurBestToCleanUp continues to fire on cancel via cancel.Register exactly as it did in the sync version. Adds a net48 polyfill for WaitForExitAsync using Process.Exited + TaskCompletionSource. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/CommandLine/SilentProcessRunner.cs | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 0fcf47f9e..97b8a585f 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -8,13 +8,14 @@ using System.Runtime.Versioning; using System.Text; using System.Threading; +using System.Threading.Tasks; using Octopus.Tentacle.Core.Diagnostics; namespace Octopus.Tentacle.Util { public static class SilentProcessRunner { - public static int ExecuteCommand( + public static Task ExecuteCommandAsync( string executable, string arguments, string workingDirectory, @@ -23,10 +24,10 @@ public static int ExecuteCommand( Action error, CancellationToken cancel) { - return ExecuteCommand(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel); + return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel); } - public static int ExecuteCommand( + public static async Task ExecuteCommandAsync( string executable, string arguments, string workingDirectory, @@ -125,6 +126,7 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei WriteData(error, errorResetEvent, e); }; + process.EnableRaisingEvents = true; process.Start(); var running = true; @@ -140,7 +142,11 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei process.BeginOutputReadLine(); process.BeginErrorReadLine(); - process.WaitForExit(); +#if NETFRAMEWORK + await WaitForExitAsyncNetFramework(process, cancel).ConfigureAwait(false); +#else + await process.WaitForExitAsync(cancel).ConfigureAwait(false); +#endif SafelyCancelRead(process.CancelErrorRead, debug); SafelyCancelRead(process.CancelOutputRead, debug); @@ -203,6 +209,30 @@ static void SafelyCancelRead(Action action, Action debug) } } +#if NETFRAMEWORK + static Task WaitForExitAsyncNetFramework(Process process, CancellationToken cancellationToken) + { + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + CancellationTokenRegistration registration = default; + void OnExited(object? sender, EventArgs e) + { + registration.Dispose(); + tcs.TrySetResult(null); + } + process.Exited += OnExited; + if (process.HasExited) tcs.TrySetResult(null); + if (cancellationToken.CanBeCanceled) + { + registration = cancellationToken.Register(() => + { + process.Exited -= OnExited; + tcs.TrySetCanceled(cancellationToken); + }); + } + return tcs.Task; + } +#endif + static void DoOurBestToCleanUp(Process process, Action error) { try From f168db7d3d6e58c7c93e2468aa0f76035ae8ee57 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Mon, 25 May 2026 12:18:25 +1000 Subject: [PATCH 02/52] Migrate ISilentProcessRunner and wrappers to async ISilentProcessRunner, SilentProcessRunnerWrapper, and the SilentProcessRunnerExtended helpers now return Task and call SilentProcessRunner.ExecuteCommandAsync directly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/ISilentProcessRunner.cs | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs b/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs index 06a61e558..49c54fd6b 100644 --- a/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs +++ b/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs @@ -1,13 +1,14 @@ -using System; +using System; using System.Collections.Generic; using System.Threading; +using System.Threading.Tasks; using Octopus.Tentacle.Startup; namespace Octopus.Tentacle.Util { public interface ISilentProcessRunner { - public int ExecuteCommand( + Task ExecuteCommandAsync( string executable, string arguments, string workingDirectory, @@ -15,7 +16,7 @@ public int ExecuteCommand( Action error, CancellationToken cancel = default); - public int ExecuteCommand( + Task ExecuteCommandAsync( string executable, string arguments, string workingDirectory, @@ -27,23 +28,23 @@ public int ExecuteCommand( public class SilentProcessRunnerWrapper : ISilentProcessRunner { - public int ExecuteCommand(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default) + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default) { - return SilentProcessRunnerExtended.ExecuteCommand(executable, arguments, workingDirectory, info, error, cancel); + return SilentProcessRunnerExtended.ExecuteCommandAsync(executable, arguments, workingDirectory, info, error, cancel); } - public int ExecuteCommand(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default) + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default) { - return SilentProcessRunner.ExecuteCommand(executable, arguments, workingDirectory, debug, info, error, cancel: cancel); + return SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, cancel: cancel); } } public static class SilentProcessRunnerExtended { - public static CmdResult ExecuteCommand(this CommandLineInvocation invocation) - => ExecuteCommand(invocation, Environment.CurrentDirectory); + public static async Task ExecuteCommandAsync(this CommandLineInvocation invocation) + => await ExecuteCommandAsync(invocation, Environment.CurrentDirectory); - public static CmdResult ExecuteCommand(this CommandLineInvocation invocation, string workingDirectory) + public static async Task ExecuteCommandAsync(this CommandLineInvocation invocation, string workingDirectory) { if (workingDirectory == null) throw new ArgumentNullException(nameof(workingDirectory)); @@ -52,7 +53,7 @@ public static CmdResult ExecuteCommand(this CommandLineInvocation invocation, st var infos = new List(); var errors = new List(); - var exitCode = ExecuteCommand( + var exitCode = await ExecuteCommandAsync( invocation.Executable, arguments, workingDirectory, @@ -63,14 +64,14 @@ public static CmdResult ExecuteCommand(this CommandLineInvocation invocation, st return new CmdResult(exitCode, infos, errors); } - public static int ExecuteCommand( + public static Task ExecuteCommandAsync( string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default) - => SilentProcessRunner.ExecuteCommand(executable, + => SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, LogFileOnlyLogger.Current.Info, @@ -79,4 +80,4 @@ public static int ExecuteCommand( customEnvironmentVariables: null, cancel: cancel); } -} \ No newline at end of file +} From 373bce480378bbff6aa6d600b0fcbfe55e040bec Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Mon, 25 May 2026 12:19:05 +1000 Subject: [PATCH 03/52] Migrate CommandLineRunner internals to async ExecuteCommandAsync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CommandLineRunner.Execute is consumed by Octopus.Manager.Tentacle (a WPF app), so the public method stays sync. It now blocks on the new SilentProcessRunner.ExecuteCommandAsync via GetAwaiter().GetResult() — safe because the WPF callers dispatch through ThreadPool.QueueUserWorkItem, which has no synchronisation context. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/CommandLineRunner.cs | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/source/Octopus.Tentacle/Util/CommandLineRunner.cs b/source/Octopus.Tentacle/Util/CommandLineRunner.cs index cc6da549a..55c0d3641 100644 --- a/source/Octopus.Tentacle/Util/CommandLineRunner.cs +++ b/source/Octopus.Tentacle/Util/CommandLineRunner.cs @@ -1,5 +1,6 @@ -using System; +using System; using System.Collections.Generic; +using System.Threading; using Octopus.Tentacle.Core.Diagnostics; namespace Octopus.Tentacle.Util @@ -43,12 +44,19 @@ public bool Execute(CommandLineInvocation invocation, { try { - var exitCode = SilentProcessRunner.ExecuteCommand(invocation.Executable, - (invocation.Arguments ?? "") + " " + (invocation.SystemArguments ?? ""), - Environment.CurrentDirectory, - debug, - info, - error); + // Sync boundary: ICommandLineRunner is a public interface consumed by + // Octopus.Manager.Tentacle (a WPF app) which calls Execute from a + // ThreadPool.QueueUserWorkItem — no synchronisation context, so + // GetAwaiter().GetResult() here is deadlock-safe. + var exitCode = SilentProcessRunner.ExecuteCommandAsync( + invocation.Executable, + (invocation.Arguments ?? "") + " " + (invocation.SystemArguments ?? ""), + Environment.CurrentDirectory, + debug, + info, + error, + cancel: CancellationToken.None) + .GetAwaiter().GetResult(); if (exitCode != 0) { @@ -75,4 +83,4 @@ public bool Execute(CommandLineInvocation invocation, return true; } } -} \ No newline at end of file +} From 1927326e91af146bfdbaea523e3523e0788c179a Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Mon, 25 May 2026 12:19:52 +1000 Subject: [PATCH 04/52] Migrate RunningScript.RunScript to RunScriptAsync RunningScript now awaits SilentProcessRunner.ExecuteCommandAsync through a new RunScriptAsync helper. The monitored-startup path also awaits the async helper inside Task.Run. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Services/Scripts/RunningScript.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index fca5dfa15..412a415e3 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -96,7 +96,7 @@ public async Task Execute() exitCode = workspace.ShouldMonitorPowerShellStartup() ? await RunPowershellScriptWithMonitoring(shellPath, writer, runningScriptToken) - : RunScript(shellPath, writer, runningScriptToken); + : await RunScriptAsync(shellPath, writer, runningScriptToken); } } catch (OperationCanceledException) @@ -147,7 +147,7 @@ async Task RunPowershellScriptWithMonitoring(string shellPath, IScriptLogWr var monitor = new PowerShellStartupMonitor(workspace.WorkingDirectory, powerShellStartupTimeout, log, taskId); var monitoringTask = monitor.WaitForStartup(monitoringTaskCts.Token); - var scriptTask = Task.Run(() => RunScript(shellPath, writer, scriptTaskCts.Token), scriptTaskCts.Token); + var scriptTask = Task.Run(async () => await RunScriptAsync(shellPath, writer, scriptTaskCts.Token), scriptTaskCts.Token); var completedTask = await Task.WhenAny(monitoringTask, scriptTask); @@ -222,11 +222,11 @@ void RecordScriptHasCompleted(int exitCode) } } - int RunScript(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken) + async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken) { try { - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( shellPath, shell.FormatCommandArguments(workspace.BootstrapScriptFilePath, workspace.ScriptArguments, false), workspace.WorkingDirectory, @@ -234,7 +234,7 @@ int RunScript(string shellPath, IScriptLogWriter writer, CancellationToken cance LogScriptOutputTo(writer, ProcessOutputSource.StdOut), LogScriptOutputTo(writer, ProcessOutputSource.StdErr), environmentVariables, - cancellationToken); + cancel: cancellationToken); return exitCode; } From 085a0fb35cf54adbed0955b55db59e70d1ee0798 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Mon, 25 May 2026 12:23:21 +1000 Subject: [PATCH 05/52] Update sync-boundary callers to block on ExecuteCommandAsync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The six immediate sync callers of SilentProcessRunner now go through ExecuteCommandAsync(...).GetAwaiter().GetResult(): - Octopus.Manager.Tentacle PowerShellPrerequisite (WPF installer) - KubernetesDirectoryInformationProvider (IMemoryCache factory) - SystemCtlHelper (2 sites — start and sudo retry) - LinuxServiceConfigurator (3 sites — chmod, systemctl probe, sudo probe) - WindowsServiceConfigurator (sc.exe wrapper) Each site gets a comment explaining why it must be sync and why blocking on a thread-pool worker is deadlock-safe. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../PreReq/PowerShellPrerequisite.cs | 14 ++++++++++++-- .../KubernetesDirectoryInformationProvider.cs | 10 +++++++++- .../Startup/LinuxServiceConfigurator.cs | 18 +++++++++++++++--- .../Startup/WindowsServiceConfigurator.cs | 10 ++++++++-- .../Octopus.Tentacle/Util/SystemCtlHelper.cs | 12 ++++++++++-- 5 files changed, 54 insertions(+), 10 deletions(-) diff --git a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs index 92c3768f1..908fa6bc2 100644 --- a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs +++ b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs @@ -1,5 +1,6 @@ using System; using System.IO; +using System.Threading; using Octopus.Tentacle.Diagnostics; using Octopus.Tentacle.Util; @@ -33,12 +34,21 @@ static bool CheckPowerShellIsInstalled(out string commandLineOutput) // to detect 3.0, it failed to detect 4. Going the direct route: try { - SilentProcessRunnerExtended.ExecuteCommand( + // We're in the WPF installer prerequisite check. IPrerequisite.Check() must return + // synchronously — there's no async version of the interface — so we block on the async + // call with .GetAwaiter().GetResult(). + // This is safe because we're on a plain thread-pool worker. The risk with blocking on + // async is a deadlock: if the async work needs to resume on the same thread that's + // blocked waiting for it, neither can make progress. Thread-pool workers don't have + // that constraint — when the async work finishes it can pick up on any free thread, + // not specifically this one, so the block resolves normally. + SilentProcessRunnerExtended.ExecuteCommandAsync( powerShellExe, arguments, ".", stdOut.WriteLine, - s => stdErr.WriteLine($"ERR: {s}")); + s => stdErr.WriteLine($"ERR: {s}"), + cancel: CancellationToken.None).GetAwaiter().GetResult(); var outputText = stdOut.ToString(); new SystemLog().Verbose("PowerShell prerequisite check output: " + outputText); diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs index 9154bdfff..e12f69d8f 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs @@ -55,7 +55,15 @@ public KubernetesDirectoryInformationProvider(ISystemLog log, ISilentProcessRunn { var stdOut = new List(); var stdErr = new List(); - var exitCode = silentProcessRunner.ExecuteCommand("du", $"-s -B 1 {directoryPath}", "/", stdOut.Add, stdErr.Add); + // Sync boundary: called from IMemoryCache.GetOrCreate factory which is synchronous. + // We block on the async ExecuteCommandAsync with .GetAwaiter().GetResult(). + // This is safe because we're on a plain thread-pool worker. The risk with blocking on + // async is a deadlock: if the async work needs to resume on the same thread that's + // blocked waiting for it, neither can make progress. Thread-pool workers don't have + // that constraint — when the async work finishes it can pick up on any free thread, + // not specifically this one, so the block resolves normally. + var exitCode = silentProcessRunner.ExecuteCommandAsync("du", $"-s -B 1 {directoryPath}", "/", stdOut.Add, stdErr.Add) + .GetAwaiter().GetResult(); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs index f57809cd0..ecbfd394c 100644 --- a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs @@ -194,7 +194,11 @@ void WriteUnitFile(string path, string contents) File.WriteAllText(path, contents); var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"chmod 644 {path}\""); - var result = commandLineInvocation.ExecuteCommand(); + // Sync boundary: WriteUnitFile is called from IServiceConfigurator.ConfigureService + // implementations, which are themselves called from the Tentacle service-management + // CLI on a threadpool worker with no sync context. GetAwaiter().GetResult() is + // deadlock-safe here. + var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); if (result.ExitCode == 0) return; @@ -219,14 +223,22 @@ void CheckSystemPrerequisites() bool IsSystemdInstalled() { var commandLineInvocation = new CommandLineInvocation("/bin/bash", "-c \"command -v systemctl >/dev/null\""); - var result = commandLineInvocation.ExecuteCommand(); + // Sync boundary: IsSystemdInstalled is called from CheckSystemPrerequisites, + // which is called from IServiceConfigurator.ConfigureService, which is itself + // called from the Tentacle service-management CLI on a threadpool worker with + // no sync context. GetAwaiter().GetResult() is deadlock-safe here. + var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); return result.ExitCode == 0; } bool HaveSudoPrivileges() { var commandLineInvocation = new CommandLineInvocation("/bin/bash", "-c \"sudo -vn 2> /dev/null\""); - var result = commandLineInvocation.ExecuteCommand(); + // Sync boundary: HaveSudoPrivileges is called from CheckSystemPrerequisites, + // which is called from IServiceConfigurator.ConfigureService, which is itself + // called from the Tentacle service-management CLI on a threadpool worker with + // no sync context. GetAwaiter().GetResult() is deadlock-safe here. + var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); return result.ExitCode == 0; } diff --git a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs index 1fc5dd0eb..b46f74d3a 100644 --- a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs @@ -342,11 +342,17 @@ void Sc(string arguments) var sc = Path.Combine(system32, "sc.exe"); logFileOnlyLogger.Info($"Executing sc.exe {argumentsToLog}"); - var exitCode = SilentProcessRunnerExtended.ExecuteCommand(sc, + // Sync boundary: Sc() is called from IServiceConfigurator.ConfigureService + // implementations, which are themselves called from the Tentacle + // service-management CLI on Windows (no sync context, threadpool worker). + // GetAwaiter().GetResult() is deadlock-safe here. + var exitCode = SilentProcessRunnerExtended.ExecuteCommandAsync(sc, arguments, Environment.CurrentDirectory, output => outputBuilder.AppendLine(output), - error => outputBuilder.AppendLine("Error: " + error)); + error => outputBuilder.AppendLine("Error: " + error), + cancel: CancellationToken.None) + .GetAwaiter().GetResult(); if (exitCode == 0) logFileOnlyLogger.Info(outputBuilder.ToString()); else diff --git a/source/Octopus.Tentacle/Util/SystemCtlHelper.cs b/source/Octopus.Tentacle/Util/SystemCtlHelper.cs index 8ff4fd632..8da121fa5 100644 --- a/source/Octopus.Tentacle/Util/SystemCtlHelper.cs +++ b/source/Octopus.Tentacle/Util/SystemCtlHelper.cs @@ -32,7 +32,11 @@ bool RunServiceCommand(string command, string serviceName, bool logFailureAsErro { // Try without sudo first var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"systemctl {command} {serviceName}\""); - var result = commandLineInvocation.ExecuteCommand(); + // Sync boundary: RunServiceCommand is called from synchronous service-management + // helpers (StartService, RestartService, etc.), which are themselves called from + // the Tentacle service-management CLI on a threadpool worker with no sync context. + // GetAwaiter().GetResult() is deadlock-safe here. + var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); if (result.ExitCode == 0) return true; // Check if failure is due to authentication/permission issues @@ -48,7 +52,11 @@ bool RunServiceCommand(string command, string serviceName, bool logFailureAsErro { log.Info($"Permission denied. Retrying 'systemctl {command} {serviceName}' with sudo..."); var sudoInvocation = new CommandLineInvocation("/bin/bash", $"-c \"sudo systemctl {command} {serviceName}\""); - result = sudoInvocation.ExecuteCommand(); + // Sync boundary: RunServiceCommand is called from synchronous service-management + // helpers (StartService, RestartService, etc.), which are themselves called from + // the Tentacle service-management CLI on a threadpool worker with no sync context. + // GetAwaiter().GetResult() is deadlock-safe here. + result = sudoInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); if (result.ExitCode == 0) return true; usedSudo = true; From 6db9733810e54a70432e745ca07adadc1f03fe60 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Mon, 25 May 2026 12:27:11 +1000 Subject: [PATCH 06/52] Migrate test scaffolding callers to ExecuteCommandAsync Updates Kubernetes integration test setup helpers, PowerShell startup-detection tests, integration support, and Linux test fixtures to call the new async ExecuteCommandAsync API. Tests that don't await directly (NUnit static helpers, cache factories) block on .GetAwaiter().GetResult() and document why it's deadlock-safe on the test threadpool. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../KubernetesClusterOneTimeSetUp.cs | 2 +- .../KubernetesClientCompatibilityTests.cs | 2 +- .../Setup/DockerImageLoader.cs | 20 ++--- .../Setup/KubernetesAgentInstaller.cs | 14 +-- .../Setup/KubernetesClusterInstaller.cs | 18 ++-- .../Setup/SetupHelpers.cs | 6 +- .../Setup/Tooling/HelmDownloader.cs | 10 +-- .../Setup/Tooling/ToolDownloader.cs | 10 +-- .../Tooling/KubeCtlTool.cs | 8 +- .../PowerShellStartupDetectionTests.cs | 16 ++-- .../LinuxConfigureServiceHelperFixture.cs | 15 ++-- .../TentacleFetchers/LinuxTentacleFetcher.cs | 6 +- .../Util/LinuxTestUserPrincipal.cs | 3 +- .../Util/SilentProcessRunnerFixture.cs | 7 +- ...etesDirectoryInformationProviderFixture.cs | 88 ++++++++++--------- .../Util/LinuxTestUserPrincipal.cs | 2 +- 16 files changed, 126 insertions(+), 101 deletions(-) diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesAgent/KubernetesClusterOneTimeSetUp.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesAgent/KubernetesClusterOneTimeSetUp.cs index 3854e7c53..4c46902c5 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesAgent/KubernetesClusterOneTimeSetUp.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesAgent/KubernetesClusterOneTimeSetUp.cs @@ -17,7 +17,7 @@ public async Task OneTimeSetUp() installer = new KubernetesClusterInstaller(KubernetesTestsGlobalContext.Instance.TemporaryDirectory, kindExePath, helmExePath, kubeCtlPath, KubernetesTestsGlobalContext.Instance.Logger); await installer.InstallLatestSupported(); - KubernetesTestsGlobalContext.Instance.TentacleImageAndTag = SetupHelpers.GetTentacleImageAndTag(kindExePath, installer); + KubernetesTestsGlobalContext.Instance.TentacleImageAndTag = await SetupHelpers.GetTentacleImageAndTag(kindExePath, installer); KubernetesTestsGlobalContext.Instance.SetToolExePaths(helmExePath, kubeCtlPath); KubernetesTestsGlobalContext.Instance.KubeConfigPath = installer.KubeConfigPath; } diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesClientCompatibilityTests.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesClientCompatibilityTests.cs index a0c6ae878..5d060cb32 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesClientCompatibilityTests.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesClientCompatibilityTests.cs @@ -165,7 +165,7 @@ async Task SetupCluster(ClusterVersion clusterVersion) clusterInstaller = new KubernetesClusterInstaller(testContext.TemporaryDirectory, kindExePath, helmExePath, kubeCtlPath, testContext.Logger); await clusterInstaller.Install(clusterVersion); - testContext.TentacleImageAndTag = SetupHelpers.GetTentacleImageAndTag(kindExePath, clusterInstaller); + testContext.TentacleImageAndTag = await SetupHelpers.GetTentacleImageAndTag(kindExePath, clusterInstaller); testContext.SetToolExePaths(helmExePath, kubeCtlPath); testContext.KubeConfigPath = clusterInstaller.KubeConfigPath; } diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs index cf28823aa..97972eba8 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs @@ -18,16 +18,16 @@ public DockerImageLoader(TemporaryDirectory temporaryDirectory, ILogger logger, this.kindExePath = kindExePath; } - public string? LoadMostRecentImageIntoKind(string clusterName) + public async Task LoadMostRecentImageIntoKind(string clusterName) { - var mostRecentTag = FindMostRecentTag(); + var mostRecentTag = await FindMostRecentTag(); return !string.IsNullOrWhiteSpace(mostRecentTag) - ? LoadImageIntoKind(mostRecentTag, clusterName) + ? await LoadImageIntoKind(mostRecentTag, clusterName) : null; } - string? FindMostRecentTag() + async Task FindMostRecentTag() { var sb = new StringBuilder(); var tags = new List(); @@ -36,7 +36,7 @@ public DockerImageLoader(TemporaryDirectory temporaryDirectory, ILogger logger, .WriteTo.StringBuilder(sb) .CreateLogger(); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( "docker", "images octopusdeploy/kubernetes-agent-tentacle --format \"{{.Tag}}\"", temporaryDirectory.DirectoryPath, @@ -47,7 +47,7 @@ public DockerImageLoader(TemporaryDirectory temporaryDirectory, ILogger logger, tags.Add(line); }, sprLogger.Error, - CancellationToken.None + cancel: CancellationToken.None ); if (exitCode != 0) @@ -55,11 +55,11 @@ public DockerImageLoader(TemporaryDirectory temporaryDirectory, ILogger logger, logger.Error("Failed to get latest image tag from docker"); throw new InvalidOperationException($"Failed to get latest image tag from docker. Logs: {sb}"); } - + return tags.FirstOrDefault(); } - string LoadImageIntoKind(string mostRecentTag, string clusterName) + async Task LoadImageIntoKind(string mostRecentTag, string clusterName) { var image = $"octopusdeploy/kubernetes-agent-tentacle:{mostRecentTag}"; @@ -69,14 +69,14 @@ string LoadImageIntoKind(string mostRecentTag, string clusterName) .WriteTo.StringBuilder(sb) .CreateLogger(); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( kindExePath, $"load docker-image {image} --name={clusterName}", temporaryDirectory.DirectoryPath, sprLogger.Debug, sprLogger.Information, sprLogger.Error, - CancellationToken.None + cancel: CancellationToken.None ); if (exitCode != 0) diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs index 79de31e5f..1083b54d8 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs @@ -56,14 +56,14 @@ public async Task InstallAgent(int listeningPort, string? tentacleImageA .MinimumLevel.Debug() .CreateLogger(); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( helmExePath, arguments, temporaryDirectory.DirectoryPath, sprLogger.Debug, sprLogger.Information, sprLogger.Error, - CancellationToken.None); + cancel: CancellationToken.None); sw.Stop(); @@ -169,7 +169,7 @@ async Task GetAgentThumbprint() .MinimumLevel.Debug() .CreateLogger(); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( kubeCtlExePath, //get the generated thumbprint from the config map $"get cm tentacle-config --namespace {Namespace} --kubeconfig=\"{kubeConfigPath}\" -o \"jsonpath={{.data['Tentacle\\.CertificateThumbprint']}}\"", @@ -181,7 +181,7 @@ async Task GetAgentThumbprint() thumbprint = x; }, sprLogger.Error, - CancellationToken.None); + cancel: CancellationToken.None); if (exitCode != 0) { @@ -219,14 +219,16 @@ public void Dispose() NamespaceFlag, AgentName); - var exitCode = SilentProcessRunner.ExecuteCommand( + // Dispose() cannot be made async; .GetAwaiter().GetResult() is safe here + // because this runs in test teardown (not inside an async context with a sync-blocking SynchronizationContext). + var exitCode = SilentProcessRunner.ExecuteCommandAsync( helmExePath, uninstallArgs, temporaryDirectory.DirectoryPath, logger.Debug, logger.Information, logger.Error, - CancellationToken.None); + cancel: CancellationToken.None).GetAwaiter().GetResult(); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs index 31b196718..ce8812ffd 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs @@ -53,7 +53,7 @@ async Task InstallCluster(ClusterVersion clusterVersion) var sw = new Stopwatch(); sw.Restart(); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( kindExePath, //we give the cluster a unique name $"create cluster --name={clusterName} --config=\"{configFilePath}\" --kubeconfig=\"{kubeConfigName}\"", @@ -61,7 +61,7 @@ async Task InstallCluster(ClusterVersion clusterVersion) logger.Debug, logger.Information, logger.Error, - CancellationToken.None); + cancel: CancellationToken.None); sw.Stop(); @@ -92,7 +92,7 @@ async Task SetLocalhostRouting() .WriteTo.StringBuilder(sb) .CreateLogger(); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( kubeCtlPath, //we give the cluster a unique name $"apply -n default -f \"{manifestFilePath}\" --kubeconfig=\"{KubeConfigPath}\"", @@ -100,7 +100,7 @@ async Task SetLocalhostRouting() sprLogger.Debug, sprLogger.Information, sprLogger.Error, - CancellationToken.None); + cancel: CancellationToken.None); if (exitCode != 0) { @@ -143,14 +143,14 @@ async Task InstallNfsCsiDriver() .WriteTo.StringBuilder(sb) .CreateLogger(); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( helmExePath, installArgs, tempDir.DirectoryPath, sprLogger.Debug, sprLogger.Information, sprLogger.Error, - CancellationToken.None); + cancel: CancellationToken.None); if (exitCode != 0) { @@ -174,7 +174,9 @@ string BuildNfsCsiDriverInstallArguments() public void Dispose() { - var exitCode = SilentProcessRunner.ExecuteCommand( + // Dispose() cannot be made async; .GetAwaiter().GetResult() is safe here + // because this runs in test teardown (not inside an async context with a sync-blocking SynchronizationContext). + var exitCode = SilentProcessRunner.ExecuteCommandAsync( kindExePath, //delete the cluster for this test run $"delete cluster --name={clusterName}", @@ -182,7 +184,7 @@ public void Dispose() logger.Debug, logger.Information, logger.Error, - CancellationToken.None); + cancel: CancellationToken.None).GetAwaiter().GetResult(); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/SetupHelpers.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/SetupHelpers.cs index f5ed3dca0..5e05de365 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/SetupHelpers.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/SetupHelpers.cs @@ -46,7 +46,7 @@ public static TentacleClient BuildTentacleClient(Uri uri, string? thumbprint, Ha builder.Build()); } - public static string? GetTentacleImageAndTag(string kindExePath, KubernetesClusterInstaller clusterInstaller) + public static async Task GetTentacleImageAndTag(string kindExePath, KubernetesClusterInstaller clusterInstaller) { if (clusterInstaller == null) { @@ -68,9 +68,9 @@ public static TentacleClient BuildTentacleClient(Uri uri, string? thumbprint, Ha { //if we should use the latest locally build image, load the tag from docker and load it into kind var imageLoader = new DockerImageLoader(KubernetesTestsGlobalContext.Instance.TemporaryDirectory, KubernetesTestsGlobalContext.Instance.Logger, kindExePath); - imageAndTag = imageLoader.LoadMostRecentImageIntoKind(clusterInstaller.ClusterName); + imageAndTag = await imageLoader.LoadMostRecentImageIntoKind(clusterInstaller.ClusterName); } - + if(imageAndTag is not null) KubernetesTestsGlobalContext.Instance.Logger.Information("Using tentacle image: {ImageAndTag}", imageAndTag); diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs index f28857be9..6ea1d2736 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs @@ -27,7 +27,7 @@ protected override string BuildDownloadUrl(Architecture processArchitecture, Ope static string GetArchitectureLabel(Architecture processArchitecture) => processArchitecture == Architecture.Arm64 ? "arm64" : "amd64"; - protected override string PostDownload(string targetDirectory, string downloadFilePath, Architecture processArchitecture, OperatingSystem operatingSystem) + protected override async Task PostDownload(string targetDirectory, string downloadFilePath, Architecture processArchitecture, OperatingSystem operatingSystem) { var architecture = GetArchitectureLabel(processArchitecture); var osName = GetOsName(operatingSystem); @@ -43,7 +43,7 @@ protected override string PostDownload(string targetDirectory, string downloadFi else { //everything else is tar.gz - ExtractTarGzip(downloadFilePath, extractionDir); + await ExtractTarGzip(downloadFilePath, extractionDir); } //move the extracted helm executable to the root target directory @@ -66,7 +66,7 @@ static string GetOsName(OperatingSystem operatingSystem) _ => throw new ArgumentOutOfRangeException(nameof(operatingSystem), operatingSystem, null) }; - void ExtractTarGzip(string gzArchiveName, string destFolder) + async Task ExtractTarGzip(string gzArchiveName, string destFolder) { if (!Directory.Exists(destFolder)) { @@ -79,14 +79,14 @@ void ExtractTarGzip(string gzArchiveName, string destFolder) // Falling back to good old fashioned `tar` does the job nicely :) using var tmp = new TemporaryDirectory(); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( "tar", $"xzvf \"{gzArchiveName}\" -C \"{destFolder}\"", tmp.DirectoryPath, Logger.Debug, Logger.Information, Logger.Error, - CancellationToken.None); + cancel: CancellationToken.None); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs index f89ae9037..6e586a355 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs @@ -37,19 +37,19 @@ public async Task Download(string targetDirectory, CancellationToken can Logger.Information("Downloading {DownloadUrl} to {DownloadFilePath}", downloadUrl, downloadFilePath); await OctopusPackageDownloader.DownloadPackage(downloadUrl, downloadFilePath, Logger, cancellationToken); - downloadFilePath = PostDownload(targetDirectory, downloadFilePath, RuntimeInformation.ProcessArchitecture, os); + downloadFilePath = await PostDownload(targetDirectory, downloadFilePath, RuntimeInformation.ProcessArchitecture, os); //if this is not running on windows, chmod the tool to be executable if (os is not OperatingSystem.Windows) { - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( "chmod", $"+x \"{downloadFilePath}\"", targetDirectory, Logger.Debug, Logger.Information, Logger.Error, - CancellationToken.None); + cancel: CancellationToken.None); if (exitCode != 0) { @@ -62,12 +62,12 @@ public async Task Download(string targetDirectory, CancellationToken can protected abstract string BuildDownloadUrl(Architecture processArchitecture, OperatingSystem operatingSystem); - protected virtual string PostDownload(string downloadDirectory, string downloadFilePath, Architecture processArchitecture, OperatingSystem operatingSystem) + protected virtual Task PostDownload(string downloadDirectory, string downloadFilePath, Architecture processArchitecture, OperatingSystem operatingSystem) { var targetFilename = Path.Combine(downloadDirectory, ExecutableName); File.Move(downloadFilePath, targetFilename); - return targetFilename; + return Task.FromResult(targetFilename); } static OperatingSystem GetOperationSystem() diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs index cae324e54..0b3354159 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs @@ -25,10 +25,10 @@ public KubeCtlTool(TemporaryDirectory temporaryDirectory, string kubeCtlExePath, public Task ExecuteNamespacedCommand(string command, CancellationToken cancellationToken = default) { - return Task.Run(() => ExecuteCommand($"{command} --namespace {ns}", cancellationToken), cancellationToken); + return ExecuteCommand($"{command} --namespace {ns}", cancellationToken); } - KubeCtlCommandResult ExecuteCommand(string command, CancellationToken cancellationToken = default) + async Task ExecuteCommand(string command, CancellationToken cancellationToken = default) { var sb = new StringBuilder(); var sprLogger = new LoggerConfiguration() @@ -40,7 +40,7 @@ KubeCtlCommandResult ExecuteCommand(string command, CancellationToken cancellati var stdOut = new List(); var stdErr = new List(); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( kubeCtlExePath, $"{command} --kubeconfig=\"{kubeConfigPath}\"", temporaryDirectory.DirectoryPath, @@ -55,7 +55,7 @@ KubeCtlCommandResult ExecuteCommand(string command, CancellationToken cancellati sprLogger.Error(y); stdErr.Add(y); }, - cancellationToken); + cancel: cancellationToken); return new (exitCode, stdOut, stdErr); } diff --git a/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs b/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs index 814523b42..4d11a567a 100644 --- a/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs +++ b/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs @@ -271,14 +271,14 @@ public async Task WhenPowerShellNeverStarts_WeShouldDetectTheScriptDidNotStart_A var args = shell.FormatCommandArguments(workspace.BootstrapScriptFilePath, null, allowInteractive: false); var directOutput = new List(); - var directExitCode = SilentProcessRunner.ExecuteCommand( + var directExitCode = await SilentProcessRunner.ExecuteCommandAsync( shell.GetFullPath(), args, workspace.WorkingDirectory, _ => { }, line => directOutput.Add(line), line => directOutput.Add(line), - CancellationToken.None); + cancel: CancellationToken.None); var directOutputText = string.Join("\n", directOutput); Logger.Information("Direct invocation output:\n{Output}", directOutputText); @@ -338,14 +338,14 @@ public async Task WhenPowerShellNeverStarts_AndWorkspaceIsDeletedBeforeScriptRun var args = shell.FormatCommandArguments(bootstrapScriptFilePath, null, allowInteractive: false); var directOutput = new List(); - var directExitCode = SilentProcessRunner.ExecuteCommand( + var directExitCode = await SilentProcessRunner.ExecuteCommandAsync( shell.GetFullPath(), args, workspace.WorkingDirectory, _ => { }, line => directOutput.Add(line), line => directOutput.Add(line), - CancellationToken.None); + cancel: CancellationToken.None); var directOutputText = string.Join("\n", directOutput); Logger.Information("Direct invocation output:\n{Output}", directOutputText); @@ -370,15 +370,17 @@ static IShell GetShellForCurrentPlatform() // First check if pwsh is available try { - var result = SilentProcessRunner.ExecuteCommand( + var result = SilentProcessRunner.ExecuteCommandAsync( "which", "pwsh", Environment.CurrentDirectory, _ => { }, _ => { }, _ => { }, - new Dictionary(), - CancellationToken.None); + customEnvironmentVariables: new Dictionary(), + cancel: CancellationToken.None) + // Safe: static helper, no synchronisation context. + .GetAwaiter().GetResult(); if (result == 0) { diff --git a/source/Octopus.Tentacle.Tests.Integration/Startup/LinuxConfigureServiceHelperFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Startup/LinuxConfigureServiceHelperFixture.cs index bb8e45aef..1010fc9cc 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Startup/LinuxConfigureServiceHelperFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Startup/LinuxConfigureServiceHelperFixture.cs @@ -47,7 +47,8 @@ public void CannotWriteToServiceFileAsUser() WriteUnixFile(scriptPath); var chmodCmd = new CommandLineInvocation("/bin/bash", $"-c \"chmod 777 {scriptPath}\""); - chmodCmd.ExecuteCommand(); + // Safe: sync test helper, no synchronisation context. + chmodCmd.ExecuteCommandAsync().GetAwaiter().GetResult(); var configureServiceHelper = new LinuxServiceConfigurator(log); @@ -66,7 +67,8 @@ public void CannotWriteToServiceFileAsUser() serviceConfigurationState); var statCmd = new CommandLineInvocation("/bin/bash", $"-c \"stat -c '%A' /etc/systemd/system/{instance}.service\""); - var result = statCmd.ExecuteCommand(); + // Safe: sync test helper, no synchronisation context. + var result = statCmd.ExecuteCommandAsync().GetAwaiter().GetResult(); result.Infos.Single().Should().Be("-rw-r--r--"); // Service file should only be writeable for the root user } @@ -81,7 +83,8 @@ void CanInstallService(string username, string password) WriteUnixFile(scriptPath); var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"chmod 777 {scriptPath}\""); - commandLineInvocation.ExecuteCommand(); + // Safe: sync test helper, no synchronisation context. + commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); var configureServiceHelper = new LinuxServiceConfigurator(log); @@ -151,7 +154,8 @@ void WriteUnixFile(string path) Dictionary GetServiceStatus(string serviceName) { var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"systemctl show {serviceName}\""); - var result = commandLineInvocation.ExecuteCommand(); + // Safe: sync test helper, no synchronisation context. + var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); Console.WriteLine($"Status of service {serviceName}"); foreach (var info in result.Infos) Console.WriteLine(info); @@ -181,7 +185,8 @@ bool DoesServiceUnitFileExist(string serviceName) CmdResult RunBashCommand(string command) { var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"{command}\""); - return commandLineInvocation.ExecuteCommand(); + // Safe: sync test helper, no synchronisation context. + return commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); } } } diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs index b6613ff5b..5e50f6cc6 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs @@ -62,14 +62,16 @@ public static void ExtractTarGzip(string gzArchiveName, string destFolder, ILogg using var tmp = new TemporaryDirectory(); Action log = s => logger.Information(s); - var exitCode = SilentProcessRunner.ExecuteCommand( + var exitCode = SilentProcessRunner.ExecuteCommandAsync( "tar", $"xzvf \"{gzArchiveName}\" -C \"{destFolder}\"", tmp.DirectoryPath, log, log, log, - CancellationToken.None); + cancel: CancellationToken.None) + // Safe: static void helper, no synchronisation context. + .GetAwaiter().GetResult(); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs b/source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs index d3f738261..74f295f1a 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs @@ -22,7 +22,8 @@ public LinuxTestUserPrincipal(string username) static void RunCommand(string arguments, bool failOnNonZeroExitCode = true) { var commandLineInvocation = new CommandLineInvocation("/bin/bash", arguments); - var result = commandLineInvocation.ExecuteCommand(); + // Safe: constructor-time helper, no synchronisation context. + var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); foreach (var line in result.Errors) Console.WriteLine(line); diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index da9247fd6..c91298d24 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -406,6 +406,8 @@ static string EchoEnvironmentVariable(string varName) return $"${varName}"; } + // Sync-over-async is safe here: NUnit runs tests on a plain ThreadPool thread with no + // synchronisation context, so there is no risk of deadlock. static int Execute( string command, string arguments, @@ -418,7 +420,8 @@ static int Execute( var debug = new StringBuilder(); var info = new StringBuilder(); var error = new StringBuilder(); - var exitCode = SilentProcessRunner.ExecuteCommand( + + var exitCode = SilentProcessRunner.ExecuteCommandAsync( command, arguments, workingDirectory, @@ -437,7 +440,7 @@ static int Execute( Console.WriteLine($"{DateTime.UtcNow} ERR: {x}"); error.Append(x); }, - cancel); + cancel: cancel).GetAwaiter().GetResult(); debugMessages = debug; infoMessages = info; diff --git a/source/Octopus.Tentacle.Tests/Kubernetes/KubernetesDirectoryInformationProviderFixture.cs b/source/Octopus.Tentacle.Tests/Kubernetes/KubernetesDirectoryInformationProviderFixture.cs index 60799927e..b4b7e337e 100644 --- a/source/Octopus.Tentacle.Tests/Kubernetes/KubernetesDirectoryInformationProviderFixture.cs +++ b/source/Octopus.Tentacle.Tests/Kubernetes/KubernetesDirectoryInformationProviderFixture.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Threading; +using System.Threading.Tasks; using FluentAssertions; using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Internal; @@ -24,27 +25,29 @@ public void DuOutputParses() { const ulong usedSize = 500 * Megabyte; var spr = Substitute.For(); - spr.When(x => x.ExecuteCommand("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>())) - .Do(x => + spr.ExecuteCommandAsync("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>()) + .Returns(callInfo => { - x.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + callInfo.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + return Task.FromResult(0); }); var memoryCache = new MemoryCache(new MemoryCacheOptions()); var sut = new KubernetesDirectoryInformationProvider(Substitute.For(), spr, memoryCache); sut.GetPathUsedBytes("/octopus").Should().Be(usedSize); } - + [Test] public void DuOutputParsesWithMultipleLines() { const ulong usedSize = 500 * Megabyte; var spr = Substitute.For(); - spr.When(x => x.ExecuteCommand("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>())) - .Do(x => + spr.ExecuteCommandAsync("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>()) + .Returns(callInfo => { - x.ArgAt>(3).Invoke($"500\t/octopus/extradir"); - x.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); - x.ArgAt>(3).Invoke($"{usedSize+1000}\tTotal"); + callInfo.ArgAt>(3).Invoke($"500\t/octopus/extradir"); + callInfo.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + callInfo.ArgAt>(3).Invoke($"{usedSize+1000}\tTotal"); + return Task.FromResult(0); }); var memoryCache = new MemoryCache(new MemoryCacheOptions()); var sut = new KubernetesDirectoryInformationProvider(Substitute.For(), spr, memoryCache); @@ -56,18 +59,18 @@ public void IfDuFailsWeStillGetData() { const ulong usedSize = 500 * Megabyte; var spr = Substitute.For(); - spr.When(x => x.ExecuteCommand("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>())) - .Do(x => + spr.ExecuteCommandAsync("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>()) + .Returns(callInfo => { - x.ArgAt>(3).Invoke($"500\t/octopus"); - x.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + callInfo.ArgAt>(3).Invoke($"500\t/octopus"); + callInfo.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + return Task.FromResult(1); }); - spr.ReturnsForAll(1); var memoryCache = new MemoryCache(new MemoryCacheOptions()); var sut = new KubernetesDirectoryInformationProvider(Substitute.For(), spr, memoryCache); sut.GetPathUsedBytes("/octopus").Should().Be(usedSize); } - + [Test] public void IfDuFailsWeLogCorrectly() { @@ -75,22 +78,22 @@ public void IfDuFailsWeLogCorrectly() var systemLog = new InMemoryLog(); var spr = Substitute.For(); - spr.When(x => x.ExecuteCommand("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>())) - .Do(x => + spr.ExecuteCommandAsync("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>()) + .Returns(callInfo => { // stdout - x.ArgAt>(3).Invoke("500\t/octopus"); - x.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); - + callInfo.ArgAt>(3).Invoke("500\t/octopus"); + callInfo.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + // stderr - x.ArgAt>(4).Invoke("no permission for foo"); - x.ArgAt>(4).Invoke("also no permission for bar"); + callInfo.ArgAt>(4).Invoke("no permission for foo"); + callInfo.ArgAt>(4).Invoke("also no permission for bar"); + return Task.FromResult(1); }); - spr.ReturnsForAll(1); var memoryCache = new MemoryCache(new MemoryCacheOptions()); var sut = new KubernetesDirectoryInformationProvider(systemLog, spr, memoryCache); sut.GetPathUsedBytes("/octopus").Should().Be(usedSize); - + systemLog.GetLogsForCategory(LogCategory.Warning).Should().Contain("Could not reliably get disk space using du. Getting best approximation..."); systemLog.GetLogsForCategory(LogCategory.Info).Should().Contain($"Du stdout returned 500\t/octopus, {usedSize}\t/octopus"); systemLog.GetLogsForCategory(LogCategory.Info).Should().Contain("Du stderr returned no permission for foo, also no permission for bar"); @@ -100,56 +103,61 @@ public void IfDuFailsWeLogCorrectly() public void IfDuFailsCompletelyReturnNull() { var spr = Substitute.For(); - spr.ReturnsForAll(1); + spr.ExecuteCommandAsync(Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any>(), Arg.Any>()) + .Returns(Task.FromResult(1)); var memoryCache = new MemoryCache(new MemoryCacheOptions()); var sut = new KubernetesDirectoryInformationProvider(Substitute.For(), spr, memoryCache); sut.GetPathUsedBytes("/octopus").Should().Be(null); } - + [Test] public void ReturnedValueShouldBeCached() { var spr = Substitute.For(); - spr.ReturnsForAll(1); + spr.ExecuteCommandAsync(Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any>(), Arg.Any>()) + .Returns(Task.FromResult(1)); var baseTime = DateTimeOffset.UtcNow; var clock = new TestClock(baseTime); var memoryCache = new MemoryCache(new MemoryCacheOptions(){ Clock = clock}); var sut = new KubernetesDirectoryInformationProvider(Substitute.For(), spr, memoryCache); sut.GetPathUsedBytes("/octopus").Should().Be(null); - + const ulong usedSize = 500 * Megabyte; - spr.When(x => x.ExecuteCommand("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>())) - .Do(x => + spr.ExecuteCommandAsync("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>()) + .Returns(callInfo => { - x.ArgAt>(3).Invoke($"123\t/octopus"); - x.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + callInfo.ArgAt>(3).Invoke($"123\t/octopus"); + callInfo.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + return Task.FromResult(0); }); clock.UtcNow = baseTime + TimeSpan.FromSeconds(29); sut.GetPathUsedBytes("/octopus").Should().Be(null); } - + [Test] public void DuCacheExpiresAfter30Seconds() { var spr = Substitute.For(); - spr.ReturnsForAll(1); + spr.ExecuteCommandAsync(Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any>(), Arg.Any>()) + .Returns(Task.FromResult(1)); var baseTime = DateTimeOffset.UtcNow; var clock = new TestClock(baseTime); var memoryCache = new MemoryCache(new MemoryCacheOptions(){ Clock = clock}); var sut = new KubernetesDirectoryInformationProvider(Substitute.For(), spr, memoryCache); sut.GetPathUsedBytes("/octopus").Should().Be(null); - + const ulong usedSize = 500 * Megabyte; - spr.When(x => x.ExecuteCommand("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>())) - .Do(x => + spr.ExecuteCommandAsync("du", "-s -B 1 /octopus", "/", Arg.Any>(), Arg.Any>()) + .Returns(callInfo => { - x.ArgAt>(3).Invoke($"123\t/octopus"); - x.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + callInfo.ArgAt>(3).Invoke($"123\t/octopus"); + callInfo.ArgAt>(3).Invoke($"{usedSize}\t/octopus"); + return Task.FromResult(0); }); clock.UtcNow = baseTime + TimeSpan.FromSeconds(30); - + sut.GetPathUsedBytes("/octopus").Should().Be(usedSize); } diff --git a/source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs b/source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs index 83da2cb93..9fe121add 100644 --- a/source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs +++ b/source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs @@ -22,7 +22,7 @@ public LinuxTestUserPrincipal(string username) static void RunCommand(string arguments, bool failOnNonZeroExitCode = true) { var commandLineInvocation = new CommandLineInvocation("/bin/bash", arguments); - var result = commandLineInvocation.ExecuteCommand(); + var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); foreach (var line in result.Errors) Console.WriteLine(line); From 72b2a9c6477dfea4e557afc40b7fa886bf0a033e Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Mon, 25 May 2026 13:25:07 +1000 Subject: [PATCH 07/52] Name the sync-over-async pattern in sync-boundary comments Rewrites the six sync-boundary comments to name the canonical pattern ("sync-over-async"), link Stephen Cleary's "Don't Block on Async Code" reference, and keep the "we are here / we do this / safe because" structure. Removes em-dashes per style preference. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../PreReq/PowerShellPrerequisite.cs | 11 ++++------ .../KubernetesDirectoryInformationProvider.cs | 13 ++++++------ .../Startup/LinuxServiceConfigurator.cs | 21 ++++++++----------- .../Startup/WindowsServiceConfigurator.cs | 10 +++++---- .../Util/CommandLineRunner.cs | 10 +++++---- .../Octopus.Tentacle/Util/SystemCtlHelper.cs | 15 +++++++------ 6 files changed, 38 insertions(+), 42 deletions(-) diff --git a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs index 908fa6bc2..8366ffc03 100644 --- a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs +++ b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs @@ -35,13 +35,10 @@ static bool CheckPowerShellIsInstalled(out string commandLineOutput) try { // We're in the WPF installer prerequisite check. IPrerequisite.Check() must return - // synchronously — there's no async version of the interface — so we block on the async - // call with .GetAwaiter().GetResult(). - // This is safe because we're on a plain thread-pool worker. The risk with blocking on - // async is a deadlock: if the async work needs to resume on the same thread that's - // blocked waiting for it, neither can make progress. Thread-pool workers don't have - // that constraint — when the async work finishes it can pick up on any free thread, - // not specifically this one, so the block resolves normally. + // synchronously, so we block on the async call with .GetAwaiter().GetResult(). + // This is sync-over-async but is safe because the installer dispatches us on a + // plain thread-pool worker. No captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html SilentProcessRunnerExtended.ExecuteCommandAsync( powerShellExe, arguments, diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs index e12f69d8f..a0fcf7935 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs @@ -55,13 +55,12 @@ public KubernetesDirectoryInformationProvider(ISystemLog log, ISilentProcessRunn { var stdOut = new List(); var stdErr = new List(); - // Sync boundary: called from IMemoryCache.GetOrCreate factory which is synchronous. - // We block on the async ExecuteCommandAsync with .GetAwaiter().GetResult(). - // This is safe because we're on a plain thread-pool worker. The risk with blocking on - // async is a deadlock: if the async work needs to resume on the same thread that's - // blocked waiting for it, neither can make progress. Thread-pool workers don't have - // that constraint — when the async work finishes it can pick up on any free thread, - // not specifically this one, so the block resolves normally. + // We're in the IMemoryCache.GetOrCreate factory that populates the disk-space cache entry. + // The cache factory delegate is synchronous (Func), so we block on the + // async call with .GetAwaiter().GetResult(). + // This is sync-over-async but is safe because the cache factory runs on a plain + // thread-pool worker. No captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var exitCode = silentProcessRunner.ExecuteCommandAsync("du", $"-s -B 1 {directoryPath}", "/", stdOut.Add, stdErr.Add) .GetAwaiter().GetResult(); diff --git a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs index ecbfd394c..66ebc8dbd 100644 --- a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs @@ -194,10 +194,12 @@ void WriteUnitFile(string path, string contents) File.WriteAllText(path, contents); var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"chmod 644 {path}\""); - // Sync boundary: WriteUnitFile is called from IServiceConfigurator.ConfigureService - // implementations, which are themselves called from the Tentacle service-management - // CLI on a threadpool worker with no sync context. GetAwaiter().GetResult() is - // deadlock-safe here. + // We're in WriteUnitFile, called from IServiceConfigurator.ConfigureService implementations + // which are sync (called from the Tentacle service-management CLI), so we block on the + // async call with .GetAwaiter().GetResult(). + // This is sync-over-async but is safe because the CLI dispatches us on a plain + // thread-pool worker. No captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); if (result.ExitCode == 0) return; @@ -223,10 +225,8 @@ void CheckSystemPrerequisites() bool IsSystemdInstalled() { var commandLineInvocation = new CommandLineInvocation("/bin/bash", "-c \"command -v systemctl >/dev/null\""); - // Sync boundary: IsSystemdInstalled is called from CheckSystemPrerequisites, - // which is called from IServiceConfigurator.ConfigureService, which is itself - // called from the Tentacle service-management CLI on a threadpool worker with - // no sync context. GetAwaiter().GetResult() is deadlock-safe here. + // Same sync-over-async boundary as WriteUnitFile: CheckSystemPrerequisites is called + // from the sync ConfigureService path, on a plain thread-pool worker. var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); return result.ExitCode == 0; } @@ -234,10 +234,7 @@ bool IsSystemdInstalled() bool HaveSudoPrivileges() { var commandLineInvocation = new CommandLineInvocation("/bin/bash", "-c \"sudo -vn 2> /dev/null\""); - // Sync boundary: HaveSudoPrivileges is called from CheckSystemPrerequisites, - // which is called from IServiceConfigurator.ConfigureService, which is itself - // called from the Tentacle service-management CLI on a threadpool worker with - // no sync context. GetAwaiter().GetResult() is deadlock-safe here. + // Same sync-over-async boundary as IsSystemdInstalled. var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); return result.ExitCode == 0; } diff --git a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs index b46f74d3a..9f9714bca 100644 --- a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs @@ -342,10 +342,12 @@ void Sc(string arguments) var sc = Path.Combine(system32, "sc.exe"); logFileOnlyLogger.Info($"Executing sc.exe {argumentsToLog}"); - // Sync boundary: Sc() is called from IServiceConfigurator.ConfigureService - // implementations, which are themselves called from the Tentacle - // service-management CLI on Windows (no sync context, threadpool worker). - // GetAwaiter().GetResult() is deadlock-safe here. + // We're in Sc() running sc.exe, called from IServiceConfigurator.ConfigureService + // implementations which are sync (called from the Tentacle service-management CLI on + // Windows), so we block on the async call with .GetAwaiter().GetResult(). + // This is sync-over-async but is safe because the CLI dispatches us on a plain + // thread-pool worker. No captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var exitCode = SilentProcessRunnerExtended.ExecuteCommandAsync(sc, arguments, Environment.CurrentDirectory, diff --git a/source/Octopus.Tentacle/Util/CommandLineRunner.cs b/source/Octopus.Tentacle/Util/CommandLineRunner.cs index 55c0d3641..faf00b1d8 100644 --- a/source/Octopus.Tentacle/Util/CommandLineRunner.cs +++ b/source/Octopus.Tentacle/Util/CommandLineRunner.cs @@ -44,10 +44,12 @@ public bool Execute(CommandLineInvocation invocation, { try { - // Sync boundary: ICommandLineRunner is a public interface consumed by - // Octopus.Manager.Tentacle (a WPF app) which calls Execute from a - // ThreadPool.QueueUserWorkItem — no synchronisation context, so - // GetAwaiter().GetResult() here is deadlock-safe. + // We're in CommandLineRunner.Execute, consumed by Octopus.Manager.Tentacle (WPF). + // The WPF installer calls Execute from ThreadPool.QueueUserWorkItem (a sync + // delegate), so we block on the async call with .GetAwaiter().GetResult(). + // This is sync-over-async but is safe because the installer dispatches us on a + // plain thread-pool worker. No captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var exitCode = SilentProcessRunner.ExecuteCommandAsync( invocation.Executable, (invocation.Arguments ?? "") + " " + (invocation.SystemArguments ?? ""), diff --git a/source/Octopus.Tentacle/Util/SystemCtlHelper.cs b/source/Octopus.Tentacle/Util/SystemCtlHelper.cs index 8da121fa5..ba4edf993 100644 --- a/source/Octopus.Tentacle/Util/SystemCtlHelper.cs +++ b/source/Octopus.Tentacle/Util/SystemCtlHelper.cs @@ -32,10 +32,12 @@ bool RunServiceCommand(string command, string serviceName, bool logFailureAsErro { // Try without sudo first var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"systemctl {command} {serviceName}\""); - // Sync boundary: RunServiceCommand is called from synchronous service-management - // helpers (StartService, RestartService, etc.), which are themselves called from - // the Tentacle service-management CLI on a threadpool worker with no sync context. - // GetAwaiter().GetResult() is deadlock-safe here. + // We're in SystemCtlHelper running a systemctl command. All callers (StartService, + // RestartService, etc.) are sync, called from the Tentacle service-management CLI + // which has no async path, so we block on the async call with .GetAwaiter().GetResult(). + // This is sync-over-async but is safe because the CLI dispatches us on a plain + // thread-pool worker. No captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); if (result.ExitCode == 0) return true; @@ -52,10 +54,7 @@ bool RunServiceCommand(string command, string serviceName, bool logFailureAsErro { log.Info($"Permission denied. Retrying 'systemctl {command} {serviceName}' with sudo..."); var sudoInvocation = new CommandLineInvocation("/bin/bash", $"-c \"sudo systemctl {command} {serviceName}\""); - // Sync boundary: RunServiceCommand is called from synchronous service-management - // helpers (StartService, RestartService, etc.), which are themselves called from - // the Tentacle service-management CLI on a threadpool worker with no sync context. - // GetAwaiter().GetResult() is deadlock-safe here. + // Same sync-over-async boundary as above: sudo retry on the same thread-pool worker. result = sudoInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); if (result.ExitCode == 0) return true; From 26bfaf2607728861a5d13361e97b7ed793ebb68b Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Mon, 25 May 2026 13:58:39 +1000 Subject: [PATCH 08/52] Address PR review on #1236: signature-only SilentProcessRunner change, lighthouse comments, async tests - Strip SilentProcessRunner.cs to signature-only async change (move polyfill, EnableRaisingEvents, and await WaitForExitAsync to #1226 abandon PR) - Apply lighthouse comment pattern uniformly to all sync-over-async sites including test scaffolding and the second/third sites in LinuxServiceConfigurator - Improve LinuxServiceConfigurator justification: name the sync interface chain (IServiceConfigurator -> AbstractCommand -> ICommand -> Topshelf) and the no-SynchronizationContext property of Main/Topshelf worker threads - Move SilentProcessRunnerFixture sync-over-async comment to actual call site - Convert LinuxConfigureServiceHelperFixture test methods to async Task, eliminating sync-over-async entirely in that file Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/CommandLine/SilentProcessRunner.cs | 35 +---------- .../Setup/KubernetesAgentInstaller.cs | 7 ++- .../Setup/KubernetesClusterInstaller.cs | 7 ++- .../LinuxConfigureServiceHelperFixture.cs | 58 +++++++++---------- .../TentacleFetchers/LinuxTentacleFetcher.cs | 10 +++- .../Util/LinuxTestUserPrincipal.cs | 7 ++- .../Util/SilentProcessRunnerFixture.cs | 9 ++- .../Startup/LinuxServiceConfigurator.cs | 52 ++++++++++++++--- 8 files changed, 104 insertions(+), 81 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 97b8a585f..0e57221c6 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -27,7 +27,7 @@ public static Task ExecuteCommandAsync( return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel); } - public static async Task ExecuteCommandAsync( + public static Task ExecuteCommandAsync( string executable, string arguments, string workingDirectory, @@ -126,7 +126,6 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei WriteData(error, errorResetEvent, e); }; - process.EnableRaisingEvents = true; process.Start(); var running = true; @@ -142,11 +141,7 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei process.BeginOutputReadLine(); process.BeginErrorReadLine(); -#if NETFRAMEWORK - await WaitForExitAsyncNetFramework(process, cancel).ConfigureAwait(false); -#else - await process.WaitForExitAsync(cancel).ConfigureAwait(false); -#endif + process.WaitForExit(); SafelyCancelRead(process.CancelErrorRead, debug); SafelyCancelRead(process.CancelOutputRead, debug); @@ -158,7 +153,7 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei debug($"Process {exeFileNameOrFullPath} in {workingDirectory} exited with code {exitCode}"); running = false; - return exitCode; + return Task.FromResult(exitCode); } } } @@ -209,30 +204,6 @@ static void SafelyCancelRead(Action action, Action debug) } } -#if NETFRAMEWORK - static Task WaitForExitAsyncNetFramework(Process process, CancellationToken cancellationToken) - { - var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); - CancellationTokenRegistration registration = default; - void OnExited(object? sender, EventArgs e) - { - registration.Dispose(); - tcs.TrySetResult(null); - } - process.Exited += OnExited; - if (process.HasExited) tcs.TrySetResult(null); - if (cancellationToken.CanBeCanceled) - { - registration = cancellationToken.Register(() => - { - process.Exited -= OnExited; - tcs.TrySetCanceled(cancellationToken); - }); - } - return tcs.Task; - } -#endif - static void DoOurBestToCleanUp(Process process, Action error) { try diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs index 1083b54d8..cdb4ecab3 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs @@ -219,8 +219,11 @@ public void Dispose() NamespaceFlag, AgentName); - // Dispose() cannot be made async; .GetAwaiter().GetResult() is safe here - // because this runs in test teardown (not inside an async context with a sync-blocking SynchronizationContext). + // We're in IDisposable.Dispose(). Dispose() must return synchronously, so we + // block on the async call with .GetAwaiter().GetResult(). This is sync-over-async + // but is safe because the NUnit test runner dispatches us on a worker thread + // without a captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var exitCode = SilentProcessRunner.ExecuteCommandAsync( helmExePath, uninstallArgs, diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs index ce8812ffd..94008d36e 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs @@ -174,8 +174,11 @@ string BuildNfsCsiDriverInstallArguments() public void Dispose() { - // Dispose() cannot be made async; .GetAwaiter().GetResult() is safe here - // because this runs in test teardown (not inside an async context with a sync-blocking SynchronizationContext). + // We're in IDisposable.Dispose(). Dispose() must return synchronously, so we + // block on the async call with .GetAwaiter().GetResult(). This is sync-over-async + // but is safe because the NUnit test runner dispatches us on a worker thread + // without a captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var exitCode = SilentProcessRunner.ExecuteCommandAsync( kindExePath, //delete the cluster for this test run diff --git a/source/Octopus.Tentacle.Tests.Integration/Startup/LinuxConfigureServiceHelperFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Startup/LinuxConfigureServiceHelperFixture.cs index 1010fc9cc..7750865fb 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Startup/LinuxConfigureServiceHelperFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Startup/LinuxConfigureServiceHelperFixture.cs @@ -4,6 +4,7 @@ using System.Linq; using System.Reflection; using System.Text; +using System.Threading.Tasks; using FluentAssertions; using NUnit.Framework; using Octopus.Tentacle.CommonTestUtils.Diagnostics; @@ -21,22 +22,22 @@ public class LinuxConfigureServiceHelperFixture { [Test] [RequiresSudoOnLinux] - public void CanInstallServiceAsRoot() + public async Task CanInstallServiceAsRoot() { - CanInstallService(null, null); + await CanInstallService(null, null); } [Test] [RequiresSudoOnLinux] - public void CanInstallServiceAsUser() + public async Task CanInstallServiceAsUser() { var user = new LinuxTestUserPrincipal("octo-shared-svc-test"); - CanInstallService(user.UserName, user.Password); + await CanInstallService(user.UserName, user.Password); } [Test] [RequiresSudoOnLinux] - public void CannotWriteToServiceFileAsUser() + public async Task CannotWriteToServiceFileAsUser() { const string serviceName = "OctopusShared.ServiceHelperTest"; const string instance = "TestCannotWriteToServiceFileInstance"; @@ -47,8 +48,7 @@ public void CannotWriteToServiceFileAsUser() WriteUnixFile(scriptPath); var chmodCmd = new CommandLineInvocation("/bin/bash", $"-c \"chmod 777 {scriptPath}\""); - // Safe: sync test helper, no synchronisation context. - chmodCmd.ExecuteCommandAsync().GetAwaiter().GetResult(); + await chmodCmd.ExecuteCommandAsync(); var configureServiceHelper = new LinuxServiceConfigurator(log); @@ -67,12 +67,11 @@ public void CannotWriteToServiceFileAsUser() serviceConfigurationState); var statCmd = new CommandLineInvocation("/bin/bash", $"-c \"stat -c '%A' /etc/systemd/system/{instance}.service\""); - // Safe: sync test helper, no synchronisation context. - var result = statCmd.ExecuteCommandAsync().GetAwaiter().GetResult(); + var result = await statCmd.ExecuteCommandAsync(); result.Infos.Single().Should().Be("-rw-r--r--"); // Service file should only be writeable for the root user } - void CanInstallService(string username, string password) + async Task CanInstallService(string username, string password) { const string serviceName = "OctopusShared.ServiceHelperTest"; const string instance = "TestInstance"; @@ -83,8 +82,7 @@ void CanInstallService(string username, string password) WriteUnixFile(scriptPath); var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"chmod 777 {scriptPath}\""); - // Safe: sync test helper, no synchronisation context. - commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); + await commandLineInvocation.ExecuteCommandAsync(); var configureServiceHelper = new LinuxServiceConfigurator(log); @@ -103,19 +101,19 @@ void CanInstallService(string username, string password) serviceConfigurationState); //Check that the systemd unit service file has been written - Assert.IsTrue(DoesServiceUnitFileExist(instance), "The service unit file has not been created"); + Assert.IsTrue(await DoesServiceUnitFileExist(instance), "The service unit file has not been created"); - var status = GetServiceStatus(instance); + var status = await GetServiceStatus(instance); status["ActiveState"].Should().Be("active"); status["SubState"].Should().Be("running"); status["LoadState"].Should().Be("loaded"); status["User"].Should().Be(username ?? "root"); //Check that the Service is running - Assert.IsTrue(IsServiceRunning(instance), "The service is not running"); + Assert.IsTrue(await IsServiceRunning(instance), "The service is not running"); //Check that the service is enabled to run on startup - Assert.IsTrue(IsServiceEnabled(instance), "The service has not been enabled to run on startup"); + Assert.IsTrue(await IsServiceEnabled(instance), "The service has not been enabled to run on startup"); var stopServiceConfigurationState = new ServiceConfigurationState { @@ -130,13 +128,13 @@ void CanInstallService(string username, string password) stopServiceConfigurationState); //Check that the Service has stopped - Assert.IsFalse(IsServiceRunning(instance), "The service has not been stopped"); + Assert.IsFalse(await IsServiceRunning(instance), "The service has not been stopped"); //Check that the service is disabled - Assert.IsFalse(IsServiceEnabled(instance), "The service has not been disabled"); + Assert.IsFalse(await IsServiceEnabled(instance), "The service has not been disabled"); //Check that the service is disabled - Assert.IsFalse(DoesServiceUnitFileExist(instance), "The service unit file still exists"); + Assert.IsFalse(await DoesServiceUnitFileExist(instance), "The service unit file still exists"); } void WriteUnixFile(string path) @@ -151,11 +149,10 @@ void WriteUnixFile(string path) } } - Dictionary GetServiceStatus(string serviceName) + async Task> GetServiceStatus(string serviceName) { var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"systemctl show {serviceName}\""); - // Safe: sync test helper, no synchronisation context. - var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); + var result = await commandLineInvocation.ExecuteCommandAsync(); Console.WriteLine($"Status of service {serviceName}"); foreach (var info in result.Infos) Console.WriteLine(info); @@ -164,29 +161,28 @@ Dictionary GetServiceStatus(string serviceName) .ToDictionary(x => x[0], x => x[1]); } - bool IsServiceRunning(string serviceName) + async Task IsServiceRunning(string serviceName) { - var result = RunBashCommand($"systemctl is-active --quiet {serviceName}"); + var result = await RunBashCommand($"systemctl is-active --quiet {serviceName}"); return result.ExitCode == 0; } - bool IsServiceEnabled(string serviceName) + async Task IsServiceEnabled(string serviceName) { - var result = RunBashCommand($"systemctl is-enabled --quiet {serviceName}"); + var result = await RunBashCommand($"systemctl is-enabled --quiet {serviceName}"); return result.ExitCode == 0; } - bool DoesServiceUnitFileExist(string serviceName) + async Task DoesServiceUnitFileExist(string serviceName) { - var result = RunBashCommand($"ls /etc/systemd/system | grep {serviceName}.service"); + var result = await RunBashCommand($"ls /etc/systemd/system | grep {serviceName}.service"); return result.ExitCode == 0; } - CmdResult RunBashCommand(string command) + async Task RunBashCommand(string command) { var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"{command}\""); - // Safe: sync test helper, no synchronisation context. - return commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); + return await commandLineInvocation.ExecuteCommandAsync(); } } } diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs index 5e50f6cc6..7bf97744a 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs @@ -62,6 +62,12 @@ public static void ExtractTarGzip(string gzArchiveName, string destFolder, ILogg using var tmp = new TemporaryDirectory(); Action log = s => logger.Information(s); + // We're in a synchronous public static helper (ExtractTarGzip). The method + // must return synchronously, so we block on the async call with + // .GetAwaiter().GetResult(). This is sync-over-async but is safe because + // the NUnit test runner dispatches us on a worker thread without a captured + // SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var exitCode = SilentProcessRunner.ExecuteCommandAsync( "tar", $"xzvf \"{gzArchiveName}\" -C \"{destFolder}\"", @@ -69,9 +75,7 @@ public static void ExtractTarGzip(string gzArchiveName, string destFolder, ILogg log, log, log, - cancel: CancellationToken.None) - // Safe: static void helper, no synchronisation context. - .GetAwaiter().GetResult(); + cancel: CancellationToken.None).GetAwaiter().GetResult(); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs b/source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs index 74f295f1a..241457316 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs @@ -22,7 +22,12 @@ public LinuxTestUserPrincipal(string username) static void RunCommand(string arguments, bool failOnNonZeroExitCode = true) { var commandLineInvocation = new CommandLineInvocation("/bin/bash", arguments); - // Safe: constructor-time helper, no synchronisation context. + // We're in a synchronous test helper called from the LinuxTestUserPrincipal + // constructor. Constructors must return synchronously, so we block on the + // async call with .GetAwaiter().GetResult(). This is sync-over-async but is + // safe because the NUnit test runner dispatches us on a worker thread without + // a captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); foreach (var line in result.Errors) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index c91298d24..ada4a5095 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -406,8 +406,6 @@ static string EchoEnvironmentVariable(string varName) return $"${varName}"; } - // Sync-over-async is safe here: NUnit runs tests on a plain ThreadPool thread with no - // synchronisation context, so there is no risk of deadlock. static int Execute( string command, string arguments, @@ -421,6 +419,13 @@ static int Execute( var info = new StringBuilder(); var error = new StringBuilder(); + // We're in a synchronous test helper (Execute) that exposes a sync int + // return and out parameters. The method must return synchronously, so we + // block on the async call with .GetAwaiter().GetResult(). This is + // sync-over-async but is safe because the NUnit test runner dispatches us + // on a worker thread without a captured SynchronizationContext, so no + // deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var exitCode = SilentProcessRunner.ExecuteCommandAsync( command, arguments, diff --git a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs index 66ebc8dbd..409d90b49 100644 --- a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs @@ -194,11 +194,20 @@ void WriteUnitFile(string path, string contents) File.WriteAllText(path, contents); var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"chmod 644 {path}\""); - // We're in WriteUnitFile, called from IServiceConfigurator.ConfigureService implementations - // which are sync (called from the Tentacle service-management CLI), so we block on the - // async call with .GetAwaiter().GetResult(). - // This is sync-over-async but is safe because the CLI dispatches us on a plain - // thread-pool worker. No captured SynchronizationContext, so no deadlock. + // We're in LinuxServiceConfigurator, which implements IServiceConfigurator. + // IServiceConfigurator is consumed by ServiceCommand (an AbstractCommand). + // AbstractCommand.Start() is sync because it implements ICommand.Start(), + // which is sync because Topshelf's runtime callback API is sync. So + // ConfigureService must return synchronously, and we block on the async + // call with .GetAwaiter().GetResult(). This is sync-over-async but is safe + // because no SynchronizationContext is ever captured on this call stack. + // When the user runs `tentacle service ...` from a shell, we enter through + // the process entry point (Main), and the main thread of a .NET console app + // has no SynchronizationContext installed by default. When running as an + // installed systemd service, Topshelf's OnStart callback runs on a + // dedicated worker (a `new Thread(...)`) that also has no + // SynchronizationContext. Either way, no captured context means no + // deadlock. // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); @@ -225,8 +234,21 @@ void CheckSystemPrerequisites() bool IsSystemdInstalled() { var commandLineInvocation = new CommandLineInvocation("/bin/bash", "-c \"command -v systemctl >/dev/null\""); - // Same sync-over-async boundary as WriteUnitFile: CheckSystemPrerequisites is called - // from the sync ConfigureService path, on a plain thread-pool worker. + // We're in LinuxServiceConfigurator, which implements IServiceConfigurator. + // IServiceConfigurator is consumed by ServiceCommand (an AbstractCommand). + // AbstractCommand.Start() is sync because it implements ICommand.Start(), + // which is sync because Topshelf's runtime callback API is sync. So + // ConfigureService must return synchronously, and we block on the async + // call with .GetAwaiter().GetResult(). This is sync-over-async but is safe + // because no SynchronizationContext is ever captured on this call stack. + // When the user runs `tentacle service ...` from a shell, we enter through + // the process entry point (Main), and the main thread of a .NET console app + // has no SynchronizationContext installed by default. When running as an + // installed systemd service, Topshelf's OnStart callback runs on a + // dedicated worker (a `new Thread(...)`) that also has no + // SynchronizationContext. Either way, no captured context means no + // deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); return result.ExitCode == 0; } @@ -234,7 +256,21 @@ bool IsSystemdInstalled() bool HaveSudoPrivileges() { var commandLineInvocation = new CommandLineInvocation("/bin/bash", "-c \"sudo -vn 2> /dev/null\""); - // Same sync-over-async boundary as IsSystemdInstalled. + // We're in LinuxServiceConfigurator, which implements IServiceConfigurator. + // IServiceConfigurator is consumed by ServiceCommand (an AbstractCommand). + // AbstractCommand.Start() is sync because it implements ICommand.Start(), + // which is sync because Topshelf's runtime callback API is sync. So + // ConfigureService must return synchronously, and we block on the async + // call with .GetAwaiter().GetResult(). This is sync-over-async but is safe + // because no SynchronizationContext is ever captured on this call stack. + // When the user runs `tentacle service ...` from a shell, we enter through + // the process entry point (Main), and the main thread of a .NET console app + // has no SynchronizationContext installed by default. When running as an + // installed systemd service, Topshelf's OnStart callback runs on a + // dedicated worker (a `new Thread(...)`) that also has no + // SynchronizationContext. Either way, no captured context means no + // deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); return result.ExitCode == 0; } From e4c291188c3c8af6748b64c26108c085684b4aa1 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 12:13:45 +1000 Subject: [PATCH 09/52] =?UTF-8?q?Apply=20Pattern=201=20sync-over-async=20w?= =?UTF-8?q?rap-private-async=20to=20sync=E2=86=94async=20boundaries?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consolidates 9 sync-over-async sites into 5 single-method bridges, each of which is a tiny sync wrapper over a private async implementation. All internal helpers become fully async, removing the in-method GetAwaiter calls in SystemCtlHelper and LinuxServiceConfigurator entirely. Bridges: - PowerShellPrerequisite.Check - KubernetesDirectoryInformationProvider.GetPathUsedBytes - LinuxServiceConfigurator.ConfigureService (replaces 5 prior bridges) - WindowsServiceConfigurator.ConfigureService - CommandLineRunner.Execute (also exposes ExecuteAsync for async callers) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../PreReq/PowerShellPrerequisite.cs | 29 ++-- .../ReviewAndRunScriptTabViewModel.cs | 2 +- .../KubernetesDirectoryInformationProvider.cs | 35 ++--- .../Startup/LinuxServiceConfigurator.cs | 133 +++++++----------- .../Startup/WindowsServiceConfigurator.cs | 62 ++++---- .../Util/CommandLineRunner.cs | 69 ++++++--- .../Util/ICommandLineRunner.cs | 21 ++- .../Octopus.Tentacle/Util/SystemCtlHelper.cs | 36 ++--- 8 files changed, 210 insertions(+), 177 deletions(-) diff --git a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs index 8366ffc03..fcbe7cc17 100644 --- a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs +++ b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs @@ -1,6 +1,7 @@ using System; using System.IO; using System.Threading; +using System.Threading.Tasks; using Octopus.Tentacle.Diagnostics; using Octopus.Tentacle.Util; @@ -10,9 +11,18 @@ public class PowerShellPrerequisite : IPrerequisite { public string StatusMessage => "Checking that Windows PowerShell 2.0 is installed..."; + // We're at the WPF installer prerequisite boundary. IPrerequisite.Check() must return + // synchronously, so this is the sync-over-async bridge: a one-line wrapper over the + // private async implementation. Safe because the installer dispatches us on a plain + // thread-pool worker. No captured SynchronizationContext, so no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html public PrerequisiteCheckResult Check() + => CheckAsync().GetAwaiter().GetResult(); + + async Task CheckAsync() { - return CheckPowerShellIsInstalled(out var commandLineOutput) + var (installed, commandLineOutput) = await CheckPowerShellIsInstalledAsync(); + return installed ? PrerequisiteCheckResult.Successful() : PrerequisiteCheckResult.Failed("Windows PowerShell 2.0 or above does not appear to be installed and on the System Path on this machine. Please install Windows PowerShell or add it to the System Path then re-run this setup tool.", helpLink: "http://g.octopushq.com/HowDoIInstallPowerShell", @@ -20,44 +30,39 @@ public PrerequisiteCheckResult Check() commandLineOutput: commandLineOutput); } - static bool CheckPowerShellIsInstalled(out string commandLineOutput) + static async Task<(bool installed, string commandLineOutput)> CheckPowerShellIsInstalledAsync() { var stdOut = new StringWriter(); var stdErr = new StringWriter(); const string powerShellExe = "powershell.exe"; const string arguments = "-NonInteractive -NoProfile -Command \"Write-Output $PSVersionTable.PSVersion\""; - commandLineOutput = $"{powerShellExe} {arguments}"; + var commandLineOutput = $"{powerShellExe} {arguments}"; // Despite our old check conforming to Microsoft's recommendations // for PS version checking around the 1.0/2.0 era, and extending // to detect 3.0, it failed to detect 4. Going the direct route: try { - // We're in the WPF installer prerequisite check. IPrerequisite.Check() must return - // synchronously, so we block on the async call with .GetAwaiter().GetResult(). - // This is sync-over-async but is safe because the installer dispatches us on a - // plain thread-pool worker. No captured SynchronizationContext, so no deadlock. - // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html - SilentProcessRunnerExtended.ExecuteCommandAsync( + await SilentProcessRunnerExtended.ExecuteCommandAsync( powerShellExe, arguments, ".", stdOut.WriteLine, s => stdErr.WriteLine($"ERR: {s}"), - cancel: CancellationToken.None).GetAwaiter().GetResult(); + cancel: CancellationToken.None); var outputText = stdOut.ToString(); new SystemLog().Verbose("PowerShell prerequisite check output: " + outputText); commandLineOutput = $"{commandLineOutput}{Environment.NewLine}{stdOut}{Environment.NewLine}{stdErr}"; - return outputText.Contains("Major"); + return (outputText.Contains("Major"), commandLineOutput); } catch (Exception ex) { commandLineOutput = $"{commandLineOutput}{Environment.NewLine}{ex}"; - return false; + return (false, commandLineOutput); } } } diff --git a/source/Octopus.Manager.Tentacle/TentacleConfiguration/SetupWizard/ReviewAndRunScriptTabViewModel.cs b/source/Octopus.Manager.Tentacle/TentacleConfiguration/SetupWizard/ReviewAndRunScriptTabViewModel.cs index 2b8eef4c9..1eb1be871 100644 --- a/source/Octopus.Manager.Tentacle/TentacleConfiguration/SetupWizard/ReviewAndRunScriptTabViewModel.cs +++ b/source/Octopus.Manager.Tentacle/TentacleConfiguration/SetupWizard/ReviewAndRunScriptTabViewModel.cs @@ -46,7 +46,7 @@ public async Task GenerateAndExecuteScript() { var script = GenerateScript(); ContributeSensitiveValues(logger); - success = commandLineRunner.Execute(script, logger); + success = await commandLineRunner.ExecuteAsync(script, logger); } catch (Exception ex) { diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs index a0fcf7935..4a373e0f1 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Threading.Tasks; using Octopus.Client.Extensions; using Microsoft.Extensions.Caching.Memory; using Octopus.Tentacle.Core.Diagnostics; @@ -13,13 +14,13 @@ public interface IKubernetesDirectoryInformationProvider public ulong? GetPathUsedBytes(string directoryPath); public ulong? GetPathTotalBytes(); } - + public class KubernetesDirectoryInformationProvider : IKubernetesDirectoryInformationProvider { readonly ISystemLog log; readonly ISilentProcessRunner silentProcessRunner; readonly IMemoryCache directoryInformationCache; - + //30s gives us fairly up to date information, but doesn't impact performance too much. //For 50 concurrent Cloud deployments: //No cache: 30min ea. @@ -36,12 +37,21 @@ public KubernetesDirectoryInformationProvider(ISystemLog log, ISilentProcessRunn this.directoryInformationCache = directoryInformationCache; } + // We're at the IKubernetesDirectoryInformationProvider boundary. Callers (capacity + // reporting) are sync, so this is the sync-over-async bridge: a one-line wrapper over + // the private async implementation. Safe because the consumer is a background sweeper + // running on a plain thread-pool worker. No captured SynchronizationContext, so no + // deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html public ulong? GetPathUsedBytes(string directoryPath) + => GetPathUsedBytesAsync(directoryPath).GetAwaiter().GetResult(); + + async Task GetPathUsedBytesAsync(string directoryPath) { - return directoryInformationCache.GetOrCreate(directoryPath, e => + return await directoryInformationCache.GetOrCreateAsync(directoryPath, async e => { e.SetAbsoluteExpiration(CacheExpiry); - return GetDriveBytesUsingDu(directoryPath); + return await GetDriveBytesUsingDuAsync(directoryPath); }); } @@ -49,20 +59,13 @@ public KubernetesDirectoryInformationProvider(ISystemLog log, ISilentProcessRunn { return KubernetesUtilities.GetResourceBytes(KubernetesConfig.PersistentVolumeSize); } - - - ulong? GetDriveBytesUsingDu(string directoryPath) + + + async Task GetDriveBytesUsingDuAsync(string directoryPath) { var stdOut = new List(); var stdErr = new List(); - // We're in the IMemoryCache.GetOrCreate factory that populates the disk-space cache entry. - // The cache factory delegate is synchronous (Func), so we block on the - // async call with .GetAwaiter().GetResult(). - // This is sync-over-async but is safe because the cache factory runs on a plain - // thread-pool worker. No captured SynchronizationContext, so no deadlock. - // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html - var exitCode = silentProcessRunner.ExecuteCommandAsync("du", $"-s -B 1 {directoryPath}", "/", stdOut.Add, stdErr.Add) - .GetAwaiter().GetResult(); + var exitCode = await silentProcessRunner.ExecuteCommandAsync("du", $"-s -B 1 {directoryPath}", "/", stdOut.Add, stdErr.Add); if (exitCode != 0) { @@ -78,4 +81,4 @@ public KubernetesDirectoryInformationProvider(ISystemLog log, ISilentProcessRunn return null; } } -} \ No newline at end of file +} diff --git a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs index 409d90b49..6f3c099a7 100644 --- a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs @@ -1,8 +1,9 @@ -using System; +using System; using System.Collections.Generic; using System.IO; using System.Text; using System.Text.RegularExpressions; +using System.Threading.Tasks; using Octopus.Tentacle.Core.Diagnostics; using Octopus.Tentacle.Util; @@ -47,22 +48,35 @@ public void ConfigureServiceByConfigPath(string thisServiceName, serviceConfigurationState); } + // We're at the IServiceConfigurator boundary. IServiceConfigurator is consumed by + // ServiceCommand (an AbstractCommand), and AbstractCommand.Start() is sync because + // ICommand.Start() is sync (Topshelf's runtime callback API is sync). So + // ConfigureService must return synchronously. This is the single sync-over-async + // bridge for the Linux service-configuration code path: a one-line wrapper over the + // private async implementation. Safe because no SynchronizationContext is captured on + // this call stack: the console-app main thread has none by default, and Topshelf's + // OnStart callback runs on a `new Thread(...)` worker that also has none. Either way, + // no captured context means no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html void ConfigureService(string thisServiceName, string exePath, string? instance, string? configPath, string serviceDescription, ServiceConfigurationState serviceConfigurationState) + => ConfigureServiceAsync(thisServiceName, exePath, instance, configPath, serviceDescription, serviceConfigurationState).GetAwaiter().GetResult(); + + async Task ConfigureServiceAsync(string thisServiceName, string exePath, string? instance, string? configPath, string serviceDescription, ServiceConfigurationState serviceConfigurationState) { //Check if system has bash and systemd - CheckSystemPrerequisites(); + await CheckSystemPrerequisitesAsync(); var cleanedInstanceName = SanitizeString(instance ?? thisServiceName); var systemdUnitFilePath = $"/etc/systemd/system/{cleanedInstanceName}.service"; if (serviceConfigurationState.Restart) - RestartService(cleanedInstanceName); + await RestartServiceAsync(cleanedInstanceName); if (serviceConfigurationState.Stop) - StopService(cleanedInstanceName); + await StopServiceAsync(cleanedInstanceName); if (serviceConfigurationState.Uninstall) - UninstallService(cleanedInstanceName, systemdUnitFilePath); + await UninstallServiceAsync(cleanedInstanceName, systemdUnitFilePath); var serviceDependencies = new List(); serviceDependencies.AddRange(new[] {"network.target"}); @@ -72,7 +86,7 @@ void ConfigureService(string thisServiceName, string exePath, string? instance, var userName = serviceConfigurationState.Username ?? "root"; if (serviceConfigurationState.Install) - InstallService(cleanedInstanceName, + await InstallServiceAsync(cleanedInstanceName, instance, configPath, exePath, @@ -82,7 +96,7 @@ void ConfigureService(string thisServiceName, string exePath, string? instance, serviceDependencies); if (serviceConfigurationState.Reconfigure) - ReconfigureService(cleanedInstanceName, + await ReconfigureServiceAsync(cleanedInstanceName, instance, configPath, exePath, @@ -92,42 +106,42 @@ void ConfigureService(string thisServiceName, string exePath, string? instance, serviceDependencies); if (serviceConfigurationState.Start) - StartService(cleanedInstanceName); + await StartServiceAsync(cleanedInstanceName); } - void RestartService(string serviceName) + async Task RestartServiceAsync(string serviceName) { log.Info($"Restarting service: {serviceName}"); - if (systemCtlHelper.RestartService(serviceName)) + if (await systemCtlHelper.RestartServiceAsync(serviceName)) log.Info("Service has been restarted"); else log.Error("The service could not be restarted"); } - void StopService(string serviceName) + async Task StopServiceAsync(string serviceName) { log.Info($"Stopping service: {serviceName}"); - if (systemCtlHelper.StopService(serviceName)) + if (await systemCtlHelper.StopServiceAsync(serviceName)) log.Info("Service stopped"); else log.Error("The service could not be stopped"); } - void StartService(string serviceName) + async Task StartServiceAsync(string serviceName) { - if (systemCtlHelper.StartService(serviceName, true)) + if (await systemCtlHelper.StartServiceAsync(serviceName, true)) log.Info($"Service started: {serviceName}"); else log.Error($"Could not start the systemd service: {serviceName}"); } - void UninstallService(string instance, string systemdUnitFilePath) + async Task UninstallServiceAsync(string instance, string systemdUnitFilePath) { log.Info($"Removing systemd service: {instance}"); try { - systemCtlHelper.StopService(instance); - systemCtlHelper.DisableService(instance); + await systemCtlHelper.StopServiceAsync(instance); + await systemCtlHelper.DisableServiceAsync(instance); File.Delete(systemdUnitFilePath); log.Info("Service uninstalled"); } @@ -138,7 +152,7 @@ void UninstallService(string instance, string systemdUnitFilePath) } } - void InstallService(string serviceName, + async Task InstallServiceAsync(string serviceName, string? instance, string? configPath, string exePath, @@ -149,8 +163,8 @@ void InstallService(string serviceName, { try { - WriteUnitFile(systemdUnitFilePath, GenerateSystemdUnitFile(instance, configPath, serviceDescription, exePath, userName, serviceDependencies)); - systemCtlHelper.EnableService(serviceName, true); + await WriteUnitFileAsync(systemdUnitFilePath, GenerateSystemdUnitFile(instance, configPath, serviceDescription, exePath, userName, serviceDependencies)); + await systemCtlHelper.EnableServiceAsync(serviceName, true); log.Info($"Service installed: {serviceName}"); } catch (Exception e) @@ -160,7 +174,7 @@ void InstallService(string serviceName, } } - void ReconfigureService(string serviceName, + async Task ReconfigureServiceAsync(string serviceName, string? instance, string? configPath, string exePath, @@ -173,13 +187,13 @@ void ReconfigureService(string serviceName, { log.Info($"Attempting to remove old service: {serviceName}"); //remove service - systemCtlHelper.StopService(serviceName); - systemCtlHelper.DisableService(serviceName); + await systemCtlHelper.StopServiceAsync(serviceName); + await systemCtlHelper.DisableServiceAsync(serviceName); File.Delete(systemdUnitFilePath); //re-add service - WriteUnitFile(systemdUnitFilePath, GenerateSystemdUnitFile(instance, configPath, serviceDescription, exePath, userName, serviceDependencies)); - systemCtlHelper.EnableService(serviceName, true); + await WriteUnitFileAsync(systemdUnitFilePath, GenerateSystemdUnitFile(instance, configPath, serviceDescription, exePath, userName, serviceDependencies)); + await systemCtlHelper.EnableServiceAsync(serviceName, true); log.Info($"Service installed: {serviceName}"); } catch (Exception e) @@ -189,93 +203,48 @@ void ReconfigureService(string serviceName, } } - void WriteUnitFile(string path, string contents) + async Task WriteUnitFileAsync(string path, string contents) { File.WriteAllText(path, contents); var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"chmod 644 {path}\""); - // We're in LinuxServiceConfigurator, which implements IServiceConfigurator. - // IServiceConfigurator is consumed by ServiceCommand (an AbstractCommand). - // AbstractCommand.Start() is sync because it implements ICommand.Start(), - // which is sync because Topshelf's runtime callback API is sync. So - // ConfigureService must return synchronously, and we block on the async - // call with .GetAwaiter().GetResult(). This is sync-over-async but is safe - // because no SynchronizationContext is ever captured on this call stack. - // When the user runs `tentacle service ...` from a shell, we enter through - // the process entry point (Main), and the main thread of a .NET console app - // has no SynchronizationContext installed by default. When running as an - // installed systemd service, Topshelf's OnStart callback runs on a - // dedicated worker (a `new Thread(...)`) that also has no - // SynchronizationContext. Either way, no captured context means no - // deadlock. - // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html - var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); + var result = await commandLineInvocation.ExecuteCommandAsync(); if (result.ExitCode == 0) return; result.Validate(); } - void CheckSystemPrerequisites() + async Task CheckSystemPrerequisitesAsync() { if (!File.Exists("/bin/bash")) throw new ControlledFailureException( "Could not detect bash. bash is required to run tentacle."); - if (!HaveSudoPrivileges()) + if (!await HaveSudoPrivilegesAsync()) throw new ControlledFailureException( "Requires elevated privileges. Please run command as sudo."); - if (!IsSystemdInstalled()) + if (!await IsSystemdInstalledAsync()) throw new ControlledFailureException( "Could not detect systemd. systemd is required to run Tentacle as a service"); } - bool IsSystemdInstalled() + async Task IsSystemdInstalledAsync() { var commandLineInvocation = new CommandLineInvocation("/bin/bash", "-c \"command -v systemctl >/dev/null\""); - // We're in LinuxServiceConfigurator, which implements IServiceConfigurator. - // IServiceConfigurator is consumed by ServiceCommand (an AbstractCommand). - // AbstractCommand.Start() is sync because it implements ICommand.Start(), - // which is sync because Topshelf's runtime callback API is sync. So - // ConfigureService must return synchronously, and we block on the async - // call with .GetAwaiter().GetResult(). This is sync-over-async but is safe - // because no SynchronizationContext is ever captured on this call stack. - // When the user runs `tentacle service ...` from a shell, we enter through - // the process entry point (Main), and the main thread of a .NET console app - // has no SynchronizationContext installed by default. When running as an - // installed systemd service, Topshelf's OnStart callback runs on a - // dedicated worker (a `new Thread(...)`) that also has no - // SynchronizationContext. Either way, no captured context means no - // deadlock. - // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html - var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); + var result = await commandLineInvocation.ExecuteCommandAsync(); return result.ExitCode == 0; } - bool HaveSudoPrivileges() + async Task HaveSudoPrivilegesAsync() { var commandLineInvocation = new CommandLineInvocation("/bin/bash", "-c \"sudo -vn 2> /dev/null\""); - // We're in LinuxServiceConfigurator, which implements IServiceConfigurator. - // IServiceConfigurator is consumed by ServiceCommand (an AbstractCommand). - // AbstractCommand.Start() is sync because it implements ICommand.Start(), - // which is sync because Topshelf's runtime callback API is sync. So - // ConfigureService must return synchronously, and we block on the async - // call with .GetAwaiter().GetResult(). This is sync-over-async but is safe - // because no SynchronizationContext is ever captured on this call stack. - // When the user runs `tentacle service ...` from a shell, we enter through - // the process entry point (Main), and the main thread of a .NET console app - // has no SynchronizationContext installed by default. When running as an - // installed systemd service, Topshelf's OnStart callback runs on a - // dedicated worker (a `new Thread(...)`) that also has no - // SynchronizationContext. Either way, no captured context means no - // deadlock. - // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html - var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); + var result = await commandLineInvocation.ExecuteCommandAsync(); return result.ExitCode == 0; } - string GenerateSystemdUnitFile(string? instance, + string GenerateSystemdUnitFile(string? instance, string? configPath, string serviceDescription, string exePath, string userName, IEnumerable serviceDependencies) { @@ -291,7 +260,7 @@ string GenerateSystemdUnitFile(string? instance, if (!string.IsNullOrEmpty(instance)) { stringBuilder.Append($" --instance={instance}"); - } + } else if (!string.IsNullOrEmpty(configPath)) { stringBuilder.Append($" --config={configPath}"); @@ -312,4 +281,4 @@ string GenerateSystemdUnitFile(string? instance, static string SanitizeString(string str) => Regex.Replace(str.Replace("/", ""), @"\s+", "-"); } -} \ No newline at end of file +} diff --git a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs index 9f9714bca..6d368d705 100644 --- a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.IO; @@ -7,6 +7,7 @@ using System.ServiceProcess; using System.Text; using System.Threading; +using System.Threading.Tasks; using Octopus.Tentacle.Core.Diagnostics; using Octopus.Tentacle.Util; using Polly; @@ -58,12 +59,30 @@ public void ConfigureServiceByConfigPath(string thisServiceName, serviceConfigurationState); } + // We're at the IServiceConfigurator boundary. IServiceConfigurator is consumed by + // ServiceCommand (an AbstractCommand), and AbstractCommand.Start() is sync because + // ICommand.Start() is sync (Topshelf's runtime callback API is sync). So + // ConfigureService must return synchronously. This is the single sync-over-async + // bridge for the Windows service-configuration code path: a one-line wrapper over the + // private async implementation. Safe because no SynchronizationContext is captured on + // this call stack: the console-app main thread has none by default, and Topshelf's + // OnStart callback runs on a `new Thread(...)` worker that also has none. Either way, + // no captured context means no deadlock. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html void ConfigureService(string thisServiceName, string exePath, string? instance, string? configPath, string serviceDescription, ServiceConfigurationState serviceConfigurationState) + => ConfigureServiceAsync(thisServiceName, exePath, instance, configPath, serviceDescription, serviceConfigurationState).GetAwaiter().GetResult(); + + async Task ConfigureServiceAsync(string thisServiceName, + string exePath, + string? instance, + string? configPath, + string serviceDescription, + ServiceConfigurationState serviceConfigurationState) { windowsLocalAdminRightsChecker.AssertIsRunningElevated(); var services = ServiceController.GetServices(); @@ -81,7 +100,7 @@ void ConfigureService(string thisServiceName, if (serviceConfigurationState.Uninstall) { - UninstallService(thisServiceName, controller); + await UninstallServiceAsync(thisServiceName, controller); } var serviceDependencies = new List(); @@ -96,18 +115,18 @@ void ConfigureService(string thisServiceName, if (serviceConfigurationState.Install) { - controller = InstallService(thisServiceName, exePath, instance, configPath, + controller = await InstallServiceAsync(thisServiceName, exePath, instance, configPath, serviceDescription, serviceConfigurationState, controller, serviceDependencies); } if (serviceConfigurationState.Reconfigure) { - ReconfigureService(thisServiceName, exePath, instance, configPath, serviceDescription, serviceDependencies); + await ReconfigureServiceAsync(thisServiceName, exePath, instance, configPath, serviceDescription, serviceDependencies); } if ((serviceConfigurationState.Install || serviceConfigurationState.Reconfigure) && !string.IsNullOrWhiteSpace(serviceConfigurationState.Username)) { - ConfigureCredentialsForService(thisServiceName, serviceConfigurationState); + await ConfigureCredentialsForServiceAsync(thisServiceName, serviceConfigurationState); } if (serviceConfigurationState.Start) @@ -116,7 +135,7 @@ void ConfigureService(string thisServiceName, } } - void ConfigureCredentialsForService(string thisServiceName, ServiceConfigurationState serviceConfigurationState) + async Task ConfigureCredentialsForServiceAsync(string thisServiceName, ServiceConfigurationState serviceConfigurationState) { if (!string.IsNullOrWhiteSpace(serviceConfigurationState.Password)) { @@ -137,13 +156,13 @@ void ConfigureCredentialsForService(string thisServiceName, ServiceConfiguration } else { - Sc($"config \"{thisServiceName}\" obj= \"{serviceConfigurationState.Username}\""); + await ScAsync($"config \"{thisServiceName}\" obj= \"{serviceConfigurationState.Username}\""); } log.Info("Service credentials set"); } - void ReconfigureService(string thisServiceName, + async Task ReconfigureServiceAsync(string thisServiceName, string exePath, string? instance, string? configPath, @@ -154,13 +173,13 @@ void ReconfigureService(string thisServiceName, var command = exePath.EndsWith(".dll") ? $"config \"{thisServiceName}\" binpath= \"dotnet \\\"{exePath}\\\" run {instanceIdentifier} DisplayName= \"{thisServiceName}\" depend= {string.Join("/", serviceDependencies)} start= auto" : $"config \"{thisServiceName}\" binpath= \"\\\"{exePath}\\\" run {instanceIdentifier} DisplayName= \"{thisServiceName}\" depend= {string.Join("/", serviceDependencies)} start= auto"; - Sc(command); - Sc($"description \"{thisServiceName}\" \"{serviceDescription}\""); + await ScAsync(command); + await ScAsync($"description \"{thisServiceName}\" \"{serviceDescription}\""); log.Info("Service reconfigured"); } - ServiceController? InstallService(string thisServiceName, + async Task InstallServiceAsync(string thisServiceName, string exePath, string? instance, string? configPath, @@ -181,8 +200,8 @@ void ReconfigureService(string thisServiceName, ? $"create \"{thisServiceName}\" binpath= \"dotnet \\\"{exePath}\\\" run {instanceIdentifier} DisplayName= \"{thisServiceName}\" depend= {string.Join("/", serviceDependencies)} start= auto" : $"create \"{thisServiceName}\" binpath= \"\\\"{exePath}\\\" run {instanceIdentifier} DisplayName= \"{thisServiceName}\" depend= {string.Join("/", serviceDependencies)} start= auto"; - Sc(command); - Sc($"description \"{thisServiceName}\" \"{serviceDescription}\""); + await ScAsync(command); + await ScAsync($"description \"{thisServiceName}\" \"{serviceDescription}\""); } log.Info("Service installed"); @@ -207,7 +226,7 @@ static string InstanceIdentifier(string? instance, string? configPath) throw new InvalidOperationException("Either the instance name of configuration path must be provided to configure a service"); } - void UninstallService(string thisServiceName, ServiceController? controller) + async Task UninstallServiceAsync(string thisServiceName, ServiceController? controller) { if (controller == null) { @@ -215,7 +234,7 @@ void UninstallService(string thisServiceName, ServiceController? controller) } else { - Sc($"delete \"{thisServiceName}\""); + await ScAsync($"delete \"{thisServiceName}\""); log.Info("Service uninstalled"); } @@ -333,7 +352,7 @@ void WaitForControllerStatus(ServiceController controller, ServiceControllerStat 150); } - void Sc(string arguments) + async Task ScAsync(string arguments) { var outputBuilder = new StringBuilder(); var argumentsToLog = string.Join(" ", arguments); @@ -342,19 +361,12 @@ void Sc(string arguments) var sc = Path.Combine(system32, "sc.exe"); logFileOnlyLogger.Info($"Executing sc.exe {argumentsToLog}"); - // We're in Sc() running sc.exe, called from IServiceConfigurator.ConfigureService - // implementations which are sync (called from the Tentacle service-management CLI on - // Windows), so we block on the async call with .GetAwaiter().GetResult(). - // This is sync-over-async but is safe because the CLI dispatches us on a plain - // thread-pool worker. No captured SynchronizationContext, so no deadlock. - // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html - var exitCode = SilentProcessRunnerExtended.ExecuteCommandAsync(sc, + var exitCode = await SilentProcessRunnerExtended.ExecuteCommandAsync(sc, arguments, Environment.CurrentDirectory, output => outputBuilder.AppendLine(output), error => outputBuilder.AppendLine("Error: " + error), - cancel: CancellationToken.None) - .GetAwaiter().GetResult(); + cancel: CancellationToken.None); if (exitCode == 0) logFileOnlyLogger.Info(outputBuilder.ToString()); else diff --git a/source/Octopus.Tentacle/Util/CommandLineRunner.cs b/source/Octopus.Tentacle/Util/CommandLineRunner.cs index faf00b1d8..3f2609999 100644 --- a/source/Octopus.Tentacle/Util/CommandLineRunner.cs +++ b/source/Octopus.Tentacle/Util/CommandLineRunner.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Threading; +using System.Threading.Tasks; using Octopus.Tentacle.Core.Diagnostics; namespace Octopus.Tentacle.Util @@ -8,13 +9,11 @@ namespace Octopus.Tentacle.Util public class CommandLineRunner : ICommandLineRunner { public bool Execute(IEnumerable commandLineInvocations, ILog log) - { - return Execute(commandLineInvocations, + => Execute(commandLineInvocations, log.Verbose, log.Info, log.Error, log.Error); - } public bool Execute(IEnumerable commandLineInvocations, Action debug, @@ -36,29 +35,63 @@ public bool Execute(CommandLineInvocation invocation, ILog log) log.Error, log.Error); + // We're at the ICommandLineRunner sync entry point, consumed by Octopus.Manager.Tentacle + // (WPF). The WPF installer calls Execute from ThreadPool.QueueUserWorkItem (a sync + // delegate), so this is the sync-over-async bridge: a one-line wrapper over the public + // async implementation. Safe because the installer dispatches us on a plain thread-pool + // worker. No captured SynchronizationContext, so no deadlock. Async callers should call + // ExecuteAsync directly. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html public bool Execute(CommandLineInvocation invocation, Action debug, Action info, Action error, Action exception) + => ExecuteAsync(invocation, debug, info, error, exception).GetAwaiter().GetResult(); + + public Task ExecuteAsync(IEnumerable commandLineInvocations, ILog log) + => ExecuteAsync(commandLineInvocations, + log.Verbose, + log.Info, + log.Error, + log.Error); + + public async Task ExecuteAsync(IEnumerable commandLineInvocations, + Action debug, + Action info, + Action error, + Action exception) + { + foreach (var invocation in commandLineInvocations) + if (!await ExecuteAsync(invocation, debug, info, error, exception)) + return false; + + return true; + } + + public Task ExecuteAsync(CommandLineInvocation invocation, ILog log) + => ExecuteAsync(invocation, + log.Info, + log.Info, + log.Error, + log.Error); + + public async Task ExecuteAsync(CommandLineInvocation invocation, + Action debug, + Action info, + Action error, + Action exception) { try { - // We're in CommandLineRunner.Execute, consumed by Octopus.Manager.Tentacle (WPF). - // The WPF installer calls Execute from ThreadPool.QueueUserWorkItem (a sync - // delegate), so we block on the async call with .GetAwaiter().GetResult(). - // This is sync-over-async but is safe because the installer dispatches us on a - // plain thread-pool worker. No captured SynchronizationContext, so no deadlock. - // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html - var exitCode = SilentProcessRunner.ExecuteCommandAsync( - invocation.Executable, - (invocation.Arguments ?? "") + " " + (invocation.SystemArguments ?? ""), - Environment.CurrentDirectory, - debug, - info, - error, - cancel: CancellationToken.None) - .GetAwaiter().GetResult(); + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( + invocation.Executable, + (invocation.Arguments ?? "") + " " + (invocation.SystemArguments ?? ""), + Environment.CurrentDirectory, + debug, + info, + error, + cancel: CancellationToken.None); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle/Util/ICommandLineRunner.cs b/source/Octopus.Tentacle/Util/ICommandLineRunner.cs index a1244a681..ab0b1d7db 100644 --- a/source/Octopus.Tentacle/Util/ICommandLineRunner.cs +++ b/source/Octopus.Tentacle/Util/ICommandLineRunner.cs @@ -1,5 +1,6 @@ -using System; +using System; using System.Collections.Generic; +using System.Threading.Tasks; using Octopus.Tentacle.Core.Diagnostics; namespace Octopus.Tentacle.Util @@ -21,5 +22,21 @@ bool Execute(CommandLineInvocation invocation, Action info, Action error, Action exception); + + Task ExecuteAsync(IEnumerable commandLineInvocations, ILog log); + + Task ExecuteAsync(IEnumerable commandLineInvocations, + Action debug, + Action info, + Action error, + Action exception); + + Task ExecuteAsync(CommandLineInvocation commandLineInvocation, ILog log); + + Task ExecuteAsync(CommandLineInvocation invocation, + Action debug, + Action info, + Action error, + Action exception); } -} \ No newline at end of file +} diff --git a/source/Octopus.Tentacle/Util/SystemCtlHelper.cs b/source/Octopus.Tentacle/Util/SystemCtlHelper.cs index ba4edf993..d97fb3a03 100644 --- a/source/Octopus.Tentacle/Util/SystemCtlHelper.cs +++ b/source/Octopus.Tentacle/Util/SystemCtlHelper.cs @@ -1,5 +1,6 @@ using System; using System.Linq; +using System.Threading.Tasks; using Octopus.Tentacle.Core.Diagnostics; namespace Octopus.Tentacle.Util @@ -13,32 +14,26 @@ public SystemCtlHelper(ISystemLog log) this.log = log; } - public bool StartService(string serviceName, bool logFailureAsError = false) - => RunServiceCommand("start", serviceName, logFailureAsError); + public Task StartServiceAsync(string serviceName, bool logFailureAsError = false) + => RunServiceCommandAsync("start", serviceName, logFailureAsError); - public bool RestartService(string serviceName, bool logFailureAsError = false) - => RunServiceCommand("restart", serviceName, logFailureAsError); + public Task RestartServiceAsync(string serviceName, bool logFailureAsError = false) + => RunServiceCommandAsync("restart", serviceName, logFailureAsError); - public bool StopService(string serviceName, bool logFailureAsError = false) - => RunServiceCommand("stop", serviceName, logFailureAsError); + public Task StopServiceAsync(string serviceName, bool logFailureAsError = false) + => RunServiceCommandAsync("stop", serviceName, logFailureAsError); - public bool EnableService(string serviceName, bool logFailureAsError = false) - => RunServiceCommand("enable", serviceName, logFailureAsError); + public Task EnableServiceAsync(string serviceName, bool logFailureAsError = false) + => RunServiceCommandAsync("enable", serviceName, logFailureAsError); - public bool DisableService(string serviceName, bool logFailureAsError = false) - => RunServiceCommand("disable", serviceName, logFailureAsError); + public Task DisableServiceAsync(string serviceName, bool logFailureAsError = false) + => RunServiceCommandAsync("disable", serviceName, logFailureAsError); - bool RunServiceCommand(string command, string serviceName, bool logFailureAsError) + async Task RunServiceCommandAsync(string command, string serviceName, bool logFailureAsError) { // Try without sudo first var commandLineInvocation = new CommandLineInvocation("/bin/bash", $"-c \"systemctl {command} {serviceName}\""); - // We're in SystemCtlHelper running a systemctl command. All callers (StartService, - // RestartService, etc.) are sync, called from the Tentacle service-management CLI - // which has no async path, so we block on the async call with .GetAwaiter().GetResult(). - // This is sync-over-async but is safe because the CLI dispatches us on a plain - // thread-pool worker. No captured SynchronizationContext, so no deadlock. - // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html - var result = commandLineInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); + var result = await commandLineInvocation.ExecuteCommandAsync(); if (result.ExitCode == 0) return true; // Check if failure is due to authentication/permission issues @@ -54,10 +49,9 @@ bool RunServiceCommand(string command, string serviceName, bool logFailureAsErro { log.Info($"Permission denied. Retrying 'systemctl {command} {serviceName}' with sudo..."); var sudoInvocation = new CommandLineInvocation("/bin/bash", $"-c \"sudo systemctl {command} {serviceName}\""); - // Same sync-over-async boundary as above: sudo retry on the same thread-pool worker. - result = sudoInvocation.ExecuteCommandAsync().GetAwaiter().GetResult(); + result = await sudoInvocation.ExecuteCommandAsync(); if (result.ExitCode == 0) return true; - + usedSudo = true; } From 0b9e11f8a242d4e32ec49974cdadff6b56c1b54a Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 16:48:36 +1000 Subject: [PATCH 10/52] Push async sibling up so KubernetesScriptPodCreator awaits directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address PR review on #1236: the sync-over-async hop at the IKubernetesDirectoryInformationProvider boundary was carrying two consumers — a sync override (EnsureDiskHasEnoughFreeSpace) and an async one (CreateScriptContainer). Expose async siblings (GetPathUsedBytesAsync on the provider, GetStorageInformationAsync on KubernetesPhysicalFileSystem) so the async caller awaits directly. The sync GetPathUsedBytes and GetStorageInformation remain for the IOctopusFileSystem override, and the lighthouse comment now names that exact consumer instead of waving at a non-existent background sweeper. --- .../KubernetesDirectoryInformationProvider.cs | 14 ++++++++------ .../Kubernetes/KubernetesPhysicalFileSystem.cs | 13 +++++++++++++ .../Kubernetes/KubernetesScriptPodCreator.cs | 2 +- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs index 4a373e0f1..c5e3e32a6 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs @@ -12,6 +12,7 @@ namespace Octopus.Tentacle.Kubernetes public interface IKubernetesDirectoryInformationProvider { public ulong? GetPathUsedBytes(string directoryPath); + public Task GetPathUsedBytesAsync(string directoryPath); public ulong? GetPathTotalBytes(); } @@ -37,16 +38,17 @@ public KubernetesDirectoryInformationProvider(ISystemLog log, ISilentProcessRunn this.directoryInformationCache = directoryInformationCache; } - // We're at the IKubernetesDirectoryInformationProvider boundary. Callers (capacity - // reporting) are sync, so this is the sync-over-async bridge: a one-line wrapper over - // the private async implementation. Safe because the consumer is a background sweeper - // running on a plain thread-pool worker. No captured SynchronizationContext, so no - // deadlock. + // Sync-over-async bridge for the one remaining sync caller: KubernetesPhysicalFileSystem + // overrides IOctopusFileSystem.EnsureDiskHasEnoughFreeSpace (sync), which calls + // GetStorageInformation (sync), which calls this. Async callers should use + // GetPathUsedBytesAsync directly. Safe because the Kubernetes agent is a console process + // (no SynchronizationContext) and the file-system call paths run on plain thread-pool + // workers, so no deadlock. // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html public ulong? GetPathUsedBytes(string directoryPath) => GetPathUsedBytesAsync(directoryPath).GetAwaiter().GetResult(); - async Task GetPathUsedBytesAsync(string directoryPath) + public async Task GetPathUsedBytesAsync(string directoryPath) { return await directoryInformationCache.GetOrCreateAsync(directoryPath, async e => { diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesPhysicalFileSystem.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesPhysicalFileSystem.cs index e1bc2378d..0f2fce740 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesPhysicalFileSystem.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesPhysicalFileSystem.cs @@ -1,5 +1,6 @@ using System; using System.IO; +using System.Threading.Tasks; using Octopus.Tentacle.Core.Diagnostics; using Octopus.Tentacle.Core.Util; using Octopus.Tentacle.Util; @@ -54,5 +55,17 @@ public override void EnsureDiskHasEnoughFreeSpace(string directoryPath, long req return null; } + + public async Task<(ulong freeSpaceBytes, ulong totalSpaceBytes)?> GetStorageInformationAsync() + { + var bytesUsed = await directoryInformationProvider.GetPathUsedBytesAsync(HomeDir); + var bytesTotal = directoryInformationProvider.GetPathTotalBytes(); + if (bytesUsed.HasValue && bytesTotal.HasValue) + { + return (bytesTotal.Value - bytesUsed.Value, bytesTotal.Value); + } + + return null; + } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesScriptPodCreator.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesScriptPodCreator.cs index 894a7d7ab..7a1439ad1 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesScriptPodCreator.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesScriptPodCreator.cs @@ -300,7 +300,7 @@ void LogVerboseToBothLogs(string message, InMemoryTentacleScriptLog tentacleScri protected async Task CreateScriptContainer(StartKubernetesScriptCommandV1 command, string podName, string scriptName, string homeDir, string workspacePath, string[]? scriptArguments, InMemoryTentacleScriptLog tentacleScriptLog, V1Container? containerSpec) { - var spaceInformation = kubernetesPhysicalFileSystem.GetStorageInformation(); + var spaceInformation = await kubernetesPhysicalFileSystem.GetStorageInformationAsync(); var commandString = string.Join(" ", new[] { From 7bb3ef6a9af2ea4bccf965e5881b1acfbdababc2 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Wed, 27 May 2026 11:05:45 +1000 Subject: [PATCH 11/52] Rework sync-over-async justifications + fix wizard telemetry test Address PR review on #1236: - Plainer two-section "Why this is sync / Why blocking is safe" format across all sync-over-async sites; tests add a "Why low risk" line per reviewer ask. - KubernetesDirectoryInformationProvider, PowerShellPrerequisite, Linux/WindowsServiceConfigurator, KubernetesAgentInstaller, SilentProcessRunnerFixture, LinuxTestUserPrincipal (previously had no justification, now does). - LinuxTentacleFetcher.ExtractTarGzip: both callers are already async, so flip the helper to ExtractTarGzipAsync and remove the sync-over-async entirely. Update NugetTentacleFetcher.ExtractTentacle to await. - Fix WhenSettingUpPollingTentacle_TelemetryEventShouldBeSent: builder was stubbing only the sync Execute overloads, so the ExecuteAsync switch in ReviewAndRunScriptTabViewModel returned a default false Task and the telemetry callback never fired. Stub both overloads. --- .../SetupTentacleWizardModelBuilder.cs | 2 ++ .../PreReq/PowerShellPrerequisite.cs | 14 +++++++++---- .../Setup/KubernetesAgentInstaller.cs | 11 ++++++---- .../TentacleFetchers/LinuxTentacleFetcher.cs | 14 ++++--------- .../TentacleFetchers/NugetTentacleFetcher.cs | 2 +- .../Util/SilentProcessRunnerFixture.cs | 15 +++++++------ .../Util/LinuxTestUserPrincipal.cs | 9 ++++++++ .../KubernetesDirectoryInformationProvider.cs | 15 +++++++------ .../Startup/LinuxServiceConfigurator.cs | 21 +++++++++++-------- .../Startup/WindowsServiceConfigurator.cs | 21 +++++++++++-------- 10 files changed, 75 insertions(+), 49 deletions(-) diff --git a/source/Octopus.Manager.Tentacle.Tests/Builders/SetupTentacleWizardModelBuilder.cs b/source/Octopus.Manager.Tentacle.Tests/Builders/SetupTentacleWizardModelBuilder.cs index 0e71b9e1a..b86c59576 100644 --- a/source/Octopus.Manager.Tentacle.Tests/Builders/SetupTentacleWizardModelBuilder.cs +++ b/source/Octopus.Manager.Tentacle.Tests/Builders/SetupTentacleWizardModelBuilder.cs @@ -32,6 +32,8 @@ public SetupTentacleWizardModelBuilder() commandLineRunner = Substitute.For(); commandLineRunner.Execute(Arg.Any(), Arg.Any()).Returns(true); commandLineRunner.Execute(Arg.Any>(), Arg.Any()).Returns(true); + commandLineRunner.ExecuteAsync(Arg.Any(), Arg.Any()).Returns(true); + commandLineRunner.ExecuteAsync(Arg.Any>(), Arg.Any()).Returns(true); } public SetupTentacleWizardModelBuilder WithTelemetryService(ITelemetryService telemetryService) diff --git a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs index fcbe7cc17..e922e1b8f 100644 --- a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs +++ b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs @@ -11,10 +11,16 @@ public class PowerShellPrerequisite : IPrerequisite { public string StatusMessage => "Checking that Windows PowerShell 2.0 is installed..."; - // We're at the WPF installer prerequisite boundary. IPrerequisite.Check() must return - // synchronously, so this is the sync-over-async bridge: a one-line wrapper over the - // private async implementation. Safe because the installer dispatches us on a plain - // thread-pool worker. No captured SynchronizationContext, so no deadlock. + // Why this is sync: IPrerequisite.Check() is part of a sync interface used by + // the WPF installer's prerequisite plumbing. Making it async would mean + // converting the whole IPrerequisite chain, which is a wider refactor than + // this PR. + // + // Why blocking on the async call is safe: PreReqWindow.Start dispatches each + // prerequisite via DispatchHelper.Background, which queues us via + // ThreadPool.QueueUserWorkItem. That's a plain thread-pool worker with no + // SynchronizationContext, so there's nothing for the awaited continuation + // to wait on. // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html public PrerequisiteCheckResult Check() => CheckAsync().GetAwaiter().GetResult(); diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs index cdb4ecab3..72f23d2a0 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs @@ -219,10 +219,13 @@ public void Dispose() NamespaceFlag, AgentName); - // We're in IDisposable.Dispose(). Dispose() must return synchronously, so we - // block on the async call with .GetAwaiter().GetResult(). This is sync-over-async - // but is safe because the NUnit test runner dispatches us on a worker thread - // without a captured SynchronizationContext, so no deadlock. + // Why this is sync: IDisposable.Dispose() must return sync. + // + // Why blocking on the async call is safe: this only runs under NUnit, + // which dispatches us on a worker thread with no SynchronizationContext. + // + // Why low risk: this is test code. The worst case for a wrong call here + // is a hung test, not a production incident. // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var exitCode = SilentProcessRunner.ExecuteCommandAsync( helmExePath, diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs index 7bf97744a..298bcc4c6 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs @@ -44,11 +44,11 @@ async Task DownloadAndExtractFromUrl(string directoryPath, string url) var extractionDirectory = new DirectoryInfo(Path.Combine(directoryPath, "extracted")); - ExtractTarGzip(downloadFilePath, extractionDirectory.FullName, logger); + await ExtractTarGzipAsync(downloadFilePath, extractionDirectory.FullName, logger); return Path.Combine(extractionDirectory.FullName, "tentacle", "Tentacle"); } - public static void ExtractTarGzip(string gzArchiveName, string destFolder, ILogger logger) + public static async Task ExtractTarGzipAsync(string gzArchiveName, string destFolder, ILogger logger) { if (!Directory.Exists(destFolder)) { @@ -62,20 +62,14 @@ public static void ExtractTarGzip(string gzArchiveName, string destFolder, ILogg using var tmp = new TemporaryDirectory(); Action log = s => logger.Information(s); - // We're in a synchronous public static helper (ExtractTarGzip). The method - // must return synchronously, so we block on the async call with - // .GetAwaiter().GetResult(). This is sync-over-async but is safe because - // the NUnit test runner dispatches us on a worker thread without a captured - // SynchronizationContext, so no deadlock. - // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html - var exitCode = SilentProcessRunner.ExecuteCommandAsync( + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( "tar", $"xzvf \"{gzArchiveName}\" -C \"{destFolder}\"", tmp.DirectoryPath, log, log, log, - cancel: CancellationToken.None).GetAwaiter().GetResult(); + cancel: CancellationToken.None); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/NugetTentacleFetcher.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/NugetTentacleFetcher.cs index dc6f3851e..fff2c6bc9 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/NugetTentacleFetcher.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/NugetTentacleFetcher.cs @@ -61,7 +61,7 @@ async Task DownloadAndExtractFromUrl(string directoryPath, Version versi var tentacleFolder = Path.Combine(directoryPath, "tentacle"); if (tentacleArtifact.EndsWith(".tar.gz")) { - LinuxTentacleFetcher.ExtractTarGzip(tentacleArtifact, tentacleFolder, logger); + await LinuxTentacleFetcher.ExtractTarGzipAsync(tentacleArtifact, tentacleFolder, logger); } else { diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index ada4a5095..44125a578 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -419,12 +419,15 @@ static int Execute( var info = new StringBuilder(); var error = new StringBuilder(); - // We're in a synchronous test helper (Execute) that exposes a sync int - // return and out parameters. The method must return synchronously, so we - // block on the async call with .GetAwaiter().GetResult(). This is - // sync-over-async but is safe because the NUnit test runner dispatches us - // on a worker thread without a captured SynchronizationContext, so no - // deadlock. + // Why this is sync: Execute is a test helper that returns int and uses + // out parameters — both force the signature to be sync. It's invoked + // directly from sync NUnit test methods. + // + // Why blocking on the async call is safe: NUnit dispatches us on a + // worker thread with no SynchronizationContext. + // + // Why low risk: this is test code. The worst case for a wrong call here + // is a hung test, not a production incident. // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html var exitCode = SilentProcessRunner.ExecuteCommandAsync( command, diff --git a/source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs b/source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs index 9fe121add..219fa5b6a 100644 --- a/source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs +++ b/source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs @@ -19,6 +19,15 @@ public LinuxTestUserPrincipal(string username) public string UserName { get; } + // Why this is sync: RunCommand is called from the constructor, which can't + // be async. + // + // Why blocking on the async call is safe: this only runs under NUnit, which + // dispatches us on a worker thread with no SynchronizationContext. + // + // Why low risk: this is test code. The worst case for a wrong call here is + // a hung test, not a production incident. + // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html static void RunCommand(string arguments, bool failOnNonZeroExitCode = true) { var commandLineInvocation = new CommandLineInvocation("/bin/bash", arguments); diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs index c5e3e32a6..7609ff106 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs @@ -38,12 +38,15 @@ public KubernetesDirectoryInformationProvider(ISystemLog log, ISilentProcessRunn this.directoryInformationCache = directoryInformationCache; } - // Sync-over-async bridge for the one remaining sync caller: KubernetesPhysicalFileSystem - // overrides IOctopusFileSystem.EnsureDiskHasEnoughFreeSpace (sync), which calls - // GetStorageInformation (sync), which calls this. Async callers should use - // GetPathUsedBytesAsync directly. Safe because the Kubernetes agent is a console process - // (no SynchronizationContext) and the file-system call paths run on plain thread-pool - // workers, so no deadlock. + // Why this is sync: the only caller is EnsureDiskHasEnoughFreeSpace, which + // overrides a sync method on the IOctopusFileSystem chain. Making that + // whole chain async is a wider refactor than this PR. New code should + // call GetPathUsedBytesAsync directly instead of going through here. + // + // Why blocking on the async call is safe: .GetAwaiter().GetResult() can + // deadlock when the calling thread has a SynchronizationContext. The + // Kubernetes agent is a console app and doesn't set one up, so there's + // nothing for the awaited continuation to wait on. // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html public ulong? GetPathUsedBytes(string directoryPath) => GetPathUsedBytesAsync(directoryPath).GetAwaiter().GetResult(); diff --git a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs index 6f3c099a7..128996c1d 100644 --- a/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs @@ -48,15 +48,18 @@ public void ConfigureServiceByConfigPath(string thisServiceName, serviceConfigurationState); } - // We're at the IServiceConfigurator boundary. IServiceConfigurator is consumed by - // ServiceCommand (an AbstractCommand), and AbstractCommand.Start() is sync because - // ICommand.Start() is sync (Topshelf's runtime callback API is sync). So - // ConfigureService must return synchronously. This is the single sync-over-async - // bridge for the Linux service-configuration code path: a one-line wrapper over the - // private async implementation. Safe because no SynchronizationContext is captured on - // this call stack: the console-app main thread has none by default, and Topshelf's - // OnStart callback runs on a `new Thread(...)` worker that also has none. Either way, - // no captured context means no deadlock. + // Used by ServiceCommand (an AbstractCommand) to install/configure the + // Tentacle as a Linux systemd service. + // + // Why this is sync: AbstractCommand.Start() is sync because ICommand.Start() + // is sync. When Tentacle runs as a Windows service we host AbstractCommands + // via Topshelf, whose runtime callback API is also sync — so the call path + // has to return sync end-to-end. + // + // Why blocking on the async call is safe: the console-app main thread has + // no SynchronizationContext. Topshelf's OnStart callback runs on a fresh + // `new Thread(...)` worker that also has none. Either way, nothing for the + // awaited continuation to wait on. // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html void ConfigureService(string thisServiceName, string exePath, string? instance, string? configPath, string serviceDescription, ServiceConfigurationState serviceConfigurationState) => ConfigureServiceAsync(thisServiceName, exePath, instance, configPath, serviceDescription, serviceConfigurationState).GetAwaiter().GetResult(); diff --git a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs index 6d368d705..581b80541 100644 --- a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs @@ -59,15 +59,18 @@ public void ConfigureServiceByConfigPath(string thisServiceName, serviceConfigurationState); } - // We're at the IServiceConfigurator boundary. IServiceConfigurator is consumed by - // ServiceCommand (an AbstractCommand), and AbstractCommand.Start() is sync because - // ICommand.Start() is sync (Topshelf's runtime callback API is sync). So - // ConfigureService must return synchronously. This is the single sync-over-async - // bridge for the Windows service-configuration code path: a one-line wrapper over the - // private async implementation. Safe because no SynchronizationContext is captured on - // this call stack: the console-app main thread has none by default, and Topshelf's - // OnStart callback runs on a `new Thread(...)` worker that also has none. Either way, - // no captured context means no deadlock. + // Used by ServiceCommand (an AbstractCommand) to install/configure the + // Tentacle as a Windows Service. + // + // Why this is sync: AbstractCommand.Start() is sync because ICommand.Start() + // is sync. When Tentacle runs as a Windows service we host AbstractCommands + // via Topshelf, whose runtime callback API is also sync — so the call path + // has to return sync end-to-end. + // + // Why blocking on the async call is safe: the console-app main thread has + // no SynchronizationContext. Topshelf's OnStart callback runs on a fresh + // `new Thread(...)` worker that also has none. Either way, nothing for the + // awaited continuation to wait on. // See https://blog.stephencleary.com/2012/07/dont-block-on-async-code.html void ConfigureService(string thisServiceName, string exePath, From 0a5d94198cabfb1845473bc133eaf9b2c759c463 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Wed, 27 May 2026 15:31:30 +1000 Subject: [PATCH 12/52] Pull shared post-fetch logic into BuildStorageInformation helper Address PR review on #1236: the sync and async GetStorageInformation variants had duplicated body. Factor the bytesTotal lookup + tuple assembly into a private sync helper so the sync/async pair we need stays DRY. --- .../Kubernetes/KubernetesPhysicalFileSystem.cs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesPhysicalFileSystem.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesPhysicalFileSystem.cs index 0f2fce740..45d0570a5 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesPhysicalFileSystem.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesPhysicalFileSystem.cs @@ -47,18 +47,21 @@ public override void EnsureDiskHasEnoughFreeSpace(string directoryPath, long req public (ulong freeSpaceBytes, ulong totalSpaceBytes)? GetStorageInformation() { var bytesUsed = directoryInformationProvider.GetPathUsedBytes(HomeDir); - var bytesTotal = directoryInformationProvider.GetPathTotalBytes(); - if (bytesUsed.HasValue && bytesTotal.HasValue) - { - return (bytesTotal.Value - bytesUsed.Value, bytesTotal.Value); - } - - return null; + return BuildStorageInformation(bytesUsed); } public async Task<(ulong freeSpaceBytes, ulong totalSpaceBytes)?> GetStorageInformationAsync() { var bytesUsed = await directoryInformationProvider.GetPathUsedBytesAsync(HomeDir); + return BuildStorageInformation(bytesUsed); + } + + // Shared sync assembler used by both GetStorageInformation (sync, for the + // IOctopusFileSystem override path) and GetStorageInformationAsync (for + // CreateScriptContainer). Pulled out to keep the post-fetch logic DRY + // across the sync/async pair we need. + (ulong freeSpaceBytes, ulong totalSpaceBytes)? BuildStorageInformation(ulong? bytesUsed) + { var bytesTotal = directoryInformationProvider.GetPathTotalBytes(); if (bytesUsed.HasValue && bytesTotal.HasValue) { From e8a7126bf6cd18d3341d9bf72dc0802390ef04ad Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 12:17:32 +1000 Subject: [PATCH 13/52] Add abandon contracts and TentacleDebugDisableProcessKill env var Co-Authored-By: Claude Opus 4.7 (1M context) --- .../IAsyncClientScriptServiceV2.cs | 1 + .../ScriptExitCodes.cs | 1 + .../ScriptServiceV2/AbandonScriptCommandV2.cs | 17 +++++++++++++++++ .../ScriptServiceV2/IScriptServiceV2.cs | 1 + .../Util/EnvironmentVariables.cs | 1 + 5 files changed, 21 insertions(+) create mode 100644 source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs diff --git a/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs b/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs index 996f915c1..0c89048e4 100644 --- a/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs @@ -10,6 +10,7 @@ public interface IAsyncClientScriptServiceV2 Task StartScriptAsync(StartScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); Task GetStatusAsync(ScriptStatusRequestV2 request, HalibutProxyRequestOptions proxyRequestOptions); Task CancelScriptAsync(CancelScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); + Task AbandonScriptAsync(AbandonScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); Task CompleteScriptAsync(CompleteScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs b/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs index 2e0ce1b15..2319410c4 100644 --- a/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs +++ b/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs @@ -12,6 +12,7 @@ public static class ScriptExitCodes public const int UnknownScriptExitCode = -45; public const int UnknownResultExitCode = -46; public const int PowerShellNeverStartedExitCode = -47; + public const int AbandonedExitCode = -48; //Kubernetes Agent public const int KubernetesScriptPodNotFound = -81; diff --git a/source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs new file mode 100644 index 000000000..66efba446 --- /dev/null +++ b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs @@ -0,0 +1,17 @@ +using System; + +namespace Octopus.Tentacle.Contracts.ScriptServiceV2 +{ + public class AbandonScriptCommandV2 + { + public AbandonScriptCommandV2(ScriptTicket ticket, long lastLogSequence) + { + Ticket = ticket; + LastLogSequence = lastLogSequence; + } + + public ScriptTicket Ticket { get; } + + public long LastLogSequence { get; } + } +} diff --git a/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs index 3effc17b8..9858111fa 100644 --- a/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs @@ -7,6 +7,7 @@ public interface IScriptServiceV2 ScriptStatusResponseV2 StartScript(StartScriptCommandV2 command); ScriptStatusResponseV2 GetStatus(ScriptStatusRequestV2 request); ScriptStatusResponseV2 CancelScript(CancelScriptCommandV2 command); + ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command); void CompleteScript(CompleteScriptCommandV2 command); } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs b/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs index 2b5f83a59..4293edee7 100644 --- a/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs +++ b/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs @@ -29,6 +29,7 @@ public static class EnvironmentVariables public const string TentacleMachineConfigurationHomeDirectory = "TentacleMachineConfigurationHomeDirectory"; public const string TentaclePollingConnectionCount = "TentaclePollingConnectionCount"; public const string TentaclePowerShellStartupTimeout = "TentaclePowerShellStartupTimeout"; + public const string TentacleDebugDisableProcessKill = "TentacleDebugDisableProcessKill"; public const string NfsWatchdogDirectory = "watchdog_directory"; public static string TentacleUseTcpNoDelay = "TentacleUseTcpNoDelay"; public static string TentacleUseAsyncListener = "TentacleUseAsyncListener"; From 18005849de2a274e1cd82bbca13515b479abea09 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 12:17:48 +1000 Subject: [PATCH 14/52] Add abandon token to SilentProcessRunner and remove process.Close() race - Adds CancellationToken abandon parameter - Switches the await from sync body to await WaitForExitAsync(abandon) - Returns AbandonedExitCode when abandon fires before process exits - Adds net48 polyfill (WaitForExitAsyncNetFramework) - Sets process.EnableRaisingEvents = true - Removes process.Close() race in DoOurBestToCleanUp - Adds SafelyCancelOutputAndErrorRead helper - Adds TentacleDebugDisableProcessKill test affordance to Hitman Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/CommandLine/SilentProcessRunner.cs | 149 +++++++++++++++--- 1 file changed, 128 insertions(+), 21 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 0e57221c6..7b70ad001 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -9,7 +9,9 @@ using System.Text; using System.Threading; using System.Threading.Tasks; +using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Core.Diagnostics; +using Octopus.Tentacle.Core.Util; namespace Octopus.Tentacle.Util { @@ -22,12 +24,13 @@ public static Task ExecuteCommandAsync( Action debug, Action info, Action error, - CancellationToken cancel) + CancellationToken cancel, + CancellationToken abandon) { - return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel); + return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel, abandon: abandon); } - public static Task ExecuteCommandAsync( + public static async Task ExecuteCommandAsync( string executable, string arguments, string workingDirectory, @@ -35,7 +38,8 @@ public static Task ExecuteCommandAsync( Action info, Action error, IReadOnlyDictionary? customEnvironmentVariables = null, - CancellationToken cancel = default) + CancellationToken cancel = default, + CancellationToken abandon = default) { if (executable == null) throw new ArgumentNullException(nameof(executable)); @@ -110,6 +114,7 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei process.StartInfo.CreateNoWindow = true; process.StartInfo.RedirectStandardOutput = true; process.StartInfo.RedirectStandardError = true; + process.EnableRaisingEvents = true; if (PlatformDetection.IsRunningOnWindows) { process.StartInfo.StandardOutputEncoding = encoding; @@ -136,12 +141,39 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei })) { if (cancel.IsCancellationRequested) - DoOurBestToCleanUp(process, error); + DoOurBestToCleanUp(process, error); process.BeginOutputReadLine(); process.BeginErrorReadLine(); - process.WaitForExit(); + try + { + // WaitForExitAsync completes when the Process.Exited event fires (or + // when `abandon` cancels). Unlike the sync WaitForExit() no-timeout + // overload, it does NOT wait for the redirected stdout/stderr streams + // to reach EOF — so a re-parented grandchild holding our pipes open + // cannot hang us here. Stream draining is handled separately below by + // SafelyWaitForAllOutput (with a 5s timeout per stream). + // + // We pass `abandon` (not `cancel`) because cancel is handled via the + // cancel.Register callback above which kills the process tree; the + // resulting Exited event is what unblocks this await on cancel. + // `abandon` is a separate token used by EFT-3295 to stop waiting + // WITHOUT killing the process — see the catch block below. +#if NETFRAMEWORK + await WaitForExitAsyncNetFramework(process, abandon).ConfigureAwait(false); +#else + await process.WaitForExitAsync(abandon).ConfigureAwait(false); +#endif + } + catch (OperationCanceledException) when (abandon.IsCancellationRequested && !process.HasExited) + { + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + running = false; + return ScriptExitCodes.AbandonedExitCode; + } SafelyCancelRead(process.CancelErrorRead, debug); SafelyCancelRead(process.CancelOutputRead, debug); @@ -153,7 +185,7 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei debug($"Process {exeFileNameOrFullPath} in {workingDirectory} exited with code {exitCode}"); running = false; - return Task.FromResult(exitCode); + return exitCode; } } } @@ -181,9 +213,20 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, CancellationToken cancel, Action debug) { + // Waits for the OutputDataReceived/ErrorDataReceived handler to signal EOF on the + // stream (it sets the reset event when it receives a null DataReceivedEventArgs.Data, + // which is .NET's EOF marker). This does NOT close the pipe — it just gives the OS + // up to 5 seconds to deliver the EOF. + // + // If a re-parented grandchild is holding the pipe open, EOF never arrives, the wait + // times out, and we proceed without the final flush of buffered output. The pipe is + // released later by Process.Dispose() at end of ExecuteCommandAsync via the + // `using (var process = new Process())` block. + // + // 5 seconds is somewhat arbitrary — the process has already exited by the time we + // reach here, so under normal circumstances EOF arrives within milliseconds. try { - //5 seconds is a bit arbitrary, but the process should have already exited by now, so unwise to wait too long outputResetEvent.Wait(TimeSpan.FromSeconds(5), cancel); } catch (OperationCanceledException ex) @@ -222,22 +265,79 @@ static void DoOurBestToCleanUp(Process process, Action error) error($"Failed to kill the launched process: {killProcessException}"); } } - finally + // Do NOT add process.Close() here. The pre-async version of this code did, and adding + // it back will cause cancel to hang forever. Here's the full picture: + // + // OLD SYNC CODE: the calling thread blocked inside process.WaitForExit() (no-timeout + // overload), which waits for BOTH the process to exit AND the redirected stream + // readers to reach EOF. If a re-parented grandchild held our stdout/stderr open, the + // stream readers never reached EOF, so WaitForExit() blocked forever. Calling + // process.Close() during cancel-cleanup forced the Process object to release its + // handles to the redirected pipes, which made the readers see EOF, which let + // WaitForExit() return. That's why Close() was here. + // + // NEW ASYNC CODE: the calling thread awaits a TaskCompletionSource that completes + // when the Process.Exited event fires. WaitForExitAsync does NOT wait on the + // redirected streams (Microsoft confirms in the docs: "output processing will not + // have completed when this method returns"). So a grandchild holding pipes open + // can't hang the await. The original reason for Close() is gone. + // + // WHY ADDING Close() BACK IS WORSE THAN USELESS: process.Close() detaches the Process + // object from the underlying OS process, which tears down the wait state that + // produces the Exited event. If Close() runs before the kernel has signalled the + // exit to .NET (which is asynchronous — Hitman.Kill returns immediately, the OS + // delivers the exit notification some time later), the Exited event never fires, + // our TCS never completes, and the await hangs forever. Every cancel races. + // + // HOW PIPES ACTUALLY GET RELEASED NOW: + // 1. After WaitForExitAsync returns, SafelyWaitForAllOutput waits up to 5 seconds + // per stream for EOF. If a grandchild holds the pipes, this times out and we + // proceed (it bounds cancel latency; it does NOT close anything). + // 2. The outer `using (var process = new Process())` block calls Process.Dispose + // at end of method, which calls Close internally. Because we're no longer + // awaiting WaitForExitAsync at this point, the Close-vs-Exited race can't + // happen — the wait state is already torn down by our code, not by Close. + // + // Worst case cancel latency with grandchild holding pipes: ~10s (5s × 2 streams). + // Covered by tests in SilentProcessRunnerFixture: + // - CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNotHang (Windows) + // - CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_ShouldNotHang (Unix) + // Both assert cancel returns within 30s in this scenario. + } + +#if NETFRAMEWORK + // WaitForExitAsync is not available on .NET Framework 4.x; polyfill using Process.Exited event + TaskCompletionSource. + static Task WaitForExitAsyncNetFramework(Process process, CancellationToken cancellationToken) + { + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + CancellationTokenRegistration registration = default; + + void OnExited(object? sender, EventArgs e) { - try - { - // When cancelling, close the file handles. - // If the child finishes, but the grandchild holds stdout/stderr and never completes, - // THEN we won't issue a kill to the grandchild but will wait for the grandchild to - // close the handles. - process.Close(); - } - catch (Exception ex) + registration.Dispose(); + tcs.TrySetResult(null); + } + + process.Exited += OnExited; + + // Guard against race: process may have already exited before we subscribed. + if (process.HasExited) + { + tcs.TrySetResult(null); + } + + if (cancellationToken.CanBeCanceled) + { + registration = cancellationToken.Register(() => { - error($"Failed to close process resources: {ex.Message}"); - } + process.Exited -= OnExited; + tcs.TrySetCanceled(cancellationToken); + }); } + + return tcs.Task; } +#endif [DllImport("kernel32.dll", SetLastError = true)] #pragma warning disable PC003 // Native API not available in UWP @@ -252,11 +352,18 @@ class Hitman { public static void TryKillProcessAndChildrenRecursively(Process process) { + if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill))) + { + // Test-only no-op: simulate "kill was attempted but didn't terminate the process". + // Only activated when the test harness sets this env var on the Tentacle process. + return; + } + #if NETFRAMEWORK TryKillWindowsProcessAndChildrenRecursively(process.Id); #endif #if !NETFRAMEWORK - // Since .NET Core 3.0 there is support for killing a process and it's children + // Since .NET Core 3.0 there is support for killing a process and it's children process.Kill(true); #endif } From c45ba7b92e564564d0a41865e98f0462b0bc562c Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 12:52:00 +1000 Subject: [PATCH 15/52] Plumb abandon token through ISilentProcessRunner Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/ISilentProcessRunner.cs | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs b/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs index 49c54fd6b..fa6d21f45 100644 --- a/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs +++ b/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs @@ -14,7 +14,8 @@ Task ExecuteCommandAsync( string workingDirectory, Action info, Action error, - CancellationToken cancel = default); + CancellationToken cancel = default, + CancellationToken abandon = default); Task ExecuteCommandAsync( string executable, @@ -23,19 +24,20 @@ Task ExecuteCommandAsync( Action debug, Action info, Action error, - CancellationToken cancel = default); + CancellationToken cancel = default, + CancellationToken abandon = default); } public class SilentProcessRunnerWrapper : ISilentProcessRunner { - public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default) + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) { - return SilentProcessRunnerExtended.ExecuteCommandAsync(executable, arguments, workingDirectory, info, error, cancel); + return SilentProcessRunnerExtended.ExecuteCommandAsync(executable, arguments, workingDirectory, info, error, cancel, abandon); } - public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default) + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) { - return SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, cancel: cancel); + return SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, cancel: cancel, abandon: abandon); } } @@ -70,7 +72,8 @@ public static Task ExecuteCommandAsync( string workingDirectory, Action info, Action error, - CancellationToken cancel = default) + CancellationToken cancel = default, + CancellationToken abandon = default) => SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, @@ -78,6 +81,7 @@ public static Task ExecuteCommandAsync( info, error, customEnvironmentVariables: null, - cancel: cancel); + cancel: cancel, + abandon: abandon); } } From ec4ce8fd5fca95729b8ec863643b536ed5482363 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 13:04:20 +1000 Subject: [PATCH 16/52] Pass abandon: CancellationToken.None at sync-boundary callers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four caller sites that bridge sync interfaces to the (now-takes-abandon) ExecuteCommandAsync pass CancellationToken.None for the new abandon parameter (these callers don't need abandon semantics): - PowerShellPrerequisite.CheckPowerShellIsInstalledAsync - KubernetesDirectoryInformationProvider.GetDriveBytesUsingDuAsync - WindowsServiceConfigurator.ScAsync - CommandLineRunner.ExecuteAsync SystemCtlHelper and LinuxServiceConfigurator call the CommandLineInvocation extension overloads, which don't expose an abandon parameter — they default to CancellationToken.None internally, so no edits needed there. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs | 3 ++- .../Kubernetes/KubernetesDirectoryInformationProvider.cs | 3 ++- source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs | 3 ++- source/Octopus.Tentacle/Util/CommandLineRunner.cs | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs index e922e1b8f..94ea0751f 100644 --- a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs +++ b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs @@ -56,7 +56,8 @@ await SilentProcessRunnerExtended.ExecuteCommandAsync( ".", stdOut.WriteLine, s => stdErr.WriteLine($"ERR: {s}"), - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); var outputText = stdOut.ToString(); new SystemLog().Verbose("PowerShell prerequisite check output: " + outputText); diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs index 7609ff106..dc6a990cf 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Threading; using System.Threading.Tasks; using Octopus.Client.Extensions; using Microsoft.Extensions.Caching.Memory; @@ -70,7 +71,7 @@ public KubernetesDirectoryInformationProvider(ISystemLog log, ISilentProcessRunn { var stdOut = new List(); var stdErr = new List(); - var exitCode = await silentProcessRunner.ExecuteCommandAsync("du", $"-s -B 1 {directoryPath}", "/", stdOut.Add, stdErr.Add); + var exitCode = await silentProcessRunner.ExecuteCommandAsync("du", $"-s -B 1 {directoryPath}", "/", stdOut.Add, stdErr.Add, abandon: CancellationToken.None); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs index 581b80541..f7ab12992 100644 --- a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs @@ -369,7 +369,8 @@ async Task ScAsync(string arguments) Environment.CurrentDirectory, output => outputBuilder.AppendLine(output), error => outputBuilder.AppendLine("Error: " + error), - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); if (exitCode == 0) logFileOnlyLogger.Info(outputBuilder.ToString()); else diff --git a/source/Octopus.Tentacle/Util/CommandLineRunner.cs b/source/Octopus.Tentacle/Util/CommandLineRunner.cs index 3f2609999..69fbae7a5 100644 --- a/source/Octopus.Tentacle/Util/CommandLineRunner.cs +++ b/source/Octopus.Tentacle/Util/CommandLineRunner.cs @@ -91,7 +91,8 @@ public async Task ExecuteAsync(CommandLineInvocation invocation, debug, info, error, - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); if (exitCode != 0) { From 2b6dec1b6fb861b401002f2bc1f6f1200c02e889 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 13:04:29 +1000 Subject: [PATCH 17/52] Plumb abandon token through RunningScript, differentiate abandoned vs cancelled Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Services/Scripts/RunningScript.cs | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index 412a415e3..0ed327ba1 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -22,6 +22,7 @@ public class RunningScript: IRunningScript readonly IShell shell; readonly string taskId; readonly CancellationToken runningScriptToken; + readonly CancellationToken abandonToken; readonly IReadOnlyDictionary environmentVariables; readonly ILog log; readonly ScriptIsolationMutex scriptIsolationMutex; @@ -34,6 +35,7 @@ public RunningScript(IShell shell, string taskId, ScriptIsolationMutex scriptIsolationMutex, CancellationToken runningScriptToken, + CancellationToken abandonToken, IReadOnlyDictionary environmentVariables, TimeSpan powerShellStartupTimeout, ILog log @@ -44,6 +46,7 @@ ILog log this.stateStore = stateStore; this.taskId = taskId; this.runningScriptToken = runningScriptToken; + this.abandonToken = abandonToken; this.environmentVariables = environmentVariables; this.log = log; this.scriptIsolationMutex = scriptIsolationMutex; @@ -60,7 +63,7 @@ public RunningScript(IShell shell, CancellationToken runningScriptToken, IReadOnlyDictionary environmentVariables, TimeSpan powerShellStartupTimeout, - ILog log) : this(shell, workspace, null, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, environmentVariables, powerShellStartupTimeout, log) + ILog log) : this(shell, workspace, null, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, CancellationToken.None, environmentVariables, powerShellStartupTimeout, log) { } @@ -96,9 +99,19 @@ public async Task Execute() exitCode = workspace.ShouldMonitorPowerShellStartup() ? await RunPowershellScriptWithMonitoring(shellPath, writer, runningScriptToken) - : await RunScriptAsync(shellPath, writer, runningScriptToken); + : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); } } + catch (OperationCanceledException) when (abandonToken.IsCancellationRequested) + { + // Distinguish the abandon path from cancel: when the abandon token fires, + // we don't try to kill the underlying script process. Logging it as + // "abandoned" rather than "canceled" makes the deployment log honest about + // what happened, and surfacing AbandonedExitCode (-48) lets the caller + // (the Octopus Server) treat it differently from a normal cancel exit. + writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution abandoned."); + exitCode = ScriptExitCodes.AbandonedExitCode; + } catch (OperationCanceledException) { writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution canceled."); @@ -147,7 +160,7 @@ async Task RunPowershellScriptWithMonitoring(string shellPath, IScriptLogWr var monitor = new PowerShellStartupMonitor(workspace.WorkingDirectory, powerShellStartupTimeout, log, taskId); var monitoringTask = monitor.WaitForStartup(monitoringTaskCts.Token); - var scriptTask = Task.Run(async () => await RunScriptAsync(shellPath, writer, scriptTaskCts.Token), scriptTaskCts.Token); + var scriptTask = Task.Run(async () => await RunScriptAsync(shellPath, writer, scriptTaskCts.Token, abandonToken), scriptTaskCts.Token); var completedTask = await Task.WhenAny(monitoringTask, scriptTask); @@ -222,7 +235,7 @@ void RecordScriptHasCompleted(int exitCode) } } - async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken) + async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken, CancellationToken abandon) { try { @@ -234,7 +247,8 @@ async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, Cancel LogScriptOutputTo(writer, ProcessOutputSource.StdOut), LogScriptOutputTo(writer, ProcessOutputSource.StdErr), environmentVariables, - cancel: cancellationToken); + cancellationToken, + abandon); return exitCode; } From 86dc538cb49db78c4d4307693e2e29700326a7e6 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 13:04:35 +1000 Subject: [PATCH 18/52] Implement ScriptServiceV2.AbandonScriptAsync and abandon-gated workspace cleanup Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Services/Scripts/ScriptServiceV2.cs | 71 +++++++++++++++---- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs index 0a200fc19..6774874d1 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs @@ -72,7 +72,8 @@ public async Task StartScriptAsync(StartScriptCommandV2 { IScriptWorkspace workspace; - // If the state already exists then this runningScript is already running/has already run and we should not run it again + // If the state already exists then this runningScript is already running/has already run and we should not run it again. + // StartScript may be called multiple times for the same ticket (e.g. server retries), so we guard against double-launching. if (runningScript.ScriptStateStore.Exists()) { var state = runningScript.ScriptStateStore.Load(); @@ -103,7 +104,8 @@ public async Task StartScriptAsync(StartScriptCommandV2 command.TaskId, workspace, runningScript.ScriptStateStore, - runningScript.CancellationToken); + runningScript.CancellationToken, + runningScript.AbandonToken); runningScript.Process = process; @@ -140,22 +142,64 @@ public async Task CancelScriptAsync(CancelScriptCommandV return GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process); } - public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken) + public Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) { - await Task.CompletedTask; + // Fires the abandon token (so Execute will return AbandonedExitCode on its next + // unwind) and returns the current status snapshot immediately. The caller (the + // Octopus Server) polls GetStatus to observe the eventual Complete + AbandonedExitCode, + // same as for the cancel flow, so there's no need to block the RPC handler waiting + // for the running script to reach Complete state. + if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) + { + runningScript.Abandon(); + } + return Task.FromResult(GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process)); + } + + public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken) + { + // Stop tracking and dispose the running-script bookkeeping. The underlying + // OS process may or may not still be running depending on whether this + // script completed normally, was cancelled, or was abandoned. if (runningScripts.TryRemove(command.Ticket, out var runningScript)) { runningScript.Dispose(); } var workspace = workspaceFactory.GetWorkspace(command.Ticket, WorkspaceReadinessCheck.Skip); - await workspace.Delete(cancellationToken); + + // For abandoned scripts the underlying OS process is, by design, still alive + // and may still hold open file handles inside the workspace (logs being written + // to, working files, etc.). workspace.Delete() will fail in that case on + // Windows (sharing violations) and may partially delete on Linux. Tolerate + // the failure: the workspace will be left on disk and reaped by another + // mechanism (manual cleanup, instance restart). For all other completion paths + // the process has exited and Delete should succeed; surface any failure there. + var stateStore = scriptStateStoreFactory.Create(workspace); + var wasAbandoned = stateStore.Exists() + && stateStore.Load().ExitCode == ScriptExitCodes.AbandonedExitCode; + + if (wasAbandoned) + { + try + { + await workspace.Delete(cancellationToken); + } + catch (Exception ex) + { + log.Warn(ex, $"Could not delete abandoned workspace at {workspace.WorkingDirectory}. Leaving on disk; the underlying script process may still hold open file handles."); + } + } + else + { + await workspace.Delete(cancellationToken); + } } - RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken) + RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken, CancellationToken abandonToken) { - var runningScript = new RunningScript(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, environmentVariables, powerShellStartupTimeout, log); + var runningScript = new RunningScript(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); _ = Task.Run(async () => await runningScript.Execute()); return runningScript; } @@ -204,13 +248,14 @@ public bool IsRunningScript(ScriptTicket ticket) class RunningScriptWrapper : IDisposable { - readonly CancellationTokenSource cancellationTokenSource = new (); + readonly CancellationTokenSource cancellationTokenSource = new(); + readonly CancellationTokenSource abandonTokenSource = new(); public RunningScriptWrapper(ScriptStateStore scriptStateStore) { ScriptStateStore = scriptStateStore; - CancellationToken = cancellationTokenSource.Token; + AbandonToken = abandonTokenSource.Token; } public RunningScript? Process { get; set; } @@ -218,15 +263,15 @@ public RunningScriptWrapper(ScriptStateStore scriptStateStore) public SemaphoreSlim StartScriptMutex { get; } = new(1, 1); public CancellationToken CancellationToken { get; } + public CancellationToken AbandonToken { get; } - public void Cancel() - { - cancellationTokenSource.Cancel(); - } + public void Cancel() => cancellationTokenSource.Cancel(); + public void Abandon() => abandonTokenSource.Cancel(); public void Dispose() { cancellationTokenSource.Dispose(); + abandonTokenSource.Dispose(); } } } From 175cc6e55875363e3128bbe4c6b6cab2a7d386f5 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 13:04:39 +1000 Subject: [PATCH 19/52] Advertise AbandonScriptV2 capability Co-Authored-By: Claude Opus 4.7 (1M context) --- .../CapabilitiesServiceV2Test.cs | 23 +++++++++++++++++++ .../Capabilities/CapabilitiesServiceV2.cs | 3 ++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index 6f9b7300a..5107b2306 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -7,6 +7,7 @@ using NUnit.Framework; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators; @@ -39,6 +40,17 @@ public async Task CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_W expectedCapabilitiesCount++; } + // tentacleConfigurationTestCase.Version == null indicates the "latest" build under + // test (the code in this branch). Test cases with a concrete Version exercise older + // released tentacles fetched from S3 to verify backwards compatibility. Older builds + // pre-date EFT-3295 and don't advertise the AbandonScriptAsync capability, so we only + // assert it for the latest build. + if (version == null) + { + capabilities.Should().Contain(nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); + expectedCapabilitiesCount++; + } + capabilities.Count.Should().Be(expectedCapabilitiesCount); } @@ -63,6 +75,17 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK expectedCapabilitiesCount++; } + // tentacleConfigurationTestCase.Version == null indicates the "latest" build under + // test (the code in this branch). Test cases with a concrete Version exercise older + // released tentacles fetched from S3 to verify backwards compatibility. Older builds + // pre-date EFT-3295 and don't advertise the AbandonScriptAsync capability, so we only + // assert it for the latest build. + if (version == null) + { + capabilities.Should().Contain(nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); + expectedCapabilitiesCount++; + } + capabilities.Should().NotContain(nameof(IKubernetesScriptServiceV1)); capabilities.Count.Should().Be(expectedCapabilitiesCount); diff --git a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs index 7dda62791..545f55c73 100644 --- a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs +++ b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs @@ -3,6 +3,7 @@ using System.Threading.Tasks; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Core.Services; @@ -24,7 +25,7 @@ public async Task GetCapabilitiesAsync(CancellationToken } //non-kubernetes agent tentacles only support the standard script services - return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2) }); + return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync) }); } } } \ No newline at end of file From 9a458961689c6a0e0f260ee3dd5c2e80570ea93e Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 13:04:46 +1000 Subject: [PATCH 20/52] Add abandon-specific tests and helpers Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ScriptServiceV2DecoratorBuilder.cs | 5 + .../ClientScriptExecutionAbandon.cs | 145 ++++++++++++++ .../Support/ClientAndTentacle.cs | 14 ++ .../Util/RunningScriptFixture.cs | 72 +++++++ .../Util/SilentProcessRunnerFixture.cs | 157 +++++++++++---- .../CapabilitiesServiceV2Fixture.cs | 24 ++- .../Integration/ScriptServiceV2Fixture.cs | 187 ++++++++++++++++++ 7 files changed, 567 insertions(+), 37 deletions(-) create mode 100644 source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs diff --git a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs index 17b499a06..5cd747cf0 100644 --- a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs +++ b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs @@ -188,6 +188,11 @@ public async Task CancelScriptAsync(CancelScriptCommandV return await cancelScriptFunc(inner, command, options); } + public async Task AbandonScriptAsync(AbandonScriptCommandV2 command, HalibutProxyRequestOptions options) + { + return await inner.AbandonScriptAsync(command, options); + } + public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, HalibutProxyRequestOptions options) { await completeScriptAction(inner, command, options); diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs new file mode 100644 index 000000000..8e801f885 --- /dev/null +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -0,0 +1,145 @@ +using System; +using System.IO; +using System.Linq; +using System.Threading.Tasks; +using FluentAssertions; +using Halibut.ServiceModel; +using NUnit.Framework; +using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Contracts.ScriptServiceV2; +using Octopus.Tentacle.Core.Util; +using Octopus.Tentacle.Tests.Integration.Support; +using Octopus.Tentacle.Tests.Integration.Util; +using Octopus.Tentacle.Tests.Integration.Util.Builders; + +namespace Octopus.Tentacle.Tests.Integration +{ + [IntegrationTestTimeout] + public class ClientScriptExecutionAbandon : IntegrationTest + { + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_WhenCancelFailsToKillProcess_ReturnsAbandonedExitCode(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // TentacleDebugDisableProcessKill=1 makes Hitman a no-op, so CancelScript cannot + // actually terminate the underlying script process. The script becomes genuinely + // "stuck" from Tentacle's perspective. AbandonScript should then return promptly + // with AbandonedExitCode without waiting for the process to exit. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + var firstCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + var scriptServiceV2 = clientTentacle.CreateScriptServiceV2Client(); + + var scriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + // Cancel: Hitman is a no-op so the process keeps running. + await scriptServiceV2.CancelScriptAsync( + new CancelScriptCommandV2(firstCommand.ScriptTicket, 0), + new HalibutProxyRequestOptions(CancellationToken)); + await Task.Delay(TimeSpan.FromSeconds(1)); + + // Abandon: fires the abandon token. The RPC returns the current status snapshot + // immediately, so we poll GetStatus until the script reaches Complete state. + await scriptServiceV2.AbandonScriptAsync( + new AbandonScriptCommandV2(firstCommand.ScriptTicket, 0), + new HalibutProxyRequestOptions(CancellationToken)); + + ScriptStatusResponseV2 abandonResponse = null!; + await Wait.For(async () => + { + abandonResponse = await scriptServiceV2.GetStatusAsync( + new ScriptStatusRequestV2(firstCommand.ScriptTicket, 0), + new HalibutProxyRequestOptions(CancellationToken)); + return abandonResponse.State == ProcessState.Complete; + }, + TimeSpan.FromSeconds(30), + () => throw new Exception("Abandoned script did not reach Complete state within 30s"), + CancellationToken); + abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // Release the script process so it exits cleanly and stops leaking. + File.WriteAllText(releaseFile, ""); + await scriptExecution; + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRunning(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // The whole reason Tentacle needs an abandon RPC is to release the isolation mutex + // when CancelScript can't unstick the script. This test proves that contract: a + // FullIsolation script gets stuck (because TentacleDebugDisableProcessKill makes + // cancel a no-op), abandon is called, and a second FullIsolation script with the + // same mutex name must then be able to acquire the mutex and run. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + const string sharedMutex = "abandon-test-mutex"; + + var firstCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + var scriptServiceV2 = clientTentacle.CreateScriptServiceV2Client(); + + var firstScriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("First script did not start"), + CancellationToken); + + await scriptServiceV2.CancelScriptAsync( + new CancelScriptCommandV2(firstCommand.ScriptTicket, 0), + new HalibutProxyRequestOptions(CancellationToken)); + await Task.Delay(TimeSpan.FromSeconds(1)); + + await scriptServiceV2.AbandonScriptAsync( + new AbandonScriptCommandV2(firstCommand.ScriptTicket, 0), + new HalibutProxyRequestOptions(CancellationToken)); + + // Second FullIsolation script with the SAME mutex name. If the abandon released + // the mutex, this script can acquire it and run to completion. Otherwise it would + // block waiting for the (still-alive) first script's mutex hold. + var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); + var secondCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var (secondResult, _) = await tentacleClient.ExecuteScript(secondCommand, CancellationToken); + secondResult.ExitCode.Should().Be(0); + File.Exists(secondStartFile).Should().BeTrue("second script should have run after the mutex was released"); + + File.WriteAllText(releaseFile, ""); + await firstScriptExecution; + } + } +} diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs b/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs index 9d4e9a523..789522815 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs @@ -6,6 +6,9 @@ using Octopus.Tentacle.Client; using Octopus.Tentacle.Client.Retries; using Octopus.Tentacle.CommonTestUtils; +using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; +using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Tests.Integration.Support.Legacy; using Octopus.TestPortForwarder; using Serilog; @@ -30,6 +33,17 @@ public LegacyTentacleClientBuilder LegacyTentacleClientBuilder() return new LegacyTentacleClientBuilder(halibutRuntime, ServiceEndPoint); } + // The integration test for AbandonScript needs to call AbandonScriptAsync directly + // over the wire to assert on the RPC response shape and to drive the cancel→abandon + // sequence without going through TentacleClient's higher-level ExecuteScript orchestrator. + // TentacleClient deliberately doesn't expose AbandonScript at all today; the server is + // the only production consumer, and it talks to the Halibut client directly too. + // Exposing a direct client here keeps the test focused on the RPC behavior. + public IAsyncClientScriptServiceV2 CreateScriptServiceV2Client() + { + return halibutRuntime.CreateAsyncClient(ServiceEndPoint); + } + public ClientAndTentacle(IHalibutRuntime halibutRuntime, ServiceEndPoint serviceEndPoint, Server server, diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs index c0e3b05ad..5847c589b 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs @@ -188,6 +188,78 @@ public async Task CancellationToken_ShouldKillTheProcess() } } + [Test] + public async Task Execute_WhenAbandonTokenFires_ReturnsAbandonedExitCode() + { + using var tempDir = new TemporaryDirectory(); + var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); + + // Write a long-sleeping script that first records its PID, then sleeps. + var scriptBody = PlatformDetection.IsRunningOnWindows + ? $"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300" + : $"echo $$ > '{pidFile}' && sleep 300"; + workspace.BootstrapScript(scriptBody); + + var shell = PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(); + using var runningCts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + using var abandonCts = new CancellationTokenSource(); + + var script = new RunningScript( + shell, + workspace, + stateStore: null, + scriptLog, + taskId, + scriptIsolationMutex, + runningCts.Token, + abandonCts.Token, + new Dictionary(), + PowerShellStartupDetection.PowerShellStartupTimeout, + new InMemoryLog()); + + var executeTask = script.Execute(); + + // Wait deterministically for the process to write its PID before we abandon. + await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); + abandonCts.Cancel(); + + await executeTask; + + try + { + script.State.Should().Be(ProcessState.Complete); + script.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + finally + { + if (File.Exists(pidFile) + && int.TryParse(SafelyReadPidFile(pidFile).Trim(), out var pid) + && pid > 0) + { + try { System.Diagnostics.Process.GetProcessById(pid).Kill(); } + catch { /* process already exited */ } + } + } + } + + static async Task WaitForPidFileAsync(string pidFile, TimeSpan timeout) + { + var deadline = DateTime.UtcNow + timeout; + while (DateTime.UtcNow < deadline) + { + if (File.Exists(pidFile) && int.TryParse(SafelyReadPidFile(pidFile).Trim(), out var pid) && pid > 0) + return; + await Task.Delay(100); + } + Assert.Fail($"PID file '{pidFile}' was not written within {timeout.TotalSeconds}s — script process did not start."); + } + + static string SafelyReadPidFile(string path) + { + try { return File.ReadAllText(path); } + catch { return string.Empty; } + } + static string EchoEnvironmentVariable(string varName) { if (PlatformDetection.IsRunningOnWindows) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 44125a578..270ad301f 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -7,6 +7,7 @@ using FluentAssertions; using NUnit.Framework; using Octopus.Tentacle.CommonTestUtils; +using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Support.TestAttributes; using Octopus.Tentacle.Util; @@ -130,29 +131,45 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot using var tempDir = new TemporaryDirectory(); var grandchildPidFile = Path.Combine(tempDir.DirectoryPath, "grandchild.pid"); - // This test reproduces the cancel-time hang. - // The issue was SilentProcessRunner will wait for stdout/stderr pipes to be closed. - // The pipes can be inherited by grandchildren, and remain open even after the - // child process has died. + // This test guards the cancel path of ExecuteCommandAsync against a regression + // involving re-parented grandchildren that inherit our redirected pipes. // - // Normally Process.Kill(entireProcessTree:true) would kill the entire process tree. - // However if the child process dies THEN we issue the kill command, we do NOT see any - // process under the child and so the Kill() command completes. This leaves the grandchild - // running, holding our pipes we are waiting on. + // The scenario: + // * We launch a process that spawns a child, which spawns a long-running grandchild + // and then immediately exits. + // * Because the child exited BEFORE our cancel fires, the grandchild has been + // re-parented (PPID broken). Process.Kill(entireProcessTree:true) follows PPID + // links, so it does NOT find the grandchild — Kill returns having killed nothing + // beyond the (already-dead) child. + // * The grandchild inherited our redirected stdout/stderr pipes and holds them + // open. The stream readers therefore never see EOF. // - // The test stacks three processes: PowerShell (the child) launches cmd.exe (a - // throwaway middle layer) which does `start /b ping` to background ping (the - // grandchild) and then exits. cmd exiting before we cancel is what breaks the - // PPID chain — without that, ping would still be a direct child of PowerShell - // and Kill(true) would find it. + // Why this is a real risk to ExecuteCommandAsync: + // * Old sync version: process.WaitForExit() blocks until BOTH the process exits + // AND the redirected streams reach EOF, so the grandchild holding pipes would + // hang it forever. The fix was to call process.Close() during cancel-cleanup to + // forcibly release the pipe handles. + // * New async version: WaitForExitAsync does NOT wait on streams — it returns as + // soon as the Exited event fires. SafelyWaitForAllOutput then waits up to 5s + // per stream for EOF and times out if the grandchild still holds them. Pipes + // are released by the using-block's Process.Dispose at end of method. + // * Critically, we deliberately do NOT call process.Close() during cancel-cleanup + // anymore — see DoOurBestToCleanUp in SilentProcessRunner.cs for the full + // explanation. Adding it back caused a 10-minute hang in CI because Close races + // with the Exited event handler that WaitForExitAsync depends on. // - // Two non-obvious bits below, both load-bearing: + // This test asserts cancel returns in well under 30s in the grandchild scenario. + // If it ever takes 10 minutes (the test timeout), someone has re-introduced + // process.Close() or otherwise broken the Exited-event path. + // + // Two non-obvious bits in the PowerShell script below, both load-bearing: // * $psi.RedirectStandardInput = $true — we don't use stdin, but redirecting // any stream is what flips bInheritHandles=true in .NET's Process.Start. That // is what makes cmd (and by extension ping) inherit our pipe write-ends. // Without this the grandchild doesn't hold our pipes and there is no bug to // reproduce. - // * The WMI lookup — we need the grandchild's PID so the test can clean it up. + // * The WMI lookup — we need the grandchild's PID so the test can clean it up + // afterwards (otherwise we'd leak a long-running ping on the CI host). var psScript = @" $pidFile = 'PIDFILE_PLACEHOLDER' $pingPath = Join-Path $env:WINDIR 'System32\PING.EXE' @@ -205,9 +222,12 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot sw.Stop(); completed.Should().BeTrue( - $"ExecuteCommand should return shortly after cancellation even when a grandchild " + - $"holds the redirected pipes. Without proactively closing the redirected streams " + - $"after Kill, Process.WaitForExit() blocks indefinitely. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); + $"ExecuteCommandAsync should return promptly after cancellation even when a " + + $"grandchild holds the redirected pipes. Worst case is ~10s (5s timeout × 2 streams " + + $"in SafelyWaitForAllOutput). If we hit the 30s test timeout, either someone " + + $"re-introduced process.Close() in DoOurBestToCleanUp (which races with the Exited " + + $"event WaitForExitAsync depends on) or SafelyWaitForAllOutput's per-stream timeout " + + $"has been removed. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); } } finally @@ -222,19 +242,23 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul if (PlatformDetection.IsRunningOnWindows) Assert.Ignore("Unix-only repro (Mac/Linux). The Windows equivalent is covered by the [WindowsTest] above."); - // This test reproduces the cancel-time hang. - // The issue is SilentProcessRunner will wait for stdout/stderr pipes to be closed. - // The pipes can be inherited by grandchildren, and remain open even after the - // child process has died. + // Unix equivalent of CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNotHang + // above. See that test's leading comment for the full rationale — the short version is: // - // Normally Process.Kill(entireProcessTree:true) would kill the entire process tree. - // However if the child process dies THEN we issue the kill command, we do NOT see any - // process under the child and so the Kill() command completes. This leaves the grandchild - // running, holding our pipes we are waiting on. + // * We start sh, which backgrounds a long sleep (the grandchild) and exits + // immediately. The grandchild gets re-parented to init/launchd and inherits our + // redirected stdout/stderr pipes, holding them open. + // * Process.Kill(entireProcessTree:true) follows PPID links, so by the time we + // cancel, the now-orphan grandchild is invisible to Kill — it keeps running. + // * Old sync code: process.WaitForExit() hung forever waiting for stream EOF. + // Fix was process.Close() during cancel-cleanup. + // * New async code: WaitForExitAsync ignores streams, so this doesn't hang. + // SafelyWaitForAllOutput bounds the post-await drain to 5s per stream. Pipes + // are released by Process.Dispose at end of method (NOT during cancel-cleanup + // — see DoOurBestToCleanUp in SilentProcessRunner.cs for why adding Close back + // causes a 10-minute hang). // - // The test is simple we run "sh", the child process, and tell it to yeet - // sleep, the grandchild, into the background. The grandchild now keeps - // running holding on to our pipes we will wait for an EOF on. + // This test asserts cancel returns in well under 30s in the grandchild scenario. using var tempDir = new TemporaryDirectory(); var grandchildPidFile = Path.Combine(tempDir.DirectoryPath, "grandchild.pid"); @@ -258,15 +282,19 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul var sw = Stopwatch.StartNew(); cts.Cancel(); - // Cancel should be super quick, if it takes a long time, then we have an issue where we are waiting for the grandchild. + // Cancel should return within ~10s worst case (5s SafelyWaitForAllOutput timeout + // per stream). If we hit the 30s test timeout, something is hanging — most likely + // process.Close() got re-added to DoOurBestToCleanUp (see that method for why). var completed = task.Wait(TimeSpan.FromSeconds(30)); sw.Stop(); completed.Should().BeTrue( - $"ExecuteCommand should return shortly after cancellation even when a Unix " + - $"grandchild (reparented to init/launchd) holds the redirected pipes. " + - $"Without proactively closing the redirected streams after Kill, " + - $"Process.WaitForExit() blocks indefinitely. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); + $"ExecuteCommandAsync should return promptly after cancellation even when a Unix " + + $"grandchild (reparented to init/launchd) holds the redirected pipes. Worst case " + + $"is ~10s. If we hit the 30s test timeout, either process.Close() was re-introduced " + + $"in DoOurBestToCleanUp (which races with the Exited event WaitForExitAsync depends " + + $"on) or SafelyWaitForAllOutput's per-stream timeout has been removed. Elapsed " + + $"since cancel: {sw.Elapsed.TotalSeconds:F1}s"); } } finally @@ -275,6 +303,64 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul } } + [Test] + public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProcess() + { + using var tempDir = new TemporaryDirectory(); + var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); + + var abandonCommand = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; + var arguments = PlatformDetection.IsRunningOnWindows + ? $"-NoProfile -NonInteractive -Command \"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300\"" + : $"-c \"echo $$ > '{pidFile}' && sleep 300\""; + + using var cancelCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + + var infoMessages = new StringBuilder(); + + var sw = Stopwatch.StartNew(); + + var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( + abandonCommand, + arguments, + Environment.CurrentDirectory, + debug: _ => { }, + info: msg => { lock (infoMessages) infoMessages.AppendLine(msg); }, + error: _ => { }, + customEnvironmentVariables: null, + cancel: cancelCts.Token, + abandon: abandonCts.Token)); + + // Wait deterministically for the process to write its PID before we abandon + await WaitForGrandchildSpawnAsync(pidFile, TimeSpan.FromSeconds(30)); + abandonCts.Cancel(); + + try + { + var exitCode = await task; + sw.Stop(); + + // The whole point of abandon is "return promptly without waiting for the script + // process to exit". The script we just started runs for 5 minutes (sleep 300). + // Without an elapsed-time assertion this test would pass even if abandon + // accidentally waited the full 5 minutes, which would silently lose the entire + // contract. 2 seconds is a generous near-instant bound: the abandon path on a + // local machine returns in tens of milliseconds; CI has been measured under 500ms. + sw.Elapsed.Should().BeLessThan(TimeSpan.FromSeconds(2), "abandon should return promptly without waiting for the underlying process"); + exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); + } + finally + { + // Force-kill the sleeping process to avoid leaking it on CI + if (File.Exists(pidFile) && int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) && pid > 0) + { + try { Process.GetProcessById(pid).Kill(); } catch { /* already gone */ } + } + } + } + static async Task WaitForGrandchildSpawnAsync(string pidFile, TimeSpan timeout) { var deadline = DateTime.UtcNow + timeout; @@ -448,7 +534,8 @@ static int Execute( Console.WriteLine($"{DateTime.UtcNow} ERR: {x}"); error.Append(x); }, - cancel: cancel).GetAwaiter().GetResult(); + cancel: cancel, + abandon: CancellationToken.None).GetAwaiter().GetResult(); debugMessages = debug; infoMessages = info; diff --git a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs index 0c6326a05..ab34074c6 100644 --- a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs @@ -20,8 +20,8 @@ public async Task CapabilitiesAreReturned() .GetCapabilitiesAsync(CancellationToken.None)) .SupportedCapabilities; - capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2)); - capabilities.Count.Should().Be(3); + capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), "AbandonScriptV2"); + capabilities.Count.Should().Be(4); capabilities.Should().NotContainMatch("IKubernetesScriptService*"); } @@ -42,5 +42,25 @@ public async Task OnlyKubernetesScriptServicesAreReturnedWhenRunningAsKubernetes Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, null); } + + [Test] + public async Task GetCapabilities_OnNonKubernetesTentacle_AdvertisesAbandonScriptV2() + { + var service = new CapabilitiesServiceV2(); + var response = await service.GetCapabilitiesAsync(CancellationToken.None); + response.SupportedCapabilities.Should().Contain("AbandonScriptV2"); + } + + [Test] + public async Task GetCapabilities_OnKubernetesTentacle_DoesNotAdvertiseAbandonScriptV2() + { + Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, "ABC"); + + var service = new CapabilitiesServiceV2(); + var response = await service.GetCapabilitiesAsync(CancellationToken.None); + response.SupportedCapabilities.Should().NotContain("AbandonScriptV2"); + + Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, null); + } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs index d5c771eef..e6c469a4b 100644 --- a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs @@ -483,6 +483,193 @@ public async Task ScriptTicketCasingShouldNotAffectCommands() lowerCaseResponse.ExitCode.Should().Be(0); } + [Test] + public async Task AbandonScript_OnUnknownTicket_ReturnsCompleteWithUnknownScriptExitCode() + { + var ticket = new ScriptTicket("unknown-ticket-" + Guid.NewGuid().ToString("N")); + var response = await service.AbandonScriptAsync(new AbandonScriptCommandV2(ticket, 0), CancellationToken.None); + + response.State.Should().Be(ProcessState.Complete); + response.ExitCode.Should().Be(ScriptExitCodes.UnknownScriptExitCode); + } + + [Test] + public async Task AbandonScript_OnRunningScript_FiresAbandonToken_ReturnsAbandonedExitCode() + { + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // Wait for the script to reach Running state + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Running) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Running, "script should have reached Running state within 30 seconds"); + + // Fire abandon + await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + + // Poll until the script completes (the abandon token causes the process runner to return AbandonedExitCode) + ScriptStatusResponseV2 finalResponse; + var completionDeadline = DateTime.UtcNow.AddSeconds(30); + do + { + finalResponse = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (finalResponse.State == ProcessState.Complete) break; + await Task.Delay(100); + } while (DateTime.UtcNow < completionDeadline); + + finalResponse.State.Should().Be(ProcessState.Complete); + finalResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + + [Test] + public async Task AbandonScript_OnAlreadyCompletedScript_ReturnsRealExitCode() + { + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBody("echo \"finished\"") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // Wait for the script to complete + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Complete); + + var abandonResponse = await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + abandonResponse.ExitCode.Should().Be(0, "real exit code should be returned, not AbandonedExitCode"); + } + + [Test] + public async Task CompleteScript_AfterAbandon_WhenWorkspaceDeleteFails_LogsWarnAndReturnsNormally() + { + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // Wait for Running + ScriptStatusResponseV2 status; + var runningDeadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Running) break; + await Task.Delay(50); + } while (DateTime.UtcNow < runningDeadline); + status.State.Should().Be(ProcessState.Running, "script should have reached Running state within 30 seconds"); + + await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + + // Poll until Complete + var completeDeadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < completeDeadline); + status.State.Should().Be(ProcessState.Complete); + status.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // Build a service whose workspace.Delete throws + var deleteException = new IOException("file in use"); + var (mockFactory, mockLog) = BuildFactoryWithThrowingDelete(startCommand.ScriptTicket, deleteException); + var serviceUnderTest = new ScriptServiceV2( + PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), + mockFactory, + stateStoreFactory, + new ScriptIsolationMutex(), + mockLog); + + Func complete = async () => await serviceUnderTest.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket), CancellationToken.None); + + await complete.Should().NotThrowAsync(); + mockLog.Received().Warn(deleteException, Arg.Is(m => m.Contains("Could not delete") && m.Contains(startCommand.ScriptTicket.TaskId))); + } + + [Test] + public async Task CompleteScript_AfterNormalCompletion_WhenWorkspaceDeleteFails_PropagatesException() + { + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBody("echo \"finished\"") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // Poll until natural completion + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Complete); + status.ExitCode.Should().Be(0, "the script exited cleanly, not via abandon"); + + var deleteException = new IOException("file in use"); + var (mockFactory, mockLog) = BuildFactoryWithThrowingDelete(startCommand.ScriptTicket, deleteException); + var serviceUnderTest = new ScriptServiceV2( + PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), + mockFactory, + stateStoreFactory, + new ScriptIsolationMutex(), + mockLog); + + Func complete = async () => await serviceUnderTest.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket), CancellationToken.None); + + await complete.Should().ThrowAsync(); + } + + /// + /// Builds a mock IScriptWorkspaceFactory that delegates all calls to the real workspaceFactory except + /// workspace.Delete, which throws the supplied exception. Also returns a mock ISystemLog for assertion. + /// + (IScriptWorkspaceFactory factory, ISystemLog log) BuildFactoryWithThrowingDelete(ScriptTicket ticket, Exception deleteException) + { + var realWorkspace = workspaceFactory.GetWorkspace(ticket, WorkspaceReadinessCheck.Skip); + + var fakeWorkspace = Substitute.For(); + fakeWorkspace.ScriptTicket.Returns(realWorkspace.ScriptTicket); + fakeWorkspace.WorkingDirectory.Returns(realWorkspace.WorkingDirectory); + fakeWorkspace.BootstrapScriptFilePath.Returns(realWorkspace.BootstrapScriptFilePath); + fakeWorkspace.LogFilePath.Returns(realWorkspace.LogFilePath); + fakeWorkspace.ResolvePath(Arg.Any()).Returns(ci => realWorkspace.ResolvePath(ci.Arg())); + fakeWorkspace.CreateLog().Returns(_ => realWorkspace.CreateLog()); + fakeWorkspace.Delete(Arg.Any()).Returns(_ => throw deleteException); + + var fakeFactory = Substitute.For(); + fakeFactory.GetWorkspace(Arg.Any(), Arg.Any()).Returns(fakeWorkspace); + + var fakeLog = Substitute.For(); + return (fakeFactory, fakeLog); + } + // TODO - Test the stateStore is updated. private void SetupScriptState(ScriptTicket ticket) From 8475bbc2a0f7f41c6131f5ebfe03b5c80f3a5057 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 13:04:50 +1000 Subject: [PATCH 21/52] Add EFT-3295 spec, plan, and restructuring docs Co-Authored-By: Claude Opus 4.7 (1M context) --- ...2026-05-21-tentacle-script-abandon-plan.md | 1374 +++++++++++++++++ ...ync-migration-from-abandon-feature-plan.md | 1012 ++++++++++++ ...26-05-21-tentacle-script-abandon-design.md | 400 +++++ ...c-migration-from-abandon-feature-design.md | 131 ++ 4 files changed, 2917 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-21-tentacle-script-abandon-plan.md create mode 100644 docs/superpowers/plans/2026-05-25-split-async-migration-from-abandon-feature-plan.md create mode 100644 docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md create mode 100644 docs/superpowers/specs/2026-05-25-split-async-migration-from-abandon-feature-design.md diff --git a/docs/superpowers/plans/2026-05-21-tentacle-script-abandon-plan.md b/docs/superpowers/plans/2026-05-21-tentacle-script-abandon-plan.md new file mode 100644 index 000000000..107d40c62 --- /dev/null +++ b/docs/superpowers/plans/2026-05-21-tentacle-script-abandon-plan.md @@ -0,0 +1,1374 @@ +# Tentacle script abandon — implementation plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Spec:** `docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md` +**Ticket:** [EFT-3295](https://linear.app/octopus/issue/EFT-3295/tentacle-script-abandonment-to-release-the-mutex) + +**Goal:** Add an `AbandonScript` verb to `IScriptServiceV2` so Octopus Server can tell Tentacle to release the `ScriptIsolationMutex` and accept new work even when `Process.Kill` failed to stop a stuck script. + +**Architecture:** Async migration of `SilentProcessRunner.ExecuteCommand` to `ExecuteCommandAsync`, replacing `process.WaitForExit()` with `await process.WaitForExitAsync(abandon)`. Two-token model on the call chain: existing `cancel` (drives kill via `cancel.Register`) and new `abandon` (drives the wait's early return). New RPC method on `IScriptServiceV2` fires the abandon token. Tentacle does NOT kill the OS process; the runaway is the customer's host-level problem per the ticket. + +**Tech stack:** .NET (multi-target), Halibut RPC, NUnit + FluentAssertions, NSubstitute for mocks. PowerShell on Windows, Bash on Linux. + +**Working branch:** `jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` (PR #1226). + +--- + +## File structure + +### New files +- `source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs` — new command DTO. + +### Modified — contracts +- `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs` — add `AbandonedExitCode = -48`. +- `source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs` — add `AbandonScript` method signature. + +### Modified — production code +- `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` — `ExecuteCommand` → `ExecuteCommandAsync`; add `abandon` token; swap `WaitForExit()` for `await WaitForExitAsync(abandon)`; abandon catch returns `AbandonedExitCode` after `SafelyCancelRead`. +- `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` — interface and wrapper become async, add `abandon` parameter. +- `source/Octopus.Tentacle/Util/CommandLineRunner.cs` — caller migration to await. +- `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` — `RunScript` → `RunScriptAsync`; constructor accepts `abandonToken`; plumb through. +- `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` — `LaunchShell` passes `abandonToken`; `RunningScriptWrapper` gains `abandonTokenSource`; new `AbandonScriptAsync`; targeted best-effort `workspace.Delete` in `CompleteScriptAsync`. +- `source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs` — add `"AbandonScriptV2"` to the non-Kubernetes capability list. +- `source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs` — add `TentacleDebugDisableProcessKill = "TentacleDebugDisableProcessKill"`. + +### Modified — Kubernetes integration test scaffolding (caller migration only) +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs` (1 site) +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs` (2 sites) +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs` (3 sites) +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs` (4 sites) +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs` (1 site) +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs` (1 site) + +### Modified — Tentacle integration test scaffolding (caller migration only) +- `source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs` (3 sites) +- `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` (existing tests need await; abandon tests added) +- `source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs` (1 site) + +### New tests +- Additions inside `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` — abandon-token behaviour, async timing, thread-leak. +- Additions inside `source/Octopus.Tentacle.Tests/Util/RunningScriptFixture.cs` — abandon plumbing. +- Additions inside `source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs` — service-layer abandon paths. +- New file `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` — end-to-end mutex-release-on-abandon (mirrors `ClientScriptExecutionIsolationMutex.cs`). + +--- + +## Task ordering rationale + +Contracts first (no behaviour change, just shapes). Test affordance next (needed by later integration tests). Async migration is the biggest single change — done in one bottom-up pass with all callers migrated together so the build stays green. RunningScript / ScriptServiceV2 abandon wiring after the async machinery exists. Capability advertisement last (it's a one-line addition gating the whole feature). Tests interleaved with the behaviour they cover. + +--- + +### Task 1: Add `AbandonedExitCode = -48` + +**Files:** +- Modify: `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs` + +- [ ] **Step 1: Add the constant** + +Open `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs`. Add a new line right after `PowerShellNeverStartedExitCode = -47;`: + +```csharp +public const int AbandonedExitCode = -48; +``` + +The full block should read: + +```csharp +public const int PowerShellNeverStartedExitCode = -47; +public const int AbandonedExitCode = -48; + +//Kubernetes Agent +public const int KubernetesScriptPodNotFound = -81; +``` + +- [ ] **Step 2: Build** + +```bash +dotnet build source/Octopus.Tentacle.Contracts/Octopus.Tentacle.Contracts.csproj +``` + +Expected: build succeeds. + +- [ ] **Step 3: Commit** + +```bash +git add source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs +git commit -m "Add AbandonedExitCode = -48 to ScriptExitCodes" +``` + +--- + +### Task 2: Add `AbandonScriptCommandV2` DTO + +**Files:** +- Create: `source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs` + +- [ ] **Step 1: Create the file** + +Use the same shape as `CancelScriptCommandV2.cs` (which lives in the same folder): + +```csharp +using System; + +namespace Octopus.Tentacle.Contracts.ScriptServiceV2 +{ + public class AbandonScriptCommandV2 + { + public AbandonScriptCommandV2(ScriptTicket ticket, long lastLogSequence) + { + Ticket = ticket; + LastLogSequence = lastLogSequence; + } + + public ScriptTicket Ticket { get; } + + public long LastLogSequence { get; } + } +} +``` + +- [ ] **Step 2: Build** + +```bash +dotnet build source/Octopus.Tentacle.Contracts/Octopus.Tentacle.Contracts.csproj +``` + +Expected: build succeeds. + +- [ ] **Step 3: Commit** + +```bash +git add source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs +git commit -m "Add AbandonScriptCommandV2 contract" +``` + +--- + +### Task 3: Add `AbandonScript` method to `IScriptServiceV2` + +**Files:** +- Modify: `source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs` + +- [ ] **Step 1: Update the interface** + +Add `AbandonScript` between `CancelScript` and `CompleteScript`: + +```csharp +using System; + +namespace Octopus.Tentacle.Contracts.ScriptServiceV2 +{ + public interface IScriptServiceV2 + { + ScriptStatusResponseV2 StartScript(StartScriptCommandV2 command); + ScriptStatusResponseV2 GetStatus(ScriptStatusRequestV2 request); + ScriptStatusResponseV2 CancelScript(CancelScriptCommandV2 command); + ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command); + void CompleteScript(CompleteScriptCommandV2 command); + } +} +``` + +- [ ] **Step 2: Build the whole solution** + +```bash +dotnet build source/Tentacle.sln +``` + +Expected: **build fails.** The async implementer (`ScriptServiceV2` in `Octopus.Tentacle.Core`) doesn't implement the new method yet. That's intentional — we'll fix it in Task 11. For now, capture the compile errors and confirm they're the expected "missing implementation" errors and nothing else. + +- [ ] **Step 3: Stash the stub on Halibut decorators** + +Tentacle wraps services with async decorators (look for `IAsyncScriptServiceV2`, `BackwardsCompatibleAsyncCapabilitiesV2Decorator`, etc). For the build to stay green between Task 3 and Task 11, add a **temporary** `NotImplementedException`-throwing stub to `ScriptServiceV2.cs`: + +Open `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs`. Add this method right after `CancelScriptAsync`: + +```csharp +public async Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) +{ + await Task.CompletedTask; + throw new NotImplementedException("Implemented in Task 11"); +} +``` + +- [ ] **Step 4: Build again, confirm green** + +```bash +dotnet build source/Tentacle.sln +``` + +Expected: build succeeds. + +- [ ] **Step 5: Commit** + +```bash +git add source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +git commit -m "Add AbandonScript to IScriptServiceV2 interface (stub)" +``` + +--- + +### Task 4: Add `TentacleDebugDisableProcessKill` env-var constant + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs` + +- [ ] **Step 1: Add the constant** + +Open the file. Add a new line in the `EnvironmentVariables` static class, grouped near the other `Tentacle*` constants: + +```csharp +public const string TentacleDebugDisableProcessKill = "TentacleDebugDisableProcessKill"; +``` + +- [ ] **Step 2: Build** + +```bash +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj +``` + +Expected: build succeeds. + +- [ ] **Step 3: Commit** + +```bash +git add source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs +git commit -m "Add TentacleDebugDisableProcessKill env var constant" +``` + +(The Hitman wiring happens in Task 6 alongside the async migration so the test affordance is in place before any new tests need it.) + +--- + +### Task 5: Make `SilentProcessRunner.ExecuteCommand` async — failing test first + +**Files:** +- Modify: `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` + +This is TDD's red step for the async migration. We're not going to migrate the whole call chain yet — we just write the new test that targets the future async method so it fails to compile, proving we need the new signature. + +- [ ] **Step 1: Add the failing test** + +Open `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs`. Add this new test near the existing `CancellationToken_*` tests: + +```csharp +[Test] +public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProcess() +{ + var command = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; + var arguments = PlatformDetection.IsRunningOnWindows + ? "-NoProfile -NonInteractive -Command \"Start-Sleep -Seconds 300\"" + : "-c \"sleep 300\""; + + using var cancelCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + + var infoMessages = new StringBuilder(); + + var sw = Stopwatch.StartNew(); + + var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( + command, + arguments, + Environment.CurrentDirectory, + debug: _ => { }, + info: msg => { lock (infoMessages) infoMessages.AppendLine(msg); }, + error: _ => { }, + customEnvironmentVariables: null, + cancel: cancelCts.Token, + abandon: abandonCts.Token)); + + // Give the process ~500ms to actually start before we abandon + await Task.Delay(500); + abandonCts.Cancel(); + + var exitCode = await task; + sw.Stop(); + + sw.Elapsed.Should().BeLessThan(TimeSpan.FromSeconds(2), "abandon should return promptly"); + exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); +} +``` + +Add the corresponding `using`s at the top if missing: + +```csharp +using System.Diagnostics; +using System.Threading.Tasks; +using Octopus.Tentacle.Contracts; +``` + +- [ ] **Step 2: Confirm it fails to compile** + +```bash +dotnet build source/Octopus.Tentacle.Tests.Integration/Octopus.Tentacle.Tests.Integration.csproj +``` + +Expected: compile error referencing `ExecuteCommandAsync` not existing on `SilentProcessRunner`. That's the red. + +- [ ] **Step 3: Commit (red phase)** + +```bash +git add source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +git commit -m "Add failing test for AbandonToken behaviour in SilentProcessRunner" +``` + +We commit red because the next task migrates the production method; both will pass together once the migration completes. + +--- + +### Task 6: Migrate `SilentProcessRunner.ExecuteCommand` to async + add `abandon` token + plumb `TentacleDebugDisableProcessKill` + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` + +This is the load-bearing implementation task. We rename the method, change the return to `Task`, add the `abandon` parameter, swap `process.WaitForExit()` for `await process.WaitForExitAsync(abandon)`, add the abandon catch with `SafelyCancelRead` + honest log line + `AbandonedExitCode`, and wire the env var into `Hitman.TryKillProcessAndChildrenRecursively`. + +- [ ] **Step 1: Update `ExecuteCommand` signature and body** + +Find the current `public static int ExecuteCommand(...)` overload at the top (around line 17). Update both overloads to be `async Task` and add the `abandon` parameter. The simpler overload should delegate to the richer one: + +```csharp +public static Task ExecuteCommandAsync( + string executable, + string arguments, + string workingDirectory, + Action debug, + Action info, + Action error, + CancellationToken cancel, + CancellationToken abandon) +{ + return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel, abandon: abandon); +} + +public static async Task ExecuteCommandAsync( + string executable, + string arguments, + string workingDirectory, + Action debug, + Action info, + Action error, + IReadOnlyDictionary? customEnvironmentVariables = null, + CancellationToken cancel = default, + CancellationToken abandon = default) +{ + // ... existing argument-null checks ... + // ... existing process.StartInfo setup ... + process.Start(); + + var running = true; + + using (cancel.Register(() => + { + if (running) DoOurBestToCleanUp(process, error); + })) + { + if (cancel.IsCancellationRequested) + DoOurBestToCleanUp(process, error); + + process.BeginOutputReadLine(); + process.BeginErrorReadLine(); + + try + { + await process.WaitForExitAsync(abandon).ConfigureAwait(false); + } + catch (OperationCanceledException) when (abandon.IsCancellationRequested && !process.HasExited) + { + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + running = false; + return ScriptExitCodes.AbandonedExitCode; + } + + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + + SafelyWaitForAllOutput(outputResetEvent, cancel, debug); + SafelyWaitForAllOutput(errorResetEvent, cancel, debug); + + var exitCode = SafelyGetExitCode(process); + debug($"Process {exeFileNameOrFullPath} in {workingDirectory} exited with code {exitCode}"); + + running = false; + return exitCode; + } +} +``` + +Notes: +- The old synchronous `ExecuteCommand` overloads are deleted. Every caller migrates in Tasks 7–9. +- `running = false` set inside the abandon catch as well — `cancel.Register`'s callback checks `running` to decide whether to call `DoOurBestToCleanUp`. After abandon we don't want it firing. + +- [ ] **Step 2: Wire `TentacleDebugDisableProcessKill` into `Hitman`** + +In the same file, find the `Hitman.TryKillProcessAndChildrenRecursively` method (around line 250). Add the env-var check at the top: + +```csharp +public static void TryKillProcessAndChildrenRecursively(Process process) +{ + if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill))) + { + // Test-only no-op: simulate "kill was attempted but didn't terminate the process". + // Only activated when the test harness sets this env var on the Tentacle process. + return; + } + +#if NETFRAMEWORK + TryKillWindowsProcessAndChildrenRecursively(process.Id); +#endif +#if !NETFRAMEWORK + process.Kill(true); +#endif +} +``` + +Add the `using` at the top if not already present: + +```csharp +using Octopus.Tentacle.Core.Util; +``` + +- [ ] **Step 3: Build (expect cascade failures from removed sync method)** + +```bash +dotnet build source/Tentacle.sln +``` + +Expected: many compile errors at every caller of the removed `ExecuteCommand`. That's the next several tasks. + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +git commit -m "Migrate SilentProcessRunner to async; add abandon token; debug kill-disable flag" +``` + +--- + +### Task 7: Migrate production callers to await + +**Files:** +- Modify: `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` +- Modify: `source/Octopus.Tentacle/Util/CommandLineRunner.cs` + +- [ ] **Step 1: Update `ISilentProcessRunner` interface and wrapper** + +Open `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs`. Make the interface and wrapper async, add `abandon`: + +```csharp +public interface ISilentProcessRunner +{ + Task ExecuteCommandAsync( + string executable, + string arguments, + string workingDirectory, + Action info, + Action error, + CancellationToken cancel = default, + CancellationToken abandon = default); + + Task ExecuteCommandAsync( + string executable, + string arguments, + string workingDirectory, + Action debug, + Action info, + Action error, + CancellationToken cancel = default, + CancellationToken abandon = default); +} + +public class SilentProcessRunnerWrapper : ISilentProcessRunner +{ + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) + { + return SilentProcessRunnerExtended.ExecuteCommandAsync(executable, arguments, workingDirectory, info, error, cancel, abandon); + } + + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) + { + return SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, cancel: cancel, abandon: abandon); + } +} +``` + +Update the `SilentProcessRunnerExtended` static helpers in the same file. The extension methods on `CommandLineInvocation` will need to become async too: + +```csharp +public static async Task ExecuteCommandAsync(this CommandLineInvocation invocation) + => await ExecuteCommandAsync(invocation, Environment.CurrentDirectory); + +public static async Task ExecuteCommandAsync(this CommandLineInvocation invocation, string workingDirectory) +{ + if (workingDirectory == null) + throw new ArgumentNullException(nameof(workingDirectory)); + + var arguments = $"{invocation.Arguments} {invocation.SystemArguments ?? string.Empty}"; + var infos = new List(); + var errors = new List(); + + var exitCode = await ExecuteCommandAsync( + invocation.Executable, + arguments, + workingDirectory, + infos.Add, + errors.Add + ); + + return new CmdResult(exitCode, infos, errors); +} + +public static Task ExecuteCommandAsync( + string executable, + string arguments, + string workingDirectory, + Action info, + Action error, + CancellationToken cancel = default, + CancellationToken abandon = default) + => SilentProcessRunner.ExecuteCommandAsync(executable, + arguments, + workingDirectory, + LogFileOnlyLogger.Current.Info, + info, + error, + customEnvironmentVariables: null, + cancel: cancel, + abandon: abandon); +``` + +- [ ] **Step 2: Migrate `CommandLineRunner`** + +Open `source/Octopus.Tentacle/Util/CommandLineRunner.cs`. Find the call to `SilentProcessRunner.ExecuteCommand` and convert. The whole method becomes async — propagate the change up the chain until you reach a natural async boundary or `Task.Run` / `.GetAwaiter().GetResult()` glue is needed. + +Pattern for each call site: + +```csharp +// Before: +var exitCode = SilentProcessRunner.ExecuteCommand(invocation.Executable, ...); +// After: +var exitCode = await SilentProcessRunner.ExecuteCommandAsync(invocation.Executable, ..., abandon: CancellationToken.None); +``` + +For `CommandLineRunner.Execute`, the method becomes `ExecuteAsync` returning `Task`. Any caller that hits a sync boundary uses `.GetAwaiter().GetResult()` *as a last resort, with a comment explaining why*. + +- [ ] **Step 3: Build** + +```bash +dotnet build source/Octopus.Tentacle/Octopus.Tentacle.csproj +``` + +Expected: build succeeds (or surfaces the next layer of callers; resolve them with the same pattern). + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle/Util/ISilentProcessRunner.cs source/Octopus.Tentacle/Util/CommandLineRunner.cs +git commit -m "Migrate ISilentProcessRunner and CommandLineRunner to async" +``` + +--- + +### Task 8: Migrate Kubernetes integration test scaffolding to await + +**Files:** +- Modify (caller migration only): + - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs` + - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs` + - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs` + - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs` + - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs` + - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs` + +- [ ] **Step 1: Apply the same caller pattern to every call site** + +Pattern at each `SilentProcessRunner.ExecuteCommand(...)`: + +```csharp +// Before (synchronous): +var exitCode = SilentProcessRunner.ExecuteCommand(executable, args, workingDir, debug, info, error, cancel: ct); +// After (async, abandon-token = None because these are setup tools, not Tentacle script execution): +var exitCode = await SilentProcessRunner.ExecuteCommandAsync(executable, args, workingDir, debug, info, error, cancel: ct, abandon: CancellationToken.None); +``` + +Make the containing method `async Task` (or `async Task` if it doesn't return the exit code). Propagate `async` up the call chain in this file. Most of these scaffolding methods are already called from `async` test setup, so the propagation is usually one or two layers. + +For commented-out lines (e.g. `KubernetesClusterInstaller.cs:129`), leave them commented. + +- [ ] **Step 2: Build the K8s integration test project** + +```bash +dotnet build source/Octopus.Tentacle.Kubernetes.Tests.Integration/Octopus.Tentacle.Kubernetes.Tests.Integration.csproj +``` + +Expected: build succeeds. + +- [ ] **Step 3: Commit** + +```bash +git add source/Octopus.Tentacle.Kubernetes.Tests.Integration/ +git commit -m "Migrate Kubernetes integration test scaffolding to async ExecuteCommandAsync" +``` + +--- + +### Task 9: Migrate Tentacle integration test scaffolding to await + +**Files:** +- Modify: + - `source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs` (3 sites) + - `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` (existing sync tests — but also fix the helper there) + - `source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs` + +- [ ] **Step 1: Migrate each caller** + +Same pattern as Task 8. `SilentProcessRunner.ExecuteCommand(...)` → `await SilentProcessRunner.ExecuteCommandAsync(..., abandon: CancellationToken.None)`. Containing methods become `async Task<...>`. + +In `SilentProcessRunnerFixture.cs`, there's a private helper near the top that wraps `ExecuteCommand` for the existing tests (`Execute(...)`). Migrate it: + +```csharp +static async Task ExecuteAsync(string command, string arguments, string workingDirectory, out StringBuilder debugMessages, out StringBuilder infoMessages, out StringBuilder errorMessages, CancellationToken cancel = default, CancellationToken abandon = default) +``` + +Each existing test that calls `Execute(...)` now calls `await ExecuteAsync(...)`. Tests become `async Task` returning methods. NUnit handles that. + +- [ ] **Step 2: Build the Tentacle integration test project** + +```bash +dotnet build source/Octopus.Tentacle.Tests.Integration/Octopus.Tentacle.Tests.Integration.csproj +``` + +Expected: build succeeds. + +- [ ] **Step 3: Run the existing SilentProcessRunner tests on Linux + Windows** + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --filter "FullyQualifiedName~SilentProcessRunnerFixture" +``` + +Expected: all existing tests pass. The new `AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProcess` test from Task 5 ALSO passes now that the production method exists. Green. + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle.Tests.Integration/ +git commit -m "Migrate Tentacle integration test scaffolding to async; AbandonToken test now passes" +``` + +--- + +### Task 10: Add abandon support to `RunningScript` + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` +- Modify: `source/Octopus.Tentacle.Tests/Util/RunningScriptFixture.cs` (or wherever the existing fixture lives — adjust path if it's in `Octopus.Tentacle.Tests.Integration`) + +- [ ] **Step 1: Write the failing test** + +Open the existing `RunningScriptFixture.cs`. Add a test that exercises the abandon path: + +```csharp +[Test] +public async Task Execute_WhenAbandonTokenFires_ReturnsAbandonedExitCode() +{ + // arrange: a workspace + shell that runs a long-sleeping script + var workspace = CreateWorkspace(bashScript: "sleep 300", powershellScript: "Start-Sleep -Seconds 300"); + var shell = new Bash(); // or appropriate cross-platform helper + using var runningCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + + var runningScript = new RunningScript( + shell, + workspace, + stateStore: null, + scriptLog: workspace.CreateLog(), + taskId: "ServerTask-1", + scriptIsolationMutex: new ScriptIsolationMutex(), + runningScriptToken: runningCts.Token, + abandonToken: abandonCts.Token, + environmentVariables: new Dictionary(), + powerShellStartupTimeout: TimeSpan.FromMinutes(1), + log: Substitute.For()); + + var executeTask = runningScript.Execute(); + await Task.Delay(500); // let the process start + abandonCts.Cancel(); + + await executeTask; + runningScript.State.Should().Be(ProcessState.Complete); + runningScript.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); +} +``` + +Run it; expect compile failure ("RunningScript constructor doesn't accept abandonToken"). + +- [ ] **Step 2: Add `abandonToken` to `RunningScript`** + +In `RunningScript.cs`, add a field and constructor parameter: + +```csharp +readonly CancellationToken runningScriptToken; +readonly CancellationToken abandonToken; // NEW + +public RunningScript(IShell shell, + IScriptWorkspace workspace, + IScriptStateStore? stateStore, + IScriptLog scriptLog, + string taskId, + ScriptIsolationMutex scriptIsolationMutex, + CancellationToken runningScriptToken, + CancellationToken abandonToken, // NEW + IReadOnlyDictionary environmentVariables, + TimeSpan powerShellStartupTimeout, + ILog log) +{ + // ... existing assignments ... + this.abandonToken = abandonToken; + // ... +} +``` + +Update the secondary constructor that omits `stateStore` to pass `abandonToken` through as well. + +- [ ] **Step 3: Replace `RunScript` with async, plumb `abandonToken`** + +Replace the existing `int RunScript(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken)` with: + +```csharp +async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken, CancellationToken abandon) +{ + try + { + var exitCode = await SilentProcessRunner.ExecuteCommandAsync( + shellPath, + shell.FormatCommandArguments(workspace.BootstrapScriptFilePath, workspace.ScriptArguments, false), + workspace.WorkingDirectory, + LogScriptOutputTo(writer, ProcessOutputSource.Debug), + LogScriptOutputTo(writer, ProcessOutputSource.StdOut), + LogScriptOutputTo(writer, ProcessOutputSource.StdErr), + environmentVariables, + cancellationToken, + abandon); + + return exitCode; + } + catch (Exception ex) + { + writer.WriteOutput(ProcessOutputSource.StdErr, "An exception was thrown when invoking " + shellPath + ": " + ex.Message); + writer.WriteOutput(ProcessOutputSource.StdErr, ex.ToString()); + return ScriptExitCodes.PowershellInvocationErrorExitCode; + } +} +``` + +- [ ] **Step 4: Update `Execute` to await the async `RunScriptAsync`** + +Inside `Execute()`, change the call: + +```csharp +exitCode = workspace.ShouldMonitorPowerShellStartup() + ? await RunPowershellScriptWithMonitoring(shellPath, writer, runningScriptToken) + : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); +``` + +Inside `RunPowershellScriptWithMonitoring`, find the `Task.Run(() => RunScript(...))` line and change to `Task.Run(async () => await RunScriptAsync(shellPath, writer, scriptTaskCts.Token, abandonToken), scriptTaskCts.Token)`. + +- [ ] **Step 5: Build and run the new test** + +```bash +dotnet build source/Tentacle.sln +dotnet test source/Octopus.Tentacle.Tests.Integration --filter "Execute_WhenAbandonTokenFires" +``` + +Expected: build succeeds; the new test passes. Build of the broader solution will surface that `ScriptServiceV2.cs` doesn't pass `abandonToken` yet — that's Task 11. + +- [ ] **Step 6: Commit** + +```bash +git add source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs source/Octopus.Tentacle.Tests/Util/RunningScriptFixture.cs +git commit -m "Plumb abandon token through RunningScript; covered by new test" +``` + +--- + +### Task 11: Implement `ScriptServiceV2.AbandonScriptAsync` and add `abandonTokenSource` to wrapper + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` +- Modify: `source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs` (existing fixture) + +- [ ] **Step 1: Write failing service-layer tests** + +In `ScriptServiceV2Fixture.cs`, add these tests: + +```csharp +[Test] +public async Task AbandonScript_OnUnknownTicket_ReturnsCompleteWithUnknownScriptExitCode() +{ + var service = CreateService(); + var ticket = new ScriptTicket("unknown"); + var response = await service.AbandonScriptAsync(new AbandonScriptCommandV2(ticket, 0), CancellationToken.None); + + response.State.Should().Be(ProcessState.Complete); + response.ExitCode.Should().Be(ScriptExitCodes.UnknownScriptExitCode); +} + +[Test] +public async Task AbandonScript_OnRunningScript_FiresAbandonToken_ReleasesMutex_ReturnsAbandonedExitCode() +{ + var service = CreateService(); + + // start a script that will block on a file-wait, so it stays Running until we release it + var startCommand = BuildLongRunningCommand(); // uses TestExecuteShellScriptCommandBuilder + await service.StartScriptAsync(startCommand, CancellationToken.None); + + var response = await service.AbandonScriptAsync( + new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), + CancellationToken.None); + + response.State.Should().Be(ProcessState.Complete); + response.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // mutex should be free: a new FullIsolation script should start + var second = BuildLongRunningCommand(); + var secondResponse = await service.StartScriptAsync(second, CancellationToken.None); + secondResponse.State.Should().NotBe(ProcessState.Pending); // i.e. wasn't blocked on the mutex +} + +[Test] +public async Task AbandonScript_OnAlreadyCompletedScript_ReturnsRealExitCodeNotAbandoned() +{ + var service = CreateService(); + var startCommand = BuildShortRunningCommand(exitCode: 0); // completes quickly + + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // wait for completion + ScriptStatusResponseV2 status; + do { status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); } + while (status.State != ProcessState.Complete); + + var abandonResponse = await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + abandonResponse.ExitCode.Should().Be(0, "real exit code should be returned, not AbandonedExitCode"); +} +``` + +Run: expect compile failures (the stub from Task 3 throws NotImplementedException; the assertions will fail). + +- [ ] **Step 2: Implement `AbandonScriptAsync`** + +Replace the Task 3 stub with the real implementation. Also add `abandonTokenSource` to `RunningScriptWrapper`: + +```csharp +class RunningScriptWrapper : IDisposable +{ + readonly CancellationTokenSource cancellationTokenSource = new(); + readonly CancellationTokenSource abandonTokenSource = new(); + + public RunningScriptWrapper(ScriptStateStore scriptStateStore) + { + ScriptStateStore = scriptStateStore; + CancellationToken = cancellationTokenSource.Token; + AbandonToken = abandonTokenSource.Token; + } + + public RunningScript? Process { get; set; } + public ScriptStateStore ScriptStateStore { get; } + public SemaphoreSlim StartScriptMutex { get; } = new(1, 1); + + public CancellationToken CancellationToken { get; } + public CancellationToken AbandonToken { get; } + + public void Cancel() => cancellationTokenSource.Cancel(); + public void Abandon() => abandonTokenSource.Cancel(); + + public void Dispose() + { + cancellationTokenSource.Dispose(); + abandonTokenSource.Dispose(); + } +} +``` + +Replace the stub `AbandonScriptAsync`: + +```csharp +public async Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) +{ + await Task.CompletedTask; + + if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) + { + runningScript.Abandon(); + } + + return GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process); +} +``` + +In `LaunchShell`, pass `abandonToken` through: + +```csharp +RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken, CancellationToken abandonToken) +{ + var runningScript = new RunningScript(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); + _ = Task.Run(async () => await runningScript.Execute()); + return runningScript; +} +``` + +Update the call site of `LaunchShell` in `StartScriptAsync` to pass `runningScript.AbandonToken`. + +- [ ] **Step 3: Run the new tests** + +```bash +dotnet test source/Octopus.Tentacle.Tests --filter "FullyQualifiedName~ScriptServiceV2Fixture.AbandonScript" +``` + +Expected: all three new tests pass. + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs +git commit -m "Implement ScriptServiceV2.AbandonScriptAsync with abandon-token wrapper" +``` + +--- + +### Task 12: Targeted best-effort `CompleteScript` + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` +- Modify: `source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs` + +- [ ] **Step 1: Write a failing test** + +Add to `ScriptServiceV2Fixture.cs`: + +```csharp +[Test] +public async Task CompleteScript_AfterAbandon_WhenWorkspaceDeleteFails_LogsWarnAndReturnsNormally() +{ + var service = CreateService(); // factory should let us inject a workspace whose Delete throws IOException + var startCommand = BuildLongRunningCommand(); + await service.StartScriptAsync(startCommand, CancellationToken.None); + await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + + // arrange the workspace.Delete to fail + ArrangeWorkspaceDeleteToThrow(startCommand.ScriptTicket, new IOException("file in use")); + + var complete = async () => await service.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + + await complete.Should().NotThrowAsync(); + // assert: systemLog received a Warn entry mentioning the leaked directory + fakeSystemLog.WarnMessages.Should().Contain(m => m.Contains("Could not delete") && m.Contains(startCommand.ScriptTicket.TaskId)); +} + +[Test] +public async Task CompleteScript_AfterNormalCompletion_WhenWorkspaceDeleteFails_PropagatesException() +{ + var service = CreateService(); + var startCommand = BuildShortRunningCommand(exitCode: 0); + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // poll until natural completion + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Complete); + status.ExitCode.Should().Be(0, "the script exited cleanly, not via abandon"); + + ArrangeWorkspaceDeleteToThrow(startCommand.ScriptTicket, new IOException("file in use")); + + var complete = async () => await service.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + + await complete.Should().ThrowAsync(); +} +``` + +Run: expect both to fail (current code propagates the exception unconditionally). + +- [ ] **Step 2: Update `CompleteScriptAsync`** + +Replace the existing implementation: + +```csharp +public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken) +{ + if (runningScripts.TryRemove(command.Ticket, out var runningScript)) + { + runningScript.Dispose(); + } + + var workspace = workspaceFactory.GetWorkspace(command.Ticket, WorkspaceReadinessCheck.Skip); + + var stateStore = scriptStateStoreFactory.Create(workspace); + var wasAbandoned = stateStore.Exists() + && stateStore.Load().ExitCode == ScriptExitCodes.AbandonedExitCode; + + if (wasAbandoned) + { + try + { + await workspace.Delete(cancellationToken); + } + catch (Exception ex) + { + log.Warn(ex, $"Could not delete abandoned workspace at {workspace.WorkingDirectory}. Leaving on disk; the underlying script process may still hold open file handles."); + } + } + else + { + await workspace.Delete(cancellationToken); + } +} +``` + +- [ ] **Step 3: Run the new tests** + +```bash +dotnet test source/Octopus.Tentacle.Tests --filter "FullyQualifiedName~ScriptServiceV2Fixture.CompleteScript" +``` + +Expected: both new tests pass; existing CompleteScript tests still pass. + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs +git commit -m "Best-effort workspace.Delete gated on AbandonedExitCode" +``` + +--- + +### Task 13: Advertise `AbandonScriptV2` capability + +**Files:** +- Modify: `source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs` +- Modify: `source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs` (existing fixture) + +- [ ] **Step 1: Write the failing test** + +In `CapabilitiesServiceV2Fixture.cs`: + +```csharp +[Test] +public async Task GetCapabilities_OnNonKubernetesTentacle_AdvertisesAbandonScriptV2() +{ + var service = new CapabilitiesServiceV2(); + var response = await service.GetCapabilitiesAsync(CancellationToken.None); + response.SupportedCapabilities.Should().Contain("AbandonScriptV2"); +} + +[Test] +public async Task GetCapabilities_OnKubernetesTentacle_DoesNotAdvertiseAbandonScriptV2() +{ + // arrange KubernetesSupportDetection.IsRunningAsKubernetesAgent = true (test-only override; mirror existing pattern in the fixture) + var service = new CapabilitiesServiceV2(); + var response = await service.GetCapabilitiesAsync(CancellationToken.None); + response.SupportedCapabilities.Should().NotContain("AbandonScriptV2"); +} +``` + +Run: expect both to fail. + +- [ ] **Step 2: Add the capability string** + +In `CapabilitiesServiceV2.cs`: + +```csharp +return new CapabilitiesResponseV2(new List +{ + nameof(IScriptService), + nameof(IFileTransferService), + nameof(IScriptServiceV2), + "AbandonScriptV2" +}); +``` + +- [ ] **Step 3: Run the tests** + +```bash +dotnet test source/Octopus.Tentacle.Tests --filter "FullyQualifiedName~CapabilitiesServiceV2Fixture.GetCapabilities" +``` + +Expected: both new tests pass; existing capability tests still pass. + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs +git commit -m "Advertise AbandonScriptV2 capability" +``` + +--- + +### Task 14: Integration test — mutex release on abandon + +**Files:** +- Create: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` + +This is the load-bearing end-to-end test. Mirrors `ClientScriptExecutionIsolationMutex.cs`. Uses the existing builders (`TestExecuteShellScriptCommandBuilder`, `ScriptBuilder`, `Wait.For`, `TentacleServiceDecoratorBuilder`) — do NOT use raw shell + `Thread.Sleep`. + +- [ ] **Step 1: Create the file** + +```csharp +using System; +using System.Collections; +using System.IO; +using System.Threading.Tasks; +using FluentAssertions; +using NUnit.Framework; +using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Contracts.ScriptServiceV2; +using Octopus.Tentacle.Tests.Integration.Support; +using Octopus.Tentacle.Tests.Integration.Util; +using Octopus.Tentacle.Tests.Integration.Util.Builders; + +namespace Octopus.Tentacle.Tests.Integration +{ + [IntegrationTestTimeout] + public class ClientScriptExecutionAbandon : IntegrationTest + { + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.V2)] + public async Task AbandonScript_WhileScriptIsRunning_ReleasesMutexAndReturnsAbandonedExitCode(TentacleConfigurationTestCase tcc) + { + await using var clientTentacle = await tcc.CreateBuilder() + .WithTentacleEnvironmentVariable("TentacleDebugDisableProcessKill", "1") // make Hitman a no-op + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + // first script: signals "started" then blocks until release file appears + var firstCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName("abandon-test-mutex") + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + + var firstScriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); + + // wait until the first script is actually running + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("first script did not start"), + CancellationToken); + + // cancel first (kill is mocked off, so the script keeps running) + await tentacleClient.ScriptServiceV2.CancelScriptAsync(new CancelScriptCommandV2(firstCommand.ScriptTicket, 0), CancellationToken); + + // give cancel a moment to be attempted; then abandon + await Task.Delay(TimeSpan.FromSeconds(1)); + + var abandonResponse = await tentacleClient.ScriptServiceV2.AbandonScriptAsync(new AbandonScriptCommandV2(firstCommand.ScriptTicket, 0), CancellationToken); + abandonResponse.State.Should().Be(ProcessState.Complete); + abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // load-bearing: second FullIsolation script should now start, proving the mutex was released + var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); + var secondCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName("abandon-test-mutex") + .Build(); + + var secondResult = await tentacleClient.ExecuteScript(secondCommand, CancellationToken); + secondResult.response.ExitCode.Should().Be(0); + File.Exists(secondStartFile).Should().BeTrue(); + + // release the first script so the test process doesn't leak forever + File.WriteAllText(releaseFile, ""); + } + } +} +``` + +If `WithTentacleEnvironmentVariable` doesn't exist on the builder, add it as a small helper in `ClientAndTentacleBuilder` and propagate to the Tentacle process startup environment. + +- [ ] **Step 2: Run the new test on Linux** + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --filter "ClientScriptExecutionAbandon" +``` + +Expected: passes. + +- [ ] **Step 3: Run on Windows CI** + +Push to the branch and verify the Windows CI job passes. + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +git commit -m "Integration test: AbandonScript releases mutex when kill mocked off" +``` + +--- + +### Task 15: Integration test — multi-level-deep hang variant + +**Files:** +- Modify: `source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs` (add `AppendRaw` helper) +- Modify: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (add second test) + +- [ ] **Step 0: Add `AppendRaw` to `ScriptBuilder`** + +The existing `ScriptBuilder` doesn't have a way to inject shell-specific raw command lines. Add this helper near `Print` / `Sleep`: + +```csharp +public ScriptBuilder AppendRaw(string bash, string windows) +{ + bashScript.AppendLine(bash); + windowsScript.AppendLine(windows); + return this; +} +``` + +Commit this separately so the helper is available before the multi-level test depends on it: + +```bash +git add source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs +git commit -m "Add ScriptBuilder.AppendRaw for shell-specific command injection" +``` + +The ticket explicitly asks for a "multi-level-deep hang (bootstrap → Calamari → script → AV)" test. + +- [ ] **Step 1: Add the test** + +```csharp +[Test] +[TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.V2)] +public async Task AbandonScript_MultiLevelDeepHang_StillReleasesMutex(TentacleConfigurationTestCase tcc) +{ + await using var clientTentacle = await tcc.CreateBuilder() + .WithTentacleEnvironmentVariable("TentacleDebugDisableProcessKill", "1") + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + // Multi-level chain: Tentacle runs the outer shell (bootstrap), which launches a child shell + // which itself launches a grandchild that polls for the release file. Mirrors + // bootstrap → Calamari → user script. + var script = new ScriptBuilder() + .CreateFile(startFile) + .AppendRaw( + bash: $"bash -c \"bash -c 'while [ ! -f {releaseFile.Replace("\\", "/")} ]; do sleep 0.5; done'\"", + windows: $"powershell -NoProfile -Command \"powershell -NoProfile -Command 'while (-not (Test-Path \\\"{releaseFile}\\\")) {{ Start-Sleep -Milliseconds 500 }}'\""); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(script) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName("abandon-multilevel-mutex") + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + var firstExecution = Task.Run(async () => await tentacleClient.ExecuteScript(command, CancellationToken)); + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("multi-level script did not start"), + CancellationToken); + + await tentacleClient.ScriptServiceV2.CancelScriptAsync(new CancelScriptCommandV2(command.ScriptTicket, 0), CancellationToken); + await Task.Delay(TimeSpan.FromSeconds(1)); + + var abandonResponse = await tentacleClient.ScriptServiceV2.AbandonScriptAsync(new AbandonScriptCommandV2(command.ScriptTicket, 0), CancellationToken); + abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // mutex released check (same as Task 14) + var secondCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second"))) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName("abandon-multilevel-mutex") + .Build(); + var secondResult = await tentacleClient.ExecuteScript(secondCommand, CancellationToken); + secondResult.response.ExitCode.Should().Be(0); + + File.WriteAllText(releaseFile, ""); +} +``` + +- [ ] **Step 2: Run** + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --filter "AbandonScript_MultiLevelDeepHang" +``` + +Expected: passes. + +- [ ] **Step 3: Commit** + +```bash +git add source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +git commit -m "Integration test: multi-level-deep hang abandons cleanly" +``` + +--- + +### Task 16: Full test suite + push for CI + +- [ ] **Step 1: Run the entire test suite locally** + +```bash +dotnet test source/Tentacle.sln +``` + +Expected: all tests pass on Linux. (Windows-only tests will skip locally if not on Windows.) + +- [ ] **Step 2: Push for CI** + +```bash +git push +``` + +Wait for the GitHub Actions check on PR #1226. All matrices (Linux, Windows, both target frameworks) must pass. + +- [ ] **Step 3: Address any platform-specific failures** + +Most likely areas: +- Workspace-cleanup test on Linux: Linux generally allows deletion of open files (the inode survives until handles close). The "delete fails" test may need a Windows-only attribute. +- Thread-count assertion timing: bump the delta tolerance if CI jitter is higher than dev box. + +- [ ] **Step 4: Final commit (if any fixes needed)** + +```bash +git add +git commit -m "Address CI platform-specific test failures" +git push +``` + +--- + +## Self-review checklist (run after writing the plan, before handing off) + +- [ ] Spec coverage: every section of `docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md` maps to at least one task above. +- [ ] No `TODO`, `TBD`, `implement later`, or "add appropriate error handling" placeholders. +- [ ] Type/method names consistent across tasks (`ExecuteCommandAsync`, `AbandonScriptCommandV2`, `AbandonedExitCode`, `abandonToken`, `AbandonScriptAsync`). +- [ ] Every code step shows the actual code, not a description. +- [ ] Every command step shows the exact command and the expected outcome. + +## Notes for execution + +- **Frequent commits.** Each task above is one commit. Don't bundle. +- **Build green between tasks.** Task 3 introduces a `NotImplementedException` stub precisely so the build stays green between contracts and the implementation in Task 11. +- **Test cleanup.** Several integration tests leave a running PowerShell / bash sleep process behind (because `TentacleDebugDisableProcessKill` is set). The tests must release them via the release-file pattern. Forgetting cleanup will leak processes on the CI box. +- **Coordination with server-side.** Server-side session is on a parallel branch in `OctopusDeploy/OctopusDeploy`. Once both PRs are mergeable, coordinate the contract package version bump so Server picks up the new contract in lockstep. diff --git a/docs/superpowers/plans/2026-05-25-split-async-migration-from-abandon-feature-plan.md b/docs/superpowers/plans/2026-05-25-split-async-migration-from-abandon-feature-plan.md new file mode 100644 index 000000000..70f5db598 --- /dev/null +++ b/docs/superpowers/plans/2026-05-25-split-async-migration-from-abandon-feature-plan.md @@ -0,0 +1,1012 @@ +# Split Async Migration from Abandon Feature — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Restructure the existing PR stack so the async migration of `SilentProcessRunner` sits in its own foundational PR, and the abandon feature (PR #1226) stacks on top of it. + +**Architecture:** End-state rebuild. Create a fresh branch from `main` containing only the async migration + sync-boundary comments. Then force-push #1226 with the abandon delta on top. PR #1235 rebases on the new #1226. + +**Tech Stack:** C# (.NET 8 + net48 polyfill), Autofac DI, NUnit tests, git worktree workflow. + +**Spec:** `docs/superpowers/specs/2026-05-25-split-async-migration-from-abandon-feature-design.md` + +**Reference state:** The current tip of `jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` is `583eb46c` (now: `46f09e7e` after the spec commit). The diff from `main` to that commit contains BOTH PRs' content combined. + +--- + +## Phase 0 — Preparation + +### Task 0.1: Tag the current state as a safety reference + +**Files:** none (git only) + +- [ ] **Step 1: Tag the current abandon branch tip** + +```bash +cd /Users/jim/code/OctopusTentacle +git tag claude-safety-2026-05-25-pre-split jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex +git tag claude-safety-2026-05-25-pre-split-1235 jimpelletier/eft-3295-async-signature-propagation +``` + +- [ ] **Step 2: Verify tags** + +```bash +git tag -l "claude-safety-*" +``` + +Expected: at least these two tags listed. + +--- + +## Phase 1 — Build the base PR + +### Task 1.1: Create new base branch from main + +**Files:** none (git only) + +- [ ] **Step 1: Fetch main** + +```bash +cd /Users/jim/code/OctopusTentacle +git fetch origin main +``` + +- [ ] **Step 2: Create the new branch from origin/main** + +```bash +git checkout -b jimpelletier/eft-3295-async-migration-base origin/main +``` + +- [ ] **Step 3: Verify** + +```bash +git log --oneline -1 +``` + +Expected: latest commit on `main`. + +--- + +### Task 1.2: Migrate `SilentProcessRunner.ExecuteCommand` to async + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` + +The goal: change the method from sync to async with the MINIMUM internal changes. The `cancel` token is passed to `WaitForExitAsync(cancel)` so cancel still throws OCE and unwinds. `DoOurBestToCleanUp` remains unchanged — including the `process.Close()` call. `SafelyWaitForAllOutput` remains unchanged. + +Use the `claude-safety-2026-05-25-pre-split` tag to see what the final state in `583eb46c` looks like, but ONLY take: +- The method signature change to `async Task ExecuteCommandAsync(...)` (without `abandon` parameter) +- The internal `process.WaitForExit()` → `await process.WaitForExitAsync(cancel)` change +- The net48 polyfill `WaitForExitAsyncNetFramework` +- `process.EnableRaisingEvents = true` if it's needed for the polyfill + +DO NOT take: +- The `abandon` parameter on the method +- Any changes to `DoOurBestToCleanUp` (keep `process.Close()` as it was on main) +- Any changes to `SafelyWaitForAllOutput` comments +- Any `OperationCanceledException when (abandon.IsCancellationRequested && !process.HasExited)` catch block + +- [ ] **Step 1: Read the file on main** + +```bash +git show origin/main:source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs > /tmp/srp_main.cs +git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs > /tmp/srp_target.cs +``` + +- [ ] **Step 2: Construct the base-PR version manually** + +Use the main version as a starting point. Apply the minimum needed for async: +- Change `public static int ExecuteCommand(` → `public static async Task ExecuteCommandAsync(` +- Add `using System.Threading.Tasks;` +- Inside the method, find `process.WaitForExit();` and change to: + ```csharp + #if NETFRAMEWORK + await WaitForExitAsyncNetFramework(process, cancel).ConfigureAwait(false); + #else + await process.WaitForExitAsync(cancel).ConfigureAwait(false); + #endif + ``` +- Set `process.EnableRaisingEvents = true;` before `process.Start();` (needed so the polyfill's `Process.Exited` event fires) +- Add the `WaitForExitAsyncNetFramework` polyfill at the end of the class, inside an `#if NETFRAMEWORK` block: + ```csharp + #if NETFRAMEWORK + static Task WaitForExitAsyncNetFramework(Process process, CancellationToken cancellationToken) + { + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + CancellationTokenRegistration registration = default; + + void OnExited(object? sender, EventArgs e) + { + registration.Dispose(); + tcs.TrySetResult(null); + } + + process.Exited += OnExited; + if (process.HasExited) + { + tcs.TrySetResult(null); + } + if (cancellationToken.CanBeCanceled) + { + registration = cancellationToken.Register(() => + { + process.Exited -= OnExited; + tcs.TrySetCanceled(cancellationToken); + }); + } + return tcs.Task; + } + #endif + ``` + +- [ ] **Step 3: Verify the file compiles standalone** + +```bash +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj +``` + +Expected: build succeeds. Errors will likely be in callers, not this file. + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +git commit -m "$(cat <<'COMMIT' +Migrate SilentProcessRunner.ExecuteCommand to async + +Replaces the sync WaitForExit() with await WaitForExitAsync(cancel). +The cancel token is passed directly so the existing cancel semantics +are preserved: cancel firing throws OCE from the await and unwinds. +DoOurBestToCleanUp continues to fire on cancel via cancel.Register +exactly as it did in the sync version. + +Adds a net48 polyfill for WaitForExitAsync using Process.Exited + +TaskCompletionSource. + +Co-Authored-By: Claude Opus 4.7 (1M context) +COMMIT +)" +``` + +--- + +### Task 1.3: Migrate `ISilentProcessRunner` to async + +**Files:** +- Modify: `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` + +The interface defines the contract for `SilentProcessRunner.ExecuteCommand`. It will need an `ExecuteCommandAsync` method matching the new signature on the static `SilentProcessRunner` class. + +- [ ] **Step 1: Read main version** + +```bash +git show origin/main:source/Octopus.Tentacle/Util/ISilentProcessRunner.cs +``` + +- [ ] **Step 2: Read target version for reference** + +```bash +git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle/Util/ISilentProcessRunner.cs +``` + +- [ ] **Step 3: Construct the base-PR version** + +Take the target version. Remove ANY `abandon` parameter. Change return type of methods from `int` to `Task`. Add `CancellationToken cancel = default` if not already present. + +Replace the `SilentProcessRunnerExtended` (or similar wrapper) implementation so it calls `SilentProcessRunner.ExecuteCommandAsync(...)` and awaits/returns the Task — NO `.GetAwaiter().GetResult()` inside. + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle/Util/ISilentProcessRunner.cs +git commit -m "$(cat <<'COMMIT' +Migrate ISilentProcessRunner to async + +Co-Authored-By: Claude Opus 4.7 (1M context) +COMMIT +)" +``` + +--- + +### Task 1.4: Migrate `CommandLineRunner` and `CommandLineInvocation` to async + +**Files:** +- Modify: `source/Octopus.Tentacle/Util/CommandLineRunner.cs` +- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs` (only if it has an Execute method) + +`CommandLineRunner` wraps `SilentProcessRunner` and is consumed by Kubernetes integration tests and CLI helpers. Its `Execute` method becomes `ExecuteAsync`. + +`CommandLineInvocation.ExecuteCommandAsync()` is referenced from `SystemCtlHelper`, `LinuxServiceConfigurator`, `WindowsServiceConfigurator`. If this method exists on `CommandLineInvocation`, migrate it to async (no `abandon` param). + +- [ ] **Step 1: Check whether CommandLineInvocation has an Execute method** + +```bash +grep -n "Execute" source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs 2>/dev/null || echo "no Execute method in CommandLineInvocation" +``` + +- [ ] **Step 2: Read both versions** + +```bash +git show origin/main:source/Octopus.Tentacle/Util/CommandLineRunner.cs +git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle/Util/CommandLineRunner.cs +``` + +- [ ] **Step 3: Construct the base-PR version** + +Make `CommandLineRunner.Execute` → no, keep `Execute` (the existing public method is sync and consumed by the WPF installer, which must remain sync). Inside `Execute`, where it calls the underlying process runner: it currently does so via `.GetAwaiter().GetResult()`. KEEP that. The improved comment goes on the GetAwaiter line: + +```csharp +// We're in CommandLineRunner.Execute, called from the WPF installer (Octopus.Manager.Tentacle) +// running on a thread-pool worker after the installer hands off to our process-runner helper. +// CommandLineRunner.Execute itself must return synchronously because the installer's UI flow +// is sync. We block on the async call with .GetAwaiter().GetResult(). +// This is safe because we're on a plain thread-pool worker. The risk with blocking on async +// is a deadlock: if the async work needs to resume on the same thread that's blocked waiting +// for it, neither can make progress. Thread-pool workers don't have that constraint — the +// async work can pick up on any free thread when it finishes, so the block resolves normally. +var exitCode = SilentProcessRunner.ExecuteCommandAsync(/* args */).GetAwaiter().GetResult(); +``` + +- [ ] **Step 4: Commit** + +```bash +git add source/Octopus.Tentacle/Util/CommandLineRunner.cs source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs 2>/dev/null +git commit -m "Migrate CommandLineRunner and CommandLineInvocation to async + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 1.5: Migrate `RunningScript` to async (no abandon token) + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` + +`RunningScript.RunScript()` calls `SilentProcessRunner.ExecuteCommand`. Now that ExecuteCommand is async, RunScript must also become async. RunningScript stays WITHOUT the abandon token in the base PR. + +- [ ] **Step 1: Read both versions for reference** + +```bash +git show origin/main:source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs | head -100 +git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs | head -100 +``` + +- [ ] **Step 2: Build the base version** + +Take the target version and remove: +- `CancellationToken abandonToken` constructor parameter +- `abandonToken` field +- `abandon: abandonToken` argument when calling `ExecuteCommandAsync` +- Any `OperationCanceledException when (abandonToken.IsCancellationRequested)` catch branches +- Any references to `AbandonedExitCode` (those aren't in `ScriptExitCodes` yet) + +Make the public method async: `RunScript` → `RunScriptAsync` returning `Task`. + +- [ ] **Step 3: Commit** + +```bash +git add source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +git commit -m "Migrate RunningScript to async + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 1.6: Migrate `ScriptServiceV2` callsite to async (no abandon) + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` + +`ScriptServiceV2.StartScriptAsync` calls into `RunningScript.RunScript`. Update to await `RunScriptAsync`. Do NOT add `AbandonScriptAsync` here yet. + +- [ ] **Step 1: Read both versions** + +```bash +git show origin/main:source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs > /tmp/scs_main.cs +git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs > /tmp/scs_target.cs +``` + +- [ ] **Step 2: Build the base version** + +Take the main version and apply ONLY the minimal changes needed to await the new async `RunScriptAsync` from `RunningScript`. Remove all abandon-specific additions in the target version: +- No `AbandonScriptAsync` method +- No `RunningScriptWrapper.AbandonTokenSource` / `Abandon()` method +- No `AbandonedExitCode` references +- No abandon-specific workspace deletion logic + +- [ ] **Step 3: Commit** + +```bash +git add source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +git commit -m "Update ScriptServiceV2 to await async RunScriptAsync + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 1.7: Update the six sync↔async boundary sites with improved comments + +**Files:** +- Modify: `source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs` +- Modify: `source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs` +- Modify: `source/Octopus.Tentacle/Util/SystemCtlHelper.cs` +- Modify: `source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs` +- Modify: `source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs` + +Each of these sites was the immediate consumer of `ExecuteCommand` in main. After Task 1.2 they need to call `ExecuteCommandAsync` and either await it (if they can become async) or block with `.GetAwaiter().GetResult()` (if they cannot). + +ALL of these sites CANNOT become async in this PR — they implement sync interfaces (IPrerequisite, IMemoryCache factory, IServiceConfigurator) or are called from sync framework code (Topshelf). They all use `.GetAwaiter().GetResult()` with an explanatory comment. + +Comment template (adapt the specifics per site): + +``` +// We're in [SHORT DESCRIPTION OF SITE]. [WHY IT MUST BE SYNC — interface +// constraint, framework callback, etc.]. We block on the async call with +// .GetAwaiter().GetResult(). +// This is safe because we're on a plain thread-pool worker. The risk with +// blocking on async is a deadlock: if the async work needs to resume on +// the same thread that's blocked waiting for it, neither can make progress. +// Thread-pool workers don't have that constraint — the async work can +// pick up on any free thread when it finishes, so the block resolves normally. +``` + +- [ ] **Step 1: Update PowerShellPrerequisite** + +Site: `source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs`. The `Check()` method calls `SilentProcessRunner.ExecuteCommandAsync(...).GetAwaiter().GetResult()`. Comment specifics: "We're in the WPF installer prerequisite check. IPrerequisite.Check() must return synchronously — there's no async version of the interface — so we block..." + +Reference state: `git show claude-safety-2026-05-25-pre-split:source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs` + +Copy that file's content directly — it has the right comment already. + +- [ ] **Step 2: Update KubernetesDirectoryInformationProvider** + +Site: `source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs`. Method `GetDriveBytesUsingDu` is called from inside an `IMemoryCache.GetOrCreate` factory (a `Func` — sync). Comment specifics: "We're in the IMemoryCache.GetOrCreate factory that populates the disk-space cache entry. The cache factory delegate is synchronous (`Func`) so we block on the async call with `.GetAwaiter().GetResult()`..." + +Take this content from the safety tag, BUT verify it does not include any async chain propagation (it shouldn't — we never propagated this in the abandon PR). It should be `GetPathUsedBytes` (sync) with GetAwaiter on the du call. + +```bash +git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs +``` + +If the file at safety tag has `GetPathUsedBytesAsync` or other async-chain content, that came from PR #1235 work that was rolled back. Use the file with the sync `GetPathUsedBytes` + GetAwaiter pattern. + +- [ ] **Step 3: Update SystemCtlHelper** + +Site: `source/Octopus.Tentacle/Util/SystemCtlHelper.cs`. Two GetAwaiter calls inside `RunServiceCommand` (one for systemctl, one for sudo retry). Comment specifics: "We're in SystemCtlHelper running a systemctl command. All callers (StartService, RestartService, etc.) are sync — they're part of the Tentacle service-management CLI flow, which bottoms out in ServiceCommand.Start() (sync `void` override) with no async path..." + +Second GetAwaiter call (sudo retry) gets a short pointer comment: "Same sync boundary — sudo retry on the same thread-pool worker." + +- [ ] **Step 4: Update LinuxServiceConfigurator** + +Site: `source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs`. Three GetAwaiter calls: `WriteUnitFile`, `IsSystemdInstalled`, `HaveSudoPrivileges`. Each gets the comment template, adapted: + +For `WriteUnitFile`: "WriteUnitFile is called from `IServiceConfigurator.ConfigureService` implementations, which are themselves called from the Tentacle service-management CLI on a thread-pool worker..." + +For `IsSystemdInstalled` and `HaveSudoPrivileges`: "Same sync boundary as WriteUnitFile." + +- [ ] **Step 5: Update WindowsServiceConfigurator** + +Site: `source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs`. One GetAwaiter call inside `Sc()`. Comment specifics: "Sc() is called from `IServiceConfigurator.ConfigureService` implementations on Windows, on a thread-pool worker..." + +- [ ] **Step 6: Commit** + +```bash +git add source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs \ + source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs \ + source/Octopus.Tentacle/Util/SystemCtlHelper.cs \ + source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs \ + source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs +git commit -m "$(cat <<'COMMIT' +Document the six sync↔async boundary sites with improved comments + +Each immediate sync caller of ExecuteCommandAsync now blocks via +.GetAwaiter().GetResult() with a comment that explains where it sits +in the call graph, why the surrounding code must be synchronous, and +why blocking on async is deadlock-safe from a plain thread-pool worker. + +Sites: +- PowerShellPrerequisite.Check (WPF installer prerequisite) +- KubernetesDirectoryInformationProvider.GetDriveBytesUsingDu (IMemoryCache factory) +- SystemCtlHelper.RunServiceCommand (×2 — systemctl + sudo retry) +- LinuxServiceConfigurator: WriteUnitFile, IsSystemdInstalled, HaveSudoPrivileges +- WindowsServiceConfigurator.Sc + +Co-Authored-By: Claude Opus 4.7 (1M context) +COMMIT +)" +``` + +--- + +### Task 1.8: Update other test scaffolding files + +**Files:** +- The Kubernetes integration test files listed in the diff (TestUtils, Setup, Tooling, etc.) likely need to be migrated to async because they consume `CommandLineRunner` or `SilentProcessRunner`. + +The diff from `main` to `583eb46c` lists these: +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/*.cs` +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/*.cs` +- `source/Octopus.Tentacle.Tests.Integration/Support/*.cs` +- `source/Octopus.Tentacle.Tests.Integration/Util/*.cs` +- `source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs` + +Most of these only changed because they had to switch from sync `Execute` to async `ExecuteAsync`. Copy the safety-tag versions BUT verify each one only contains async-migration changes (no abandon-related changes). If a file contains abandon test fixtures, take only the async portions. + +- [ ] **Step 1: For each file, compare main vs safety tag** + +```bash +for f in \ + source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/SetupHelpers.cs \ + source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs \ + source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs \ + source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs \ + source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs \ + source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs \ + source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs \ + source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesAgent/KubernetesClusterOneTimeSetUp.cs \ + source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesClientCompatibilityTests.cs \ + source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs \ + source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs \ + source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs \ + source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs \ + source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs ; do + echo "=== $f ===" + git diff origin/main..claude-safety-2026-05-25-pre-split -- "$f" | head -20 + echo +done +``` + +- [ ] **Step 2: For each file, take the safety-tag version IF its changes are purely async-migration** + +Use `git checkout claude-safety-2026-05-25-pre-split -- ` for each. + +If a file contains abandon-specific additions (e.g., references to `AbandonScript` or `AbandonedExitCode`), manually edit out those parts after checkout. + +- [ ] **Step 3: Build** + +```bash +dotnet build source/Octopus.Tentacle.sln +``` + +Expected: build succeeds. Errors here will reveal additional files that need attention. + +- [ ] **Step 4: Commit** + +```bash +git add -A +git commit -m "$(cat <<'COMMIT' +Migrate test scaffolding to async ExecuteCommandAsync + +Updates Kubernetes integration test setup and support helpers to await +the new ExecuteCommandAsync signature. No abandon-feature content is +included. + +Co-Authored-By: Claude Opus 4.7 (1M context) +COMMIT +)" +``` + +--- + +### Task 1.9: Build verification — base PR must compile and tests must pass + +**Files:** none (verification only) + +- [ ] **Step 1: Full build** + +```bash +cd /Users/jim/code/OctopusTentacle/source +dotnet build Octopus.Tentacle.sln 2>&1 | tail -50 +``` + +Expected: 0 errors. Any errors must be resolved before proceeding — they indicate missing files in the migration. + +- [ ] **Step 2: Run the unit tests** + +```bash +dotnet test source/Octopus.Tentacle.Tests/Octopus.Tentacle.Tests.csproj +``` + +Expected: all green. + +- [ ] **Step 3: Run the SilentProcessRunner integration test for ShouldCancelPing** + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --filter "Name~ShouldCancelPing" +``` + +Expected: green. This verifies cancel works with our `WaitForExitAsync(cancel)` wiring. + +--- + +### Task 1.10: Push the base branch and open the new PR + +**Files:** none (git + gh) + +- [ ] **Step 1: Push** + +```bash +git push -u origin jimpelletier/eft-3295-async-migration-base +``` + +- [ ] **Step 2: Create the PR with base = main** + +```bash +gh pr create \ + --base main \ + --head jimpelletier/eft-3295-async-migration-base \ + --title "Migrate SilentProcessRunner to async" \ + --body "$(cat <<'EOF' +## Summary + +Makes `SilentProcessRunner.ExecuteCommand` async. Required foundation for the EFT-3295 script-abandonment feature (PR #1226, which stacks on top of this) but valuable on its own as a refactor: enables awaiting process runs from already-async callers rather than blocking a thread. + +### What this PR does +- `SilentProcessRunner.ExecuteCommand` → `ExecuteCommandAsync` (and the matching interfaces and helpers) +- Internal: `process.WaitForExit()` → `await process.WaitForExitAsync(cancel)` +- Adds a net48 polyfill for `WaitForExitAsync` (using `Process.Exited` + `TaskCompletionSource`) +- The six immediate sync callers (PowerShellPrerequisite, KubernetesDirectoryInformationProvider, SystemCtlHelper×2, LinuxServiceConfigurator×3, WindowsServiceConfigurator) block via `.GetAwaiter().GetResult()` with a comment explaining the call context and why blocking on a thread-pool worker is deadlock-safe + +### What this PR explicitly does NOT include +- The `abandon` parameter on `ExecuteCommandAsync` (added in #1226) +- Removal of `process.Close()` from `DoOurBestToCleanUp` (added in #1226) +- Any abandon-specific contracts, RPC methods, capabilities, env vars, or tests (#1226) + +## Test plan +- [ ] CI build green +- [ ] `ShouldCancelPing` integration test still passes (cancel semantics preserved) + +🤖 Generated with [Claude Code](https://claude.ai/claude-code) +EOF +)" +``` + +- [ ] **Step 3: Capture the new PR number for use in subsequent tasks** + +```bash +gh pr view jimpelletier/eft-3295-async-migration-base --json number,url +``` + +--- + +## Phase 2 — Rebuild #1226 on top of the base PR + +### Task 2.1: Reset the abandon branch to the base PR tip + +**Files:** none (git only) + +- [ ] **Step 1: Switch to the abandon branch** + +```bash +cd /Users/jim/code/OctopusTentacle +git checkout jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex +``` + +- [ ] **Step 2: Hard-reset to the new base branch tip** + +```bash +git reset --hard jimpelletier/eft-3295-async-migration-base +``` + +- [ ] **Step 3: Verify** + +```bash +git log --oneline -1 +``` + +Expected: tip of the base branch. + +--- + +### Task 2.2: Apply the abandon delta — contracts, env var + +**Files:** +- Create: `source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs` +- Modify: `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs` (add `AbandonedExitCode = -48`) +- Modify: `source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs` (add `AbandonScript` method) +- Modify: `source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs` (add `AbandonScriptAsync`) +- Modify: `source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs` (add `TentacleDebugDisableProcessKill`) + +- [ ] **Step 1: Copy each file from safety tag** + +```bash +git checkout claude-safety-2026-05-25-pre-split -- \ + source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs \ + source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs \ + source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs \ + source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs \ + source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs +``` + +- [ ] **Step 2: Commit** + +```bash +git commit -am "Add abandon contracts and TentacleDebugDisableProcessKill env var + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 2.3: Apply the abandon delta — SilentProcessRunner abandon token + Close removal + long-form comments + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` + +This step: +- Adds the `abandon` parameter to `ExecuteCommandAsync` +- Switches internal await from `WaitForExitAsync(cancel)` to `WaitForExitAsync(abandon)` +- Adds the `OperationCanceledException when (abandon.IsCancellationRequested && !process.HasExited)` catch returning `ScriptExitCodes.AbandonedExitCode` +- Removes `process.Close()` from `DoOurBestToCleanUp` +- Adds long-form documentation comments to `DoOurBestToCleanUp`, `SafelyWaitForAllOutput`, and the `WaitForExitAsync` call site +- Adds the Hitman env-var test-affordance check + +- [ ] **Step 1: Take the safety-tag version of SilentProcessRunner** + +```bash +git checkout claude-safety-2026-05-25-pre-split -- source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +``` + +- [ ] **Step 2: Verify the stray `process.Close()` bug fix is included** + +```bash +grep -n "process.Close" source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +``` + +Expected output: only references in comments (no actual `process.Close();` call). If a `process.Close();` call appears not in a comment, remove it manually — same fix as in commit `583eb46c`. + +- [ ] **Step 3: Commit** + +```bash +git commit -am "Add abandon token to SilentProcessRunner and remove process.Close() race + +- Adds CancellationToken abandon parameter to ExecuteCommandAsync +- Switches the await from WaitForExitAsync(cancel) to WaitForExitAsync(abandon) +- Returns ScriptExitCodes.AbandonedExitCode when abandon fires before process exits +- Removes process.Close() from DoOurBestToCleanUp (race with WaitForExitAsync's + TCS via the Exited event — Close tore down the wait state, hung cancel) +- Adds long-form documentation comments explaining the race, the grandchild-pipe + scenario, and worst-case cancel latency +- Adds TentacleDebugDisableProcessKill test affordance to Hitman + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 2.4: Apply the abandon delta — interface + caller updates for abandon parameter + +**Files:** +- Modify: `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` +- Modify: `source/Octopus.Tentacle/Util/CommandLineRunner.cs` +- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs` (if it has an Execute method) + +Add the `abandon` parameter to the interface and the helper class. + +- [ ] **Step 1: Take the safety-tag versions** + +```bash +git checkout claude-safety-2026-05-25-pre-split -- \ + source/Octopus.Tentacle/Util/ISilentProcessRunner.cs \ + source/Octopus.Tentacle/Util/CommandLineRunner.cs +git checkout claude-safety-2026-05-25-pre-split -- source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs 2>/dev/null +``` + +- [ ] **Step 2: Commit** + +```bash +git commit -am "Plumb abandon token through ISilentProcessRunner and CommandLineRunner + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 2.5: Apply the abandon delta — RunningScript abandon-token plumbing + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` + +Adds the `abandonToken` constructor parameter and passes it to `ExecuteCommandAsync`. + +- [ ] **Step 1: Take the safety-tag version** + +```bash +git checkout claude-safety-2026-05-25-pre-split -- source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +``` + +- [ ] **Step 2: Commit** + +```bash +git commit -am "Plumb abandon token through RunningScript + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 2.6: Apply the abandon delta — ScriptServiceV2.AbandonScriptAsync + workspace cleanup + +**Files:** +- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` + +Adds: +- `RunningScriptWrapper.AbandonTokenSource` and `Abandon()` +- Public `AbandonScriptAsync` method on the service +- Best-effort `workspace.Delete` gated on `AbandonedExitCode` + +- [ ] **Step 1: Take the safety-tag version** + +```bash +git checkout claude-safety-2026-05-25-pre-split -- source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +``` + +- [ ] **Step 2: Commit** + +```bash +git commit -am "Implement ScriptServiceV2.AbandonScriptAsync and abandon-gated workspace cleanup + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 2.7: Apply the abandon delta — advertise AbandonScriptV2 capability + +**Files:** +- Modify: `source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs` +- Modify: `source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs` + +Adds `nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)` to the capabilities list. Updates the integration test to expect it for Latest tentacles. + +- [ ] **Step 1: Take the safety-tag versions** + +```bash +git checkout claude-safety-2026-05-25-pre-split -- \ + source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs \ + source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +``` + +- [ ] **Step 2: Commit** + +```bash +git commit -am "Advertise AbandonScriptV2 capability + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 2.8: Apply the abandon delta — ScriptBuilder.AppendRaw, tests, and grandchild test comments + +**Files:** +- Modify: `source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs` +- Modify: `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` +- Create or modify: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` +- Modify: `source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs` +- Modify: `source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs` +- Modify: `source/Octopus.Tentacle.Tests/Kubernetes/KubernetesDirectoryInformationProviderFixture.cs` (if it has abandon-specific test changes) + +The abandon-specific tests and test helpers. Includes the rewritten grandchild test comments in `SilentProcessRunnerFixture`. + +- [ ] **Step 1: Take the safety-tag versions** + +```bash +git checkout claude-safety-2026-05-25-pre-split -- \ + source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs \ + source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs \ + source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs \ + source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs \ + source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs +``` + +- [ ] **Step 2: Commit** + +```bash +git commit -am "Add abandon-specific tests and rewrite grandchild test comments for async behavior + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +### Task 2.9: Apply remaining files — spec doc, plan doc, anything else in diff + +**Files:** +- Spec/plan files from `docs/superpowers/` +- Any remaining file in the `git diff main..claude-safety-2026-05-25-pre-split` that's not already covered + +- [ ] **Step 1: List files still differing** + +```bash +git diff jimpelletier/eft-3295-async-migration-base..claude-safety-2026-05-25-pre-split --name-only +``` + +- [ ] **Step 2: Inspect any unhandled files and bring them over** + +For each remaining file: +- If the change is abandon-specific: `git checkout claude-safety-2026-05-25-pre-split -- ` +- If unrelated: skip and ask user + +- [ ] **Step 3: Verify the diff is complete** + +```bash +git diff jimpelletier/eft-3295-async-migration-base..HEAD --stat +``` + +This should now contain the FULL abandon-feature delta. + +- [ ] **Step 4: Verify end state matches the safety tag** + +```bash +git diff claude-safety-2026-05-25-pre-split HEAD +``` + +Expected: zero output. The rebuilt branch should produce the EXACT same end state as `583eb46c`. + +If there are differences, investigate and resolve them before continuing. + +- [ ] **Step 5: Commit any final additions** + +```bash +git status +git add -A +git commit -m "Bring in remaining abandon-feature files + +Co-Authored-By: Claude Opus 4.7 (1M context) " || echo "no changes" +``` + +--- + +### Task 2.10: Build verification — abandon PR must compile and all tests must pass + +**Files:** none (verification only) + +- [ ] **Step 1: Full build** + +```bash +dotnet build source/Octopus.Tentacle.sln 2>&1 | tail -50 +``` + +Expected: 0 errors. + +- [ ] **Step 2: Run abandon-specific tests** + +```bash +dotnet test source/Octopus.Tentacle.Tests/Octopus.Tentacle.Tests.csproj --filter "Name~Abandon" +dotnet test source/Octopus.Tentacle.Tests.Integration --filter "Name~Abandon" +``` + +Expected: green. + +--- + +### Task 2.11: Force-push abandon branch and update PR #1226's base + +**Files:** none (git + gh) + +- [ ] **Step 1: Force-push** + +```bash +git push --force-with-lease origin jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex +``` + +- [ ] **Step 2: Change PR #1226's base to the new async-migration-base branch** + +```bash +gh pr edit 1226 --base jimpelletier/eft-3295-async-migration-base +``` + +- [ ] **Step 3: Add a comment to #1226 explaining the rebase** + +```bash +gh pr comment 1226 --body "$(cat <<'EOF' +Rebased on top of the new foundational PR (the async migration of \`SilentProcessRunner\`). The diff is now focused on the abandon feature itself — the async-migration plumbing has moved to the base PR. + +Previous head: \`583eb46c\` (preserved as tag \`claude-safety-2026-05-25-pre-split\`). +EOF +)" +``` + +--- + +## Phase 3 — Rebase PR #1235 + +### Task 3.1: Rebase #1235 on top of the new #1226 + +**Files:** none (git) + +- [ ] **Step 1: Switch to #1235's branch** + +```bash +cd /Users/jim/code/OctopusTentacle +git checkout jimpelletier/eft-3295-async-signature-propagation +``` + +- [ ] **Step 2: Rebase onto the new #1226 tip** + +```bash +git rebase jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex +``` + +If conflicts arise: resolve each one. The most likely conflict file is `SilentProcessRunner.cs` (because #1235 had the stray `process.Close()` fix that's now in #1226). Other conflicts are mechanical — resolve in favor of the #1235 version since those are the push-higher changes. + +- [ ] **Step 3: Verify build** + +```bash +dotnet build source/Octopus.Tentacle.sln 2>&1 | tail -20 +``` + +- [ ] **Step 4: Force-push #1235** + +```bash +git push --force-with-lease origin jimpelletier/eft-3295-async-signature-propagation +``` + +- [ ] **Step 5: Sanity check #1235's PR diff** + +```bash +gh pr view 1235 --json url +``` + +Visit the URL and confirm the diff contains only the push-higher commits (no abandon-feature content leaked). + +--- + +## Phase 4 — Final verification + +### Task 4.1: End-to-end stack check + +**Files:** none + +- [ ] **Step 1: Verify branch graph** + +```bash +git log --oneline --graph --all -30 +``` + +Expected: `main` → base branch → abandon branch → push-higher branch. + +- [ ] **Step 2: Verify each PR's base** + +```bash +gh pr list --head jimpelletier/eft-3295-async-migration-base +gh pr list --head jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex +gh pr list --head jimpelletier/eft-3295-async-signature-propagation +``` + +Expected: +- New base PR → base: `main` +- #1226 → base: `jimpelletier/eft-3295-async-migration-base` +- #1235 → base: `jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` + +- [ ] **Step 3: Verify end-state equivalence** + +```bash +# When all three PRs are squash-merged, the result on main should equal the safety tag's file states +git diff claude-safety-2026-05-25-pre-split jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex +``` + +Expected: zero output (the rebased #1226 ends at the same end-state as the original tip). + +```bash +git diff claude-safety-2026-05-25-pre-split-1235 jimpelletier/eft-3295-async-signature-propagation +``` + +Expected: zero output (the rebased #1235 ends at the same end-state as before). + +- [ ] **Step 4: Report success** + +Report each PR's URL and confirm the inversion is complete. + +--- + +## Open notes for the implementer + +- If `git checkout claude-safety-2026-05-25-pre-split -- ` brings over content that includes abandon-specific changes when a file is supposed to be "async-migration only," check whether the file at safety-tag has BOTH concerns mixed. If so, you'll need to manually edit out the abandon parts. This is most likely for: `SilentProcessRunner.cs`, `RunningScript.cs`, `ScriptServiceV2.cs`, `ISilentProcessRunner.cs`, `CommandLineRunner.cs`. +- If a build error during Phase 1 says a method has the wrong signature, it likely means the abandon-token parameter leaked into the base-PR version of an interface. Search for `abandon` in the file and remove. +- The `.worktrees/` directory is gitignored from the abandon branch but NOT from `main`. If `git status` shows it as untracked on the base branch, that's expected — the gitignore was added in the abandon branch only. The base PR should NOT include this gitignore change (it's not async-migration related). diff --git a/docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md b/docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md new file mode 100644 index 000000000..694e63254 --- /dev/null +++ b/docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md @@ -0,0 +1,400 @@ +# Tentacle script abandon — design + +**Status:** Draft, ready for implementation planning. Contract aligned with the parallel server-side session. +**Ticket:** [EFT-3295](https://linear.app/octopus/issue/EFT-3295/tentacle-script-abandonment-to-release-the-mutex) +**ADR:** [ADR-042 — Defer server-task Abandoned state](https://github.com/OctopusDeploy/adr/pull/226) +**Parallel work:** Server-side (ProcessExecution layer) is being designed in a separate session and will consume the contract proposed here. + +--- + +## Problem + +When a Tentacle script is hung in a way that resists `Process.Kill` (Philips' case: PowerShell stuck inside CrowdStrike + Rapid7 fighting over the same process; kernel-level uninterruptible wait), today's flow ends with: + +- `ScriptIsolationMutex` stays held → subsequent deployments to that Tentacle queue forever. +- The .NET threadpool thread inside `RunningScript.Execute()` stays parked on `process.WaitForExit()` (synchronous). +- The customer's only recovery is RDP-in-and-kill or reboot. Not acceptable for Philips. + +Server-side will detect that cancellation hasn't propagated within its own timeout and will tell Tentacle to **abandon** the script. Tentacle releases the mutex, logs honestly, accepts new work. The runaway OS process is **not** killed — explicitly out of scope per the ticket. + +## Scope + +In scope: +- `IScriptServiceV2` only (Listening + Polling Tentacles). +- New Halibut RPC verb `AbandonScript`, new exit code `AbandonedExitCode = -48`. +- Gated by server-side feature flag (`AbandonTentacleScriptOnCancellationTimeoutFeatureToggle`) for the first release. No Tentacle-side flag — capability advertisement is binary on build version. + +Out of scope: +- SSH targets (different lock model; ticket explicitly defers). +- Kubernetes agent (`IKubernetesScriptServiceV1`): different mechanism, separate stuck-pod work already in flight (`KubernetesPendingPodWatchDog`). Server's capability negotiation handles "don't try abandon on Kubernetes targets" cleanly. +- Old `IScriptService` (V1): no signal that any active Tentacle still negotiates V1. +- Killing the runaway OS process. +- Server-task Abandoned UI state — deferred by ADR-042; task continues to surface as Cancelled. + +## Section 1 — Contract surface + +Add a method to existing `IScriptServiceV2`. Do NOT introduce V3 — the convention here is method-addition + capability negotiation. + +```csharp +// source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs +public interface IScriptServiceV2 +{ + ScriptStatusResponseV2 StartScript(StartScriptCommandV2 command); + ScriptStatusResponseV2 GetStatus(ScriptStatusRequestV2 request); + ScriptStatusResponseV2 CancelScript(CancelScriptCommandV2 command); + ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command); // NEW + void CompleteScript(CompleteScriptCommandV2 command); +} + +// NEW: source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs +public class AbandonScriptCommandV2 +{ + public AbandonScriptCommandV2(ScriptTicket ticket, long lastLogSequence) { /* … */ } + public ScriptTicket Ticket { get; } + public long LastLogSequence { get; } +} +``` + +**Capability advertisement.** Tentacle's `CapabilitiesServiceV2` advertises `AbandonScriptV2` once the build supports it. Binary on build version, no Tentacle-side toggle. Server's existing `BackwardsCompatibleAsyncCapabilitiesV2Decorator` handles "Tentacle doesn't advertise it → don't call it" for older Tentacles. Server-side `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` is the only feature-flag off-switch. + +**Why a new verb (not a "force" flag on Cancel).** Different semantics: Cancel = "try to stop the OS process gracefully". Abandon = "give up tracking; release the mutex; the OS process may still be running". Two verbs map cleanly to ProcessExecution's two-step escalation (cancel first, abandon if cancel doesn't propagate). + +## Section 2 — Mutex release mechanics + +**The core constraint.** `RunningScript.Execute()` acquires `ScriptIsolationMutex` inside a `using` block that wraps a synchronous call to `SilentProcessRunner.ExecuteCommand`. `ExecuteCommand` blocks on `process.WaitForExit()` (line 143). When `WaitForExit` never returns: +1. The mutex is welded shut (the `using`'s Dispose never runs). +2. The threadpool thread inside `Task.Run(() => Execute())` is parked forever. + +Both problems need to be solved. The mutex problem is the ticket's primary deliverable; the parked-thread problem is required so Tentacle doesn't accumulate thread leaks each time the abandon path fires. + +**Rejected alternatives** (documented for the reviewer's benefit): + +- **Orphan the Task + release mutex via external Dispose.** Releases mutex but leaks a threadpool worker per abandon. Tentacle eventually starves the threadpool. +- **Manual `Thread` instead of `Task`.** Same leak problem, just trades threadpool for kernel thread handles + stack memory. +- **`Thread.Abort` / `Thread.Interrupt` / `TerminateThread` P/Invoke.** No safe managed mechanism to release a thread parked in unmanaged code. `TerminateThread` doesn't unwind stack or release locks; can corrupt Tentacle's own state. +- **Out-of-process script worker.** Cleanly isolates the stuck-process problem from Tentacle, but is a massive refactor far outside EFT-3295's scope. Worth a separate proposal someday. +- **Sync cancellable wait via `ManualResetEventSlim.Wait()`.** Replaces only the blocking primitive inside `SilentProcessRunner`, leaves everything else synchronous. Smaller diff, but preserves a parked thread per running script in the normal case (same cost as today) and doesn't move the codebase toward async. Rejected in favour of the async approach below. Tentacle's existing test coverage gives us confidence the wider async migration is safe to ship, so the smaller-diff defensiveness isn't compelling. + +### The chosen approach: async cancellable wait + +Replace the sync `process.WaitForExit()` with `await process.WaitForExitAsync(abandon)`. **Replace `ExecuteCommand` outright; do NOT ship an additive overload.** Every caller migrates to await. + +**Verified behaviour** (.NET source, `Process.cs:1523-1594`): `WaitForExitAsync` uses a `TaskCompletionSource` driven by either the process's `Exited` event or `cancellationToken.UnsafeRegister(... TrySetCanceled ...)`. When the token fires, the awaiter completes with `OperationCanceledException` independently of whether the OS process has exited. The `WaitUntilOutputEOF` follow-up is bypassed on cancellation. **No thread is parked during the wait.** + +**Two tokens, one passed to the wait.** `cancel` keeps its existing job (`cancel.Register` fires `DoOurBestToCleanUp` → `Hitman.Kill`). `abandon` is the new signal whose only job is "stop waiting, do not touch the process". Only `abandon` is passed into `WaitForExitAsync`; do NOT link `cancel` in. When `cancel` fires and the kill works, the process exits and the wait returns naturally via the `Exited` event. When `cancel` fires and the kill DOESN'T work (Philips), the wait keeps going until `abandon` fires from the server's 2-minute escalation. Linking `cancel` into the wait token would race the kill against the wait-cancellation and lose the natural-exit code on the happy path. + +```csharp +using (cancel.Register(() => DoOurBestToCleanUp(process, error))) +{ + process.BeginOutputReadLine(); + process.BeginErrorReadLine(); + + try + { + await process.WaitForExitAsync(abandon).ConfigureAwait(false); + } + catch (OperationCanceledException) when (abandon.IsCancellationRequested && !process.HasExited) + { + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + return ScriptExitCodes.AbandonedExitCode; + } + + // process exited (naturally or via cancel-triggered kill) — existing cleanup path + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + return SafelyGetExitCode(process); +} +``` + +**Diff shape — `ExecuteCommand` becomes `ExecuteCommandAsync`, all callers migrate.** Search across the repo found ~20 call sites. Every one updates. + +Production code: +- `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` — the method itself. Rename, return `Task`, swap `WaitForExit()` for `await WaitForExitAsync(abandon)`. Two-token signature. +- `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` — interface and the in-process wrapper become async. +- `source/Octopus.Tentacle/Util/CommandLineRunner.cs` — caller migration. +- `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` — `RunScript` → `RunScriptAsync`; ctor takes `abandonToken` alongside `runningScriptToken`; `Execute()` awaits the new path. +- `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` — `LaunchShell` passes `abandonToken` from the wrapper. `RunningScriptWrapper` gains `abandonTokenSource`. New `AbandonScriptAsync` method. +- `source/Octopus.Tentacle.Contracts/ScriptServiceV2/` — new `AbandonScriptCommandV2.cs`, interface method on `IScriptServiceV2.cs` (per Section 1). +- `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs` — add `AbandonedExitCode = -48`. +- Capabilities advertisement (`AbandonScriptV2`). + +Kubernetes integration test scaffolding (all caller-migration, no logic change): +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs` +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs` (2 call sites) +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs` (3 call sites) +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs` (4 call sites) +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs` +- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs` + +Tentacle integration test scaffolding (caller migration): +- `source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs` (3 call sites) +- `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` +- `source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs` + +**What happens to stdout/stderr after abandon.** Returning `AbandonedExitCode` unwinds the method. The outer `using (var process = new Process())` disposes the Process, which closes our end of the redirected pipes. The OS process may get EPIPE on its next stdout/stderr write. This is consistent with the ticket: we're closing our own handles, not killing the runaway process. The script's runtime keeps doing whatever it's doing; many scripts ignore broken-pipe errors, and scripts that fail on them already had nowhere to log anyway. The alternative — leaving the Process and its pipes pinned in memory indefinitely — is the resource-accumulation problem we already rejected. + +**Async correctness watch-outs for the implementation plan:** +- Every new async method gets `.ConfigureAwait(false)`. +- No `.Result` / `.Wait()` calls on the new path; if a caller can't easily be made async, surface it for separate handling rather than block-on-async. +- Verify no deadlock under the Tentacle's synchronisation context (none, but worth confirming). + +## Section 3 — State, exit code, log wording + +- **Exit code:** `ScriptExitCodes.AbandonedExitCode = -48`. Distinct from `CanceledExitCode (-43)`. Server-side telemetry can tell abandoned from cancelled even though task UI surfaces both as "Cancelled" per ADR-042. +- **State on GetStatus after abandon:** `(ProcessState.Complete, AbandonedExitCode, latestLogs)`. Same shape as Cancel returns today. +- **Honest log line:** `"Tentacle has abandoned this script. The underlying script process may still be running on this host."` Written once, into the workspace script log, near the end of the abandon path. +- **Workspace cleanup on subsequent `CompleteScript`:** targeted best-effort. `CompleteScript` reads the stateStore and checks the persisted exit code. If `AbandonedExitCode`, wrap `workspace.Delete` in try/catch, log a `Warn` to systemLog naming the leaked directory, return success. For any other exit code, `workspace.Delete` is called as today and exceptions propagate. This way the relaxed-deletion policy applies only to the rare abandon case; bugs that leak handles on normal-completion paths can't hide under a blanket try/catch. No janitor — the ticket already says OS-level state on the host is the customer's problem. +- **Idempotency — actual-status return (NOT silent no-op):** + - Abandon called twice on the same already-abandoned ticket → returns the cached `(Complete, AbandonedExitCode, logs)` response. + - Abandon called on a ticket that completed naturally before the abandon arrived (race case the server-side session flagged) → returns `(Complete, realExitCode, logs)` with the **real exit code**, distinct from `AbandonedExitCode`. The server uses this distinction to log *"Script had already completed before abandon was needed"* instead of *"Tentacle abandoned the script"*. Silent no-op would hide this signal. + - Abandon called on an unknown ticket (never started, or already cleaned up via `CompleteScript`) → returns `(Complete, UnknownScriptExitCode, [])`, matching Cancel's behaviour for the same case. +- **Race with natural completion:** the wrapper's existing `StartScriptMutex` (or a new dedicated lock) serialises abandon entry. If state is already Complete, abandon returns the cached status per the rules above. + +## Section 4 — Automated test strategy + +### 4.1 `SilentProcessRunner` unit tests + +Style: matches existing `SilentProcessRunnerFixture.cs`. Use short-lived helper scripts/exes as process subjects. + +| Test | Trigger | Verify | +|---|---|---| +| Normal exit | Run a process that exits 0 | Returns 0; no abandon log line captured by the `info` callback spy. | +| Cancel kills process | Long-running process; fire cancel token | Within 1s: process is killed (`process.HasExited == true`), return value is the kill-induced exit code (Linux: 137; Windows: process-defined). No abandon log line. | +| Abandon while running | Long-running process; fire abandon token | Within ~100ms: returns `AbandonedExitCode`, `info` callback received exactly one call containing "Tentacle has abandoned this script". Then assert `process.HasExited == false` and clean up by killing externally. | +| Abandon AFTER natural exit (race) | Process that exits in ~50ms; fire abandon token at the moment exit fires | Return value is the process's real exit code, not `AbandonedExitCode`. No abandon log line. Verifies the `if (abandon.IsCancellationRequested && !process.HasExited)` guard. | +| Both tokens fire | Long-running process; fire cancel; while cancel.Register is mocked to no-op, fire abandon | `info` callback gets abandon log line; return value is `AbandonedExitCode`. Verifies the unkillable-cancel + abandon escalation path that the integration tests then exercise end-to-end. | + +**Async-specific timing assertion:** `WaitForExitAsync(token)` returns within ~50ms of cancellation. **Test verification:** wrap the await in `Stopwatch.StartNew()`; assert elapsed < 100ms. Proves async wait is independent of process exit. + +**Thread-leak regression test:** start 50 stuck processes via `ExecuteCommandAsync` (all `await`ed in parallel), fire abandon on all; capture `Process.GetCurrentProcess().Threads.Count` before and 1s after; assert delta ≤ 5 (allow for threadpool jitter). The async path should produce zero parked threads at steady state. + +### 4.2 `ScriptServiceV2` service-layer tests + +Style: matches existing service-layer fixtures using in-memory script shells and stub workspace factories. + +| Test | Trigger | Verify | +|---|---|---| +| **Mutex release (load-bearing)** | Start `FullIsolation` script; abandon it; immediately start second `FullIsolation` script | Second `StartScript` returns with `State == Running` within 1s. Reading `ScriptIsolationMutex.TaskLock.Report()` between abandon and second-start shows the lock free in that window. | +| Abandon before StartScript | Call AbandonScript with a ticket never seen | Returns `(Complete, UnknownScriptExitCode)`. Matches existing Cancel behaviour for unknown ticket. | +| Abandon after CompleteScript | Start → Complete → Abandon | Returns `(Complete, UnknownScriptExitCode)` (wrapper already removed; stateStore gone). | +| Abandon then Cancel | Abandon, then Cancel same ticket | Cancel returns the cached abandoned response unchanged. Asserts via response equality. | +| **Cancel then Abandon (real flow)** | Long-running script; cancel; cancel.Register no-op'd to simulate unkillable; abandon | Final GetStatus returns `(Complete, AbandonedExitCode, logs)`. Log content includes the honest line. Subsequent same-ticket StartScript returns the cached state. | +| Abandon during StartScript launch | Concurrent: StartScript holding `StartScriptMutex`, AbandonScript called | Abandon serialises behind StartScript via the existing wrapper mutex. Final state is consistent (no half-abandoned wrapper). | +| Capability advertisement | Tentacle build with the abandon feature; query `CapabilitiesServiceV2.GetCapabilities()` | Response includes `AbandonScriptV2`. Tentacle builds without the feature do not advertise it. | + +### 4.3 Integration tests (real shells, real processes) + +Style: matches `Octopus.Tentacle.Tests.Integration/ClientScriptExecutionIsolationMutex.cs` (the closest existing analogue — real Tentacle, real script, mutex semantics under test). + +**Timing flakiness: use the existing builders, not raw shell + `Thread.Sleep`.** The integration test suite has stable patterns for this exact class of test: + +- `ScriptBuilder` (`Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs`) composes cross-platform script bodies. Use `.CreateFile(path)` to signal "script reached this line" and `.WaitForFileToExist(path)` to block the script on an event, not a sleep race. This is how `ClientScriptExecutionIsolationMutex` reliably exercises long-running scripts without `Thread.Sleep` timing assumptions. +- `TestExecuteShellScriptCommandBuilder` (`Octopus.Tentacle.Tests.Integration/Util/Builders/`) composes the script command: `.SetScriptBody(ScriptBuilder)`, `.WithIsolationLevel(...)`, `.WithIsolationMutexName(...)`, `.Build()`. +- `TentacleConfigurationTestCase.CreateBuilder()` and `ClientAndTentacleBuilder` set up real Tentacle + Halibut for the test. Same as existing tests. +- `TentacleServiceDecoratorBuilder.RecordMethodUsages(...)` decorates the script service so the test can assert how many times each method was called. Use this to verify capability negotiation and call counts for the new `AbandonScript` verb. +- `Wait.For(condition, timeout, onFail, ct)` is the event-driven polling helper. Always preferred over `Task.Delay` in test bodies. + +**Pattern to follow:** mirror `ClientScriptExecutionIsolationMutex.cs`. Stuck-script tests should use `ScriptBuilder.WaitForFileToExist(...)` as the "kernel-blocked" simulant rather than `sleep 600`. The file-wait is event-driven and the test can release it on demand by creating the file. For the unkillable variant, combine the file-wait pattern with the `Tentacle.Debug.DisableProcessKill` flag described in the manual test setup so `Hitman` becomes a no-op for the test's duration. + +| Test | Trigger | Verify | +|---|---|---| +| PowerShell + abandon (kill works) | Real PowerShell, `Start-Sleep -Seconds 600`, fire Cancel, normal kill path | Final response is `(Complete, CanceledExitCode)` via the existing path. **Negative check:** abandon log line is NOT present. Confirms we haven't regressed Cancel by accidentally hitting the abandon path. | +| PowerShell + abandon (kill mocked off) | Real PowerShell, sleep; `Hitman` mocked to no-op; fire Cancel; wait; fire AbandonScript | Within 2s of abandon: response is `(Complete, AbandonedExitCode, [...honest log line...])`; mutex is free (verified by starting a second `FullIsolation` script that Acquires within 1s); the real PowerShell process is still alive on the test host (verified via `Process.GetProcessById` outside the test). Test cleanup: kill the leftover PowerShell. | +| **Multi-level-deep hang (ticket-mandated)** | bootstrap → Calamari-shim → user script, with `Hitman` no-op flag set | All verifications from the previous row pass end-to-end through the multi-level launch chain. Confirms abandon works when the stuck process is not the immediate child of Tentacle. | +| Windows workspace cleanup with open handles | Run the abandon path; leave the simulated zombie holding the workspace log file open; call CompleteScript | CompleteScript returns without exception. Tentacle systemLog contains a `Warn` naming the leaked workspace directory. Workspace dir on disk still exists (assert via `Directory.Exists`). No exception bubbles up to the calling test (which simulates Server). | +| Polling Tentacle variant | Configure test fixture as Polling | All verifications from the kill-mocked-off row pass against a Polling Tentacle. | + +**End-to-end async thread audit.** Capture `Process.GetCurrentProcess().Threads.Count` 5s into a stuck-script scenario; assert no thread parked attributable to the script pipeline (use named threads or stack-walk via ETW if precise attribution needed). Most reliable proxy: total thread count not higher than baseline + epsilon. + +**Normal-path timing regression check.** Run a 100-iteration benchmark of normal short-script execution (`Write-Host "x"`); compare median wall-clock time vs. a baseline build without the changes. **Verify:** median delta within margin of error. The async swap should not measurably slow normal script execution. + +## Section 5 — Manual testing plan + +Manual scenarios on a real test Tentacle. All scenarios assume the parallel server-side build is deployed. + +### Setup + +- Test Octopus Server with EFT-3295 server-side build. +- Windows Tentacle (primary) + Linux Tentacle (smoke). +- Debug Tentacle build with `Tentacle.Debug.DisableProcessKill=true` making `Hitman.TryKillProcessAndChildrenRecursively` a no-op — simulant for "kill doesn't work" without engineering real kernel-level waits. +- Server-side feature flag `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` (default ON, configured on the test Octopus Server). + +### Where to find things (reference for verification steps below) + +- **Tentacle systemLog (Windows):** `C:\Octopus\Logs\OctopusTentacle.txt` (or whatever the test instance is configured with — confirm via `Tentacle show-configuration`). +- **Tentacle systemLog (Linux):** `/etc/octopus//Logs/OctopusTentacle.txt`. +- **Tentacle workspace root:** `/Work/`. Each script gets a subdirectory named after its `ScriptTicket`. Inside: `bootstrapRunner.log`, `Output.log`, `script.ps1`/`Bootstrap.sh`, the state store file. +- **Script log in UI:** Octopus Server → the task → expand the deployment step. The script log is what the customer sees and is what gets the honest abandon line. +- **Thread count (Windows):** PowerShell `(Get-Process Tentacle).Threads.Count`, or use Process Explorer's Threads tab. Capture before each scenario for a baseline. +- **Thread count (Linux):** `ps -o nlwp= -p $(pgrep -f Tentacle)` returns the LWP (thread) count for the Tentacle process. +- **Capability advertisement:** Tentacle systemLog at startup contains `Negotiated capabilities: [...]` lines and per-connection capability exchanges. Or: temporarily enable Halibut verbose tracing on the server side and inspect the `CapabilitiesResponseV2` payload from this Tentacle. +- **Mutex state in Tentacle log:** grep for `acquiring isolation mutex` / `Lock acquired` / `Releasing lock` lines with the relevant task ID. + +### M1 — Regression smoke (flag ON, normal script) + +Deploy `Write-Host "hello"; Start-Sleep 5; Write-Host "done"`. + +**Verify (all must pass):** +1. Octopus UI task status → **Success** (green tick). +2. Script log in UI shows `hello` and `done`; no abandon line. +3. Tentacle systemLog: `grep "abandon" OctopusTentacle.txt` → zero matches for this task ID. +4. Tentacle systemLog shows the normal acquire/release pair: `grep "" OctopusTentacle.txt | grep -E "Lock acquired|Releasing lock"` → both lines present in order. +5. Thread count (sampled 5s after task completes) → within ±2 of pre-test baseline. + +### M2 — Cancel still works (flag ON, killable script) + +`DisableProcessKill=false`. Deploy `Start-Sleep -Seconds 300`. Wait ~10s. Click **Cancel** in Octopus UI. + +**Verify:** +1. UI task status transitions to **Cancelled** within 30s. +2. Tentacle systemLog: `grep "Hitman\|Releasing lock" OctopusTentacle.txt | tail -20` shows the kill attempt followed by mutex release for this task ID. +3. PowerShell process is gone: `Get-Process powershell -ErrorAction SilentlyContinue` returns nothing for the powershell instance that was running the script. (Match by PID captured from Tentacle log at script start.) +4. `grep "abandon" OctopusTentacle.txt` → zero matches for this task ID. Cancel path was used, not abandon. +5. Deploy a second project to the same Tentacle → starts immediately (mutex was released by the normal Cancel path). + +### M3 — The Philips scenario (flag ON, unkillable script) + +`Tentacle.Debug.DisableProcessKill=true`. Restart Tentacle. Capture thread-count baseline. Deploy `Start-Sleep -Seconds 600`. Note the script's PowerShell PID from the Tentacle log (`grep "Starting powershell" OctopusTentacle.txt | tail -1`). Click **Cancel** after ~10s. Wait for server-side abandon timeout (1–5 min per parallel session config). + +**Verify (all must pass; this is the load-bearing scenario):** + +1. **Server side called Abandon.** Server log (`OctopusServer.txt`) shows an `AbandonScript` call for this task's ticket, timestamped after the Cancel attempt + the server's abandon timeout. If the parallel session hasn't named the call yet, grep for "abandon" in server log. +2. **Honest log line in the customer-visible task log.** Open the task in Octopus UI → expand the deployment step → confirm the line `Tentacle has abandoned this script. The underlying script process may still be running on this host.` is present in the script log section. +3. **Tentacle systemLog records the abandon path.** `grep -A2 "abandon" OctopusTentacle.txt | tail -30` shows: AbandonScript invocation received, abandon token cancelled, mutex released for this task ID, wrapper removed. +4. **Mutex released — load-bearing check.** Immediately deploy a second project (any trivial script, `Write-Host "ok"`) to the same Tentacle. **Pass:** second deployment starts within 5s. **Fail:** queues indefinitely with "Waiting for the script in task..." message. +5. **Task UI status = Cancelled** (not a new "Abandoned" state — per ADR-042). +6. **Thread count returned to baseline.** Sample 10s after the abandon. **Pass:** within ±2 of baseline. **Fail:** count grew by 1 or more and stays grown. +7. **The PowerShell process is still alive on the host.** `Get-Process -Id ` returns the process. This is the ticket's "we do not kill the runaway" — verify we didn't accidentally start killing it. Kill it manually at end of test for cleanup. +8. **Exit code in the task log = -48 (AbandonedExitCode)** (or whatever surfaces in the Server-side detail view). Distinguishes from `-43` (CanceledExitCode). + +### M4 — Repeated abandon (thread-leak check under repetition) + +Capture baseline thread count and Tentacle process working-set memory. Run M3 ten times back-to-back (script the loop so each iteration: deploy → cancel → wait for abandon → next). + +**Verify:** +1. Sample thread count after each iteration. **Pass:** count stays within ±5 of baseline across all ten runs. **Fail:** monotonic growth — indicates the chosen option's thread-release mechanism is broken. +2. Sample Tentacle working-set memory after each iteration. **Pass:** stays within ~50MB of baseline (some growth from log buffers etc. is expected). **Fail:** grows by more than ~10MB per iteration — indicates Process objects or zombie tasks are being retained. +3. After all ten runs, deploy a normal project. **Pass:** runs normally, no perf degradation. +4. Kill all leftover `powershell.exe` / `sleep` processes manually at end of test. + +Async should produce zero thread cost per abandon; any growth across runs means the implementation diverged from the design. + +### M5 — Server-side flag off (Tentacle behaves as today) + +Set the server-side `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` to OFF in the test Octopus Server. Restart Server. Leave Tentacle untouched. + +**Verify:** +1. **Server doesn't dispatch Abandon.** Repeat the M3 setup. Wait past the would-be 2-minute escalation point. Server log: `grep "AbandonScript" OctopusServer.txt` → zero matches for this task ID. +2. **Tentacle still advertises the capability.** Optional sanity check via Halibut verbose tracing: `CapabilitiesResponseV2` from this Tentacle still contains `AbandonScriptV2`. The flag lives on the Server, not on Tentacle. +3. **Tentacle stays wedged.** Subsequent deployment to this Tentacle queues with "Waiting for the script in task...". Confirms today's behaviour is preserved when Server has the feature off. +4. Recovery: restart Tentacle (the existing workaround). Verify subsequent deployments work again. + +### M6 — Workspace cleanup with open handles (Windows-specific) + +Run M3 to completion. Note the script's `ScriptTicket` from the Tentacle log. + +**Verify:** +1. **Workspace dir still exists.** `dir \Work\` returns a directory listing with log files present. The zombie process (or our retained Process object, depending on option chosen) holds open file handles preventing deletion. +2. **systemLog records the failure.** `grep -i "workspace\|delete" OctopusTentacle.txt | grep ` shows a `Warn`-level entry naming the directory that could not be deleted, with the underlying I/O exception message. +3. **No propagated exception to Server.** `CompleteScript` returns normally; Server log shows successful completion of the task. **Pass:** no error response from Tentacle, no retry storm in server log. +4. **Tentacle continues to function.** Deploy a third project (not to the wedged workspace). **Pass:** runs normally. +5. **Manual cleanup of leaked workspace works after the zombie process is killed.** Kill the PowerShell process manually; `rmdir /s /q ` should now succeed. Confirms the leak is bounded (would be reclaimed if we ever added a janitor). + +### M7 — Polling Tentacle variant + +Register a Polling Tentacle against the test server. Repeat M3 setup and execution. + +**Verify:** +1. All M3 verification points pass with no Polling-specific differences. The Halibut RPC path is the same — only connection initiation direction differs. +2. **Polling-specific check:** during the abandon, Tentacle's polling loop continues. `grep "Polling" OctopusTentacle.txt | tail -20` shows polling activity through the abandon and after. **Pass:** polling not blocked by the abandon flow. +3. After abandon, the Polling Tentacle picks up the next deployment from the server. **Pass:** new deployment dispatched and runs (mutex released). + +### M8 — Linux smoke + +On a Linux Tentacle, deploy a Bash script: `sleep 600`. Repeat M2 (kill works) and M3 (kill mocked off). + +**Verify:** +1. **M2 on Linux:** `ps -p ` shows the bash/sleep process gone after Cancel. Tentacle systemLog shows `Hitman` kill path used. Same outcomes as Windows M2. +2. **M3 on Linux:** all M3 verification points pass. Thread count via `ps -o nlwp= -p $(pgrep -f Tentacle)`. Workspace location: `/etc/octopus//Work//`. +3. **Linux file-handle behaviour differs:** unlike Windows, Linux generally allows deletion of files held open by other processes (the inode survives until the last handle closes). For M6's workspace-cleanup analogue on Linux, the workspace deletion is more likely to succeed even with the zombie process running. Note in test result. +4. Confirms the implementation isn't accidentally Windows-only and behaves sensibly on Linux's different file-handle semantics. + +### M9 — Server escalation ordering + +Server escalation is hardcoded at **2 minutes** post-Cancel for the first release (`AbandonTentacleScriptOnCancellationTimeoutFeatureToggle`'s timeout constant). Not configurable in production; ask the server-side session for a debug-build override constant if you want to run this faster in your test environment. + +**Verify the killable case (no escalation expected):** +1. Run M2 (killable script + cancel). Wait at least 3 minutes. +2. Server log: `grep "AbandonScript" OctopusServer.txt | grep ` → **zero matches.** Cancel succeeded inside the 2-minute window; server correctly did not escalate. +3. Tentacle log: zero abandon entries for this task ID. + +**Verify the unkillable case (escalation expected):** +4. Run M3 (kill mocked off + cancel). Wait through the 2-minute timeout (use a stopwatch). +5. Server log: `grep "AbandonScript" OctopusServer.txt | grep ` → **exactly one match,** timestamped approximately 2 minutes after the Cancel. +6. Tentacle log: one abandon entry for this task ID. + +**Verify the actual-status race case** (server-side session's idempotency concern): +7. Set up M3, but let the script complete naturally just before the 2-minute timer fires (use a script that runs ~110 seconds). +8. Server fires AbandonScript anyway because the completion event hasn't reached it yet. +9. Tentacle returns `(Complete, realExitCode, logs)` — NOT `AbandonedExitCode`. +10. Server task log entry: *Script had already completed before abandon was needed.* Confirms the "abandon was unnecessary" signal works end-to-end. + +**Bug indicators to flag back to the server session:** +- Server calls AbandonScript on every Cancel (even killable cases) → server's escalation predicate is wrong. +- Server retries AbandonScript multiple times for the same ticket → idempotency on the server side broken. +- Server calls AbandonScript before the 2-minute window → timer is wrong. +- Server calls AbandonScript even with the Tentacle capability missing → capability gating broken; should not have scheduled. + +### Sign-off criteria + +To turn the feature flag on by default in a future release: M1–M5 pass on Windows; M3 + M4 pass on Linux; M7 passes on Polling; M9 confirms server escalation policy; M6 confirms workspace leak is bounded and logged. + +## Risks and rollout + +- **Feature flag off by default** for the first release. Customer-by-customer opt-in. +- **Sequence:** after EFT V1 cleanup closes (target end May 2026), before Task Cap 320, targeting Philips' July self-host release. +- **Telemetry:** count of AbandonScript calls per Tentacle per day. Spike = signal that either Cancel is broken or this feature is masking a different bug. +- **Soak test pre-release:** 1000 normal scripts with the server-side flag ON, verify no resource leak vs. flag OFF baseline. + +## Open questions for external reviewer + +(None remaining. Workspace cleanup policy resolved 2026-05-21 — targeted best-effort gated on `AbandonedExitCode` in the stateStore. No janitor; OS-level state on the host is the customer's responsibility per the ticket.) + +## Coordination — locked with the server-side session (2026-05-21) + +Aligned via Linear thread on EFT-3295 (commenter Jim, both sessions). Items below are locked unless explicitly noted. + +**Contract (final shape):** + +- `ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command)` on `IScriptServiceV2`. +- `AbandonScriptCommandV2 { ScriptTicket Ticket; long LastLogSequence; }` — same shape as `CancelScriptCommandV2`. Server-side dropped its initial `ServerTaskId` and "cancellation correlation id" proposal; `ScriptTicket` is sufficient. +- Capability name: `AbandonScriptV2`. + +**Idempotency (final):** Tentacle returns actual current status. Already-completed script returns `(Complete, realExitCode, logs)` — distinct from `AbandonedExitCode`, so the server's task log entry can record that the abandon was unnecessary. Unknown/already-cleaned-up ticket returns `(Complete, UnknownScriptExitCode, [])`, matching Cancel's existing shape. + +**Capability check is the primary gate.** Server uses `BackwardsCompatibleAsyncCapabilitiesV2Decorator` to query `AbandonScriptV2` once per session. Capability absent → server does not schedule the abandon dispatch at all. The RPC-fail-then-log path stays as a defensive fallback for capability-cache staleness, not the primary path. + +**One off-switch, server-side:** `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` (default ON). Governs whether server escalates to AbandonScript at all. No Tentacle-side flag — Tentacle's capability advertisement is binary on build version. (Earlier draft had a Tentacle-side flag too; dropped after PR review surfaced that it can't be cleanly toggled at runtime without versioning the service contract.) + +**Escalation timing (locked for first release):** 2 minutes. Both V1 and V2 execution pipelines escalate to AbandonScript on their next status-poll once cancellation has been pending that long. Hardcoded on the server toggle class, not configurable. Server-side updated 2026-05-21: trigger switched from a delayed NSB message to a polling-loop check; no new timers on the server side. The Tentacle-side contract is unchanged either way. + +**Execution-pipeline scope (server-side, 2026-05-21):** V1 *and* V2 server-side execution pipelines call AbandonScript via the same contract. Philips is V1 self-host so V1 is actually the urgent path. Doesn't change anything Tentacle is building. + +**Post-abandon flow:** + +1. Server calls `AbandonScript` → gets `ScriptStatusResponseV2`. +2. Server publishes `TentacleScriptAbandonedEvent`. +3. Existing post-cancel path proceeds (eventually calls `CompleteScript` downstream). + +Server-side will verify the exact GetStatus-poll-vs-read-from-response detail during their implementation plan. + +**Task log wording:** + +- Tentacle script log (this doc's Section 3): *Tentacle has abandoned this script. The underlying script process may still be running on this host.* +- Server task log (server session's surface). Server session's working proposal: + - On dispatch: *Cancellation hasn't taken effect on Tentacle after 2 minutes. Abandoning the script to release the script-isolation mutex.* + - On Tentacle returning `AbandonedExitCode`: *Tentacle abandoned the script.* + - On Tentacle returning a real exit code (abandon unnecessary): *Script had already completed before abandon was needed.* +- I pushed back on the dispatch wording — "script-isolation mutex" exposes internal terminology to the customer. Suggested rewrite: *Cancellation hasn't taken effect on Tentacle after 2 minutes. Abandoning the script so this target can accept new deployments.* Server session's call which to ship with. diff --git a/docs/superpowers/specs/2026-05-25-split-async-migration-from-abandon-feature-design.md b/docs/superpowers/specs/2026-05-25-split-async-migration-from-abandon-feature-design.md new file mode 100644 index 000000000..6db82ed45 --- /dev/null +++ b/docs/superpowers/specs/2026-05-25-split-async-migration-from-abandon-feature-design.md @@ -0,0 +1,131 @@ +# Split async migration into its own PR beneath the abandon feature + +## Context + +Current state: + +- **PR #1226 (`jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex`)** at commit `583eb46c` contains both: + - The async migration of `SilentProcessRunner.ExecuteCommand` → `ExecuteCommandAsync` (and all its callers). + - The script-abandonment feature (abandon token, `AbandonScriptCommandV2`, `AbandonedExitCode`, RPC method, capability, tests). +- **PR #1235 (`jimpelletier/eft-3295-async-signature-propagation`)** stacks on top of #1226 and pushes the async signature higher into CLI host, Kubernetes paths, `IServiceConfigurator`, `ICommandLineRunner`, etc. + +The stack is currently: + +``` +main ← #1226 (abandon + async migration) ← #1235 (push higher) +``` + +## Goal + +Invert the lower half of the stack so the async migration sits beneath the abandon feature: + +``` +main ← [NEW BASE PR] async migration ← #1226 (rebased: abandon feature only) ← #1235 (push higher, unchanged in shape) +``` + +The new base PR is a clean refactoring change that is reviewable and mergeable independently of the abandon feature. #1226 becomes a focused feature PR that adds script abandonment on top of the foundation. + +## Non-goals + +- This spec does NOT cover restructuring #1235. That PR's content remains as it is and continues to stack on top of #1226. +- This spec does NOT widen the scope of the abandon feature. The abandon feature's existing content is preserved, just rebased. +- This spec does NOT attempt to surgically split the existing #1226 commits via cherry-pick or interactive rebase. The commits intermix concerns and would conflict heavily. + +## Approach + +End-state rebuild rather than commit surgery. + +Take the file states at `583eb46c` and split them into two clean sets of changes built from `main`. Each PR is constructed as a small number of logical commits that produce the same final state when stacked. + +### Base PR — "Migrate SilentProcessRunner to async" + +Branch name: `jimpelletier/eft-3295-async-migration-base`. + +Scope: the minimum change required to make `SilentProcessRunner.ExecuteCommand` async, with documented sync↔async boundaries at every immediate caller. + +Contents: + +1. **`SilentProcessRunner`** — `ExecuteCommand` → `ExecuteCommandAsync`. Internal change: `process.WaitForExit()` → `await process.WaitForExitAsync(cancel)`. The `cancel` token is passed directly to `WaitForExitAsync` so the existing cancel semantics are preserved (when cancel fires, the await throws `OperationCanceledException`; the existing `cancel.Register(() => DoOurBestToCleanUp(...))` still fires Kill+Close on a separate thread). **No other SilentProcessRunner changes.** `DoOurBestToCleanUp` remains unchanged including the `process.Close()` call. `SafelyWaitForAllOutput` remains unchanged. +2. **NET Framework polyfill** for `WaitForExitAsync` (not available on net48): a `WaitForExitAsyncNetFramework` helper using `Process.Exited` event + `TaskCompletionSource`. +3. **`ISilentProcessRunner`, `CommandLineRunner`, `CommandLineInvocation`** — interface and helper class signatures migrated to async. +4. **Immediate sync callers** — six sites updated with `.GetAwaiter().GetResult()`: + - `PowerShellPrerequisite.Check()` (WPF installer prerequisite) + - `KubernetesDirectoryInformationProvider.GetDriveBytesUsingDu()` (called from `IMemoryCache.GetOrCreate` factory) + - `SystemCtlHelper.RunServiceCommand()` (2 call sites) + - `LinuxServiceConfigurator.WriteUnitFile`, `IsSystemdInstalled`, `HaveSudoPrivileges` (3 call sites) + - `WindowsServiceConfigurator.Sc()` + - `CommandLineRunner.Execute(CommandLineInvocation, ...)` +5. **Sync-boundary comments** — every one of the six sites gets the same comment pattern: "We're in X. Y must be sync because Z. We block with `.GetAwaiter().GetResult()`. This is safe because we're on a plain thread-pool worker — when the async work finishes it can resume on any free thread, so the block resolves normally." +6. **`CapabilitiesServiceV2` nameof change** — replace the `"AbandonScriptV2"` string literal with `nameof(...)`. Small refactor that fits the cleanup theme. + +What this PR does NOT include: + +- No `abandon` parameter on `ExecuteCommandAsync`. +- No removal of `process.Close()` from `DoOurBestToCleanUp`. +- No long-form documentation comments on `DoOurBestToCleanUp`, `SafelyWaitForAllOutput`, or the `WaitForExitAsync` call site (those describe race-related semantics that only matter once the abandon flow is added). +- No grandchild test comment improvements (those describe the async-cancel race the abandon PR fixes). +- No abandon-specific contracts, RPC methods, capabilities, env vars, or tests. + +### Stacked PR — "Add Script abandonment feature" (rebased #1226) + +Branch: `jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` (force-pushed). + +Scope: the script abandonment feature, building on the async foundation. + +Contents: + +1. **Add `abandon` parameter to `ExecuteCommandAsync`** — second `CancellationToken` parameter. Switch internal await from `WaitForExitAsync(cancel)` to `WaitForExitAsync(abandon)`. Cancel continues to flow through `cancel.Register`. +2. **Remove `process.Close()` from `DoOurBestToCleanUp`** — cancel-path race fix. Now needed because the abandon flow relies on `Exited`-event delivery via `WaitForExitAsync(abandon)`'s TCS, and `Close()` tears down the wait state. +3. **Long-form documentation comments** on `DoOurBestToCleanUp`, `SafelyWaitForAllOutput`, and the `WaitForExitAsync` call site explaining the race, the grandchild-pipe scenario, and the worst-case cancel latency. +4. **Contracts** — `AbandonedExitCode = -48`, `AbandonScriptCommandV2`, `IScriptServiceV2.AbandonScript`, `IAsyncClientScriptServiceV2.AbandonScriptAsync`. +5. **`ScriptServiceV2.AbandonScriptAsync` implementation** — abandon-token wrapper, fires abandon CTS, returns response. +6. **Abandon-token plumbing through `RunningScript`** — constructor accepts abandon token, passes through to `ExecuteCommandAsync`. +7. **`TentacleDebugDisableProcessKill` env var** — test affordance for the stuck-script scenario. +8. **`AbandonScriptV2` capability** — advertised in `CapabilitiesServiceV2`. +9. **Best-effort `workspace.Delete`** gated on `AbandonedExitCode` in `CompleteScriptAsync`. +10. **Abandon-specific tests** — service-layer (`ScriptServiceV2Fixture`) and integration (`ClientScriptExecutionAbandon`, `SilentProcessRunnerFixture.AbandonToken_*`). +11. **Improved grandchild test comments** — rewritten to describe the async behavior being guarded. + +### PR #1235 unchanged + +`jimpelletier/eft-3295-async-signature-propagation` continues to stack on top of #1226 with its 7 push-higher commits. Once #1226 is force-pushed, #1235 may need a rebase to stay clean but its content is the same. + +## Mechanics + +Done in this order to keep each step reversible: + +1. **Capture safety reference**: tag the current state of both branches before mutating anything (`git tag claude-safety-2026-05-25-pre-split #1226-tip #1235-tip`). The existing `claude-safety-before-rollback` tag stays. +2. **Build the base branch from `main`**: + - Branch `jimpelletier/eft-3295-async-migration-base` from `main`. + - Apply file-level changes for the base PR scope, committing as a small number of logical commits (e.g., "Migrate SilentProcessRunner to async", "Migrate ISilentProcessRunner and CommandLineRunner", "Document sync↔async boundaries", "Use nameof for capability"). + - Push the branch and open the new PR with `main` as base. +3. **Rebuild #1226 on top of the base branch**: + - Hard-reset `jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` to the new base branch's tip. + - Apply file-level changes for the abandon feature scope (the delta from base PR's end state to `583eb46c`'s state). + - Commit as a small number of logical commits. + - Force-push #1226. GitHub will automatically update the PR diff to show only the abandon-feature delta. +4. **Rebase #1235**: change #1235's branch to be a rebase of its 7 commits on top of the updated #1226. Force-push. + +## Risks + +- **Force-push to #1226** will disrupt any in-flight reviews. PR comments referencing specific commit SHAs will become stale. Mitigated by: tagging the pre-split state for reference; explicitly noting in PR #1226 that history has been rewritten and pointing to the new base PR. +- **CI must pass on the base PR alone**. The base PR's `WaitForExitAsync(cancel)` wiring is a behavioural-equivalent of the sync version, so existing cancel tests (e.g. `ShouldCancelPing`) should pass. To be verified by running the build before opening the PR. +- **Compatibility with #1235**. The 7 push-higher commits build on file states that exist in #1226 today. After the rebase, those file states may shift slightly (e.g., the abandon PR no longer carries the same intermediate commit boundaries). Conflicts during the #1235 rebase are likely but should be mechanical to resolve. + +## Verification + +After the split: + +- `git diff main..base-branch` produces a small focused diff matching the base PR scope above. +- `git diff base-branch..#1226` produces a diff matching the abandon-feature scope above. +- `git diff #1226..#1235` produces the existing 7-commit push-higher diff. +- Build the base branch standalone — must compile and CI must pass. +- Build #1226 stacked on base — must produce the same end-state as `583eb46c` does today. +- Build #1235 stacked on #1226 — must produce the same end-state as it does today. + +## Success criteria + +- New base PR exists at `jimpelletier/eft-3295-async-migration-base` with a clean, focused diff against `main`. +- PR #1226 is rebased to target the new base branch and its diff shows only the abandon feature. +- PR #1235 still works as a stacked PR on top of #1226. +- All three PRs build and pass CI. From 3cbefc25e015540da7e0e047e5dfeedde0fe2b66 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 26 May 2026 13:07:27 +1000 Subject: [PATCH 22/52] Pass abandon: CancellationToken.None at test/setup callers and extract helper Test infrastructure (Kubernetes setup, tentacle fetcher, PowerShell tests) and the IAsyncClientScriptServiceV2 test decorator now thread the new abandon parameter through. Also extracts SafelyCancelOutputAndErrorRead in SilentProcessRunner so the normal-completion path and the abandon-catch path stay in sync. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/CommandLine/SilentProcessRunner.cs | 16 ++++++++++++---- .../Setup/DockerImageLoader.cs | 6 ++++-- .../Setup/KubernetesAgentInstaller.cs | 9 ++++++--- .../Setup/KubernetesClusterInstaller.cs | 12 ++++++++---- .../Setup/Tooling/HelmDownloader.cs | 3 ++- .../Setup/Tooling/ToolDownloader.cs | 3 ++- .../Tooling/KubeCtlTool.cs | 3 ++- .../PowerShellStartupDetectionTests.cs | 9 ++++++--- .../TentacleFetchers/LinuxTentacleFetcher.cs | 3 ++- .../TestDecoratorsAreCalledInTheCorrectOrder.cs | 5 +++++ 10 files changed, 49 insertions(+), 20 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 7b70ad001..0440b83d9 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -169,14 +169,12 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei catch (OperationCanceledException) when (abandon.IsCancellationRequested && !process.HasExited) { info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); - SafelyCancelRead(process.CancelErrorRead, debug); - SafelyCancelRead(process.CancelOutputRead, debug); + SafelyCancelOutputAndErrorRead(process, debug); running = false; return ScriptExitCodes.AbandonedExitCode; } - SafelyCancelRead(process.CancelErrorRead, debug); - SafelyCancelRead(process.CancelOutputRead, debug); + SafelyCancelOutputAndErrorRead(process, debug); SafelyWaitForAllOutput(outputResetEvent, cancel, debug); SafelyWaitForAllOutput(errorResetEvent, cancel, debug); @@ -235,6 +233,16 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, } } + static void SafelyCancelOutputAndErrorRead(Process process, Action debug) + { + // Stops the OutputDataReceived / ErrorDataReceived handlers from firing further. + // Called in both the normal completion path and the abandon path; extracted here + // so the two callers stay consistent (a missed CancelXxxRead leaves the async + // readers firing during dispose, which can throw against the workspace log writer). + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + } + static void SafelyCancelRead(Action action, Action debug) { try diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs index 97972eba8..36401bede 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs @@ -47,7 +47,8 @@ public DockerImageLoader(TemporaryDirectory temporaryDirectory, ILogger logger, tags.Add(line); }, sprLogger.Error, - cancel: CancellationToken.None + cancel: CancellationToken.None, + abandon: CancellationToken.None ); if (exitCode != 0) @@ -76,7 +77,8 @@ async Task LoadImageIntoKind(string mostRecentTag, string clusterName) sprLogger.Debug, sprLogger.Information, sprLogger.Error, - cancel: CancellationToken.None + cancel: CancellationToken.None, + abandon: CancellationToken.None ); if (exitCode != 0) diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs index 72f23d2a0..6790a7f3e 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs @@ -63,7 +63,8 @@ public async Task InstallAgent(int listeningPort, string? tentacleImageA sprLogger.Debug, sprLogger.Information, sprLogger.Error, - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); sw.Stop(); @@ -181,7 +182,8 @@ async Task GetAgentThumbprint() thumbprint = x; }, sprLogger.Error, - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); if (exitCode != 0) { @@ -234,7 +236,8 @@ public void Dispose() logger.Debug, logger.Information, logger.Error, - cancel: CancellationToken.None).GetAwaiter().GetResult(); + cancel: CancellationToken.None, + abandon: CancellationToken.None).GetAwaiter().GetResult(); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs index 94008d36e..3dd40d969 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs @@ -61,7 +61,8 @@ async Task InstallCluster(ClusterVersion clusterVersion) logger.Debug, logger.Information, logger.Error, - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); sw.Stop(); @@ -100,7 +101,8 @@ async Task SetLocalhostRouting() sprLogger.Debug, sprLogger.Information, sprLogger.Error, - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); if (exitCode != 0) { @@ -150,7 +152,8 @@ async Task InstallNfsCsiDriver() sprLogger.Debug, sprLogger.Information, sprLogger.Error, - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); if (exitCode != 0) { @@ -187,7 +190,8 @@ public void Dispose() logger.Debug, logger.Information, logger.Error, - cancel: CancellationToken.None).GetAwaiter().GetResult(); + cancel: CancellationToken.None, + abandon: CancellationToken.None).GetAwaiter().GetResult(); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs index 6ea1d2736..0aa73e6a7 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs @@ -86,7 +86,8 @@ async Task ExtractTarGzip(string gzArchiveName, string destFolder) Logger.Debug, Logger.Information, Logger.Error, - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs index 6e586a355..8778b9267 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs @@ -49,7 +49,8 @@ public async Task Download(string targetDirectory, CancellationToken can Logger.Debug, Logger.Information, Logger.Error, - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs index 0b3354159..72a67b8a7 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs @@ -55,7 +55,8 @@ async Task ExecuteCommand(string command, CancellationToke sprLogger.Error(y); stdErr.Add(y); }, - cancel: cancellationToken); + cancel: cancellationToken, + abandon: CancellationToken.None); return new (exitCode, stdOut, stdErr); } diff --git a/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs b/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs index 4d11a567a..fb08a988c 100644 --- a/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs +++ b/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs @@ -278,7 +278,8 @@ public async Task WhenPowerShellNeverStarts_WeShouldDetectTheScriptDidNotStart_A _ => { }, line => directOutput.Add(line), line => directOutput.Add(line), - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); var directOutputText = string.Join("\n", directOutput); Logger.Information("Direct invocation output:\n{Output}", directOutputText); @@ -345,7 +346,8 @@ public async Task WhenPowerShellNeverStarts_AndWorkspaceIsDeletedBeforeScriptRun _ => { }, line => directOutput.Add(line), line => directOutput.Add(line), - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); var directOutputText = string.Join("\n", directOutput); Logger.Information("Direct invocation output:\n{Output}", directOutputText); @@ -378,7 +380,8 @@ static IShell GetShellForCurrentPlatform() _ => { }, _ => { }, customEnvironmentVariables: new Dictionary(), - cancel: CancellationToken.None) + cancel: CancellationToken.None, + abandon: CancellationToken.None) // Safe: static helper, no synchronisation context. .GetAwaiter().GetResult(); diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs index 298bcc4c6..e81b214f3 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs @@ -69,7 +69,8 @@ public static async Task ExtractTarGzipAsync(string gzArchiveName, string destFo log, log, log, - cancel: CancellationToken.None); + cancel: CancellationToken.None, + abandon: CancellationToken.None); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs index 09cbc58fb..fc3fd090b 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs @@ -105,6 +105,11 @@ public Task CancelScriptAsync(CancelScriptCommandV2 comm throw new NotImplementedException(); } + public Task AbandonScriptAsync(AbandonScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions) + { + throw new NotImplementedException(); + } + public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions) { await Task.CompletedTask; From a811ad42b07f6088c7321db6458f839f1d08beec Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Wed, 27 May 2026 11:08:09 +1000 Subject: [PATCH 23/52] Fix advertised AbandonScriptV2 capability string CapabilitiesServiceV2 was advertising nameof(IAsyncClientScriptServiceV2 .AbandonScriptAsync) which evaluates to the method name "AbandonScriptAsync", not the intended capability name. Both the unit test fixture (CapabilitiesServiceV2Fixture) and the EFT-3295 commit subject expect "AbandonScriptV2"; the integration test in CapabilitiesServiceV2Test had the same nameof() mistake. Replace with the literal "AbandonScriptV2" in production and the integration test, drop the now-unused IAsyncClientScriptServiceV2 import. Verify against the server's expected capability name before merging. --- .../CapabilitiesServiceV2Test.cs | 9 ++++----- .../Services/Capabilities/CapabilitiesServiceV2.cs | 3 +-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index 5107b2306..96a8e6d30 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -7,7 +7,6 @@ using NUnit.Framework; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Capabilities; -using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators; @@ -43,11 +42,11 @@ public async Task CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_W // tentacleConfigurationTestCase.Version == null indicates the "latest" build under // test (the code in this branch). Test cases with a concrete Version exercise older // released tentacles fetched from S3 to verify backwards compatibility. Older builds - // pre-date EFT-3295 and don't advertise the AbandonScriptAsync capability, so we only + // pre-date EFT-3295 and don't advertise the AbandonScriptV2 capability, so we only // assert it for the latest build. if (version == null) { - capabilities.Should().Contain(nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); + capabilities.Should().Contain("AbandonScriptV2"); expectedCapabilitiesCount++; } @@ -78,11 +77,11 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK // tentacleConfigurationTestCase.Version == null indicates the "latest" build under // test (the code in this branch). Test cases with a concrete Version exercise older // released tentacles fetched from S3 to verify backwards compatibility. Older builds - // pre-date EFT-3295 and don't advertise the AbandonScriptAsync capability, so we only + // pre-date EFT-3295 and don't advertise the AbandonScriptV2 capability, so we only // assert it for the latest build. if (version == null) { - capabilities.Should().Contain(nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); + capabilities.Should().Contain("AbandonScriptV2"); expectedCapabilitiesCount++; } diff --git a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs index 545f55c73..87136c37e 100644 --- a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs +++ b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs @@ -3,7 +3,6 @@ using System.Threading.Tasks; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Capabilities; -using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Core.Services; @@ -25,7 +24,7 @@ public async Task GetCapabilitiesAsync(CancellationToken } //non-kubernetes agent tentacles only support the standard script services - return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync) }); + return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), "AbandonScriptV2" }); } } } \ No newline at end of file From e37f98b377d4c1d921a3f18c7067cfc7d452bc85 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 12:45:42 +1000 Subject: [PATCH 24/52] Rename WaitForGrandchild* to reflect that it waits for an arbitrary PID Address PR review on #1226: the method waits for whatever PID you pass it, not specifically a grandchild process. Renamed to WaitForPidFileAsync and updated call sites. Also updated the timeout-exception message inside the method to drop grandchild-specific wording. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/SilentProcessRunnerFixture.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 270ad301f..ff6a4c38f 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -213,7 +213,7 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot cts.Token)); // Wait for the grandchild to actually be spawned before cancelling - await WaitForGrandchildSpawnAsync(grandchildPidFile, TimeSpan.FromSeconds(60)); + await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(60)); var sw = Stopwatch.StartNew(); cts.Cancel(); @@ -277,7 +277,7 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul out _, cts.Token)); - await WaitForGrandchildSpawnAsync(grandchildPidFile, TimeSpan.FromSeconds(30)); + await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(30)); var sw = Stopwatch.StartNew(); cts.Cancel(); @@ -333,7 +333,7 @@ public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProces abandon: abandonCts.Token)); // Wait deterministically for the process to write its PID before we abandon - await WaitForGrandchildSpawnAsync(pidFile, TimeSpan.FromSeconds(30)); + await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); abandonCts.Cancel(); try @@ -361,7 +361,7 @@ public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProces } } - static async Task WaitForGrandchildSpawnAsync(string pidFile, TimeSpan timeout) + static async Task WaitForPidFileAsync(string pidFile, TimeSpan timeout) { var deadline = DateTime.UtcNow + timeout; while (DateTime.UtcNow < deadline) @@ -371,8 +371,8 @@ static async Task WaitForGrandchildSpawnAsync(string pidFile, TimeSpan timeout) await Task.Delay(50); } throw new TimeoutException( - $"Test setup failed: the grandchild PID was never written to '{pidFile}'. " + - $"The grandchild-pipe scenario is not being exercised."); + $"Test setup failed: a valid PID was never written to '{pidFile}'. " + + $"The scenario under test is not being exercised."); } static string SafelyReadAllText(string path) From 2bcd40cf5a37b876c69589a629ab06d6101f3094 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 12:50:46 +1000 Subject: [PATCH 25/52] Apply ScriptServiceV2 and RunningScript review comments PR review on #1226: - ScriptServiceV2.StartScriptAsync: tighter comment on the double-call guard at the script-state check (per Jim's suggestion). - ScriptServiceV2.AbandonScriptAsync: rewrite the abandon-effect comment to spell out the WaitForExitAsync return path and the server's follow-up GetStatus pattern. - ScriptServiceV2.CompleteScriptAsync: remove a stale comment on the running-script bookkeeping disposal, expand the abandon-aware workspace.Delete comment, and link to the customer docs and AbandonScriptCommandV2 contract. - ScriptServiceV2.WasAbandoned: extracted helper covering the state-store-load + AbandonedExitCode check so the conditional in CompleteScriptAsync reads cleanly. - RunningScript.Execute: condensed the three catch-clause comments into one-liners explaining why each fires (abandon vs cancel vs mutex timeout) rather than restating the behaviour. --- .../Services/Scripts/RunningScript.cs | 8 ++-- .../Services/Scripts/ScriptServiceV2.cs | 43 ++++++++++--------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index 0ed327ba1..f1369a11f 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -102,21 +102,19 @@ public async Task Execute() : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); } } + // Fires when the caller abandoned the script: leave the OS process running and signal the distinct AbandonedExitCode so the server can tell it apart from a cancel. catch (OperationCanceledException) when (abandonToken.IsCancellationRequested) { - // Distinguish the abandon path from cancel: when the abandon token fires, - // we don't try to kill the underlying script process. Logging it as - // "abandoned" rather than "canceled" makes the deployment log honest about - // what happened, and surfacing AbandonedExitCode (-48) lets the caller - // (the Octopus Server) treat it differently from a normal cancel exit. writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution abandoned."); exitCode = ScriptExitCodes.AbandonedExitCode; } + // Fires when the caller cancelled the script and the underlying process honored the cancellation token. catch (OperationCanceledException) { writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution canceled."); exitCode = ScriptExitCodes.CanceledExitCode; } + // Fires when acquiring the isolation mutex timed out before the script could start. catch (TimeoutException) { writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution timed out."); diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs index 6774874d1..3d6a7e9db 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs @@ -72,8 +72,7 @@ public async Task StartScriptAsync(StartScriptCommandV2 { IScriptWorkspace workspace; - // If the state already exists then this runningScript is already running/has already run and we should not run it again. - // StartScript may be called multiple times for the same ticket (e.g. server retries), so we guard against double-launching. + // StartScript may be called multiple times for the same ticket (e.g. if server retries the tentacle command), so we must guard against actually starting the script twice. if (runningScript.ScriptStateStore.Exists()) { var state = runningScript.ScriptStateStore.Load(); @@ -144,11 +143,10 @@ public async Task CancelScriptAsync(CancelScriptCommandV public Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) { - // Fires the abandon token (so Execute will return AbandonedExitCode on its next - // unwind) and returns the current status snapshot immediately. The caller (the - // Octopus Server) polls GetStatus to observe the eventual Complete + AbandonedExitCode, - // same as for the cancel flow, so there's no need to block the RPC handler waiting - // for the running script to reach Complete state. + // Triggers the abandon token so `process.WaitForExitAsync` will return in + // SilentProcessRunning.ExecuteAsync which means the call to GetResponse() + // below may have the final exit code for the script. Otherwise the sender of + // the command (Octopus Server) will get the result on a subsequent call to `GetStatus` if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) { runningScript.Abandon(); @@ -159,9 +157,6 @@ public Task AbandonScriptAsync(AbandonScriptCommandV2 co public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken) { - // Stop tracking and dispose the running-script bookkeeping. The underlying - // OS process may or may not still be running depending on whether this - // script completed normally, was cancelled, or was abandoned. if (runningScripts.TryRemove(command.Ticket, out var runningScript)) { runningScript.Dispose(); @@ -169,18 +164,17 @@ public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, Cancellat var workspace = workspaceFactory.GetWorkspace(command.Ticket, WorkspaceReadinessCheck.Skip); - // For abandoned scripts the underlying OS process is, by design, still alive - // and may still hold open file handles inside the workspace (logs being written - // to, working files, etc.). workspace.Delete() will fail in that case on - // Windows (sharing violations) and may partially delete on Linux. Tolerate - // the failure: the workspace will be left on disk and reaped by another - // mechanism (manual cleanup, instance restart). For all other completion paths + // For abandoned scripts (see AbandonScriptCommandV2 and + // https://octopus.com/docs/infrastructure/deployment-targets/tentacle/tentacle-script-abandonment) + // the underlying OS process is, by design, still alive + // and unable to be killed by Tentacle. It may still hold open file handles inside + // the workspace (logs being written to, working files, etc.). workspace.Delete() + // will fail in that case on Windows due to sharing violations and may partially + // delete on Linux. We need to tolerate the failure, which will leave the workspace + // on disk to hopefully be cleaned up by another mechanism (manual cleanup, + // instance restart) etc. This is the best we can do. For all other completion paths // the process has exited and Delete should succeed; surface any failure there. - var stateStore = scriptStateStoreFactory.Create(workspace); - var wasAbandoned = stateStore.Exists() - && stateStore.Load().ExitCode == ScriptExitCodes.AbandonedExitCode; - - if (wasAbandoned) + if (WasAbandoned(workspace)) { try { @@ -197,6 +191,13 @@ public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, Cancellat } } + bool WasAbandoned(IScriptWorkspace workspace) + { + var stateStore = scriptStateStoreFactory.Create(workspace); + return stateStore.Exists() + && stateStore.Load().ExitCode == ScriptExitCodes.AbandonedExitCode; + } + RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken, CancellationToken abandonToken) { var runningScript = new RunningScript(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); From 63c10fb156096e8b49b014a29571a61c5677a6c5 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 12:58:28 +1000 Subject: [PATCH 26/52] Use nameof for the advertised AbandonScript capability; restructure tests Address PR review on #1226: - Production: replace the "AbandonScriptV2" literal with nameof(ScriptServiceV2.AbandonScriptAsync) on the non-Kubernetes capabilities path, matching the nameof pattern used for every other capability string in the codebase. Wire string is now "AbandonScriptAsync". - Unit fixture: same nameof in both AbandonScript-related assertions. - Drop the redundant .Count.Should().Be(N) assertions; BeEquivalentTo already checks element count. - Integration test: revert the AbandonScript assertions that were added inside CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_... and CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonKubernetesTentacle. Each test's scope is its own concern; AbandonScript advertisement gets a dedicated test (LatestTentacle_AdvertisesAbandonScriptCapability), modeled on the K8s-specific test pattern in the same file. --- .../CapabilitiesServiceV2Test.cs | 34 +++++++------------ .../CapabilitiesServiceV2Fixture.cs | 9 +++-- .../Capabilities/CapabilitiesServiceV2.cs | 3 +- 3 files changed, 18 insertions(+), 28 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index 96a8e6d30..abe0cbbea 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -9,6 +9,7 @@ using Octopus.Tentacle.Contracts.Capabilities; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; +using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Util.Builders; @@ -39,17 +40,6 @@ public async Task CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_W expectedCapabilitiesCount++; } - // tentacleConfigurationTestCase.Version == null indicates the "latest" build under - // test (the code in this branch). Test cases with a concrete Version exercise older - // released tentacles fetched from S3 to verify backwards compatibility. Older builds - // pre-date EFT-3295 and don't advertise the AbandonScriptV2 capability, so we only - // assert it for the latest build. - if (version == null) - { - capabilities.Should().Contain("AbandonScriptV2"); - expectedCapabilitiesCount++; - } - capabilities.Count.Should().Be(expectedCapabilitiesCount); } @@ -74,22 +64,22 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK expectedCapabilitiesCount++; } - // tentacleConfigurationTestCase.Version == null indicates the "latest" build under - // test (the code in this branch). Test cases with a concrete Version exercise older - // released tentacles fetched from S3 to verify backwards compatibility. Older builds - // pre-date EFT-3295 and don't advertise the AbandonScriptV2 capability, so we only - // assert it for the latest build. - if (version == null) - { - capabilities.Should().Contain("AbandonScriptV2"); - expectedCapabilitiesCount++; - } - capabilities.Should().NotContain(nameof(IKubernetesScriptServiceV1)); capabilities.Count.Should().Be(expectedCapabilitiesCount); } + [Test] + [TentacleConfigurations] + public async Task LatestTentacle_AdvertisesAbandonScriptCapability(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + await using var clientAndTentacle = await tentacleConfigurationTestCase.CreateLegacyBuilder().Build(CancellationToken); + + var capabilities = (await clientAndTentacle.TentacleClient.CapabilitiesServiceV2.GetCapabilitiesAsync(new(CancellationToken))).SupportedCapabilities; + + capabilities.Should().Contain(nameof(ScriptServiceV2.AbandonScriptAsync)); + } + [Test] [TentacleConfigurations(testCapabilitiesServiceVersions: true)] public async Task CapabilitiesResponseShouldBeCached(TentacleConfigurationTestCase tentacleConfigurationTestCase) diff --git a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs index ab34074c6..e219353cd 100644 --- a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs @@ -6,6 +6,7 @@ using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; +using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Kubernetes; using Octopus.Tentacle.Services.Capabilities; @@ -20,8 +21,7 @@ public async Task CapabilitiesAreReturned() .GetCapabilitiesAsync(CancellationToken.None)) .SupportedCapabilities; - capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), "AbandonScriptV2"); - capabilities.Count.Should().Be(4); + capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(ScriptServiceV2.AbandonScriptAsync)); capabilities.Should().NotContainMatch("IKubernetesScriptService*"); } @@ -36,7 +36,6 @@ public async Task OnlyKubernetesScriptServicesAreReturnedWhenRunningAsKubernetes .SupportedCapabilities; capabilities.Should().BeEquivalentTo(nameof(IFileTransferService), nameof(IKubernetesScriptServiceV1)); - capabilities.Count.Should().Be(2); capabilities.Should().NotContainMatch("IScriptService*"); @@ -48,7 +47,7 @@ public async Task GetCapabilities_OnNonKubernetesTentacle_AdvertisesAbandonScrip { var service = new CapabilitiesServiceV2(); var response = await service.GetCapabilitiesAsync(CancellationToken.None); - response.SupportedCapabilities.Should().Contain("AbandonScriptV2"); + response.SupportedCapabilities.Should().Contain(nameof(ScriptServiceV2.AbandonScriptAsync)); } [Test] @@ -58,7 +57,7 @@ public async Task GetCapabilities_OnKubernetesTentacle_DoesNotAdvertiseAbandonSc var service = new CapabilitiesServiceV2(); var response = await service.GetCapabilitiesAsync(CancellationToken.None); - response.SupportedCapabilities.Should().NotContain("AbandonScriptV2"); + response.SupportedCapabilities.Should().NotContain(nameof(ScriptServiceV2.AbandonScriptAsync)); Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, null); } diff --git a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs index 87136c37e..a4537f26a 100644 --- a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs +++ b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs @@ -6,6 +6,7 @@ using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Core.Services; +using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Util; namespace Octopus.Tentacle.Services.Capabilities @@ -24,7 +25,7 @@ public async Task GetCapabilitiesAsync(CancellationToken } //non-kubernetes agent tentacles only support the standard script services - return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), "AbandonScriptV2" }); + return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(ScriptServiceV2.AbandonScriptAsync) }); } } } \ No newline at end of file From 1f0ac09a53c9b46cd3d577754ddb19f9cea7084d Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 13:07:31 +1000 Subject: [PATCH 27/52] Address SilentProcessRunner comment + env var review feedback PR review on #1226: - Rewrote the outputResetEvent.Wait comment per Jim's suggestion (explains the EOF wait semantics, the no-pipe-close behaviour, and the grandchild case). - Clarified the 'workspace log writer' comment by naming the failure mode explicitly (late OutputDataReceived after the workspace was disposed would throw ObjectDisposedException). - Expanded the process.Close() removal comment with the full old-sync / new-async explanation Jim wrote, including a link to the Microsoft docs on Process.WaitForExitAsync output-processing behaviour. - Renamed TentacleDebugDisableProcessKill to TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION so the variable's danger is obvious to anyone reading it; updated the declaration in EnvironmentVariables and the references in SilentProcessRunner's Hitman helper and the ClientScriptExecutionAbandon integration tests. --- .../Util/CommandLine/SilentProcessRunner.cs | 56 ++++++++++--------- .../Util/EnvironmentVariables.cs | 2 +- .../ClientScriptExecutionAbandon.cs | 18 +++--- 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 0440b83d9..eacf79a2a 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -211,10 +211,10 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, CancellationToken cancel, Action debug) { - // Waits for the OutputDataReceived/ErrorDataReceived handler to signal EOF on the - // stream (it sets the reset event when it receives a null DataReceivedEventArgs.Data, - // which is .NET's EOF marker). This does NOT close the pipe — it just gives the OS - // up to 5 seconds to deliver the EOF. + // outputResetEvent.Wait is waiting for the OutputDataReceived/ErrorDataReceived + // handlers to signal EOF on the stream (when it receives a null + // DataReceivedEventArgs.Data, .NET's EOF marker). This does NOT close the pipe, + // it just gives the OS up to 5 seconds to deliver the EOF. // // If a re-parented grandchild is holding the pipe open, EOF never arrives, the wait // times out, and we proceed without the final flush of buffered output. The pipe is @@ -235,10 +235,11 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, static void SafelyCancelOutputAndErrorRead(Process process, Action debug) { - // Stops the OutputDataReceived / ErrorDataReceived handlers from firing further. - // Called in both the normal completion path and the abandon path; extracted here - // so the two callers stay consistent (a missed CancelXxxRead leaves the async - // readers firing during dispose, which can throw against the workspace log writer). + // Cancel the output/error readers so a late OutputDataReceived/ErrorDataReceived + // callback doesn't try to write to a workspace log that's already been disposed by + // the using-block above; that write would throw ObjectDisposedException. Called in + // both the normal completion path and the abandon path; extracted here so the two + // callers stay consistent. SafelyCancelRead(process.CancelErrorRead, debug); SafelyCancelRead(process.CancelOutputRead, debug); } @@ -273,38 +274,41 @@ static void DoOurBestToCleanUp(Process process, Action error) error($"Failed to kill the launched process: {killProcessException}"); } } - // Do NOT add process.Close() here. The pre-async version of this code did, and adding - // it back will cause cancel to hang forever. Here's the full picture: + // We have removed process.Close() here. The pre-async version of this code did this, and adding + // it back will cause cancel to hang forever. Here's why: // - // OLD SYNC CODE: the calling thread blocked inside process.WaitForExit() (no-timeout - // overload), which waits for BOTH the process to exit AND the redirected stream + // OLD SYNC CODE: the calling thread blocked inside SilentProcessRunner.Execute() on + // process.WaitForExit() (specifically the overload which does not specify a timeout), + // which waits for BOTH the process to exit AND the redirected stream // readers to reach EOF. If a re-parented grandchild held our stdout/stderr open, the // stream readers never reached EOF, so WaitForExit() blocked forever. Calling // process.Close() during cancel-cleanup forced the Process object to release its // handles to the redirected pipes, which made the readers see EOF, which let // WaitForExit() return. That's why Close() was here. // - // NEW ASYNC CODE: the calling thread awaits a TaskCompletionSource that completes + // NEW ASYNC CODE: the calling thread in SilentProcessRunner.Execute + // at process.WaitForExitAsync awaits a TaskCompletionSource that completes // when the Process.Exited event fires. WaitForExitAsync does NOT wait on the // redirected streams (Microsoft confirms in the docs: "output processing will not - // have completed when this method returns"). So a grandchild holding pipes open - // can't hang the await. The original reason for Close() is gone. + // have completed when this method returns" — see + // https://learn.microsoft.com/en-us/dotnet/api/system.diagnostics.process.waitforexitasync). + // So a grandchild holding pipes open can't hang the await. The original reason for + // Close() is gone. // - // WHY ADDING Close() BACK IS WORSE THAN USELESS: process.Close() detaches the Process - // object from the underlying OS process, which tears down the wait state that - // produces the Exited event. If Close() runs before the kernel has signalled the - // exit to .NET (which is asynchronous — Hitman.Kill returns immediately, the OS - // delivers the exit notification some time later), the Exited event never fires, - // our TCS never completes, and the await hangs forever. Every cancel races. + // Why adding Close() back is harmful and maybe why this is a code comment not a + // PR comment: process.Close() detaches the Process object from the underlying + // OS process, which tears down the wait state that produces the Exited event. If + // Close() runs before the kernel has signalled the exit to .NET (which is + // asynchronous: when Hitman.Kill returns immediately, the OS delivers the exit + // notification some time later), the Exited event never fires, our TaskCompletionSource + // never completes, and the await at process.WaitForExitAsync hangs forever. // - // HOW PIPES ACTUALLY GET RELEASED NOW: + // How pipes get released now: // 1. After WaitForExitAsync returns, SafelyWaitForAllOutput waits up to 5 seconds // per stream for EOF. If a grandchild holds the pipes, this times out and we // proceed (it bounds cancel latency; it does NOT close anything). // 2. The outer `using (var process = new Process())` block calls Process.Dispose - // at end of method, which calls Close internally. Because we're no longer - // awaiting WaitForExitAsync at this point, the Close-vs-Exited race can't - // happen — the wait state is already torn down by our code, not by Close. + // at end of method, which calls Close internally. // // Worst case cancel latency with grandchild holding pipes: ~10s (5s × 2 streams). // Covered by tests in SilentProcessRunnerFixture: @@ -360,7 +364,7 @@ class Hitman { public static void TryKillProcessAndChildrenRecursively(Process process) { - if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill))) + if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION))) { // Test-only no-op: simulate "kill was attempted but didn't terminate the process". // Only activated when the test harness sets this env var on the Tentacle process. diff --git a/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs b/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs index 4293edee7..faf01eb00 100644 --- a/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs +++ b/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs @@ -29,7 +29,7 @@ public static class EnvironmentVariables public const string TentacleMachineConfigurationHomeDirectory = "TentacleMachineConfigurationHomeDirectory"; public const string TentaclePollingConnectionCount = "TentaclePollingConnectionCount"; public const string TentaclePowerShellStartupTimeout = "TentaclePowerShellStartupTimeout"; - public const string TentacleDebugDisableProcessKill = "TentacleDebugDisableProcessKill"; + public const string TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION = "TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION"; public const string NfsWatchdogDirectory = "watchdog_directory"; public static string TentacleUseTcpNoDelay = "TentacleUseTcpNoDelay"; public static string TentacleUseAsyncListener = "TentacleUseAsyncListener"; diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index 8e801f885..87c883060 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -21,12 +21,12 @@ public class ClientScriptExecutionAbandon : IntegrationTest [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] public async Task AbandonScript_WhenCancelFailsToKillProcess_ReturnsAbandonedExitCode(TentacleConfigurationTestCase tentacleConfigurationTestCase) { - // TentacleDebugDisableProcessKill=1 makes Hitman a no-op, so CancelScript cannot - // actually terminate the underlying script process. The script becomes genuinely - // "stuck" from Tentacle's perspective. AbandonScript should then return promptly - // with AbandonedExitCode without waiting for the process to exit. + // TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION=1 makes Hitman a no-op, so + // CancelScript cannot actually terminate the underlying script process. The script + // becomes genuinely "stuck" from Tentacle's perspective. AbandonScript should then + // return promptly with AbandonedExitCode without waiting for the process to exit. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() - .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill, "1")) + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); @@ -85,11 +85,11 @@ public async Task AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRun { // The whole reason Tentacle needs an abandon RPC is to release the isolation mutex // when CancelScript can't unstick the script. This test proves that contract: a - // FullIsolation script gets stuck (because TentacleDebugDisableProcessKill makes - // cancel a no-op), abandon is called, and a second FullIsolation script with the - // same mutex name must then be able to acquire the mutex and run. + // FullIsolation script gets stuck (because TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION + // makes cancel a no-op), abandon is called, and a second FullIsolation script with + // the same mutex name must then be able to acquire the mutex and run. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() - .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill, "1")) + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); From bef90e961963f984335712b97f8f3d02ab8888c0 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 13:27:23 +1000 Subject: [PATCH 28/52] Drop explicit CancellationToken.None passes to ExecuteCommandAsync PR review on #1226: the call sites that didn't have a token chain to forward were passing `cancel: CancellationToken.None, abandon: CancellationToken.None` explicitly to ExecuteCommandAsync. The signature already defaults both, so the explicit None is noise. Drop it at every site that doesn't have a real token to pass. Leaves the defaults in place so CA2016 can be enabled later to catch the 'I have a token in scope and forgot to forward it' case at the analyzer level. --- .../PreReq/PowerShellPrerequisite.cs | 4 +--- .../Setup/DockerImageLoader.cs | 8 ++------ .../Setup/KubernetesAgentInstaller.cs | 12 +++--------- .../Setup/KubernetesClusterInstaller.cs | 16 ++++------------ .../Setup/Tooling/HelmDownloader.cs | 4 +--- .../Setup/Tooling/ToolDownloader.cs | 4 +--- .../Tooling/KubeCtlTool.cs | 3 +-- .../PowerShellStartupDetectionTests.cs | 12 +++--------- .../TentacleFetchers/LinuxTentacleFetcher.cs | 4 +--- .../Util/SilentProcessRunnerFixture.cs | 3 +-- .../KubernetesDirectoryInformationProvider.cs | 2 +- .../Startup/WindowsServiceConfigurator.cs | 4 +--- .../Octopus.Tentacle/Util/CommandLineRunner.cs | 4 +--- 13 files changed, 21 insertions(+), 59 deletions(-) diff --git a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs index 94ea0751f..4f13e91b4 100644 --- a/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs +++ b/source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs @@ -55,9 +55,7 @@ await SilentProcessRunnerExtended.ExecuteCommandAsync( arguments, ".", stdOut.WriteLine, - s => stdErr.WriteLine($"ERR: {s}"), - cancel: CancellationToken.None, - abandon: CancellationToken.None); + s => stdErr.WriteLine($"ERR: {s}")); var outputText = stdOut.ToString(); new SystemLog().Verbose("PowerShell prerequisite check output: " + outputText); diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs index 36401bede..9c7fa6412 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs @@ -46,9 +46,7 @@ public DockerImageLoader(TemporaryDirectory temporaryDirectory, ILogger logger, sprLogger.Information(line); tags.Add(line); }, - sprLogger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None + sprLogger.Error ); if (exitCode != 0) @@ -76,9 +74,7 @@ async Task LoadImageIntoKind(string mostRecentTag, string clusterName) temporaryDirectory.DirectoryPath, sprLogger.Debug, sprLogger.Information, - sprLogger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None + sprLogger.Error ); if (exitCode != 0) diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs index 6790a7f3e..9828a8581 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs @@ -62,9 +62,7 @@ public async Task InstallAgent(int listeningPort, string? tentacleImageA temporaryDirectory.DirectoryPath, sprLogger.Debug, sprLogger.Information, - sprLogger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None); + sprLogger.Error); sw.Stop(); @@ -181,9 +179,7 @@ async Task GetAgentThumbprint() sprLogger.Information(x); thumbprint = x; }, - sprLogger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None); + sprLogger.Error); if (exitCode != 0) { @@ -235,9 +231,7 @@ public void Dispose() temporaryDirectory.DirectoryPath, logger.Debug, logger.Information, - logger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None).GetAwaiter().GetResult(); + logger.Error).GetAwaiter().GetResult(); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs index 3dd40d969..d231d0ff4 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs @@ -60,9 +60,7 @@ async Task InstallCluster(ClusterVersion clusterVersion) tempDir.DirectoryPath, logger.Debug, logger.Information, - logger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None); + logger.Error); sw.Stop(); @@ -100,9 +98,7 @@ async Task SetLocalhostRouting() tempDir.DirectoryPath, sprLogger.Debug, sprLogger.Information, - sprLogger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None); + sprLogger.Error); if (exitCode != 0) { @@ -151,9 +147,7 @@ async Task InstallNfsCsiDriver() tempDir.DirectoryPath, sprLogger.Debug, sprLogger.Information, - sprLogger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None); + sprLogger.Error); if (exitCode != 0) { @@ -189,9 +183,7 @@ public void Dispose() tempDir.DirectoryPath, logger.Debug, logger.Information, - logger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None).GetAwaiter().GetResult(); + logger.Error).GetAwaiter().GetResult(); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs index 0aa73e6a7..8c8126f96 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs @@ -85,9 +85,7 @@ async Task ExtractTarGzip(string gzArchiveName, string destFolder) tmp.DirectoryPath, Logger.Debug, Logger.Information, - Logger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None); + Logger.Error); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs index 8778b9267..b91c4409b 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs @@ -48,9 +48,7 @@ public async Task Download(string targetDirectory, CancellationToken can targetDirectory, Logger.Debug, Logger.Information, - Logger.Error, - cancel: CancellationToken.None, - abandon: CancellationToken.None); + Logger.Error); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs index 72a67b8a7..0b3354159 100644 --- a/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs +++ b/source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs @@ -55,8 +55,7 @@ async Task ExecuteCommand(string command, CancellationToke sprLogger.Error(y); stdErr.Add(y); }, - cancel: cancellationToken, - abandon: CancellationToken.None); + cancel: cancellationToken); return new (exitCode, stdOut, stdErr); } diff --git a/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs b/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs index fb08a988c..05fce368e 100644 --- a/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs +++ b/source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs @@ -277,9 +277,7 @@ public async Task WhenPowerShellNeverStarts_WeShouldDetectTheScriptDidNotStart_A workspace.WorkingDirectory, _ => { }, line => directOutput.Add(line), - line => directOutput.Add(line), - cancel: CancellationToken.None, - abandon: CancellationToken.None); + line => directOutput.Add(line)); var directOutputText = string.Join("\n", directOutput); Logger.Information("Direct invocation output:\n{Output}", directOutputText); @@ -345,9 +343,7 @@ public async Task WhenPowerShellNeverStarts_AndWorkspaceIsDeletedBeforeScriptRun workspace.WorkingDirectory, _ => { }, line => directOutput.Add(line), - line => directOutput.Add(line), - cancel: CancellationToken.None, - abandon: CancellationToken.None); + line => directOutput.Add(line)); var directOutputText = string.Join("\n", directOutput); Logger.Information("Direct invocation output:\n{Output}", directOutputText); @@ -379,9 +375,7 @@ static IShell GetShellForCurrentPlatform() _ => { }, _ => { }, _ => { }, - customEnvironmentVariables: new Dictionary(), - cancel: CancellationToken.None, - abandon: CancellationToken.None) + customEnvironmentVariables: new Dictionary()) // Safe: static helper, no synchronisation context. .GetAwaiter().GetResult(); diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs index e81b214f3..2bf480ebd 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs @@ -68,9 +68,7 @@ public static async Task ExtractTarGzipAsync(string gzArchiveName, string destFo tmp.DirectoryPath, log, log, - log, - cancel: CancellationToken.None, - abandon: CancellationToken.None); + log); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index ff6a4c38f..830a39527 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -534,8 +534,7 @@ static int Execute( Console.WriteLine($"{DateTime.UtcNow} ERR: {x}"); error.Append(x); }, - cancel: cancel, - abandon: CancellationToken.None).GetAwaiter().GetResult(); + cancel: cancel).GetAwaiter().GetResult(); debugMessages = debug; infoMessages = info; diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs index dc6a990cf..e71c512d7 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs @@ -71,7 +71,7 @@ public KubernetesDirectoryInformationProvider(ISystemLog log, ISilentProcessRunn { var stdOut = new List(); var stdErr = new List(); - var exitCode = await silentProcessRunner.ExecuteCommandAsync("du", $"-s -B 1 {directoryPath}", "/", stdOut.Add, stdErr.Add, abandon: CancellationToken.None); + var exitCode = await silentProcessRunner.ExecuteCommandAsync("du", $"-s -B 1 {directoryPath}", "/", stdOut.Add, stdErr.Add); if (exitCode != 0) { diff --git a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs index f7ab12992..6a3f0358b 100644 --- a/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs +++ b/source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs @@ -368,9 +368,7 @@ async Task ScAsync(string arguments) arguments, Environment.CurrentDirectory, output => outputBuilder.AppendLine(output), - error => outputBuilder.AppendLine("Error: " + error), - cancel: CancellationToken.None, - abandon: CancellationToken.None); + error => outputBuilder.AppendLine("Error: " + error)); if (exitCode == 0) logFileOnlyLogger.Info(outputBuilder.ToString()); else diff --git a/source/Octopus.Tentacle/Util/CommandLineRunner.cs b/source/Octopus.Tentacle/Util/CommandLineRunner.cs index 69fbae7a5..b19903eff 100644 --- a/source/Octopus.Tentacle/Util/CommandLineRunner.cs +++ b/source/Octopus.Tentacle/Util/CommandLineRunner.cs @@ -90,9 +90,7 @@ public async Task ExecuteAsync(CommandLineInvocation invocation, Environment.CurrentDirectory, debug, info, - error, - cancel: CancellationToken.None, - abandon: CancellationToken.None); + error); if (exitCode != 0) { From 46882eb53eeeeb580e99dea69c081937019da12b Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 13:32:40 +1000 Subject: [PATCH 29/52] Hide WaitForExit pragma inside WaitForProcessExitAsync wrapper Address PR review on #1226: the inline #if NETFRAMEWORK / #else block at the WaitForExit call site, plus the separate WaitForExitAsyncNetFramework helper, are collapsed into a single WaitForProcessExitAsync method whose body contains the pragma. The call site reads as one ordinary await; the comments above it refer to WaitForProcessExitAsync as the named place where we block, matching the long process.Close() removal comment. --- .../Util/CommandLine/SilentProcessRunner.cs | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index eacf79a2a..e5d4eeda5 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -160,11 +160,7 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei // resulting Exited event is what unblocks this await on cancel. // `abandon` is a separate token used by EFT-3295 to stop waiting // WITHOUT killing the process — see the catch block below. -#if NETFRAMEWORK - await WaitForExitAsyncNetFramework(process, abandon).ConfigureAwait(false); -#else - await process.WaitForExitAsync(abandon).ConfigureAwait(false); -#endif + await WaitForProcessExitAsync(process, abandon).ConfigureAwait(false); } catch (OperationCanceledException) when (abandon.IsCancellationRequested && !process.HasExited) { @@ -287,8 +283,8 @@ static void DoOurBestToCleanUp(Process process, Action error) // WaitForExit() return. That's why Close() was here. // // NEW ASYNC CODE: the calling thread in SilentProcessRunner.Execute - // at process.WaitForExitAsync awaits a TaskCompletionSource that completes - // when the Process.Exited event fires. WaitForExitAsync does NOT wait on the + // at WaitForProcessExitAsync awaits a TaskCompletionSource that completes + // when the Process.Exited event fires. WaitForProcessExitAsync does NOT wait on the // redirected streams (Microsoft confirms in the docs: "output processing will not // have completed when this method returns" — see // https://learn.microsoft.com/en-us/dotnet/api/system.diagnostics.process.waitforexitasync). @@ -301,10 +297,10 @@ static void DoOurBestToCleanUp(Process process, Action error) // Close() runs before the kernel has signalled the exit to .NET (which is // asynchronous: when Hitman.Kill returns immediately, the OS delivers the exit // notification some time later), the Exited event never fires, our TaskCompletionSource - // never completes, and the await at process.WaitForExitAsync hangs forever. + // never completes, and the await at WaitForProcessExitAsync hangs forever. // // How pipes get released now: - // 1. After WaitForExitAsync returns, SafelyWaitForAllOutput waits up to 5 seconds + // 1. After WaitForProcessExitAsync returns, SafelyWaitForAllOutput waits up to 5 seconds // per stream for EOF. If a grandchild holds the pipes, this times out and we // proceed (it bounds cancel latency; it does NOT close anything). // 2. The outer `using (var process = new Process())` block calls Process.Dispose @@ -317,10 +313,13 @@ static void DoOurBestToCleanUp(Process process, Action error) // Both assert cancel returns within 30s in this scenario. } -#if NETFRAMEWORK - // WaitForExitAsync is not available on .NET Framework 4.x; polyfill using Process.Exited event + TaskCompletionSource. - static Task WaitForExitAsyncNetFramework(Process process, CancellationToken cancellationToken) + // Single place we block waiting for the spawned process to exit. + // On .NET Framework we use a TaskCompletionSource polyfill because + // Process.WaitForExitAsync doesn't exist there; on .NET 8+ we use the + // framework method directly. + static Task WaitForProcessExitAsync(Process process, CancellationToken cancellationToken) { +#if NETFRAMEWORK var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); CancellationTokenRegistration registration = default; @@ -348,8 +347,10 @@ void OnExited(object? sender, EventArgs e) } return tcs.Task; - } +#else + return process.WaitForExitAsync(cancellationToken); #endif + } [DllImport("kernel32.dll", SetLastError = true)] #pragma warning disable PC003 // Native API not available in UWP From 67a8c2b0c0f307168e657e551030284d06715788 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 13:40:12 +1000 Subject: [PATCH 30/52] Expose AbandonScript on TentacleClient Address PR review on #1226: ClientAndTentacle previously exposed the HalibutRuntime so the abandon integration test could construct its own IAsyncClientScriptServiceV2 proxy and call AbandonScriptAsync directly. TentacleClient now surfaces AbandonScript as a first-class method, the test uses it, and the direct-Halibut exposure is rolled back. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ITentacleClient.cs | 12 +++++++ .../Octopus.Tentacle.Client/TentacleClient.cs | 31 +++++++++++++++++++ .../ClientScriptExecutionAbandon.cs | 8 ++--- .../Support/ClientAndTentacle.cs | 12 +++---- .../TentacleClientExtensionMethods.cs | 12 +++++++ 5 files changed, 63 insertions(+), 12 deletions(-) diff --git a/source/Octopus.Tentacle.Client/ITentacleClient.cs b/source/Octopus.Tentacle.Client/ITentacleClient.cs index c79a285fc..5c6915642 100644 --- a/source/Octopus.Tentacle.Client/ITentacleClient.cs +++ b/source/Octopus.Tentacle.Client/ITentacleClient.cs @@ -8,6 +8,7 @@ using Octopus.Tentacle.Client.Scripts.Models; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.ScriptServiceV2; namespace Octopus.Tentacle.Client { @@ -59,6 +60,17 @@ Task StartScript(ExecuteScriptCommand command, /// The result, which includes the CommandContext for the next command Task CancelScript(CommandContext commandContext, ITentacleClientTaskLog logger); + /// + /// Abandon a running script. Signals Tentacle to release the script's isolation mutex + /// and clean up its workspace without waiting for the underlying process to exit. + /// Used as an escape hatch when CancelScript cannot terminate a stuck process. + /// + /// The ticket of the script to abandon + /// Used to output user orientated log messages + /// Cancels the RPC call + /// The current status snapshot of the script at the time abandon was processed + Task AbandonScript(ScriptTicket scriptTicket, ITentacleClientTaskLog logger, CancellationToken cancellationToken); + /// /// Complete the script. /// diff --git a/source/Octopus.Tentacle.Client/TentacleClient.cs b/source/Octopus.Tentacle.Client/TentacleClient.cs index 8176cccc0..cf3d60ef0 100644 --- a/source/Octopus.Tentacle.Client/TentacleClient.cs +++ b/source/Octopus.Tentacle.Client/TentacleClient.cs @@ -16,6 +16,7 @@ using Octopus.Tentacle.Contracts.Capabilities; using Octopus.Tentacle.Contracts.Logging; using Octopus.Tentacle.Contracts.Observability; +using Octopus.Tentacle.Contracts.ScriptServiceV2; using ITentacleClientObserver = Octopus.Tentacle.Contracts.Observability.ITentacleClientObserver; namespace Octopus.Tentacle.Client @@ -260,6 +261,36 @@ public async Task CancelScript(CommandContext co return await scriptExecutor.CancelScript(commandContext); } + public async Task AbandonScript(ScriptTicket scriptTicket, ITentacleClientTaskLog logger, CancellationToken cancellationToken) + { + using var activity = ActivitySource.StartActivity($"{nameof(TentacleClient)}.{nameof(AbandonScript)}"); + activity?.AddTag("octopus.tentacle.script.ticket", scriptTicket.TaskId); + + var operationMetricsBuilder = ClientOperationMetricsBuilder.Start(); + + async Task AbandonScriptAction(CancellationToken ct) + { + var request = new AbandonScriptCommandV2(scriptTicket, lastLogSequence: 0); + return await allClients.ScriptServiceV2.AbandonScriptAsync(request, new HalibutProxyRequestOptions(ct)); + } + + try + { + return await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(IScriptServiceV2.AbandonScript)), + AbandonScriptAction, + logger, + operationMetricsBuilder, + cancellationToken).ConfigureAwait(false); + } + catch (Exception e) + { + operationMetricsBuilder.Failure(e, cancellationToken); + throw; + } + } + public async Task CompleteScript(CommandContext commandContext, ITentacleClientTaskLog logger, CancellationToken scriptExecutionCancellationToken) { using var activity = ActivitySource.StartActivity($"{nameof(TentacleClient)}.{nameof(CompleteScript)}"); diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index 87c883060..3592c56cd 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -57,9 +57,7 @@ await scriptServiceV2.CancelScriptAsync( // Abandon: fires the abandon token. The RPC returns the current status snapshot // immediately, so we poll GetStatus until the script reaches Complete state. - await scriptServiceV2.AbandonScriptAsync( - new AbandonScriptCommandV2(firstCommand.ScriptTicket, 0), - new HalibutProxyRequestOptions(CancellationToken)); + await tentacleClient.AbandonScript(firstCommand.ScriptTicket, CancellationToken); ScriptStatusResponseV2 abandonResponse = null!; await Wait.For(async () => @@ -120,9 +118,7 @@ await scriptServiceV2.CancelScriptAsync( new HalibutProxyRequestOptions(CancellationToken)); await Task.Delay(TimeSpan.FromSeconds(1)); - await scriptServiceV2.AbandonScriptAsync( - new AbandonScriptCommandV2(firstCommand.ScriptTicket, 0), - new HalibutProxyRequestOptions(CancellationToken)); + await tentacleClient.AbandonScript(firstCommand.ScriptTicket, CancellationToken); // Second FullIsolation script with the SAME mutex name. If the abandon released // the mutex, this script can acquire it and run to completion. Otherwise it would diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs b/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs index 789522815..f73ef2ee1 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs @@ -33,12 +33,12 @@ public LegacyTentacleClientBuilder LegacyTentacleClientBuilder() return new LegacyTentacleClientBuilder(halibutRuntime, ServiceEndPoint); } - // The integration test for AbandonScript needs to call AbandonScriptAsync directly - // over the wire to assert on the RPC response shape and to drive the cancel→abandon - // sequence without going through TentacleClient's higher-level ExecuteScript orchestrator. - // TentacleClient deliberately doesn't expose AbandonScript at all today; the server is - // the only production consumer, and it talks to the Halibut client directly too. - // Exposing a direct client here keeps the test focused on the RPC behavior. + // Some integration tests need to invoke ScriptServiceV2 RPCs (CancelScript, GetStatus) + // directly over the wire, without going through TentacleClient's higher-level + // ExecuteScript orchestrator. TentacleClient's CancelScript/GetStatus require a + // CommandContext from a prior orchestrated call, which isn't available when the test + // is interleaving raw RPCs alongside an in-flight ExecuteScript task. Exposing a direct + // client here keeps those tests focused on the RPC behavior they care about. public IAsyncClientScriptServiceV2 CreateScriptServiceV2Client() { return halibutRuntime.CreateAsyncClient(ServiceEndPoint); diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs index c0975c788..839ca3c82 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs @@ -9,6 +9,7 @@ using Octopus.Tentacle.Client.Scripts.Models; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Support.ExtensionMethods; @@ -56,6 +57,17 @@ public static async Task UploadFile( return result; } + public static async Task AbandonScript( + this TentacleClient tentacleClient, + ScriptTicket scriptTicket, + CancellationToken token, + ITentacleClientTaskLog? log = null) + { + return await tentacleClient.AbandonScript(scriptTicket, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog().Chain(log), + token).ConfigureAwait(false); + } + public static async Task DownloadFile( this TentacleClient tentacleClient, string remotePath, From f92799f2bf39cf37ac6800daa9d2ef93ce209970 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 13:45:41 +1000 Subject: [PATCH 31/52] RunningScript: symmetric Create / CreateAbandonable factories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address PR review on #1226: the existing two-constructor shape didn't signal at the call site which path supports abandon. Replace both constructors with private ones and expose two static factories: - Create(...) — V1 ScriptService path, no abandon token. - CreateAbandonable(..., abandonToken) — V2 ScriptService path. Call sites now read the intent off the factory name. --- .../Services/Scripts/RunningScript.cs | 28 +++++++++++++++++-- .../Services/Scripts/ScriptServiceV2.cs | 2 +- .../Util/RunningScriptFixture.cs | 6 ++-- .../Services/Scripts/ScriptService.cs | 2 +- 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index f1369a11f..7109d0ff1 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -28,7 +28,7 @@ public class RunningScript: IRunningScript readonly ScriptIsolationMutex scriptIsolationMutex; readonly TimeSpan powerShellStartupTimeout; - public RunningScript(IShell shell, + RunningScript(IShell shell, IScriptWorkspace workspace, IScriptStateStore? stateStore, IScriptLog scriptLog, @@ -55,7 +55,7 @@ ILog log this.powerShellStartupTimeout = powerShellStartupTimeout; } - public RunningScript(IShell shell, + RunningScript(IShell shell, IScriptWorkspace workspace, IScriptLog scriptLog, string taskId, @@ -67,6 +67,30 @@ public RunningScript(IShell shell, { } + public static RunningScript Create(IShell shell, + IScriptWorkspace workspace, + IScriptLog scriptLog, + string taskId, + ScriptIsolationMutex scriptIsolationMutex, + CancellationToken runningScriptToken, + IReadOnlyDictionary environmentVariables, + TimeSpan powerShellStartupTimeout, + ILog log) + => new RunningScript(shell, workspace, null, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, CancellationToken.None, environmentVariables, powerShellStartupTimeout, log); + + public static RunningScript CreateAbandonable(IShell shell, + IScriptWorkspace workspace, + IScriptStateStore? stateStore, + IScriptLog scriptLog, + string taskId, + ScriptIsolationMutex scriptIsolationMutex, + CancellationToken runningScriptToken, + CancellationToken abandonToken, + IReadOnlyDictionary environmentVariables, + TimeSpan powerShellStartupTimeout, + ILog log) + => new RunningScript(shell, workspace, stateStore, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); + public ProcessState State { get; private set; } public int ExitCode { get; private set; } diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs index 3d6a7e9db..5f7c0a262 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs @@ -200,7 +200,7 @@ bool WasAbandoned(IScriptWorkspace workspace) RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken, CancellationToken abandonToken) { - var runningScript = new RunningScript(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); + var runningScript = RunningScript.CreateAbandonable(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); _ = Task.Run(async () => await runningScript.Execute()); return runningScript; } diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs index 5847c589b..62a81ba5d 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs @@ -67,7 +67,7 @@ public void SetUpLocal() scriptLog = new TestScriptLog(); cancellationTokenSource = new CancellationTokenSource(TimeSpan.FromSeconds(10)); scriptIsolationMutex = new ScriptIsolationMutex(); - runningScript = new RunningScript(shell, + runningScript = RunningScript.Create(shell, workspace, scriptLog, taskId, @@ -169,7 +169,7 @@ public async Task CancellationToken_ShouldKillTheProcess() ? (new PowerShell(), "Start-Sleep -seconds") : (new Bash() as IShell, "sleep"); - var script = new RunningScript(shell, + var script = RunningScript.Create(shell, workspace, scriptLog, taskId, @@ -204,7 +204,7 @@ public async Task Execute_WhenAbandonTokenFires_ReturnsAbandonedExitCode() using var runningCts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); using var abandonCts = new CancellationTokenSource(); - var script = new RunningScript( + var script = RunningScript.CreateAbandonable( shell, workspace, stateStore: null, diff --git a/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs b/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs index e1b952202..82b482d6f 100644 --- a/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs +++ b/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs @@ -93,7 +93,7 @@ public async Task CompleteScriptAsync(CompleteScriptComman RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, CancellationTokenSource cancel) { - var runningScript = new RunningScript(shell, workspace, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancel.Token, new Dictionary(), PowerShellStartupDetection.PowerShellStartupTimeout, log); + var runningScript = RunningScript.Create(shell, workspace, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancel.Token, new Dictionary(), PowerShellStartupDetection.PowerShellStartupTimeout, log); _ = Task.Run(async () => await runningScript.Execute()); return runningScript; } From ff0e28453660f80fea7003b7dab1013727cf8df3 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 13:52:34 +1000 Subject: [PATCH 32/52] Drop elapsed-time assertion on abandon test; rely on exit code Address PR review on #1226: the elapsed-time check was belt-and-braces. AbandonedExitCode is only returned from the abandon catch block, so the exit code already proves the abandon path was taken. Dropped the Stopwatch and the < 2 second assertion; comment updated to explain why the exit code is the actual abandon contract. --- .../Util/SilentProcessRunnerFixture.cs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 830a39527..72931d25a 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -319,8 +319,6 @@ public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProces var infoMessages = new StringBuilder(); - var sw = Stopwatch.StartNew(); - var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( abandonCommand, arguments, @@ -339,15 +337,11 @@ public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProces try { var exitCode = await task; - sw.Stop(); - - // The whole point of abandon is "return promptly without waiting for the script - // process to exit". The script we just started runs for 5 minutes (sleep 300). - // Without an elapsed-time assertion this test would pass even if abandon - // accidentally waited the full 5 minutes, which would silently lose the entire - // contract. 2 seconds is a generous near-instant bound: the abandon path on a - // local machine returns in tens of milliseconds; CI has been measured under 500ms. - sw.Elapsed.Should().BeLessThan(TimeSpan.FromSeconds(2), "abandon should return promptly without waiting for the underlying process"); + + // AbandonedExitCode is only returned from the abandon catch block, which + // requires the abandon token to fire. If we'd accidentally waited for the + // process to exit naturally, exitCode would be the script's own exit code, + // not this sentinel. The exit code is the abandon contract. exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); } From 521997231d1261fe001ad5f1a475d82c0485b742 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 14:00:27 +1000 Subject: [PATCH 33/52] Fix ScriptServiceV2 CompleteScript tests to use one service instance Address PR review on #1226: both 'when workspace.Delete fails' tests were constructing a second ScriptServiceV2 with a mock workspace factory only for CompleteScriptAsync, while StartScriptAsync / AbandonScriptAsync had run on the fixture-level service. Because running-script state is per-instance, the new service didn't know the script existed, and the test never actually reached the abandon-aware Delete tolerance code. Refactored both tests to run on a single test-specific ScriptServiceV2 instance built with a DeleteThrowingScriptWorkspaceFactory decorator that forwards every member through to a real workspace except Delete, which throws. The decorator replaces the NSubstitute-based helper so any workspace member StartScript or RunningScript may call passes through to the real implementation. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Integration/ScriptServiceV2Fixture.cs | 166 +++++++++++++----- 1 file changed, 125 insertions(+), 41 deletions(-) diff --git a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs index e6c469a4b..ed205a9b1 100644 --- a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs @@ -16,6 +16,7 @@ using Octopus.Tentacle.Core.Diagnostics; using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Core.Services.Scripts.Locking; +using Octopus.Tentacle.Core.Services.Scripts.Logging; using Octopus.Tentacle.Core.Services.Scripts.Security.Masking; using Octopus.Tentacle.Core.Services.Scripts.Shell; using Octopus.Tentacle.Core.Services.Scripts.StateStore; @@ -561,48 +562,47 @@ public async Task AbandonScript_OnAlreadyCompletedScript_ReturnsRealExitCode() [Test] public async Task CompleteScript_AfterAbandon_WhenWorkspaceDeleteFails_LogsWarnAndReturnsNormally() { + var deleteException = new IOException("file in use"); + var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(deleteException); + var serviceUnderTest = new ScriptServiceV2( + PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), + throwingFactory, + stateStoreFactory, + new ScriptIsolationMutex(), + mockLog); + var startCommand = new StartScriptCommandV2Builder() .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") .WithIsolation(ScriptIsolationLevel.NoIsolation) .WithDurationStartScriptCanWaitForScriptToFinish(null) .Build(); - await service.StartScriptAsync(startCommand, CancellationToken.None); + await serviceUnderTest.StartScriptAsync(startCommand, CancellationToken.None); // Wait for Running ScriptStatusResponseV2 status; var runningDeadline = DateTime.UtcNow.AddSeconds(30); do { - status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + status = await serviceUnderTest.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); if (status.State == ProcessState.Running) break; await Task.Delay(50); } while (DateTime.UtcNow < runningDeadline); status.State.Should().Be(ProcessState.Running, "script should have reached Running state within 30 seconds"); - await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + await serviceUnderTest.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); // Poll until Complete var completeDeadline = DateTime.UtcNow.AddSeconds(30); do { - status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + status = await serviceUnderTest.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); if (status.State == ProcessState.Complete) break; await Task.Delay(50); } while (DateTime.UtcNow < completeDeadline); status.State.Should().Be(ProcessState.Complete); status.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - // Build a service whose workspace.Delete throws - var deleteException = new IOException("file in use"); - var (mockFactory, mockLog) = BuildFactoryWithThrowingDelete(startCommand.ScriptTicket, deleteException); - var serviceUnderTest = new ScriptServiceV2( - PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), - mockFactory, - stateStoreFactory, - new ScriptIsolationMutex(), - mockLog); - Func complete = async () => await serviceUnderTest.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket), CancellationToken.None); await complete.Should().NotThrowAsync(); @@ -612,62 +612,146 @@ public async Task CompleteScript_AfterAbandon_WhenWorkspaceDeleteFails_LogsWarnA [Test] public async Task CompleteScript_AfterNormalCompletion_WhenWorkspaceDeleteFails_PropagatesException() { + var deleteException = new IOException("file in use"); + var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(deleteException); + var serviceUnderTest = new ScriptServiceV2( + PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), + throwingFactory, + stateStoreFactory, + new ScriptIsolationMutex(), + mockLog); + var startCommand = new StartScriptCommandV2Builder() .WithScriptBody("echo \"finished\"") .WithIsolation(ScriptIsolationLevel.NoIsolation) .WithDurationStartScriptCanWaitForScriptToFinish(null) .Build(); - await service.StartScriptAsync(startCommand, CancellationToken.None); + await serviceUnderTest.StartScriptAsync(startCommand, CancellationToken.None); // Poll until natural completion ScriptStatusResponseV2 status; var deadline = DateTime.UtcNow.AddSeconds(30); do { - status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + status = await serviceUnderTest.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); if (status.State == ProcessState.Complete) break; await Task.Delay(50); } while (DateTime.UtcNow < deadline); status.State.Should().Be(ProcessState.Complete); status.ExitCode.Should().Be(0, "the script exited cleanly, not via abandon"); - var deleteException = new IOException("file in use"); - var (mockFactory, mockLog) = BuildFactoryWithThrowingDelete(startCommand.ScriptTicket, deleteException); - var serviceUnderTest = new ScriptServiceV2( - PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), - mockFactory, - stateStoreFactory, - new ScriptIsolationMutex(), - mockLog); - Func complete = async () => await serviceUnderTest.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket), CancellationToken.None); await complete.Should().ThrowAsync(); } /// - /// Builds a mock IScriptWorkspaceFactory that delegates all calls to the real workspaceFactory except - /// workspace.Delete, which throws the supplied exception. Also returns a mock ISystemLog for assertion. + /// Builds an IScriptWorkspaceFactory decorator over the real workspaceFactory whose returned + /// workspaces forward every member except Delete(CancellationToken), which throws the supplied + /// exception. Also returns a mock ISystemLog for assertion. /// - (IScriptWorkspaceFactory factory, ISystemLog log) BuildFactoryWithThrowingDelete(ScriptTicket ticket, Exception deleteException) + (IScriptWorkspaceFactory factory, ISystemLog log) BuildFactoryWithThrowingDelete(Exception deleteException) { - var realWorkspace = workspaceFactory.GetWorkspace(ticket, WorkspaceReadinessCheck.Skip); + var throwingFactory = new DeleteThrowingScriptWorkspaceFactory(workspaceFactory, deleteException); + var fakeLog = Substitute.For(); + return (throwingFactory, fakeLog); + } - var fakeWorkspace = Substitute.For(); - fakeWorkspace.ScriptTicket.Returns(realWorkspace.ScriptTicket); - fakeWorkspace.WorkingDirectory.Returns(realWorkspace.WorkingDirectory); - fakeWorkspace.BootstrapScriptFilePath.Returns(realWorkspace.BootstrapScriptFilePath); - fakeWorkspace.LogFilePath.Returns(realWorkspace.LogFilePath); - fakeWorkspace.ResolvePath(Arg.Any()).Returns(ci => realWorkspace.ResolvePath(ci.Arg())); - fakeWorkspace.CreateLog().Returns(_ => realWorkspace.CreateLog()); - fakeWorkspace.Delete(Arg.Any()).Returns(_ => throw deleteException); + /// + /// IScriptWorkspaceFactory decorator that wraps every workspace it returns in a + /// DeleteThrowingScriptWorkspace so that Delete throws the configured exception while all + /// other members forward to the real workspace. + /// + class DeleteThrowingScriptWorkspaceFactory : IScriptWorkspaceFactory + { + readonly IScriptWorkspaceFactory inner; + readonly Exception deleteException; - var fakeFactory = Substitute.For(); - fakeFactory.GetWorkspace(Arg.Any(), Arg.Any()).Returns(fakeWorkspace); + public DeleteThrowingScriptWorkspaceFactory(IScriptWorkspaceFactory inner, Exception deleteException) + { + this.inner = inner; + this.deleteException = deleteException; + } - var fakeLog = Substitute.For(); - return (fakeFactory, fakeLog); + public IScriptWorkspace GetWorkspace(ScriptTicket ticket, WorkspaceReadinessCheck readinessCheck) + => new DeleteThrowingScriptWorkspace(inner.GetWorkspace(ticket, readinessCheck), deleteException); + + public async Task PrepareWorkspace( + ScriptTicket ticket, + string scriptBody, + Dictionary scripts, + ScriptIsolationLevel isolationLevel, + TimeSpan scriptMutexAcquireTimeout, + string? scriptMutexName, + string[]? scriptArguments, + List files, + CancellationToken cancellationToken) + { + var workspace = await inner.PrepareWorkspace(ticket, scriptBody, scripts, isolationLevel, scriptMutexAcquireTimeout, scriptMutexName, scriptArguments, files, cancellationToken); + return new DeleteThrowingScriptWorkspace(workspace, deleteException); + } + + public List GetUncompletedWorkspaces() + => inner.GetUncompletedWorkspaces().Select(w => (IScriptWorkspace)new DeleteThrowingScriptWorkspace(w, deleteException)).ToList(); + } + + /// + /// IScriptWorkspace decorator that forwards every member to an inner real workspace, except + /// Delete(CancellationToken), which throws the configured exception. Used to exercise the + /// CompleteScript abandon-aware tolerance of Delete failures without disturbing anything else + /// StartScript / RunningScript may touch on the workspace. + /// + class DeleteThrowingScriptWorkspace : IScriptWorkspace + { + readonly IScriptWorkspace inner; + readonly Exception deleteException; + + public DeleteThrowingScriptWorkspace(IScriptWorkspace inner, Exception deleteException) + { + this.inner = inner; + this.deleteException = deleteException; + } + + public ScriptTicket ScriptTicket => inner.ScriptTicket; + public string WorkingDirectory => inner.WorkingDirectory; + public string BootstrapScriptFilePath => inner.BootstrapScriptFilePath; + public string LogFilePath => inner.LogFilePath; + + public string[]? ScriptArguments + { + get => inner.ScriptArguments; + set => inner.ScriptArguments = value; + } + + public ScriptIsolationLevel IsolationLevel + { + get => inner.IsolationLevel; + set => inner.IsolationLevel = value; + } + + public TimeSpan ScriptMutexAcquireTimeout + { + get => inner.ScriptMutexAcquireTimeout; + set => inner.ScriptMutexAcquireTimeout = value; + } + + public string? ScriptMutexName + { + get => inner.ScriptMutexName; + set => inner.ScriptMutexName = value; + } + + public bool ShouldMonitorPowerShellStartup() => inner.ShouldMonitorPowerShellStartup(); + public void BootstrapScript(string scriptBody) => inner.BootstrapScript(scriptBody); + public string ResolvePath(string fileName) => inner.ResolvePath(fileName); + public IScriptLog CreateLog() => inner.CreateLog(); + public void WriteFile(string filename, string contents) => inner.WriteFile(filename, contents); + public void CopyFile(string sourceFilePath, string destFileName, bool overwrite) => inner.CopyFile(sourceFilePath, destFileName, overwrite); + public void CheckReadiness() => inner.CheckReadiness(); + public string? TryReadFile(string filename) => inner.TryReadFile(filename); + + public Task Delete(CancellationToken cancellationToken) => throw deleteException; } // TODO - Test the stateStore is updated. From f864101a27a9c3cd9aa172d9535920bcd19ea028 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 14:33:40 +1000 Subject: [PATCH 34/52] Address SilentProcessRunner review feedback PR review on #1226: - Move process.EnableRaisingEvents = true inside #if NETFRAMEWORK so it only fires on the netframework target where the polyfill needs it. Microsoft docs confirm Process.WaitForExitAsync sets the flag itself on .NET 5+, so the unconditional assignment was only doing real work on the framework target. - Drop EFT-3295 ticket reference from the abandon-token comment; just say "the abandon feature". - Extract the netframework polyfill body from WaitForProcessExitAsync into a separate WaitForProcessExitAsyncNetFrameworkPolyfill method so both pragma branches are one-liners. - Shrink the long process.Close() removal comment to a 3-line pointer; the full history lives in a PR comment on the line. --- .../Util/CommandLine/SilentProcessRunner.cs | 65 ++++++------------- 1 file changed, 20 insertions(+), 45 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index e5d4eeda5..7d46857f6 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -114,7 +114,13 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei process.StartInfo.CreateNoWindow = true; process.StartInfo.RedirectStandardOutput = true; process.StartInfo.RedirectStandardError = true; +#if NETFRAMEWORK + // The netframework polyfill of WaitForProcessExitAsync subscribes to + // process.Exited and needs this flag to receive the event. On .NET 8+ + // Process.WaitForExitAsync sets EnableRaisingEvents = true itself + // (see https://learn.microsoft.com/en-us/dotnet/api/system.diagnostics.process.waitforexitasync). process.EnableRaisingEvents = true; +#endif if (PlatformDetection.IsRunningOnWindows) { process.StartInfo.StandardOutputEncoding = encoding; @@ -158,7 +164,7 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei // We pass `abandon` (not `cancel`) because cancel is handled via the // cancel.Register callback above which kills the process tree; the // resulting Exited event is what unblocks this await on cancel. - // `abandon` is a separate token used by EFT-3295 to stop waiting + // `abandon` is a separate token used by the abandon feature to stop waiting // WITHOUT killing the process — see the catch block below. await WaitForProcessExitAsync(process, abandon).ConfigureAwait(false); } @@ -270,47 +276,9 @@ static void DoOurBestToCleanUp(Process process, Action error) error($"Failed to kill the launched process: {killProcessException}"); } } - // We have removed process.Close() here. The pre-async version of this code did this, and adding - // it back will cause cancel to hang forever. Here's why: - // - // OLD SYNC CODE: the calling thread blocked inside SilentProcessRunner.Execute() on - // process.WaitForExit() (specifically the overload which does not specify a timeout), - // which waits for BOTH the process to exit AND the redirected stream - // readers to reach EOF. If a re-parented grandchild held our stdout/stderr open, the - // stream readers never reached EOF, so WaitForExit() blocked forever. Calling - // process.Close() during cancel-cleanup forced the Process object to release its - // handles to the redirected pipes, which made the readers see EOF, which let - // WaitForExit() return. That's why Close() was here. - // - // NEW ASYNC CODE: the calling thread in SilentProcessRunner.Execute - // at WaitForProcessExitAsync awaits a TaskCompletionSource that completes - // when the Process.Exited event fires. WaitForProcessExitAsync does NOT wait on the - // redirected streams (Microsoft confirms in the docs: "output processing will not - // have completed when this method returns" — see - // https://learn.microsoft.com/en-us/dotnet/api/system.diagnostics.process.waitforexitasync). - // So a grandchild holding pipes open can't hang the await. The original reason for - // Close() is gone. - // - // Why adding Close() back is harmful and maybe why this is a code comment not a - // PR comment: process.Close() detaches the Process object from the underlying - // OS process, which tears down the wait state that produces the Exited event. If - // Close() runs before the kernel has signalled the exit to .NET (which is - // asynchronous: when Hitman.Kill returns immediately, the OS delivers the exit - // notification some time later), the Exited event never fires, our TaskCompletionSource - // never completes, and the await at WaitForProcessExitAsync hangs forever. - // - // How pipes get released now: - // 1. After WaitForProcessExitAsync returns, SafelyWaitForAllOutput waits up to 5 seconds - // per stream for EOF. If a grandchild holds the pipes, this times out and we - // proceed (it bounds cancel latency; it does NOT close anything). - // 2. The outer `using (var process = new Process())` block calls Process.Dispose - // at end of method, which calls Close internally. - // - // Worst case cancel latency with grandchild holding pipes: ~10s (5s × 2 streams). - // Covered by tests in SilentProcessRunnerFixture: - // - CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNotHang (Windows) - // - CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_ShouldNotHang (Unix) - // Both assert cancel returns within 30s in this scenario. + // process.Close() was deliberately removed from this cancel-cleanup path. + // Adding it back will cause cancel to hang forever. See PR #1226 review + // thread for the full old-sync vs new-async explanation. } // Single place we block waiting for the spawned process to exit. @@ -320,6 +288,15 @@ static void DoOurBestToCleanUp(Process process, Action error) static Task WaitForProcessExitAsync(Process process, CancellationToken cancellationToken) { #if NETFRAMEWORK + return WaitForProcessExitAsyncNetFrameworkPolyfill(process, cancellationToken); +#else + return process.WaitForExitAsync(cancellationToken); +#endif + } + +#if NETFRAMEWORK + static Task WaitForProcessExitAsyncNetFrameworkPolyfill(Process process, CancellationToken cancellationToken) + { var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); CancellationTokenRegistration registration = default; @@ -347,10 +324,8 @@ void OnExited(object? sender, EventArgs e) } return tcs.Task; -#else - return process.WaitForExitAsync(cancellationToken); -#endif } +#endif [DllImport("kernel32.dll", SetLastError = true)] #pragma warning disable PC003 // Native API not available in UWP From cd8efc7bb387fa1b47b45fed2a238b62c686bcf9 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 14:36:02 +1000 Subject: [PATCH 35/52] Remove ClientAndTentacle.CreateScriptServiceV2Client direct-Halibut exposure PR review on #1226: abandon no longer needs the direct Halibut bypass now that TentacleClient.AbandonScript exists. Refactored the two ClientScriptExecutionAbandon test sites to call TentacleClient.CancelScript and TentacleClient.GetStatus through new ScriptTicket-based extension helpers in TentacleClientExtensionMethods (which synthesize a CommandContext for ScriptServiceVersion2 internally, since the tests are interleaving RPCs alongside an in-flight ExecuteScript task and don't carry a CommandContext from a prior orchestrated call). Removed CreateScriptServiceV2Client from ClientAndTentacle along with the now-unused Halibut and ScriptServiceV2 contract usings. The halibutRuntime field stays because LegacyTentacleClientBuilder still uses it. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ClientScriptExecutionAbandon.cs | 18 +++-------- .../Support/ClientAndTentacle.cs | 14 --------- .../TentacleClientExtensionMethods.cs | 31 +++++++++++++++++++ 3 files changed, 35 insertions(+), 28 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index 3592c56cd..715500ebf 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -3,10 +3,8 @@ using System.Linq; using System.Threading.Tasks; using FluentAssertions; -using Halibut.ServiceModel; using NUnit.Framework; using Octopus.Tentacle.Contracts; -using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Core.Util; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Util; @@ -40,7 +38,6 @@ public async Task AbandonScript_WhenCancelFailsToKillProcess_ReturnsAbandonedExi .Build(); var tentacleClient = clientTentacle.TentacleClient; - var scriptServiceV2 = clientTentacle.CreateScriptServiceV2Client(); var scriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); @@ -50,21 +47,17 @@ await Wait.For(() => File.Exists(startFile), CancellationToken); // Cancel: Hitman is a no-op so the process keeps running. - await scriptServiceV2.CancelScriptAsync( - new CancelScriptCommandV2(firstCommand.ScriptTicket, 0), - new HalibutProxyRequestOptions(CancellationToken)); + await tentacleClient.CancelScript(firstCommand.ScriptTicket); await Task.Delay(TimeSpan.FromSeconds(1)); // Abandon: fires the abandon token. The RPC returns the current status snapshot // immediately, so we poll GetStatus until the script reaches Complete state. await tentacleClient.AbandonScript(firstCommand.ScriptTicket, CancellationToken); - ScriptStatusResponseV2 abandonResponse = null!; + ScriptStatus abandonResponse = null!; await Wait.For(async () => { - abandonResponse = await scriptServiceV2.GetStatusAsync( - new ScriptStatusRequestV2(firstCommand.ScriptTicket, 0), - new HalibutProxyRequestOptions(CancellationToken)); + abandonResponse = await tentacleClient.GetStatus(firstCommand.ScriptTicket, CancellationToken); return abandonResponse.State == ProcessState.Complete; }, TimeSpan.FromSeconds(30), @@ -104,7 +97,6 @@ public async Task AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRun .Build(); var tentacleClient = clientTentacle.TentacleClient; - var scriptServiceV2 = clientTentacle.CreateScriptServiceV2Client(); var firstScriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); @@ -113,9 +105,7 @@ await Wait.For(() => File.Exists(startFile), () => throw new Exception("First script did not start"), CancellationToken); - await scriptServiceV2.CancelScriptAsync( - new CancelScriptCommandV2(firstCommand.ScriptTicket, 0), - new HalibutProxyRequestOptions(CancellationToken)); + await tentacleClient.CancelScript(firstCommand.ScriptTicket); await Task.Delay(TimeSpan.FromSeconds(1)); await tentacleClient.AbandonScript(firstCommand.ScriptTicket, CancellationToken); diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs b/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs index f73ef2ee1..9d4e9a523 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs @@ -6,9 +6,6 @@ using Octopus.Tentacle.Client; using Octopus.Tentacle.Client.Retries; using Octopus.Tentacle.CommonTestUtils; -using Octopus.Tentacle.Contracts.Capabilities; -using Octopus.Tentacle.Contracts.ClientServices; -using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Tests.Integration.Support.Legacy; using Octopus.TestPortForwarder; using Serilog; @@ -33,17 +30,6 @@ public LegacyTentacleClientBuilder LegacyTentacleClientBuilder() return new LegacyTentacleClientBuilder(halibutRuntime, ServiceEndPoint); } - // Some integration tests need to invoke ScriptServiceV2 RPCs (CancelScript, GetStatus) - // directly over the wire, without going through TentacleClient's higher-level - // ExecuteScript orchestrator. TentacleClient's CancelScript/GetStatus require a - // CommandContext from a prior orchestrated call, which isn't available when the test - // is interleaving raw RPCs alongside an in-flight ExecuteScript task. Exposing a direct - // client here keeps those tests focused on the RPC behavior they care about. - public IAsyncClientScriptServiceV2 CreateScriptServiceV2Client() - { - return halibutRuntime.CreateAsyncClient(ServiceEndPoint); - } - public ClientAndTentacle(IHalibutRuntime halibutRuntime, ServiceEndPoint serviceEndPoint, Server server, diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs index 839ca3c82..21a8c82bc 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs @@ -5,6 +5,7 @@ using System.Threading.Tasks; using Halibut; using Octopus.Tentacle.Client; +using Octopus.Tentacle.Client.EventDriven; using Octopus.Tentacle.Client.Scripts; using Octopus.Tentacle.Client.Scripts.Models; using Octopus.Tentacle.Contracts; @@ -68,6 +69,36 @@ public static async Task AbandonScript( token).ConfigureAwait(false); } + // Some integration tests need to invoke CancelScript / GetStatus directly against an + // already-running ScriptServiceV2 script without going through ExecuteScript. They have + // a ScriptTicket but not a CommandContext (which TentacleClient's high-level methods + // expect). These helpers synthesize a CommandContext from the ticket so tests can call + // through TentacleClient instead of bypassing it with raw Halibut calls. + public static async Task CancelScript( + this TentacleClient tentacleClient, + ScriptTicket scriptTicket, + ITentacleClientTaskLog? log = null) + { + var commandContext = new CommandContext(scriptTicket, 0, ScriptServiceVersion.ScriptServiceVersion2); + var result = await tentacleClient.CancelScript(commandContext, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog().Chain(log)) + .ConfigureAwait(false); + return result.ScriptStatus; + } + + public static async Task GetStatus( + this TentacleClient tentacleClient, + ScriptTicket scriptTicket, + CancellationToken token, + ITentacleClientTaskLog? log = null) + { + var commandContext = new CommandContext(scriptTicket, 0, ScriptServiceVersion.ScriptServiceVersion2); + var result = await tentacleClient.GetStatus(commandContext, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog().Chain(log), + token).ConfigureAwait(false); + return result.ScriptStatus; + } + public static async Task DownloadFile( this TentacleClient tentacleClient, string remotePath, From e83e00a4a785245d6e34195f651fbc8fa26ad80f Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 14:38:46 +1000 Subject: [PATCH 36/52] Use builder pattern for ScriptServiceV2Fixture SUT construction PR review on #1226: replaced the [SetUp]-method approach for constructing the SUT with a ScriptServiceV2Builder. Defaults match what SetUp did; tests opt into mock overrides via .WithX(...) chained calls. The CompleteScript_*WhenWorkspaceDeleteFails* tests now build their SUT through the builder with the throwing-Delete factory and mock log injected via the new With methods. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Integration/ScriptServiceV2Fixture.cs | 227 +++++++++++++----- 1 file changed, 165 insertions(+), 62 deletions(-) diff --git a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs index ed205a9b1..9317a874e 100644 --- a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; @@ -30,30 +30,12 @@ namespace Octopus.Tentacle.Tests.Integration [TestFixture] public class ScriptServiceV2Fixture { - ScriptServiceV2 service = null!; - ScriptWorkspaceFactory workspaceFactory = null!; - ScriptStateStoreFactory stateStoreFactory = null!; - - [SetUp] - public void SetUp() - { - var homeConfiguration = Substitute.For(); - homeConfiguration.HomeDirectory.Returns(Environment.CurrentDirectory); - - var octopusPhysicalFileSystem = new OctopusPhysicalFileSystem(Substitute.For()); - workspaceFactory = new ScriptWorkspaceFactory(octopusPhysicalFileSystem, homeConfiguration, new SensitiveValueMasker()); - stateStoreFactory = new ScriptStateStoreFactory(octopusPhysicalFileSystem); - service = new ScriptServiceV2( - PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), - workspaceFactory, - stateStoreFactory, - new ScriptIsolationMutex(), - Substitute.For()); - } - [Test] public async Task ShouldExecuteAScriptSuccessfully() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var windowsScript = "& ping.exe localhost -n 1"; var bashScript = "ping localhost -c 1"; @@ -64,7 +46,7 @@ public async Task ShouldExecuteAScriptSuccessfully() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - var (logs, finalResponse) = await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + var (logs, finalResponse) = await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().Be(0); @@ -74,6 +56,9 @@ public async Task ShouldExecuteAScriptSuccessfully() [Test] public async Task ShouldReturnANonZeroExitCodeForAFailingScript() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var windowsScript = "& ping.exe nope -n 1"; var bashScript = "ping nope -c 1"; @@ -84,7 +69,7 @@ public async Task ShouldReturnANonZeroExitCodeForAFailingScript() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - var (logs, finalResponse) = await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + var (logs, finalResponse) = await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().NotBe(0); @@ -94,6 +79,9 @@ public async Task ShouldReturnANonZeroExitCodeForAFailingScript() [Test] public async Task ShouldExecuteMultipleScriptsConcurrently() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 10"; var windowsScript = "Start-Sleep -Seconds 10"; @@ -115,7 +103,7 @@ public async Task ShouldExecuteMultipleScriptsConcurrently() }); await Task.WhenAll(tasks); - + var startDuration = started.Elapsed; startDuration.Should().BeLessThan(TimeSpan.FromSeconds(5)); @@ -123,7 +111,7 @@ public async Task ShouldExecuteMultipleScriptsConcurrently() foreach (var script in scripts) { - var (logs, finalResponse) = await RunUntilScriptCompletes(script.Command, script.Response); + var (logs, finalResponse) = await RunUntilScriptCompletes(service, script.Command, script.Response); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().Be(0); @@ -137,6 +125,9 @@ public async Task ShouldExecuteMultipleScriptsConcurrently() [Test] public async Task ShouldStartExecuteAScriptQuickly() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var script = "echo \"finished\""; var startScriptCommand = new StartScriptCommandV2Builder() @@ -171,6 +162,9 @@ public async Task ShouldStartExecuteAScriptQuickly() [Test] public async Task ShouldExecuteALongRunningScriptSuccessfully() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 10"; var windowsScript = "Start-Sleep -Seconds 10"; @@ -181,7 +175,7 @@ public async Task ShouldExecuteALongRunningScriptSuccessfully() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - var (_, finalResponse) = await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + var (_, finalResponse) = await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().Be(0); @@ -190,6 +184,9 @@ public async Task ShouldExecuteALongRunningScriptSuccessfully() [Test] public async Task StartScriptShouldWaitForAShortScriptToFinish() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var startScriptCommand = new StartScriptCommandV2Builder() .WithScriptBody("echo \"finished\"") .WithDurationStartScriptCanWaitForScriptToFinish(TimeSpan.FromSeconds(5)) @@ -208,6 +205,9 @@ public async Task StartScriptShouldWaitForAShortScriptToFinish() [Test] public async Task StartScriptShouldNotWaitForALongScriptToFinish() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 10"; var windowsScript = "Start-Sleep -Seconds 10"; @@ -218,7 +218,7 @@ public async Task StartScriptShouldNotWaitForALongScriptToFinish() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); startScriptResponse.State.Should().Be(ProcessState.Running); startScriptResponse.ExitCode.Should().Be(0); @@ -228,6 +228,9 @@ public async Task StartScriptShouldNotWaitForALongScriptToFinish() [Test] public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreThanOnce_SequentialRequests() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + // Arrange var scriptTicket = new ScriptTicket(Guid.NewGuid().ToString()); var script1 = GetStartScriptCommandForScriptThatCreatesAFile(scriptTicket); @@ -237,7 +240,7 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh await service.StartScriptAsync(script1.StartScriptCommand, CancellationToken.None); var startScriptResponse = await service.StartScriptAsync(script2.StartScriptCommand, CancellationToken.None); - var (_, finalResponse) = await RunUntilScriptCompletes(script2.StartScriptCommand, startScriptResponse); + var (_, finalResponse) = await RunUntilScriptCompletes(service, script2.StartScriptCommand, startScriptResponse); // Assert finalResponse.State.Should().Be(ProcessState.Complete); @@ -250,6 +253,9 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh [Test] public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreThanOnce_ConcurrentRequests() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + // Arrange var scriptTicket = new ScriptTicket(Guid.NewGuid().ToString()); @@ -272,6 +278,7 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh await Task.WhenAll(tasks); var (_, finalResponse) = await RunUntilScriptCompletes( + service, scripts[0].StartScriptCommand, new ScriptStatusResponseV2(scripts[0].StartScriptCommand.ScriptTicket, ProcessState.Pending, 0, new List(), 0)); @@ -293,6 +300,9 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh [Test] public async Task CancelScriptShouldCancelAnExecutingScript() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 60"; var windowsScript = "Start-Sleep -Seconds 60"; @@ -338,6 +348,10 @@ public async Task CancelScriptShouldCancelAnExecutingScript() [Test] public async Task CompleteScriptShouldCleanupTheWorkspace() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var workspaceFactory = (ScriptWorkspaceFactory)builder.WorkspaceFactory; + var script = "echo \"finished\""; var startScriptCommand = new StartScriptCommandV2Builder() @@ -352,13 +366,14 @@ public async Task CompleteScriptShouldCleanupTheWorkspace() var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); Directory.Exists(workspaceDirectory).Should().BeTrue(); - await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); Directory.Exists(workspaceDirectory).Should().BeFalse(); } [Test] public async Task GetStatusShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket() { + var service = new ScriptServiceV2Builder().Build(); var response = await service.GetStatusAsync(new ScriptStatusRequestV2(new ScriptTicket("nope"), 0), CancellationToken.None); response.ExitCode.Should().Be(-45); @@ -367,6 +382,7 @@ public async Task GetStatusShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket() [Test] public async Task CancelScriptShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket() { + var service = new ScriptServiceV2Builder().Build(); var response = await service.CancelScriptAsync(new CancelScriptCommandV2(new ScriptTicket("nope"), 0), CancellationToken.None); response.ExitCode.Should().Be(-45); @@ -375,43 +391,53 @@ public async Task CancelScriptShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket [Test] public async Task CompleteScriptShouldNotErrorForAnUnknownScriptTicket() { + var service = new ScriptServiceV2Builder().Build(); await service.CompleteScriptAsync(new CompleteScriptCommandV2(new ScriptTicket("nope")), CancellationToken.None); } [Test] public async Task GetStatusShouldReturnAnExitCodeOf46ForAScriptWithAnUnknownResult() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var request = new ScriptStatusRequestV2(new ScriptTicket($"did-not-finish-{Guid.NewGuid()}"), 0); var ticket = request.Ticket; - SetupScriptState(ticket); + SetupScriptState(builder.WorkspaceFactory, builder.StateStoreFactory, ticket); var response = await service.GetStatusAsync(request, CancellationToken.None); response.ExitCode.Should().Be(-46); - await CleanupWorkspace(ticket, CancellationToken.None); + await CleanupWorkspace(builder.WorkspaceFactory, ticket, CancellationToken.None); } [Test] public async Task CancelScriptShouldReturnAnExitCodeOf46ForAScriptWithAnUnknownResult() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var request = new CancelScriptCommandV2(new ScriptTicket($"did-not-finish-{Guid.NewGuid()}"), 0); var ticket = request.Ticket; - SetupScriptState(ticket); + SetupScriptState(builder.WorkspaceFactory, builder.StateStoreFactory, ticket); var response = await service.CancelScriptAsync(request, CancellationToken.None); response.ExitCode.Should().Be(-46); - await CleanupWorkspace(ticket, CancellationToken.None); + await CleanupWorkspace(builder.WorkspaceFactory, ticket, CancellationToken.None); } [Test] public async Task CompleteScriptShouldNotErrorForAScriptWithAnUnknownResult() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var request = new CompleteScriptCommandV2(new ScriptTicket($"did-not-finish-{Guid.NewGuid()}")); var ticket = request.Ticket; - SetupScriptState(ticket); + SetupScriptState(builder.WorkspaceFactory, builder.StateStoreFactory, ticket); await service.CompleteScriptAsync(request, CancellationToken.None); } @@ -419,6 +445,9 @@ public async Task CompleteScriptShouldNotErrorForAScriptWithAnUnknownResult() [Test] public async Task ShouldStoreTheStateOfTheScriptInTheScriptStateStore() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var testStarted = DateTimeOffset.UtcNow; var bashScript = "sleep 10"; @@ -430,13 +459,13 @@ public async Task ShouldStoreTheStateOfTheScriptInTheScriptStateStore() .WithDurationStartScriptCanWaitForScriptToFinish(null) .Build(); - var scriptStateStore = SetupScriptStateStore(startScriptCommand.ScriptTicket); + var scriptStateStore = SetupScriptStateStore(builder.WorkspaceFactory, builder.StateStoreFactory, startScriptCommand.ScriptTicket); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); var runningScriptState = scriptStateStore.Load(); - var (logs, finalResponse) = await RunUntilScriptFinishes(startScriptCommand, startScriptResponse); + var (logs, finalResponse) = await RunUntilScriptFinishes(service, startScriptCommand, startScriptResponse); var testFinished = DateTimeOffset.UtcNow; var finishedScriptState = scriptStateStore.Load(); @@ -462,6 +491,8 @@ public async Task ShouldStoreTheStateOfTheScriptInTheScriptStateStore() [Test] public async Task ScriptTicketCasingShouldNotAffectCommands() { + var service = new ScriptServiceV2Builder().Build(); + // Arrange var startScriptCommand = new StartScriptCommandV2Builder() .WithScriptBody("echo \"finished\"") @@ -487,6 +518,8 @@ public async Task ScriptTicketCasingShouldNotAffectCommands() [Test] public async Task AbandonScript_OnUnknownTicket_ReturnsCompleteWithUnknownScriptExitCode() { + var service = new ScriptServiceV2Builder().Build(); + var ticket = new ScriptTicket("unknown-ticket-" + Guid.NewGuid().ToString("N")); var response = await service.AbandonScriptAsync(new AbandonScriptCommandV2(ticket, 0), CancellationToken.None); @@ -497,6 +530,8 @@ public async Task AbandonScript_OnUnknownTicket_ReturnsCompleteWithUnknownScript [Test] public async Task AbandonScript_OnRunningScript_FiresAbandonToken_ReturnsAbandonedExitCode() { + var service = new ScriptServiceV2Builder().Build(); + var startCommand = new StartScriptCommandV2Builder() .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") .WithIsolation(ScriptIsolationLevel.NoIsolation) @@ -536,6 +571,8 @@ public async Task AbandonScript_OnRunningScript_FiresAbandonToken_ReturnsAbandon [Test] public async Task AbandonScript_OnAlreadyCompletedScript_ReturnsRealExitCode() { + var service = new ScriptServiceV2Builder().Build(); + var startCommand = new StartScriptCommandV2Builder() .WithScriptBody("echo \"finished\"") .WithIsolation(ScriptIsolationLevel.NoIsolation) @@ -563,13 +600,12 @@ public async Task AbandonScript_OnAlreadyCompletedScript_ReturnsRealExitCode() public async Task CompleteScript_AfterAbandon_WhenWorkspaceDeleteFails_LogsWarnAndReturnsNormally() { var deleteException = new IOException("file in use"); - var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(deleteException); - var serviceUnderTest = new ScriptServiceV2( - PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), - throwingFactory, - stateStoreFactory, - new ScriptIsolationMutex(), - mockLog); + var builder = new ScriptServiceV2Builder(); + var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(builder.WorkspaceFactory, deleteException); + var serviceUnderTest = builder + .WithWorkspaceFactory(throwingFactory) + .WithLog(mockLog) + .Build(); var startCommand = new StartScriptCommandV2Builder() .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") @@ -613,13 +649,12 @@ public async Task CompleteScript_AfterAbandon_WhenWorkspaceDeleteFails_LogsWarnA public async Task CompleteScript_AfterNormalCompletion_WhenWorkspaceDeleteFails_PropagatesException() { var deleteException = new IOException("file in use"); - var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(deleteException); - var serviceUnderTest = new ScriptServiceV2( - PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), - throwingFactory, - stateStoreFactory, - new ScriptIsolationMutex(), - mockLog); + var builder = new ScriptServiceV2Builder(); + var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(builder.WorkspaceFactory, deleteException); + var serviceUnderTest = builder + .WithWorkspaceFactory(throwingFactory) + .WithLog(mockLog) + .Build(); var startCommand = new StartScriptCommandV2Builder() .WithScriptBody("echo \"finished\"") @@ -647,17 +682,85 @@ public async Task CompleteScript_AfterNormalCompletion_WhenWorkspaceDeleteFails_ } /// - /// Builds an IScriptWorkspaceFactory decorator over the real workspaceFactory whose returned + /// Builds an IScriptWorkspaceFactory decorator over the supplied workspaceFactory whose returned /// workspaces forward every member except Delete(CancellationToken), which throws the supplied /// exception. Also returns a mock ISystemLog for assertion. /// - (IScriptWorkspaceFactory factory, ISystemLog log) BuildFactoryWithThrowingDelete(Exception deleteException) + static (IScriptWorkspaceFactory factory, ISystemLog log) BuildFactoryWithThrowingDelete(IScriptWorkspaceFactory workspaceFactory, Exception deleteException) { var throwingFactory = new DeleteThrowingScriptWorkspaceFactory(workspaceFactory, deleteException); var fakeLog = Substitute.For(); return (throwingFactory, fakeLog); } + /// + /// Builder for ScriptServiceV2 SUT construction. Defaults match what the previous [SetUp] + /// produced; tests opt into mock overrides via the chainable With* methods. The + /// WorkspaceFactory and StateStoreFactory properties materialize on first access so tests + /// can grab them before Build() (e.g. to wrap the default factory in a decorator). + /// + class ScriptServiceV2Builder + { + IScriptWorkspaceFactory? workspaceFactory; + ScriptStateStoreFactory? stateStoreFactory; + ISystemLog? log; + IShell? shell; + ScriptIsolationMutex? mutex; + OctopusPhysicalFileSystem? cachedFileSystem; + + public IScriptWorkspaceFactory WorkspaceFactory => workspaceFactory ??= BuildDefaultWorkspaceFactory(); + public ScriptStateStoreFactory StateStoreFactory => stateStoreFactory ??= new ScriptStateStoreFactory(FileSystem); + + OctopusPhysicalFileSystem FileSystem => cachedFileSystem ??= new OctopusPhysicalFileSystem(Substitute.For()); + + ScriptWorkspaceFactory BuildDefaultWorkspaceFactory() + { + var homeConfiguration = Substitute.For(); + homeConfiguration.HomeDirectory.Returns(Environment.CurrentDirectory); + return new ScriptWorkspaceFactory(FileSystem, homeConfiguration, new SensitiveValueMasker()); + } + + public ScriptServiceV2Builder WithWorkspaceFactory(IScriptWorkspaceFactory factory) + { + workspaceFactory = factory; + return this; + } + + public ScriptServiceV2Builder WithStateStoreFactory(ScriptStateStoreFactory factory) + { + stateStoreFactory = factory; + return this; + } + + public ScriptServiceV2Builder WithLog(ISystemLog log) + { + this.log = log; + return this; + } + + public ScriptServiceV2Builder WithShell(IShell shell) + { + this.shell = shell; + return this; + } + + public ScriptServiceV2Builder WithMutex(ScriptIsolationMutex mutex) + { + this.mutex = mutex; + return this; + } + + public ScriptServiceV2 Build() + { + return new ScriptServiceV2( + shell ?? (PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash()), + WorkspaceFactory, + StateStoreFactory, + mutex ?? new ScriptIsolationMutex(), + log ?? Substitute.For()); + } + } + /// /// IScriptWorkspaceFactory decorator that wraps every workspace it returns in a /// DeleteThrowingScriptWorkspace so that Delete throws the configured exception while all @@ -756,20 +859,20 @@ public string? ScriptMutexName // TODO - Test the stateStore is updated. - private void SetupScriptState(ScriptTicket ticket) + static void SetupScriptState(IScriptWorkspaceFactory workspaceFactory, ScriptStateStoreFactory stateStoreFactory, ScriptTicket ticket) { - var stateWorkspace = SetupScriptStateStore(ticket); + var stateWorkspace = SetupScriptStateStore(workspaceFactory, stateStoreFactory, ticket); stateWorkspace.Create(); } - private ScriptStateStore SetupScriptStateStore(ScriptTicket ticket) + static ScriptStateStore SetupScriptStateStore(IScriptWorkspaceFactory workspaceFactory, ScriptStateStoreFactory stateStoreFactory, ScriptTicket ticket) { var workspace = workspaceFactory.GetWorkspace(ticket, WorkspaceReadinessCheck.Perform); var stateWorkspace = stateStoreFactory.Create(workspace); return stateWorkspace; } - private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cancellationToken) + static async Task CleanupWorkspace(IScriptWorkspaceFactory workspaceFactory, ScriptTicket ticket, CancellationToken cancellationToken) { var workspace = workspaceFactory.GetWorkspace(ticket, WorkspaceReadinessCheck.Skip); await workspace.Delete(cancellationToken); @@ -798,9 +901,9 @@ private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cance return (startScriptCommand, new FileInfo(filePath)); } - async Task<(List, ScriptStatusResponseV2)> RunUntilScriptCompletes(StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) + static async Task<(List, ScriptStatusResponseV2)> RunUntilScriptCompletes(ScriptServiceV2 service, StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) { - var (logs, lastResponse) = await RunUntilScriptFinishes(startScriptCommand, response); + var (logs, lastResponse) = await RunUntilScriptFinishes(service, startScriptCommand, response); await service.CompleteScriptAsync(new CompleteScriptCommandV2(startScriptCommand.ScriptTicket), CancellationToken.None); @@ -809,7 +912,7 @@ private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cance return (logs, lastResponse); } - async Task<(List logs, ScriptStatusResponseV2 response)> RunUntilScriptFinishes(StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) + static async Task<(List logs, ScriptStatusResponseV2 response)> RunUntilScriptFinishes(ScriptServiceV2 service, StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) { var logs = new List(response.Logs); @@ -828,7 +931,7 @@ private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cance return (logs, response); } - void WriteLogsToConsole(List logs) + static void WriteLogsToConsole(List logs) { foreach (var log in logs) { @@ -847,4 +950,4 @@ public StartScriptCommandAndResponse(StartScriptCommandV2 command) public ScriptStatusResponseV2? Response { get; set; } } } -} \ No newline at end of file +} From 7c966c761c70e0a56af77b533a83d5b3045b3a19 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 14:56:24 +1000 Subject: [PATCH 37/52] Tidy two follow-up review comments on #1226 - Drop the explanatory comment block above the TentacleClient CancelScript/GetStatus extension overloads per the latest review note. - Move process.EnableRaisingEvents = true off the process configuration block and into WaitForProcessExitAsyncNetFrameworkPolyfill itself, so the pragma stops polluting ExecuteCommandAsync's body. The polyfill only runs on netframework, so EnableRaisingEvents only fires there; on .NET 8+ Process.WaitForExitAsync sets the flag itself. --- .../Util/CommandLine/SilentProcessRunner.cs | 13 ++++++------- .../Util/Builders/TentacleClientExtensionMethods.cs | 5 ----- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 7d46857f6..1cc20c06a 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -114,13 +114,6 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei process.StartInfo.CreateNoWindow = true; process.StartInfo.RedirectStandardOutput = true; process.StartInfo.RedirectStandardError = true; -#if NETFRAMEWORK - // The netframework polyfill of WaitForProcessExitAsync subscribes to - // process.Exited and needs this flag to receive the event. On .NET 8+ - // Process.WaitForExitAsync sets EnableRaisingEvents = true itself - // (see https://learn.microsoft.com/en-us/dotnet/api/system.diagnostics.process.waitforexitasync). - process.EnableRaisingEvents = true; -#endif if (PlatformDetection.IsRunningOnWindows) { process.StartInfo.StandardOutputEncoding = encoding; @@ -297,6 +290,12 @@ static Task WaitForProcessExitAsync(Process process, CancellationToken cancellat #if NETFRAMEWORK static Task WaitForProcessExitAsyncNetFrameworkPolyfill(Process process, CancellationToken cancellationToken) { + // EnableRaisingEvents must be true for the process.Exited handler below to fire. + // On .NET 8+ Process.WaitForExitAsync sets this itself; here on netframework we + // have to set it ourselves before subscribing. + // https://learn.microsoft.com/en-us/dotnet/api/system.diagnostics.process.waitforexitasync + process.EnableRaisingEvents = true; + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); CancellationTokenRegistration registration = default; diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs index 21a8c82bc..bf6ba7343 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs @@ -69,11 +69,6 @@ public static async Task AbandonScript( token).ConfigureAwait(false); } - // Some integration tests need to invoke CancelScript / GetStatus directly against an - // already-running ScriptServiceV2 script without going through ExecuteScript. They have - // a ScriptTicket but not a CommandContext (which TentacleClient's high-level methods - // expect). These helpers synthesize a CommandContext from the ticket so tests can call - // through TentacleClient instead of bypassing it with raw Halibut calls. public static async Task CancelScript( this TentacleClient tentacleClient, ScriptTicket scriptTicket, From d5f6b83a41cb57b06ff17023a405396ff143cef0 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 15:04:56 +1000 Subject: [PATCH 38/52] Drop process.Close() removal comment from source Address PR review on #1226: source is cleaner without the pointer comment. The full historical reasoning lives in a top-level PR comment on #1226 (tagged for Luke's attention) so anyone reconsidering this in the future has to find it via the PR history rather than the source. --- .../Util/CommandLine/SilentProcessRunner.cs | 3 --- 1 file changed, 3 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 1cc20c06a..4fe392f8f 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -269,9 +269,6 @@ static void DoOurBestToCleanUp(Process process, Action error) error($"Failed to kill the launched process: {killProcessException}"); } } - // process.Close() was deliberately removed from this cancel-cleanup path. - // Adding it back will cause cancel to hang forever. See PR #1226 review - // thread for the full old-sync vs new-async explanation. } // Single place we block waiting for the spawned process to exit. From 830317f713be7acdc38bd822a48d2f7c77f42e04 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 17:09:24 +1000 Subject: [PATCH 39/52] Address review feedback on #1226 - Delete the two plan files under docs/superpowers/plans/. Per Luke and Jim: plans are merge artefacts, specs stay (Jim is asking Luke to experiment with keeping specs for intent-discovery). - Drop the unused `using System.Threading;` from KubernetesDirectoryInformationProvider.cs (leftover from the earlier cancel-token-then-removed pass). - Assert that the sleep process is still running in the abandon test before the finally-block force-kills it. We don't care in production whether the script keeps running after abandon; the assertion is test-fixture confidence that we exercised the abandon path rather than accidentally cancelling. --- ...2026-05-21-tentacle-script-abandon-plan.md | 1374 ----------------- ...ync-migration-from-abandon-feature-plan.md | 1012 ------------ .../Util/SilentProcessRunnerFixture.cs | 10 + .../KubernetesDirectoryInformationProvider.cs | 1 - 4 files changed, 10 insertions(+), 2387 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-21-tentacle-script-abandon-plan.md delete mode 100644 docs/superpowers/plans/2026-05-25-split-async-migration-from-abandon-feature-plan.md diff --git a/docs/superpowers/plans/2026-05-21-tentacle-script-abandon-plan.md b/docs/superpowers/plans/2026-05-21-tentacle-script-abandon-plan.md deleted file mode 100644 index 107d40c62..000000000 --- a/docs/superpowers/plans/2026-05-21-tentacle-script-abandon-plan.md +++ /dev/null @@ -1,1374 +0,0 @@ -# Tentacle script abandon — implementation plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Spec:** `docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md` -**Ticket:** [EFT-3295](https://linear.app/octopus/issue/EFT-3295/tentacle-script-abandonment-to-release-the-mutex) - -**Goal:** Add an `AbandonScript` verb to `IScriptServiceV2` so Octopus Server can tell Tentacle to release the `ScriptIsolationMutex` and accept new work even when `Process.Kill` failed to stop a stuck script. - -**Architecture:** Async migration of `SilentProcessRunner.ExecuteCommand` to `ExecuteCommandAsync`, replacing `process.WaitForExit()` with `await process.WaitForExitAsync(abandon)`. Two-token model on the call chain: existing `cancel` (drives kill via `cancel.Register`) and new `abandon` (drives the wait's early return). New RPC method on `IScriptServiceV2` fires the abandon token. Tentacle does NOT kill the OS process; the runaway is the customer's host-level problem per the ticket. - -**Tech stack:** .NET (multi-target), Halibut RPC, NUnit + FluentAssertions, NSubstitute for mocks. PowerShell on Windows, Bash on Linux. - -**Working branch:** `jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` (PR #1226). - ---- - -## File structure - -### New files -- `source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs` — new command DTO. - -### Modified — contracts -- `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs` — add `AbandonedExitCode = -48`. -- `source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs` — add `AbandonScript` method signature. - -### Modified — production code -- `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` — `ExecuteCommand` → `ExecuteCommandAsync`; add `abandon` token; swap `WaitForExit()` for `await WaitForExitAsync(abandon)`; abandon catch returns `AbandonedExitCode` after `SafelyCancelRead`. -- `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` — interface and wrapper become async, add `abandon` parameter. -- `source/Octopus.Tentacle/Util/CommandLineRunner.cs` — caller migration to await. -- `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` — `RunScript` → `RunScriptAsync`; constructor accepts `abandonToken`; plumb through. -- `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` — `LaunchShell` passes `abandonToken`; `RunningScriptWrapper` gains `abandonTokenSource`; new `AbandonScriptAsync`; targeted best-effort `workspace.Delete` in `CompleteScriptAsync`. -- `source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs` — add `"AbandonScriptV2"` to the non-Kubernetes capability list. -- `source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs` — add `TentacleDebugDisableProcessKill = "TentacleDebugDisableProcessKill"`. - -### Modified — Kubernetes integration test scaffolding (caller migration only) -- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs` (1 site) -- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs` (2 sites) -- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs` (3 sites) -- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs` (4 sites) -- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs` (1 site) -- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs` (1 site) - -### Modified — Tentacle integration test scaffolding (caller migration only) -- `source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs` (3 sites) -- `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` (existing tests need await; abandon tests added) -- `source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs` (1 site) - -### New tests -- Additions inside `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` — abandon-token behaviour, async timing, thread-leak. -- Additions inside `source/Octopus.Tentacle.Tests/Util/RunningScriptFixture.cs` — abandon plumbing. -- Additions inside `source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs` — service-layer abandon paths. -- New file `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` — end-to-end mutex-release-on-abandon (mirrors `ClientScriptExecutionIsolationMutex.cs`). - ---- - -## Task ordering rationale - -Contracts first (no behaviour change, just shapes). Test affordance next (needed by later integration tests). Async migration is the biggest single change — done in one bottom-up pass with all callers migrated together so the build stays green. RunningScript / ScriptServiceV2 abandon wiring after the async machinery exists. Capability advertisement last (it's a one-line addition gating the whole feature). Tests interleaved with the behaviour they cover. - ---- - -### Task 1: Add `AbandonedExitCode = -48` - -**Files:** -- Modify: `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs` - -- [ ] **Step 1: Add the constant** - -Open `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs`. Add a new line right after `PowerShellNeverStartedExitCode = -47;`: - -```csharp -public const int AbandonedExitCode = -48; -``` - -The full block should read: - -```csharp -public const int PowerShellNeverStartedExitCode = -47; -public const int AbandonedExitCode = -48; - -//Kubernetes Agent -public const int KubernetesScriptPodNotFound = -81; -``` - -- [ ] **Step 2: Build** - -```bash -dotnet build source/Octopus.Tentacle.Contracts/Octopus.Tentacle.Contracts.csproj -``` - -Expected: build succeeds. - -- [ ] **Step 3: Commit** - -```bash -git add source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs -git commit -m "Add AbandonedExitCode = -48 to ScriptExitCodes" -``` - ---- - -### Task 2: Add `AbandonScriptCommandV2` DTO - -**Files:** -- Create: `source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs` - -- [ ] **Step 1: Create the file** - -Use the same shape as `CancelScriptCommandV2.cs` (which lives in the same folder): - -```csharp -using System; - -namespace Octopus.Tentacle.Contracts.ScriptServiceV2 -{ - public class AbandonScriptCommandV2 - { - public AbandonScriptCommandV2(ScriptTicket ticket, long lastLogSequence) - { - Ticket = ticket; - LastLogSequence = lastLogSequence; - } - - public ScriptTicket Ticket { get; } - - public long LastLogSequence { get; } - } -} -``` - -- [ ] **Step 2: Build** - -```bash -dotnet build source/Octopus.Tentacle.Contracts/Octopus.Tentacle.Contracts.csproj -``` - -Expected: build succeeds. - -- [ ] **Step 3: Commit** - -```bash -git add source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs -git commit -m "Add AbandonScriptCommandV2 contract" -``` - ---- - -### Task 3: Add `AbandonScript` method to `IScriptServiceV2` - -**Files:** -- Modify: `source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs` - -- [ ] **Step 1: Update the interface** - -Add `AbandonScript` between `CancelScript` and `CompleteScript`: - -```csharp -using System; - -namespace Octopus.Tentacle.Contracts.ScriptServiceV2 -{ - public interface IScriptServiceV2 - { - ScriptStatusResponseV2 StartScript(StartScriptCommandV2 command); - ScriptStatusResponseV2 GetStatus(ScriptStatusRequestV2 request); - ScriptStatusResponseV2 CancelScript(CancelScriptCommandV2 command); - ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command); - void CompleteScript(CompleteScriptCommandV2 command); - } -} -``` - -- [ ] **Step 2: Build the whole solution** - -```bash -dotnet build source/Tentacle.sln -``` - -Expected: **build fails.** The async implementer (`ScriptServiceV2` in `Octopus.Tentacle.Core`) doesn't implement the new method yet. That's intentional — we'll fix it in Task 11. For now, capture the compile errors and confirm they're the expected "missing implementation" errors and nothing else. - -- [ ] **Step 3: Stash the stub on Halibut decorators** - -Tentacle wraps services with async decorators (look for `IAsyncScriptServiceV2`, `BackwardsCompatibleAsyncCapabilitiesV2Decorator`, etc). For the build to stay green between Task 3 and Task 11, add a **temporary** `NotImplementedException`-throwing stub to `ScriptServiceV2.cs`: - -Open `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs`. Add this method right after `CancelScriptAsync`: - -```csharp -public async Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) -{ - await Task.CompletedTask; - throw new NotImplementedException("Implemented in Task 11"); -} -``` - -- [ ] **Step 4: Build again, confirm green** - -```bash -dotnet build source/Tentacle.sln -``` - -Expected: build succeeds. - -- [ ] **Step 5: Commit** - -```bash -git add source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs -git commit -m "Add AbandonScript to IScriptServiceV2 interface (stub)" -``` - ---- - -### Task 4: Add `TentacleDebugDisableProcessKill` env-var constant - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs` - -- [ ] **Step 1: Add the constant** - -Open the file. Add a new line in the `EnvironmentVariables` static class, grouped near the other `Tentacle*` constants: - -```csharp -public const string TentacleDebugDisableProcessKill = "TentacleDebugDisableProcessKill"; -``` - -- [ ] **Step 2: Build** - -```bash -dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj -``` - -Expected: build succeeds. - -- [ ] **Step 3: Commit** - -```bash -git add source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs -git commit -m "Add TentacleDebugDisableProcessKill env var constant" -``` - -(The Hitman wiring happens in Task 6 alongside the async migration so the test affordance is in place before any new tests need it.) - ---- - -### Task 5: Make `SilentProcessRunner.ExecuteCommand` async — failing test first - -**Files:** -- Modify: `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` - -This is TDD's red step for the async migration. We're not going to migrate the whole call chain yet — we just write the new test that targets the future async method so it fails to compile, proving we need the new signature. - -- [ ] **Step 1: Add the failing test** - -Open `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs`. Add this new test near the existing `CancellationToken_*` tests: - -```csharp -[Test] -public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProcess() -{ - var command = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; - var arguments = PlatformDetection.IsRunningOnWindows - ? "-NoProfile -NonInteractive -Command \"Start-Sleep -Seconds 300\"" - : "-c \"sleep 300\""; - - using var cancelCts = new CancellationTokenSource(); - using var abandonCts = new CancellationTokenSource(); - - var infoMessages = new StringBuilder(); - - var sw = Stopwatch.StartNew(); - - var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( - command, - arguments, - Environment.CurrentDirectory, - debug: _ => { }, - info: msg => { lock (infoMessages) infoMessages.AppendLine(msg); }, - error: _ => { }, - customEnvironmentVariables: null, - cancel: cancelCts.Token, - abandon: abandonCts.Token)); - - // Give the process ~500ms to actually start before we abandon - await Task.Delay(500); - abandonCts.Cancel(); - - var exitCode = await task; - sw.Stop(); - - sw.Elapsed.Should().BeLessThan(TimeSpan.FromSeconds(2), "abandon should return promptly"); - exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); -} -``` - -Add the corresponding `using`s at the top if missing: - -```csharp -using System.Diagnostics; -using System.Threading.Tasks; -using Octopus.Tentacle.Contracts; -``` - -- [ ] **Step 2: Confirm it fails to compile** - -```bash -dotnet build source/Octopus.Tentacle.Tests.Integration/Octopus.Tentacle.Tests.Integration.csproj -``` - -Expected: compile error referencing `ExecuteCommandAsync` not existing on `SilentProcessRunner`. That's the red. - -- [ ] **Step 3: Commit (red phase)** - -```bash -git add source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs -git commit -m "Add failing test for AbandonToken behaviour in SilentProcessRunner" -``` - -We commit red because the next task migrates the production method; both will pass together once the migration completes. - ---- - -### Task 6: Migrate `SilentProcessRunner.ExecuteCommand` to async + add `abandon` token + plumb `TentacleDebugDisableProcessKill` - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` - -This is the load-bearing implementation task. We rename the method, change the return to `Task`, add the `abandon` parameter, swap `process.WaitForExit()` for `await process.WaitForExitAsync(abandon)`, add the abandon catch with `SafelyCancelRead` + honest log line + `AbandonedExitCode`, and wire the env var into `Hitman.TryKillProcessAndChildrenRecursively`. - -- [ ] **Step 1: Update `ExecuteCommand` signature and body** - -Find the current `public static int ExecuteCommand(...)` overload at the top (around line 17). Update both overloads to be `async Task` and add the `abandon` parameter. The simpler overload should delegate to the richer one: - -```csharp -public static Task ExecuteCommandAsync( - string executable, - string arguments, - string workingDirectory, - Action debug, - Action info, - Action error, - CancellationToken cancel, - CancellationToken abandon) -{ - return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel, abandon: abandon); -} - -public static async Task ExecuteCommandAsync( - string executable, - string arguments, - string workingDirectory, - Action debug, - Action info, - Action error, - IReadOnlyDictionary? customEnvironmentVariables = null, - CancellationToken cancel = default, - CancellationToken abandon = default) -{ - // ... existing argument-null checks ... - // ... existing process.StartInfo setup ... - process.Start(); - - var running = true; - - using (cancel.Register(() => - { - if (running) DoOurBestToCleanUp(process, error); - })) - { - if (cancel.IsCancellationRequested) - DoOurBestToCleanUp(process, error); - - process.BeginOutputReadLine(); - process.BeginErrorReadLine(); - - try - { - await process.WaitForExitAsync(abandon).ConfigureAwait(false); - } - catch (OperationCanceledException) when (abandon.IsCancellationRequested && !process.HasExited) - { - info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); - SafelyCancelRead(process.CancelErrorRead, debug); - SafelyCancelRead(process.CancelOutputRead, debug); - running = false; - return ScriptExitCodes.AbandonedExitCode; - } - - SafelyCancelRead(process.CancelErrorRead, debug); - SafelyCancelRead(process.CancelOutputRead, debug); - - SafelyWaitForAllOutput(outputResetEvent, cancel, debug); - SafelyWaitForAllOutput(errorResetEvent, cancel, debug); - - var exitCode = SafelyGetExitCode(process); - debug($"Process {exeFileNameOrFullPath} in {workingDirectory} exited with code {exitCode}"); - - running = false; - return exitCode; - } -} -``` - -Notes: -- The old synchronous `ExecuteCommand` overloads are deleted. Every caller migrates in Tasks 7–9. -- `running = false` set inside the abandon catch as well — `cancel.Register`'s callback checks `running` to decide whether to call `DoOurBestToCleanUp`. After abandon we don't want it firing. - -- [ ] **Step 2: Wire `TentacleDebugDisableProcessKill` into `Hitman`** - -In the same file, find the `Hitman.TryKillProcessAndChildrenRecursively` method (around line 250). Add the env-var check at the top: - -```csharp -public static void TryKillProcessAndChildrenRecursively(Process process) -{ - if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill))) - { - // Test-only no-op: simulate "kill was attempted but didn't terminate the process". - // Only activated when the test harness sets this env var on the Tentacle process. - return; - } - -#if NETFRAMEWORK - TryKillWindowsProcessAndChildrenRecursively(process.Id); -#endif -#if !NETFRAMEWORK - process.Kill(true); -#endif -} -``` - -Add the `using` at the top if not already present: - -```csharp -using Octopus.Tentacle.Core.Util; -``` - -- [ ] **Step 3: Build (expect cascade failures from removed sync method)** - -```bash -dotnet build source/Tentacle.sln -``` - -Expected: many compile errors at every caller of the removed `ExecuteCommand`. That's the next several tasks. - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs -git commit -m "Migrate SilentProcessRunner to async; add abandon token; debug kill-disable flag" -``` - ---- - -### Task 7: Migrate production callers to await - -**Files:** -- Modify: `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` -- Modify: `source/Octopus.Tentacle/Util/CommandLineRunner.cs` - -- [ ] **Step 1: Update `ISilentProcessRunner` interface and wrapper** - -Open `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs`. Make the interface and wrapper async, add `abandon`: - -```csharp -public interface ISilentProcessRunner -{ - Task ExecuteCommandAsync( - string executable, - string arguments, - string workingDirectory, - Action info, - Action error, - CancellationToken cancel = default, - CancellationToken abandon = default); - - Task ExecuteCommandAsync( - string executable, - string arguments, - string workingDirectory, - Action debug, - Action info, - Action error, - CancellationToken cancel = default, - CancellationToken abandon = default); -} - -public class SilentProcessRunnerWrapper : ISilentProcessRunner -{ - public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) - { - return SilentProcessRunnerExtended.ExecuteCommandAsync(executable, arguments, workingDirectory, info, error, cancel, abandon); - } - - public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) - { - return SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, cancel: cancel, abandon: abandon); - } -} -``` - -Update the `SilentProcessRunnerExtended` static helpers in the same file. The extension methods on `CommandLineInvocation` will need to become async too: - -```csharp -public static async Task ExecuteCommandAsync(this CommandLineInvocation invocation) - => await ExecuteCommandAsync(invocation, Environment.CurrentDirectory); - -public static async Task ExecuteCommandAsync(this CommandLineInvocation invocation, string workingDirectory) -{ - if (workingDirectory == null) - throw new ArgumentNullException(nameof(workingDirectory)); - - var arguments = $"{invocation.Arguments} {invocation.SystemArguments ?? string.Empty}"; - var infos = new List(); - var errors = new List(); - - var exitCode = await ExecuteCommandAsync( - invocation.Executable, - arguments, - workingDirectory, - infos.Add, - errors.Add - ); - - return new CmdResult(exitCode, infos, errors); -} - -public static Task ExecuteCommandAsync( - string executable, - string arguments, - string workingDirectory, - Action info, - Action error, - CancellationToken cancel = default, - CancellationToken abandon = default) - => SilentProcessRunner.ExecuteCommandAsync(executable, - arguments, - workingDirectory, - LogFileOnlyLogger.Current.Info, - info, - error, - customEnvironmentVariables: null, - cancel: cancel, - abandon: abandon); -``` - -- [ ] **Step 2: Migrate `CommandLineRunner`** - -Open `source/Octopus.Tentacle/Util/CommandLineRunner.cs`. Find the call to `SilentProcessRunner.ExecuteCommand` and convert. The whole method becomes async — propagate the change up the chain until you reach a natural async boundary or `Task.Run` / `.GetAwaiter().GetResult()` glue is needed. - -Pattern for each call site: - -```csharp -// Before: -var exitCode = SilentProcessRunner.ExecuteCommand(invocation.Executable, ...); -// After: -var exitCode = await SilentProcessRunner.ExecuteCommandAsync(invocation.Executable, ..., abandon: CancellationToken.None); -``` - -For `CommandLineRunner.Execute`, the method becomes `ExecuteAsync` returning `Task`. Any caller that hits a sync boundary uses `.GetAwaiter().GetResult()` *as a last resort, with a comment explaining why*. - -- [ ] **Step 3: Build** - -```bash -dotnet build source/Octopus.Tentacle/Octopus.Tentacle.csproj -``` - -Expected: build succeeds (or surfaces the next layer of callers; resolve them with the same pattern). - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle/Util/ISilentProcessRunner.cs source/Octopus.Tentacle/Util/CommandLineRunner.cs -git commit -m "Migrate ISilentProcessRunner and CommandLineRunner to async" -``` - ---- - -### Task 8: Migrate Kubernetes integration test scaffolding to await - -**Files:** -- Modify (caller migration only): - - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs` - - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs` - - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs` - - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs` - - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs` - - `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs` - -- [ ] **Step 1: Apply the same caller pattern to every call site** - -Pattern at each `SilentProcessRunner.ExecuteCommand(...)`: - -```csharp -// Before (synchronous): -var exitCode = SilentProcessRunner.ExecuteCommand(executable, args, workingDir, debug, info, error, cancel: ct); -// After (async, abandon-token = None because these are setup tools, not Tentacle script execution): -var exitCode = await SilentProcessRunner.ExecuteCommandAsync(executable, args, workingDir, debug, info, error, cancel: ct, abandon: CancellationToken.None); -``` - -Make the containing method `async Task` (or `async Task` if it doesn't return the exit code). Propagate `async` up the call chain in this file. Most of these scaffolding methods are already called from `async` test setup, so the propagation is usually one or two layers. - -For commented-out lines (e.g. `KubernetesClusterInstaller.cs:129`), leave them commented. - -- [ ] **Step 2: Build the K8s integration test project** - -```bash -dotnet build source/Octopus.Tentacle.Kubernetes.Tests.Integration/Octopus.Tentacle.Kubernetes.Tests.Integration.csproj -``` - -Expected: build succeeds. - -- [ ] **Step 3: Commit** - -```bash -git add source/Octopus.Tentacle.Kubernetes.Tests.Integration/ -git commit -m "Migrate Kubernetes integration test scaffolding to async ExecuteCommandAsync" -``` - ---- - -### Task 9: Migrate Tentacle integration test scaffolding to await - -**Files:** -- Modify: - - `source/Octopus.Tentacle.Tests.Integration/PowerShellStartupDetectionTests.cs` (3 sites) - - `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` (existing sync tests — but also fix the helper there) - - `source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs` - -- [ ] **Step 1: Migrate each caller** - -Same pattern as Task 8. `SilentProcessRunner.ExecuteCommand(...)` → `await SilentProcessRunner.ExecuteCommandAsync(..., abandon: CancellationToken.None)`. Containing methods become `async Task<...>`. - -In `SilentProcessRunnerFixture.cs`, there's a private helper near the top that wraps `ExecuteCommand` for the existing tests (`Execute(...)`). Migrate it: - -```csharp -static async Task ExecuteAsync(string command, string arguments, string workingDirectory, out StringBuilder debugMessages, out StringBuilder infoMessages, out StringBuilder errorMessages, CancellationToken cancel = default, CancellationToken abandon = default) -``` - -Each existing test that calls `Execute(...)` now calls `await ExecuteAsync(...)`. Tests become `async Task` returning methods. NUnit handles that. - -- [ ] **Step 2: Build the Tentacle integration test project** - -```bash -dotnet build source/Octopus.Tentacle.Tests.Integration/Octopus.Tentacle.Tests.Integration.csproj -``` - -Expected: build succeeds. - -- [ ] **Step 3: Run the existing SilentProcessRunner tests on Linux + Windows** - -```bash -dotnet test source/Octopus.Tentacle.Tests.Integration --filter "FullyQualifiedName~SilentProcessRunnerFixture" -``` - -Expected: all existing tests pass. The new `AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProcess` test from Task 5 ALSO passes now that the production method exists. Green. - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle.Tests.Integration/ -git commit -m "Migrate Tentacle integration test scaffolding to async; AbandonToken test now passes" -``` - ---- - -### Task 10: Add abandon support to `RunningScript` - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` -- Modify: `source/Octopus.Tentacle.Tests/Util/RunningScriptFixture.cs` (or wherever the existing fixture lives — adjust path if it's in `Octopus.Tentacle.Tests.Integration`) - -- [ ] **Step 1: Write the failing test** - -Open the existing `RunningScriptFixture.cs`. Add a test that exercises the abandon path: - -```csharp -[Test] -public async Task Execute_WhenAbandonTokenFires_ReturnsAbandonedExitCode() -{ - // arrange: a workspace + shell that runs a long-sleeping script - var workspace = CreateWorkspace(bashScript: "sleep 300", powershellScript: "Start-Sleep -Seconds 300"); - var shell = new Bash(); // or appropriate cross-platform helper - using var runningCts = new CancellationTokenSource(); - using var abandonCts = new CancellationTokenSource(); - - var runningScript = new RunningScript( - shell, - workspace, - stateStore: null, - scriptLog: workspace.CreateLog(), - taskId: "ServerTask-1", - scriptIsolationMutex: new ScriptIsolationMutex(), - runningScriptToken: runningCts.Token, - abandonToken: abandonCts.Token, - environmentVariables: new Dictionary(), - powerShellStartupTimeout: TimeSpan.FromMinutes(1), - log: Substitute.For()); - - var executeTask = runningScript.Execute(); - await Task.Delay(500); // let the process start - abandonCts.Cancel(); - - await executeTask; - runningScript.State.Should().Be(ProcessState.Complete); - runningScript.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); -} -``` - -Run it; expect compile failure ("RunningScript constructor doesn't accept abandonToken"). - -- [ ] **Step 2: Add `abandonToken` to `RunningScript`** - -In `RunningScript.cs`, add a field and constructor parameter: - -```csharp -readonly CancellationToken runningScriptToken; -readonly CancellationToken abandonToken; // NEW - -public RunningScript(IShell shell, - IScriptWorkspace workspace, - IScriptStateStore? stateStore, - IScriptLog scriptLog, - string taskId, - ScriptIsolationMutex scriptIsolationMutex, - CancellationToken runningScriptToken, - CancellationToken abandonToken, // NEW - IReadOnlyDictionary environmentVariables, - TimeSpan powerShellStartupTimeout, - ILog log) -{ - // ... existing assignments ... - this.abandonToken = abandonToken; - // ... -} -``` - -Update the secondary constructor that omits `stateStore` to pass `abandonToken` through as well. - -- [ ] **Step 3: Replace `RunScript` with async, plumb `abandonToken`** - -Replace the existing `int RunScript(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken)` with: - -```csharp -async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken, CancellationToken abandon) -{ - try - { - var exitCode = await SilentProcessRunner.ExecuteCommandAsync( - shellPath, - shell.FormatCommandArguments(workspace.BootstrapScriptFilePath, workspace.ScriptArguments, false), - workspace.WorkingDirectory, - LogScriptOutputTo(writer, ProcessOutputSource.Debug), - LogScriptOutputTo(writer, ProcessOutputSource.StdOut), - LogScriptOutputTo(writer, ProcessOutputSource.StdErr), - environmentVariables, - cancellationToken, - abandon); - - return exitCode; - } - catch (Exception ex) - { - writer.WriteOutput(ProcessOutputSource.StdErr, "An exception was thrown when invoking " + shellPath + ": " + ex.Message); - writer.WriteOutput(ProcessOutputSource.StdErr, ex.ToString()); - return ScriptExitCodes.PowershellInvocationErrorExitCode; - } -} -``` - -- [ ] **Step 4: Update `Execute` to await the async `RunScriptAsync`** - -Inside `Execute()`, change the call: - -```csharp -exitCode = workspace.ShouldMonitorPowerShellStartup() - ? await RunPowershellScriptWithMonitoring(shellPath, writer, runningScriptToken) - : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); -``` - -Inside `RunPowershellScriptWithMonitoring`, find the `Task.Run(() => RunScript(...))` line and change to `Task.Run(async () => await RunScriptAsync(shellPath, writer, scriptTaskCts.Token, abandonToken), scriptTaskCts.Token)`. - -- [ ] **Step 5: Build and run the new test** - -```bash -dotnet build source/Tentacle.sln -dotnet test source/Octopus.Tentacle.Tests.Integration --filter "Execute_WhenAbandonTokenFires" -``` - -Expected: build succeeds; the new test passes. Build of the broader solution will surface that `ScriptServiceV2.cs` doesn't pass `abandonToken` yet — that's Task 11. - -- [ ] **Step 6: Commit** - -```bash -git add source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs source/Octopus.Tentacle.Tests/Util/RunningScriptFixture.cs -git commit -m "Plumb abandon token through RunningScript; covered by new test" -``` - ---- - -### Task 11: Implement `ScriptServiceV2.AbandonScriptAsync` and add `abandonTokenSource` to wrapper - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` -- Modify: `source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs` (existing fixture) - -- [ ] **Step 1: Write failing service-layer tests** - -In `ScriptServiceV2Fixture.cs`, add these tests: - -```csharp -[Test] -public async Task AbandonScript_OnUnknownTicket_ReturnsCompleteWithUnknownScriptExitCode() -{ - var service = CreateService(); - var ticket = new ScriptTicket("unknown"); - var response = await service.AbandonScriptAsync(new AbandonScriptCommandV2(ticket, 0), CancellationToken.None); - - response.State.Should().Be(ProcessState.Complete); - response.ExitCode.Should().Be(ScriptExitCodes.UnknownScriptExitCode); -} - -[Test] -public async Task AbandonScript_OnRunningScript_FiresAbandonToken_ReleasesMutex_ReturnsAbandonedExitCode() -{ - var service = CreateService(); - - // start a script that will block on a file-wait, so it stays Running until we release it - var startCommand = BuildLongRunningCommand(); // uses TestExecuteShellScriptCommandBuilder - await service.StartScriptAsync(startCommand, CancellationToken.None); - - var response = await service.AbandonScriptAsync( - new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), - CancellationToken.None); - - response.State.Should().Be(ProcessState.Complete); - response.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - - // mutex should be free: a new FullIsolation script should start - var second = BuildLongRunningCommand(); - var secondResponse = await service.StartScriptAsync(second, CancellationToken.None); - secondResponse.State.Should().NotBe(ProcessState.Pending); // i.e. wasn't blocked on the mutex -} - -[Test] -public async Task AbandonScript_OnAlreadyCompletedScript_ReturnsRealExitCodeNotAbandoned() -{ - var service = CreateService(); - var startCommand = BuildShortRunningCommand(exitCode: 0); // completes quickly - - await service.StartScriptAsync(startCommand, CancellationToken.None); - - // wait for completion - ScriptStatusResponseV2 status; - do { status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); } - while (status.State != ProcessState.Complete); - - var abandonResponse = await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); - abandonResponse.ExitCode.Should().Be(0, "real exit code should be returned, not AbandonedExitCode"); -} -``` - -Run: expect compile failures (the stub from Task 3 throws NotImplementedException; the assertions will fail). - -- [ ] **Step 2: Implement `AbandonScriptAsync`** - -Replace the Task 3 stub with the real implementation. Also add `abandonTokenSource` to `RunningScriptWrapper`: - -```csharp -class RunningScriptWrapper : IDisposable -{ - readonly CancellationTokenSource cancellationTokenSource = new(); - readonly CancellationTokenSource abandonTokenSource = new(); - - public RunningScriptWrapper(ScriptStateStore scriptStateStore) - { - ScriptStateStore = scriptStateStore; - CancellationToken = cancellationTokenSource.Token; - AbandonToken = abandonTokenSource.Token; - } - - public RunningScript? Process { get; set; } - public ScriptStateStore ScriptStateStore { get; } - public SemaphoreSlim StartScriptMutex { get; } = new(1, 1); - - public CancellationToken CancellationToken { get; } - public CancellationToken AbandonToken { get; } - - public void Cancel() => cancellationTokenSource.Cancel(); - public void Abandon() => abandonTokenSource.Cancel(); - - public void Dispose() - { - cancellationTokenSource.Dispose(); - abandonTokenSource.Dispose(); - } -} -``` - -Replace the stub `AbandonScriptAsync`: - -```csharp -public async Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) -{ - await Task.CompletedTask; - - if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) - { - runningScript.Abandon(); - } - - return GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process); -} -``` - -In `LaunchShell`, pass `abandonToken` through: - -```csharp -RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken, CancellationToken abandonToken) -{ - var runningScript = new RunningScript(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); - _ = Task.Run(async () => await runningScript.Execute()); - return runningScript; -} -``` - -Update the call site of `LaunchShell` in `StartScriptAsync` to pass `runningScript.AbandonToken`. - -- [ ] **Step 3: Run the new tests** - -```bash -dotnet test source/Octopus.Tentacle.Tests --filter "FullyQualifiedName~ScriptServiceV2Fixture.AbandonScript" -``` - -Expected: all three new tests pass. - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs -git commit -m "Implement ScriptServiceV2.AbandonScriptAsync with abandon-token wrapper" -``` - ---- - -### Task 12: Targeted best-effort `CompleteScript` - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` -- Modify: `source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs` - -- [ ] **Step 1: Write a failing test** - -Add to `ScriptServiceV2Fixture.cs`: - -```csharp -[Test] -public async Task CompleteScript_AfterAbandon_WhenWorkspaceDeleteFails_LogsWarnAndReturnsNormally() -{ - var service = CreateService(); // factory should let us inject a workspace whose Delete throws IOException - var startCommand = BuildLongRunningCommand(); - await service.StartScriptAsync(startCommand, CancellationToken.None); - await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); - - // arrange the workspace.Delete to fail - ArrangeWorkspaceDeleteToThrow(startCommand.ScriptTicket, new IOException("file in use")); - - var complete = async () => await service.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); - - await complete.Should().NotThrowAsync(); - // assert: systemLog received a Warn entry mentioning the leaked directory - fakeSystemLog.WarnMessages.Should().Contain(m => m.Contains("Could not delete") && m.Contains(startCommand.ScriptTicket.TaskId)); -} - -[Test] -public async Task CompleteScript_AfterNormalCompletion_WhenWorkspaceDeleteFails_PropagatesException() -{ - var service = CreateService(); - var startCommand = BuildShortRunningCommand(exitCode: 0); - await service.StartScriptAsync(startCommand, CancellationToken.None); - - // poll until natural completion - ScriptStatusResponseV2 status; - var deadline = DateTime.UtcNow.AddSeconds(30); - do - { - status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); - if (status.State == ProcessState.Complete) break; - await Task.Delay(50); - } while (DateTime.UtcNow < deadline); - status.State.Should().Be(ProcessState.Complete); - status.ExitCode.Should().Be(0, "the script exited cleanly, not via abandon"); - - ArrangeWorkspaceDeleteToThrow(startCommand.ScriptTicket, new IOException("file in use")); - - var complete = async () => await service.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); - - await complete.Should().ThrowAsync(); -} -``` - -Run: expect both to fail (current code propagates the exception unconditionally). - -- [ ] **Step 2: Update `CompleteScriptAsync`** - -Replace the existing implementation: - -```csharp -public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken) -{ - if (runningScripts.TryRemove(command.Ticket, out var runningScript)) - { - runningScript.Dispose(); - } - - var workspace = workspaceFactory.GetWorkspace(command.Ticket, WorkspaceReadinessCheck.Skip); - - var stateStore = scriptStateStoreFactory.Create(workspace); - var wasAbandoned = stateStore.Exists() - && stateStore.Load().ExitCode == ScriptExitCodes.AbandonedExitCode; - - if (wasAbandoned) - { - try - { - await workspace.Delete(cancellationToken); - } - catch (Exception ex) - { - log.Warn(ex, $"Could not delete abandoned workspace at {workspace.WorkingDirectory}. Leaving on disk; the underlying script process may still hold open file handles."); - } - } - else - { - await workspace.Delete(cancellationToken); - } -} -``` - -- [ ] **Step 3: Run the new tests** - -```bash -dotnet test source/Octopus.Tentacle.Tests --filter "FullyQualifiedName~ScriptServiceV2Fixture.CompleteScript" -``` - -Expected: both new tests pass; existing CompleteScript tests still pass. - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs -git commit -m "Best-effort workspace.Delete gated on AbandonedExitCode" -``` - ---- - -### Task 13: Advertise `AbandonScriptV2` capability - -**Files:** -- Modify: `source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs` -- Modify: `source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs` (existing fixture) - -- [ ] **Step 1: Write the failing test** - -In `CapabilitiesServiceV2Fixture.cs`: - -```csharp -[Test] -public async Task GetCapabilities_OnNonKubernetesTentacle_AdvertisesAbandonScriptV2() -{ - var service = new CapabilitiesServiceV2(); - var response = await service.GetCapabilitiesAsync(CancellationToken.None); - response.SupportedCapabilities.Should().Contain("AbandonScriptV2"); -} - -[Test] -public async Task GetCapabilities_OnKubernetesTentacle_DoesNotAdvertiseAbandonScriptV2() -{ - // arrange KubernetesSupportDetection.IsRunningAsKubernetesAgent = true (test-only override; mirror existing pattern in the fixture) - var service = new CapabilitiesServiceV2(); - var response = await service.GetCapabilitiesAsync(CancellationToken.None); - response.SupportedCapabilities.Should().NotContain("AbandonScriptV2"); -} -``` - -Run: expect both to fail. - -- [ ] **Step 2: Add the capability string** - -In `CapabilitiesServiceV2.cs`: - -```csharp -return new CapabilitiesResponseV2(new List -{ - nameof(IScriptService), - nameof(IFileTransferService), - nameof(IScriptServiceV2), - "AbandonScriptV2" -}); -``` - -- [ ] **Step 3: Run the tests** - -```bash -dotnet test source/Octopus.Tentacle.Tests --filter "FullyQualifiedName~CapabilitiesServiceV2Fixture.GetCapabilities" -``` - -Expected: both new tests pass; existing capability tests still pass. - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs -git commit -m "Advertise AbandonScriptV2 capability" -``` - ---- - -### Task 14: Integration test — mutex release on abandon - -**Files:** -- Create: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` - -This is the load-bearing end-to-end test. Mirrors `ClientScriptExecutionIsolationMutex.cs`. Uses the existing builders (`TestExecuteShellScriptCommandBuilder`, `ScriptBuilder`, `Wait.For`, `TentacleServiceDecoratorBuilder`) — do NOT use raw shell + `Thread.Sleep`. - -- [ ] **Step 1: Create the file** - -```csharp -using System; -using System.Collections; -using System.IO; -using System.Threading.Tasks; -using FluentAssertions; -using NUnit.Framework; -using Octopus.Tentacle.Contracts; -using Octopus.Tentacle.Contracts.ScriptServiceV2; -using Octopus.Tentacle.Tests.Integration.Support; -using Octopus.Tentacle.Tests.Integration.Util; -using Octopus.Tentacle.Tests.Integration.Util.Builders; - -namespace Octopus.Tentacle.Tests.Integration -{ - [IntegrationTestTimeout] - public class ClientScriptExecutionAbandon : IntegrationTest - { - [Test] - [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.V2)] - public async Task AbandonScript_WhileScriptIsRunning_ReleasesMutexAndReturnsAbandonedExitCode(TentacleConfigurationTestCase tcc) - { - await using var clientTentacle = await tcc.CreateBuilder() - .WithTentacleEnvironmentVariable("TentacleDebugDisableProcessKill", "1") // make Hitman a no-op - .Build(CancellationToken); - - var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); - var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); - - // first script: signals "started" then blocks until release file appears - var firstCommand = new TestExecuteShellScriptCommandBuilder() - .SetScriptBody(new ScriptBuilder() - .CreateFile(startFile) - .WaitForFileToExist(releaseFile)) - .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) - .WithIsolationMutexName("abandon-test-mutex") - .Build(); - - var tentacleClient = clientTentacle.TentacleClient; - - var firstScriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); - - // wait until the first script is actually running - await Wait.For(() => File.Exists(startFile), - TimeSpan.FromSeconds(30), - () => throw new Exception("first script did not start"), - CancellationToken); - - // cancel first (kill is mocked off, so the script keeps running) - await tentacleClient.ScriptServiceV2.CancelScriptAsync(new CancelScriptCommandV2(firstCommand.ScriptTicket, 0), CancellationToken); - - // give cancel a moment to be attempted; then abandon - await Task.Delay(TimeSpan.FromSeconds(1)); - - var abandonResponse = await tentacleClient.ScriptServiceV2.AbandonScriptAsync(new AbandonScriptCommandV2(firstCommand.ScriptTicket, 0), CancellationToken); - abandonResponse.State.Should().Be(ProcessState.Complete); - abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - - // load-bearing: second FullIsolation script should now start, proving the mutex was released - var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); - var secondCommand = new TestExecuteShellScriptCommandBuilder() - .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) - .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) - .WithIsolationMutexName("abandon-test-mutex") - .Build(); - - var secondResult = await tentacleClient.ExecuteScript(secondCommand, CancellationToken); - secondResult.response.ExitCode.Should().Be(0); - File.Exists(secondStartFile).Should().BeTrue(); - - // release the first script so the test process doesn't leak forever - File.WriteAllText(releaseFile, ""); - } - } -} -``` - -If `WithTentacleEnvironmentVariable` doesn't exist on the builder, add it as a small helper in `ClientAndTentacleBuilder` and propagate to the Tentacle process startup environment. - -- [ ] **Step 2: Run the new test on Linux** - -```bash -dotnet test source/Octopus.Tentacle.Tests.Integration --filter "ClientScriptExecutionAbandon" -``` - -Expected: passes. - -- [ ] **Step 3: Run on Windows CI** - -Push to the branch and verify the Windows CI job passes. - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs -git commit -m "Integration test: AbandonScript releases mutex when kill mocked off" -``` - ---- - -### Task 15: Integration test — multi-level-deep hang variant - -**Files:** -- Modify: `source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs` (add `AppendRaw` helper) -- Modify: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (add second test) - -- [ ] **Step 0: Add `AppendRaw` to `ScriptBuilder`** - -The existing `ScriptBuilder` doesn't have a way to inject shell-specific raw command lines. Add this helper near `Print` / `Sleep`: - -```csharp -public ScriptBuilder AppendRaw(string bash, string windows) -{ - bashScript.AppendLine(bash); - windowsScript.AppendLine(windows); - return this; -} -``` - -Commit this separately so the helper is available before the multi-level test depends on it: - -```bash -git add source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs -git commit -m "Add ScriptBuilder.AppendRaw for shell-specific command injection" -``` - -The ticket explicitly asks for a "multi-level-deep hang (bootstrap → Calamari → script → AV)" test. - -- [ ] **Step 1: Add the test** - -```csharp -[Test] -[TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.V2)] -public async Task AbandonScript_MultiLevelDeepHang_StillReleasesMutex(TentacleConfigurationTestCase tcc) -{ - await using var clientTentacle = await tcc.CreateBuilder() - .WithTentacleEnvironmentVariable("TentacleDebugDisableProcessKill", "1") - .Build(CancellationToken); - - var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); - var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); - - // Multi-level chain: Tentacle runs the outer shell (bootstrap), which launches a child shell - // which itself launches a grandchild that polls for the release file. Mirrors - // bootstrap → Calamari → user script. - var script = new ScriptBuilder() - .CreateFile(startFile) - .AppendRaw( - bash: $"bash -c \"bash -c 'while [ ! -f {releaseFile.Replace("\\", "/")} ]; do sleep 0.5; done'\"", - windows: $"powershell -NoProfile -Command \"powershell -NoProfile -Command 'while (-not (Test-Path \\\"{releaseFile}\\\")) {{ Start-Sleep -Milliseconds 500 }}'\""); - - var command = new TestExecuteShellScriptCommandBuilder() - .SetScriptBody(script) - .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) - .WithIsolationMutexName("abandon-multilevel-mutex") - .Build(); - - var tentacleClient = clientTentacle.TentacleClient; - var firstExecution = Task.Run(async () => await tentacleClient.ExecuteScript(command, CancellationToken)); - await Wait.For(() => File.Exists(startFile), - TimeSpan.FromSeconds(30), - () => throw new Exception("multi-level script did not start"), - CancellationToken); - - await tentacleClient.ScriptServiceV2.CancelScriptAsync(new CancelScriptCommandV2(command.ScriptTicket, 0), CancellationToken); - await Task.Delay(TimeSpan.FromSeconds(1)); - - var abandonResponse = await tentacleClient.ScriptServiceV2.AbandonScriptAsync(new AbandonScriptCommandV2(command.ScriptTicket, 0), CancellationToken); - abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - - // mutex released check (same as Task 14) - var secondCommand = new TestExecuteShellScriptCommandBuilder() - .SetScriptBody(new ScriptBuilder().CreateFile(Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second"))) - .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) - .WithIsolationMutexName("abandon-multilevel-mutex") - .Build(); - var secondResult = await tentacleClient.ExecuteScript(secondCommand, CancellationToken); - secondResult.response.ExitCode.Should().Be(0); - - File.WriteAllText(releaseFile, ""); -} -``` - -- [ ] **Step 2: Run** - -```bash -dotnet test source/Octopus.Tentacle.Tests.Integration --filter "AbandonScript_MultiLevelDeepHang" -``` - -Expected: passes. - -- [ ] **Step 3: Commit** - -```bash -git add source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs -git commit -m "Integration test: multi-level-deep hang abandons cleanly" -``` - ---- - -### Task 16: Full test suite + push for CI - -- [ ] **Step 1: Run the entire test suite locally** - -```bash -dotnet test source/Tentacle.sln -``` - -Expected: all tests pass on Linux. (Windows-only tests will skip locally if not on Windows.) - -- [ ] **Step 2: Push for CI** - -```bash -git push -``` - -Wait for the GitHub Actions check on PR #1226. All matrices (Linux, Windows, both target frameworks) must pass. - -- [ ] **Step 3: Address any platform-specific failures** - -Most likely areas: -- Workspace-cleanup test on Linux: Linux generally allows deletion of open files (the inode survives until handles close). The "delete fails" test may need a Windows-only attribute. -- Thread-count assertion timing: bump the delta tolerance if CI jitter is higher than dev box. - -- [ ] **Step 4: Final commit (if any fixes needed)** - -```bash -git add -git commit -m "Address CI platform-specific test failures" -git push -``` - ---- - -## Self-review checklist (run after writing the plan, before handing off) - -- [ ] Spec coverage: every section of `docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md` maps to at least one task above. -- [ ] No `TODO`, `TBD`, `implement later`, or "add appropriate error handling" placeholders. -- [ ] Type/method names consistent across tasks (`ExecuteCommandAsync`, `AbandonScriptCommandV2`, `AbandonedExitCode`, `abandonToken`, `AbandonScriptAsync`). -- [ ] Every code step shows the actual code, not a description. -- [ ] Every command step shows the exact command and the expected outcome. - -## Notes for execution - -- **Frequent commits.** Each task above is one commit. Don't bundle. -- **Build green between tasks.** Task 3 introduces a `NotImplementedException` stub precisely so the build stays green between contracts and the implementation in Task 11. -- **Test cleanup.** Several integration tests leave a running PowerShell / bash sleep process behind (because `TentacleDebugDisableProcessKill` is set). The tests must release them via the release-file pattern. Forgetting cleanup will leak processes on the CI box. -- **Coordination with server-side.** Server-side session is on a parallel branch in `OctopusDeploy/OctopusDeploy`. Once both PRs are mergeable, coordinate the contract package version bump so Server picks up the new contract in lockstep. diff --git a/docs/superpowers/plans/2026-05-25-split-async-migration-from-abandon-feature-plan.md b/docs/superpowers/plans/2026-05-25-split-async-migration-from-abandon-feature-plan.md deleted file mode 100644 index 70f5db598..000000000 --- a/docs/superpowers/plans/2026-05-25-split-async-migration-from-abandon-feature-plan.md +++ /dev/null @@ -1,1012 +0,0 @@ -# Split Async Migration from Abandon Feature — Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Restructure the existing PR stack so the async migration of `SilentProcessRunner` sits in its own foundational PR, and the abandon feature (PR #1226) stacks on top of it. - -**Architecture:** End-state rebuild. Create a fresh branch from `main` containing only the async migration + sync-boundary comments. Then force-push #1226 with the abandon delta on top. PR #1235 rebases on the new #1226. - -**Tech Stack:** C# (.NET 8 + net48 polyfill), Autofac DI, NUnit tests, git worktree workflow. - -**Spec:** `docs/superpowers/specs/2026-05-25-split-async-migration-from-abandon-feature-design.md` - -**Reference state:** The current tip of `jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` is `583eb46c` (now: `46f09e7e` after the spec commit). The diff from `main` to that commit contains BOTH PRs' content combined. - ---- - -## Phase 0 — Preparation - -### Task 0.1: Tag the current state as a safety reference - -**Files:** none (git only) - -- [ ] **Step 1: Tag the current abandon branch tip** - -```bash -cd /Users/jim/code/OctopusTentacle -git tag claude-safety-2026-05-25-pre-split jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex -git tag claude-safety-2026-05-25-pre-split-1235 jimpelletier/eft-3295-async-signature-propagation -``` - -- [ ] **Step 2: Verify tags** - -```bash -git tag -l "claude-safety-*" -``` - -Expected: at least these two tags listed. - ---- - -## Phase 1 — Build the base PR - -### Task 1.1: Create new base branch from main - -**Files:** none (git only) - -- [ ] **Step 1: Fetch main** - -```bash -cd /Users/jim/code/OctopusTentacle -git fetch origin main -``` - -- [ ] **Step 2: Create the new branch from origin/main** - -```bash -git checkout -b jimpelletier/eft-3295-async-migration-base origin/main -``` - -- [ ] **Step 3: Verify** - -```bash -git log --oneline -1 -``` - -Expected: latest commit on `main`. - ---- - -### Task 1.2: Migrate `SilentProcessRunner.ExecuteCommand` to async - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` - -The goal: change the method from sync to async with the MINIMUM internal changes. The `cancel` token is passed to `WaitForExitAsync(cancel)` so cancel still throws OCE and unwinds. `DoOurBestToCleanUp` remains unchanged — including the `process.Close()` call. `SafelyWaitForAllOutput` remains unchanged. - -Use the `claude-safety-2026-05-25-pre-split` tag to see what the final state in `583eb46c` looks like, but ONLY take: -- The method signature change to `async Task ExecuteCommandAsync(...)` (without `abandon` parameter) -- The internal `process.WaitForExit()` → `await process.WaitForExitAsync(cancel)` change -- The net48 polyfill `WaitForExitAsyncNetFramework` -- `process.EnableRaisingEvents = true` if it's needed for the polyfill - -DO NOT take: -- The `abandon` parameter on the method -- Any changes to `DoOurBestToCleanUp` (keep `process.Close()` as it was on main) -- Any changes to `SafelyWaitForAllOutput` comments -- Any `OperationCanceledException when (abandon.IsCancellationRequested && !process.HasExited)` catch block - -- [ ] **Step 1: Read the file on main** - -```bash -git show origin/main:source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs > /tmp/srp_main.cs -git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs > /tmp/srp_target.cs -``` - -- [ ] **Step 2: Construct the base-PR version manually** - -Use the main version as a starting point. Apply the minimum needed for async: -- Change `public static int ExecuteCommand(` → `public static async Task ExecuteCommandAsync(` -- Add `using System.Threading.Tasks;` -- Inside the method, find `process.WaitForExit();` and change to: - ```csharp - #if NETFRAMEWORK - await WaitForExitAsyncNetFramework(process, cancel).ConfigureAwait(false); - #else - await process.WaitForExitAsync(cancel).ConfigureAwait(false); - #endif - ``` -- Set `process.EnableRaisingEvents = true;` before `process.Start();` (needed so the polyfill's `Process.Exited` event fires) -- Add the `WaitForExitAsyncNetFramework` polyfill at the end of the class, inside an `#if NETFRAMEWORK` block: - ```csharp - #if NETFRAMEWORK - static Task WaitForExitAsyncNetFramework(Process process, CancellationToken cancellationToken) - { - var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); - CancellationTokenRegistration registration = default; - - void OnExited(object? sender, EventArgs e) - { - registration.Dispose(); - tcs.TrySetResult(null); - } - - process.Exited += OnExited; - if (process.HasExited) - { - tcs.TrySetResult(null); - } - if (cancellationToken.CanBeCanceled) - { - registration = cancellationToken.Register(() => - { - process.Exited -= OnExited; - tcs.TrySetCanceled(cancellationToken); - }); - } - return tcs.Task; - } - #endif - ``` - -- [ ] **Step 3: Verify the file compiles standalone** - -```bash -dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj -``` - -Expected: build succeeds. Errors will likely be in callers, not this file. - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs -git commit -m "$(cat <<'COMMIT' -Migrate SilentProcessRunner.ExecuteCommand to async - -Replaces the sync WaitForExit() with await WaitForExitAsync(cancel). -The cancel token is passed directly so the existing cancel semantics -are preserved: cancel firing throws OCE from the await and unwinds. -DoOurBestToCleanUp continues to fire on cancel via cancel.Register -exactly as it did in the sync version. - -Adds a net48 polyfill for WaitForExitAsync using Process.Exited + -TaskCompletionSource. - -Co-Authored-By: Claude Opus 4.7 (1M context) -COMMIT -)" -``` - ---- - -### Task 1.3: Migrate `ISilentProcessRunner` to async - -**Files:** -- Modify: `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` - -The interface defines the contract for `SilentProcessRunner.ExecuteCommand`. It will need an `ExecuteCommandAsync` method matching the new signature on the static `SilentProcessRunner` class. - -- [ ] **Step 1: Read main version** - -```bash -git show origin/main:source/Octopus.Tentacle/Util/ISilentProcessRunner.cs -``` - -- [ ] **Step 2: Read target version for reference** - -```bash -git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle/Util/ISilentProcessRunner.cs -``` - -- [ ] **Step 3: Construct the base-PR version** - -Take the target version. Remove ANY `abandon` parameter. Change return type of methods from `int` to `Task`. Add `CancellationToken cancel = default` if not already present. - -Replace the `SilentProcessRunnerExtended` (or similar wrapper) implementation so it calls `SilentProcessRunner.ExecuteCommandAsync(...)` and awaits/returns the Task — NO `.GetAwaiter().GetResult()` inside. - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle/Util/ISilentProcessRunner.cs -git commit -m "$(cat <<'COMMIT' -Migrate ISilentProcessRunner to async - -Co-Authored-By: Claude Opus 4.7 (1M context) -COMMIT -)" -``` - ---- - -### Task 1.4: Migrate `CommandLineRunner` and `CommandLineInvocation` to async - -**Files:** -- Modify: `source/Octopus.Tentacle/Util/CommandLineRunner.cs` -- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs` (only if it has an Execute method) - -`CommandLineRunner` wraps `SilentProcessRunner` and is consumed by Kubernetes integration tests and CLI helpers. Its `Execute` method becomes `ExecuteAsync`. - -`CommandLineInvocation.ExecuteCommandAsync()` is referenced from `SystemCtlHelper`, `LinuxServiceConfigurator`, `WindowsServiceConfigurator`. If this method exists on `CommandLineInvocation`, migrate it to async (no `abandon` param). - -- [ ] **Step 1: Check whether CommandLineInvocation has an Execute method** - -```bash -grep -n "Execute" source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs 2>/dev/null || echo "no Execute method in CommandLineInvocation" -``` - -- [ ] **Step 2: Read both versions** - -```bash -git show origin/main:source/Octopus.Tentacle/Util/CommandLineRunner.cs -git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle/Util/CommandLineRunner.cs -``` - -- [ ] **Step 3: Construct the base-PR version** - -Make `CommandLineRunner.Execute` → no, keep `Execute` (the existing public method is sync and consumed by the WPF installer, which must remain sync). Inside `Execute`, where it calls the underlying process runner: it currently does so via `.GetAwaiter().GetResult()`. KEEP that. The improved comment goes on the GetAwaiter line: - -```csharp -// We're in CommandLineRunner.Execute, called from the WPF installer (Octopus.Manager.Tentacle) -// running on a thread-pool worker after the installer hands off to our process-runner helper. -// CommandLineRunner.Execute itself must return synchronously because the installer's UI flow -// is sync. We block on the async call with .GetAwaiter().GetResult(). -// This is safe because we're on a plain thread-pool worker. The risk with blocking on async -// is a deadlock: if the async work needs to resume on the same thread that's blocked waiting -// for it, neither can make progress. Thread-pool workers don't have that constraint — the -// async work can pick up on any free thread when it finishes, so the block resolves normally. -var exitCode = SilentProcessRunner.ExecuteCommandAsync(/* args */).GetAwaiter().GetResult(); -``` - -- [ ] **Step 4: Commit** - -```bash -git add source/Octopus.Tentacle/Util/CommandLineRunner.cs source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs 2>/dev/null -git commit -m "Migrate CommandLineRunner and CommandLineInvocation to async - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 1.5: Migrate `RunningScript` to async (no abandon token) - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` - -`RunningScript.RunScript()` calls `SilentProcessRunner.ExecuteCommand`. Now that ExecuteCommand is async, RunScript must also become async. RunningScript stays WITHOUT the abandon token in the base PR. - -- [ ] **Step 1: Read both versions for reference** - -```bash -git show origin/main:source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs | head -100 -git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs | head -100 -``` - -- [ ] **Step 2: Build the base version** - -Take the target version and remove: -- `CancellationToken abandonToken` constructor parameter -- `abandonToken` field -- `abandon: abandonToken` argument when calling `ExecuteCommandAsync` -- Any `OperationCanceledException when (abandonToken.IsCancellationRequested)` catch branches -- Any references to `AbandonedExitCode` (those aren't in `ScriptExitCodes` yet) - -Make the public method async: `RunScript` → `RunScriptAsync` returning `Task`. - -- [ ] **Step 3: Commit** - -```bash -git add source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs -git commit -m "Migrate RunningScript to async - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 1.6: Migrate `ScriptServiceV2` callsite to async (no abandon) - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` - -`ScriptServiceV2.StartScriptAsync` calls into `RunningScript.RunScript`. Update to await `RunScriptAsync`. Do NOT add `AbandonScriptAsync` here yet. - -- [ ] **Step 1: Read both versions** - -```bash -git show origin/main:source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs > /tmp/scs_main.cs -git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs > /tmp/scs_target.cs -``` - -- [ ] **Step 2: Build the base version** - -Take the main version and apply ONLY the minimal changes needed to await the new async `RunScriptAsync` from `RunningScript`. Remove all abandon-specific additions in the target version: -- No `AbandonScriptAsync` method -- No `RunningScriptWrapper.AbandonTokenSource` / `Abandon()` method -- No `AbandonedExitCode` references -- No abandon-specific workspace deletion logic - -- [ ] **Step 3: Commit** - -```bash -git add source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs -git commit -m "Update ScriptServiceV2 to await async RunScriptAsync - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 1.7: Update the six sync↔async boundary sites with improved comments - -**Files:** -- Modify: `source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs` -- Modify: `source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs` -- Modify: `source/Octopus.Tentacle/Util/SystemCtlHelper.cs` -- Modify: `source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs` -- Modify: `source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs` - -Each of these sites was the immediate consumer of `ExecuteCommand` in main. After Task 1.2 they need to call `ExecuteCommandAsync` and either await it (if they can become async) or block with `.GetAwaiter().GetResult()` (if they cannot). - -ALL of these sites CANNOT become async in this PR — they implement sync interfaces (IPrerequisite, IMemoryCache factory, IServiceConfigurator) or are called from sync framework code (Topshelf). They all use `.GetAwaiter().GetResult()` with an explanatory comment. - -Comment template (adapt the specifics per site): - -``` -// We're in [SHORT DESCRIPTION OF SITE]. [WHY IT MUST BE SYNC — interface -// constraint, framework callback, etc.]. We block on the async call with -// .GetAwaiter().GetResult(). -// This is safe because we're on a plain thread-pool worker. The risk with -// blocking on async is a deadlock: if the async work needs to resume on -// the same thread that's blocked waiting for it, neither can make progress. -// Thread-pool workers don't have that constraint — the async work can -// pick up on any free thread when it finishes, so the block resolves normally. -``` - -- [ ] **Step 1: Update PowerShellPrerequisite** - -Site: `source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs`. The `Check()` method calls `SilentProcessRunner.ExecuteCommandAsync(...).GetAwaiter().GetResult()`. Comment specifics: "We're in the WPF installer prerequisite check. IPrerequisite.Check() must return synchronously — there's no async version of the interface — so we block..." - -Reference state: `git show claude-safety-2026-05-25-pre-split:source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs` - -Copy that file's content directly — it has the right comment already. - -- [ ] **Step 2: Update KubernetesDirectoryInformationProvider** - -Site: `source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs`. Method `GetDriveBytesUsingDu` is called from inside an `IMemoryCache.GetOrCreate` factory (a `Func` — sync). Comment specifics: "We're in the IMemoryCache.GetOrCreate factory that populates the disk-space cache entry. The cache factory delegate is synchronous (`Func`) so we block on the async call with `.GetAwaiter().GetResult()`..." - -Take this content from the safety tag, BUT verify it does not include any async chain propagation (it shouldn't — we never propagated this in the abandon PR). It should be `GetPathUsedBytes` (sync) with GetAwaiter on the du call. - -```bash -git show claude-safety-2026-05-25-pre-split:source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs -``` - -If the file at safety tag has `GetPathUsedBytesAsync` or other async-chain content, that came from PR #1235 work that was rolled back. Use the file with the sync `GetPathUsedBytes` + GetAwaiter pattern. - -- [ ] **Step 3: Update SystemCtlHelper** - -Site: `source/Octopus.Tentacle/Util/SystemCtlHelper.cs`. Two GetAwaiter calls inside `RunServiceCommand` (one for systemctl, one for sudo retry). Comment specifics: "We're in SystemCtlHelper running a systemctl command. All callers (StartService, RestartService, etc.) are sync — they're part of the Tentacle service-management CLI flow, which bottoms out in ServiceCommand.Start() (sync `void` override) with no async path..." - -Second GetAwaiter call (sudo retry) gets a short pointer comment: "Same sync boundary — sudo retry on the same thread-pool worker." - -- [ ] **Step 4: Update LinuxServiceConfigurator** - -Site: `source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs`. Three GetAwaiter calls: `WriteUnitFile`, `IsSystemdInstalled`, `HaveSudoPrivileges`. Each gets the comment template, adapted: - -For `WriteUnitFile`: "WriteUnitFile is called from `IServiceConfigurator.ConfigureService` implementations, which are themselves called from the Tentacle service-management CLI on a thread-pool worker..." - -For `IsSystemdInstalled` and `HaveSudoPrivileges`: "Same sync boundary as WriteUnitFile." - -- [ ] **Step 5: Update WindowsServiceConfigurator** - -Site: `source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs`. One GetAwaiter call inside `Sc()`. Comment specifics: "Sc() is called from `IServiceConfigurator.ConfigureService` implementations on Windows, on a thread-pool worker..." - -- [ ] **Step 6: Commit** - -```bash -git add source/Octopus.Manager.Tentacle/PreReq/PowerShellPrerequisite.cs \ - source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs \ - source/Octopus.Tentacle/Util/SystemCtlHelper.cs \ - source/Octopus.Tentacle/Startup/LinuxServiceConfigurator.cs \ - source/Octopus.Tentacle/Startup/WindowsServiceConfigurator.cs -git commit -m "$(cat <<'COMMIT' -Document the six sync↔async boundary sites with improved comments - -Each immediate sync caller of ExecuteCommandAsync now blocks via -.GetAwaiter().GetResult() with a comment that explains where it sits -in the call graph, why the surrounding code must be synchronous, and -why blocking on async is deadlock-safe from a plain thread-pool worker. - -Sites: -- PowerShellPrerequisite.Check (WPF installer prerequisite) -- KubernetesDirectoryInformationProvider.GetDriveBytesUsingDu (IMemoryCache factory) -- SystemCtlHelper.RunServiceCommand (×2 — systemctl + sudo retry) -- LinuxServiceConfigurator: WriteUnitFile, IsSystemdInstalled, HaveSudoPrivileges -- WindowsServiceConfigurator.Sc - -Co-Authored-By: Claude Opus 4.7 (1M context) -COMMIT -)" -``` - ---- - -### Task 1.8: Update other test scaffolding files - -**Files:** -- The Kubernetes integration test files listed in the diff (TestUtils, Setup, Tooling, etc.) likely need to be migrated to async because they consume `CommandLineRunner` or `SilentProcessRunner`. - -The diff from `main` to `583eb46c` lists these: -- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/*.cs` -- `source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/*.cs` -- `source/Octopus.Tentacle.Tests.Integration/Support/*.cs` -- `source/Octopus.Tentacle.Tests.Integration/Util/*.cs` -- `source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs` - -Most of these only changed because they had to switch from sync `Execute` to async `ExecuteAsync`. Copy the safety-tag versions BUT verify each one only contains async-migration changes (no abandon-related changes). If a file contains abandon test fixtures, take only the async portions. - -- [ ] **Step 1: For each file, compare main vs safety tag** - -```bash -for f in \ - source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/SetupHelpers.cs \ - source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesClusterInstaller.cs \ - source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/KubernetesAgentInstaller.cs \ - source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/DockerImageLoader.cs \ - source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/HelmDownloader.cs \ - source/Octopus.Tentacle.Kubernetes.Tests.Integration/Setup/Tooling/ToolDownloader.cs \ - source/Octopus.Tentacle.Kubernetes.Tests.Integration/Tooling/KubeCtlTool.cs \ - source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesAgent/KubernetesClusterOneTimeSetUp.cs \ - source/Octopus.Tentacle.Kubernetes.Tests.Integration/KubernetesClientCompatibilityTests.cs \ - source/Octopus.Tentacle.Tests.Integration/Support/ClientAndTentacle.cs \ - source/Octopus.Tentacle.Tests.Integration/Support/TentacleFetchers/LinuxTentacleFetcher.cs \ - source/Octopus.Tentacle.Tests.Integration/Util/LinuxTestUserPrincipal.cs \ - source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs \ - source/Octopus.Tentacle.Tests/Util/LinuxTestUserPrincipal.cs ; do - echo "=== $f ===" - git diff origin/main..claude-safety-2026-05-25-pre-split -- "$f" | head -20 - echo -done -``` - -- [ ] **Step 2: For each file, take the safety-tag version IF its changes are purely async-migration** - -Use `git checkout claude-safety-2026-05-25-pre-split -- ` for each. - -If a file contains abandon-specific additions (e.g., references to `AbandonScript` or `AbandonedExitCode`), manually edit out those parts after checkout. - -- [ ] **Step 3: Build** - -```bash -dotnet build source/Octopus.Tentacle.sln -``` - -Expected: build succeeds. Errors here will reveal additional files that need attention. - -- [ ] **Step 4: Commit** - -```bash -git add -A -git commit -m "$(cat <<'COMMIT' -Migrate test scaffolding to async ExecuteCommandAsync - -Updates Kubernetes integration test setup and support helpers to await -the new ExecuteCommandAsync signature. No abandon-feature content is -included. - -Co-Authored-By: Claude Opus 4.7 (1M context) -COMMIT -)" -``` - ---- - -### Task 1.9: Build verification — base PR must compile and tests must pass - -**Files:** none (verification only) - -- [ ] **Step 1: Full build** - -```bash -cd /Users/jim/code/OctopusTentacle/source -dotnet build Octopus.Tentacle.sln 2>&1 | tail -50 -``` - -Expected: 0 errors. Any errors must be resolved before proceeding — they indicate missing files in the migration. - -- [ ] **Step 2: Run the unit tests** - -```bash -dotnet test source/Octopus.Tentacle.Tests/Octopus.Tentacle.Tests.csproj -``` - -Expected: all green. - -- [ ] **Step 3: Run the SilentProcessRunner integration test for ShouldCancelPing** - -```bash -dotnet test source/Octopus.Tentacle.Tests.Integration --filter "Name~ShouldCancelPing" -``` - -Expected: green. This verifies cancel works with our `WaitForExitAsync(cancel)` wiring. - ---- - -### Task 1.10: Push the base branch and open the new PR - -**Files:** none (git + gh) - -- [ ] **Step 1: Push** - -```bash -git push -u origin jimpelletier/eft-3295-async-migration-base -``` - -- [ ] **Step 2: Create the PR with base = main** - -```bash -gh pr create \ - --base main \ - --head jimpelletier/eft-3295-async-migration-base \ - --title "Migrate SilentProcessRunner to async" \ - --body "$(cat <<'EOF' -## Summary - -Makes `SilentProcessRunner.ExecuteCommand` async. Required foundation for the EFT-3295 script-abandonment feature (PR #1226, which stacks on top of this) but valuable on its own as a refactor: enables awaiting process runs from already-async callers rather than blocking a thread. - -### What this PR does -- `SilentProcessRunner.ExecuteCommand` → `ExecuteCommandAsync` (and the matching interfaces and helpers) -- Internal: `process.WaitForExit()` → `await process.WaitForExitAsync(cancel)` -- Adds a net48 polyfill for `WaitForExitAsync` (using `Process.Exited` + `TaskCompletionSource`) -- The six immediate sync callers (PowerShellPrerequisite, KubernetesDirectoryInformationProvider, SystemCtlHelper×2, LinuxServiceConfigurator×3, WindowsServiceConfigurator) block via `.GetAwaiter().GetResult()` with a comment explaining the call context and why blocking on a thread-pool worker is deadlock-safe - -### What this PR explicitly does NOT include -- The `abandon` parameter on `ExecuteCommandAsync` (added in #1226) -- Removal of `process.Close()` from `DoOurBestToCleanUp` (added in #1226) -- Any abandon-specific contracts, RPC methods, capabilities, env vars, or tests (#1226) - -## Test plan -- [ ] CI build green -- [ ] `ShouldCancelPing` integration test still passes (cancel semantics preserved) - -🤖 Generated with [Claude Code](https://claude.ai/claude-code) -EOF -)" -``` - -- [ ] **Step 3: Capture the new PR number for use in subsequent tasks** - -```bash -gh pr view jimpelletier/eft-3295-async-migration-base --json number,url -``` - ---- - -## Phase 2 — Rebuild #1226 on top of the base PR - -### Task 2.1: Reset the abandon branch to the base PR tip - -**Files:** none (git only) - -- [ ] **Step 1: Switch to the abandon branch** - -```bash -cd /Users/jim/code/OctopusTentacle -git checkout jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex -``` - -- [ ] **Step 2: Hard-reset to the new base branch tip** - -```bash -git reset --hard jimpelletier/eft-3295-async-migration-base -``` - -- [ ] **Step 3: Verify** - -```bash -git log --oneline -1 -``` - -Expected: tip of the base branch. - ---- - -### Task 2.2: Apply the abandon delta — contracts, env var - -**Files:** -- Create: `source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs` -- Modify: `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs` (add `AbandonedExitCode = -48`) -- Modify: `source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs` (add `AbandonScript` method) -- Modify: `source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs` (add `AbandonScriptAsync`) -- Modify: `source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs` (add `TentacleDebugDisableProcessKill`) - -- [ ] **Step 1: Copy each file from safety tag** - -```bash -git checkout claude-safety-2026-05-25-pre-split -- \ - source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs \ - source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs \ - source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs \ - source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs \ - source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs -``` - -- [ ] **Step 2: Commit** - -```bash -git commit -am "Add abandon contracts and TentacleDebugDisableProcessKill env var - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 2.3: Apply the abandon delta — SilentProcessRunner abandon token + Close removal + long-form comments - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` - -This step: -- Adds the `abandon` parameter to `ExecuteCommandAsync` -- Switches internal await from `WaitForExitAsync(cancel)` to `WaitForExitAsync(abandon)` -- Adds the `OperationCanceledException when (abandon.IsCancellationRequested && !process.HasExited)` catch returning `ScriptExitCodes.AbandonedExitCode` -- Removes `process.Close()` from `DoOurBestToCleanUp` -- Adds long-form documentation comments to `DoOurBestToCleanUp`, `SafelyWaitForAllOutput`, and the `WaitForExitAsync` call site -- Adds the Hitman env-var test-affordance check - -- [ ] **Step 1: Take the safety-tag version of SilentProcessRunner** - -```bash -git checkout claude-safety-2026-05-25-pre-split -- source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs -``` - -- [ ] **Step 2: Verify the stray `process.Close()` bug fix is included** - -```bash -grep -n "process.Close" source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs -``` - -Expected output: only references in comments (no actual `process.Close();` call). If a `process.Close();` call appears not in a comment, remove it manually — same fix as in commit `583eb46c`. - -- [ ] **Step 3: Commit** - -```bash -git commit -am "Add abandon token to SilentProcessRunner and remove process.Close() race - -- Adds CancellationToken abandon parameter to ExecuteCommandAsync -- Switches the await from WaitForExitAsync(cancel) to WaitForExitAsync(abandon) -- Returns ScriptExitCodes.AbandonedExitCode when abandon fires before process exits -- Removes process.Close() from DoOurBestToCleanUp (race with WaitForExitAsync's - TCS via the Exited event — Close tore down the wait state, hung cancel) -- Adds long-form documentation comments explaining the race, the grandchild-pipe - scenario, and worst-case cancel latency -- Adds TentacleDebugDisableProcessKill test affordance to Hitman - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 2.4: Apply the abandon delta — interface + caller updates for abandon parameter - -**Files:** -- Modify: `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` -- Modify: `source/Octopus.Tentacle/Util/CommandLineRunner.cs` -- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs` (if it has an Execute method) - -Add the `abandon` parameter to the interface and the helper class. - -- [ ] **Step 1: Take the safety-tag versions** - -```bash -git checkout claude-safety-2026-05-25-pre-split -- \ - source/Octopus.Tentacle/Util/ISilentProcessRunner.cs \ - source/Octopus.Tentacle/Util/CommandLineRunner.cs -git checkout claude-safety-2026-05-25-pre-split -- source/Octopus.Tentacle.Core/Util/CommandLine/CommandLineInvocation.cs 2>/dev/null -``` - -- [ ] **Step 2: Commit** - -```bash -git commit -am "Plumb abandon token through ISilentProcessRunner and CommandLineRunner - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 2.5: Apply the abandon delta — RunningScript abandon-token plumbing - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` - -Adds the `abandonToken` constructor parameter and passes it to `ExecuteCommandAsync`. - -- [ ] **Step 1: Take the safety-tag version** - -```bash -git checkout claude-safety-2026-05-25-pre-split -- source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs -``` - -- [ ] **Step 2: Commit** - -```bash -git commit -am "Plumb abandon token through RunningScript - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 2.6: Apply the abandon delta — ScriptServiceV2.AbandonScriptAsync + workspace cleanup - -**Files:** -- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` - -Adds: -- `RunningScriptWrapper.AbandonTokenSource` and `Abandon()` -- Public `AbandonScriptAsync` method on the service -- Best-effort `workspace.Delete` gated on `AbandonedExitCode` - -- [ ] **Step 1: Take the safety-tag version** - -```bash -git checkout claude-safety-2026-05-25-pre-split -- source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs -``` - -- [ ] **Step 2: Commit** - -```bash -git commit -am "Implement ScriptServiceV2.AbandonScriptAsync and abandon-gated workspace cleanup - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 2.7: Apply the abandon delta — advertise AbandonScriptV2 capability - -**Files:** -- Modify: `source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs` -- Modify: `source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs` - -Adds `nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)` to the capabilities list. Updates the integration test to expect it for Latest tentacles. - -- [ ] **Step 1: Take the safety-tag versions** - -```bash -git checkout claude-safety-2026-05-25-pre-split -- \ - source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs \ - source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs -``` - -- [ ] **Step 2: Commit** - -```bash -git commit -am "Advertise AbandonScriptV2 capability - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 2.8: Apply the abandon delta — ScriptBuilder.AppendRaw, tests, and grandchild test comments - -**Files:** -- Modify: `source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs` -- Modify: `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` -- Create or modify: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` -- Modify: `source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs` -- Modify: `source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs` -- Modify: `source/Octopus.Tentacle.Tests/Kubernetes/KubernetesDirectoryInformationProviderFixture.cs` (if it has abandon-specific test changes) - -The abandon-specific tests and test helpers. Includes the rewritten grandchild test comments in `SilentProcessRunnerFixture`. - -- [ ] **Step 1: Take the safety-tag versions** - -```bash -git checkout claude-safety-2026-05-25-pre-split -- \ - source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs \ - source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs \ - source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs \ - source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs \ - source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs -``` - -- [ ] **Step 2: Commit** - -```bash -git commit -am "Add abandon-specific tests and rewrite grandchild test comments for async behavior - -Co-Authored-By: Claude Opus 4.7 (1M context) " -``` - ---- - -### Task 2.9: Apply remaining files — spec doc, plan doc, anything else in diff - -**Files:** -- Spec/plan files from `docs/superpowers/` -- Any remaining file in the `git diff main..claude-safety-2026-05-25-pre-split` that's not already covered - -- [ ] **Step 1: List files still differing** - -```bash -git diff jimpelletier/eft-3295-async-migration-base..claude-safety-2026-05-25-pre-split --name-only -``` - -- [ ] **Step 2: Inspect any unhandled files and bring them over** - -For each remaining file: -- If the change is abandon-specific: `git checkout claude-safety-2026-05-25-pre-split -- ` -- If unrelated: skip and ask user - -- [ ] **Step 3: Verify the diff is complete** - -```bash -git diff jimpelletier/eft-3295-async-migration-base..HEAD --stat -``` - -This should now contain the FULL abandon-feature delta. - -- [ ] **Step 4: Verify end state matches the safety tag** - -```bash -git diff claude-safety-2026-05-25-pre-split HEAD -``` - -Expected: zero output. The rebuilt branch should produce the EXACT same end state as `583eb46c`. - -If there are differences, investigate and resolve them before continuing. - -- [ ] **Step 5: Commit any final additions** - -```bash -git status -git add -A -git commit -m "Bring in remaining abandon-feature files - -Co-Authored-By: Claude Opus 4.7 (1M context) " || echo "no changes" -``` - ---- - -### Task 2.10: Build verification — abandon PR must compile and all tests must pass - -**Files:** none (verification only) - -- [ ] **Step 1: Full build** - -```bash -dotnet build source/Octopus.Tentacle.sln 2>&1 | tail -50 -``` - -Expected: 0 errors. - -- [ ] **Step 2: Run abandon-specific tests** - -```bash -dotnet test source/Octopus.Tentacle.Tests/Octopus.Tentacle.Tests.csproj --filter "Name~Abandon" -dotnet test source/Octopus.Tentacle.Tests.Integration --filter "Name~Abandon" -``` - -Expected: green. - ---- - -### Task 2.11: Force-push abandon branch and update PR #1226's base - -**Files:** none (git + gh) - -- [ ] **Step 1: Force-push** - -```bash -git push --force-with-lease origin jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex -``` - -- [ ] **Step 2: Change PR #1226's base to the new async-migration-base branch** - -```bash -gh pr edit 1226 --base jimpelletier/eft-3295-async-migration-base -``` - -- [ ] **Step 3: Add a comment to #1226 explaining the rebase** - -```bash -gh pr comment 1226 --body "$(cat <<'EOF' -Rebased on top of the new foundational PR (the async migration of \`SilentProcessRunner\`). The diff is now focused on the abandon feature itself — the async-migration plumbing has moved to the base PR. - -Previous head: \`583eb46c\` (preserved as tag \`claude-safety-2026-05-25-pre-split\`). -EOF -)" -``` - ---- - -## Phase 3 — Rebase PR #1235 - -### Task 3.1: Rebase #1235 on top of the new #1226 - -**Files:** none (git) - -- [ ] **Step 1: Switch to #1235's branch** - -```bash -cd /Users/jim/code/OctopusTentacle -git checkout jimpelletier/eft-3295-async-signature-propagation -``` - -- [ ] **Step 2: Rebase onto the new #1226 tip** - -```bash -git rebase jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex -``` - -If conflicts arise: resolve each one. The most likely conflict file is `SilentProcessRunner.cs` (because #1235 had the stray `process.Close()` fix that's now in #1226). Other conflicts are mechanical — resolve in favor of the #1235 version since those are the push-higher changes. - -- [ ] **Step 3: Verify build** - -```bash -dotnet build source/Octopus.Tentacle.sln 2>&1 | tail -20 -``` - -- [ ] **Step 4: Force-push #1235** - -```bash -git push --force-with-lease origin jimpelletier/eft-3295-async-signature-propagation -``` - -- [ ] **Step 5: Sanity check #1235's PR diff** - -```bash -gh pr view 1235 --json url -``` - -Visit the URL and confirm the diff contains only the push-higher commits (no abandon-feature content leaked). - ---- - -## Phase 4 — Final verification - -### Task 4.1: End-to-end stack check - -**Files:** none - -- [ ] **Step 1: Verify branch graph** - -```bash -git log --oneline --graph --all -30 -``` - -Expected: `main` → base branch → abandon branch → push-higher branch. - -- [ ] **Step 2: Verify each PR's base** - -```bash -gh pr list --head jimpelletier/eft-3295-async-migration-base -gh pr list --head jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex -gh pr list --head jimpelletier/eft-3295-async-signature-propagation -``` - -Expected: -- New base PR → base: `main` -- #1226 → base: `jimpelletier/eft-3295-async-migration-base` -- #1235 → base: `jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` - -- [ ] **Step 3: Verify end-state equivalence** - -```bash -# When all three PRs are squash-merged, the result on main should equal the safety tag's file states -git diff claude-safety-2026-05-25-pre-split jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex -``` - -Expected: zero output (the rebased #1226 ends at the same end-state as the original tip). - -```bash -git diff claude-safety-2026-05-25-pre-split-1235 jimpelletier/eft-3295-async-signature-propagation -``` - -Expected: zero output (the rebased #1235 ends at the same end-state as before). - -- [ ] **Step 4: Report success** - -Report each PR's URL and confirm the inversion is complete. - ---- - -## Open notes for the implementer - -- If `git checkout claude-safety-2026-05-25-pre-split -- ` brings over content that includes abandon-specific changes when a file is supposed to be "async-migration only," check whether the file at safety-tag has BOTH concerns mixed. If so, you'll need to manually edit out the abandon parts. This is most likely for: `SilentProcessRunner.cs`, `RunningScript.cs`, `ScriptServiceV2.cs`, `ISilentProcessRunner.cs`, `CommandLineRunner.cs`. -- If a build error during Phase 1 says a method has the wrong signature, it likely means the abandon-token parameter leaked into the base-PR version of an interface. Search for `abandon` in the file and remove. -- The `.worktrees/` directory is gitignored from the abandon branch but NOT from `main`. If `git status` shows it as untracked on the base branch, that's expected — the gitignore was added in the abandon branch only. The base PR should NOT include this gitignore change (it's not async-migration related). diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 72931d25a..a5d352d69 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -344,6 +344,16 @@ public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProces // not this sentinel. The exit code is the abandon contract. exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); + + // In production we don't care whether the script keeps running after abandon. + // Here we assert it does, as a sanity check that the test fixture genuinely + // exercised the abandon path. If we'd accidentally cancelled the process + // instead, this assertion would fail (and Process.GetProcessById would throw + // ArgumentException if the PID is already gone). The abandon contract is + // "stop waiting, leave the OS process alone" — this is our confidence that + // distinction held in the test. + var sleepPid = int.Parse(SafelyReadAllText(pidFile).Trim()); + Process.GetProcessById(sleepPid).HasExited.Should().BeFalse("abandon should leave the underlying script process running"); } finally { diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs index e71c512d7..7609ff106 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesDirectoryInformationProvider.cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Threading; using System.Threading.Tasks; using Octopus.Client.Extensions; using Microsoft.Extensions.Caching.Memory; From 013dfe7e7c2806bbd8148c0af62385c1ae2b045f Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 17:18:32 +1000 Subject: [PATCH 40/52] Delete middle-layer abandon test in RunningScriptFixture Address PR review on #1226: Luke flagged the middle-layer abandon test as not pulling its weight given coverage at the top (TentacleClient integration) and bottom (SilentProcessRunner unit) layers. Deleted the test plus the WaitForPidFileAsync / SafelyReadPidFile helpers that only it used. --- .../Util/RunningScriptFixture.cs | 72 ------------------- 1 file changed, 72 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs index 62a81ba5d..2c6b9ceec 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs @@ -188,78 +188,6 @@ public async Task CancellationToken_ShouldKillTheProcess() } } - [Test] - public async Task Execute_WhenAbandonTokenFires_ReturnsAbandonedExitCode() - { - using var tempDir = new TemporaryDirectory(); - var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); - - // Write a long-sleeping script that first records its PID, then sleeps. - var scriptBody = PlatformDetection.IsRunningOnWindows - ? $"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300" - : $"echo $$ > '{pidFile}' && sleep 300"; - workspace.BootstrapScript(scriptBody); - - var shell = PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(); - using var runningCts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); - using var abandonCts = new CancellationTokenSource(); - - var script = RunningScript.CreateAbandonable( - shell, - workspace, - stateStore: null, - scriptLog, - taskId, - scriptIsolationMutex, - runningCts.Token, - abandonCts.Token, - new Dictionary(), - PowerShellStartupDetection.PowerShellStartupTimeout, - new InMemoryLog()); - - var executeTask = script.Execute(); - - // Wait deterministically for the process to write its PID before we abandon. - await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); - abandonCts.Cancel(); - - await executeTask; - - try - { - script.State.Should().Be(ProcessState.Complete); - script.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - } - finally - { - if (File.Exists(pidFile) - && int.TryParse(SafelyReadPidFile(pidFile).Trim(), out var pid) - && pid > 0) - { - try { System.Diagnostics.Process.GetProcessById(pid).Kill(); } - catch { /* process already exited */ } - } - } - } - - static async Task WaitForPidFileAsync(string pidFile, TimeSpan timeout) - { - var deadline = DateTime.UtcNow + timeout; - while (DateTime.UtcNow < deadline) - { - if (File.Exists(pidFile) && int.TryParse(SafelyReadPidFile(pidFile).Trim(), out var pid) && pid > 0) - return; - await Task.Delay(100); - } - Assert.Fail($"PID file '{pidFile}' was not written within {timeout.TotalSeconds}s — script process did not start."); - } - - static string SafelyReadPidFile(string path) - { - try { return File.ReadAllText(path); } - catch { return string.Empty; } - } - static string EchoEnvironmentVariable(string varName) { if (PlatformDetection.IsRunningOnWindows) From bad0a319f25c44321a6d699bd164a681c0f184cf Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 17:25:00 +1000 Subject: [PATCH 41/52] Tighten the sleep-still-running assert comment Address PR review on #1226: 7-line explanation shrunk to a 2-line intent statement. --- .../Util/SilentProcessRunnerFixture.cs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index a5d352d69..8a30589c1 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -345,13 +345,8 @@ public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProces exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); - // In production we don't care whether the script keeps running after abandon. - // Here we assert it does, as a sanity check that the test fixture genuinely - // exercised the abandon path. If we'd accidentally cancelled the process - // instead, this assertion would fail (and Process.GetProcessById would throw - // ArgumentException if the PID is already gone). The abandon contract is - // "stop waiting, leave the OS process alone" — this is our confidence that - // distinction held in the test. + // Test-fixture confidence: confirms we genuinely abandoned rather than + // accidentally cancelling. Production doesn't care if the script runs on. var sleepPid = int.Parse(SafelyReadAllText(pidFile).Trim()); Process.GetProcessById(sleepPid).HasExited.Should().BeFalse("abandon should leave the underlying script process running"); } From 82508ac2886041641344539994de975bd60920eb Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 17:28:13 +1000 Subject: [PATCH 42/52] Rewrite sleep-still-running assert comment in plain prose --- .../Util/SilentProcessRunnerFixture.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 8a30589c1..b2ab79fc1 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -345,8 +345,7 @@ public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProces exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); - // Test-fixture confidence: confirms we genuinely abandoned rather than - // accidentally cancelling. Production doesn't care if the script runs on. + // Make sure the fixture didn't accidentally kill the script. The exit code matches anyway. var sleepPid = int.Parse(SafelyReadAllText(pidFile).Trim()); Process.GetProcessById(sleepPid).HasExited.Should().BeFalse("abandon should leave the underlying script process running"); } From c4468c5cb87b211896e025e16977e04cf2262da3 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 17:36:27 +1000 Subject: [PATCH 43/52] Rewrite assert comment to include the prod-doesn't-care framing --- .../Util/SilentProcessRunnerFixture.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index b2ab79fc1..e4540224d 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -345,7 +345,8 @@ public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProces exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); - // Make sure the fixture didn't accidentally kill the script. The exit code matches anyway. + // Whether the script keeps running doesn't matter in prod. We check it here so we + // know our fixture didn't accidentally kill it (the exit code matches either way). var sleepPid = int.Parse(SafelyReadAllText(pidFile).Trim()); Process.GetProcessById(sleepPid).HasExited.Should().BeFalse("abandon should leave the underlying script process running"); } From f6ee579b464ce1675b77c5d261f198f50824277b Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 17:40:38 +1000 Subject: [PATCH 44/52] Reword assert comment per review --- .../Util/SilentProcessRunnerFixture.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index e4540224d..0149ced6d 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -346,7 +346,7 @@ public async Task AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProces infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); // Whether the script keeps running doesn't matter in prod. We check it here so we - // know our fixture didn't accidentally kill it (the exit code matches either way). + // know our fixture successfully prevented it from being killed (the exit code matches either way). var sleepPid = int.Parse(SafelyReadAllText(pidFile).Trim()); Process.GetProcessById(sleepPid).HasExited.Should().BeFalse("abandon should leave the underlying script process running"); } From fe9aa11dbbf4c6a3d2237db3edd3dce749c715fc Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 17:55:53 +1000 Subject: [PATCH 45/52] Retrigger CI after base branch change to main From a76914b7819c618fb9191f5a812dc5c30a7c7d4c Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 19:57:58 +1000 Subject: [PATCH 46/52] Fix two CI test failures: cancel-path hang and capabilities count CapabilitiesFromAnOlderTentacle... and CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonKubernetesTentacle: were asserting capabilities.Count.Should().Be(expectedCapabilitiesCount) without accounting for the new AbandonScript advertisement on Latest. The Count check was belt-and-braces; the per-capability Contain/NotContain assertions still carry the test's purpose, and the new dedicated LatestTentacle_AdvertisesAbandonScriptCapability covers the abandon advertisement on its own. Drop the Count assertion. CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_ShouldNotHang: the 30s task.Wait bound was hitting on slow Linux CI agents. Worst case expected is ~10s; a real regression hangs indefinitely. Bump to 60s, which preserves the regression-detection property without flaking on slow agents. --- .../CapabilitiesServiceV2Test.cs | 10 ---------- .../Util/SilentProcessRunnerFixture.cs | 11 +++++++---- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index abe0cbbea..f52ef1b59 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -32,15 +32,10 @@ public async Task CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_W capabilities.Should().Contain(nameof(IScriptService)); capabilities.Should().Contain(nameof(IFileTransferService)); - //all versions have ScriptServiceV1 & IFileTransferService - var expectedCapabilitiesCount = 2; if (version.HasScriptServiceV2()) { capabilities.Should().Contain(nameof(IScriptServiceV2)); - expectedCapabilitiesCount++; } - - capabilities.Count.Should().Be(expectedCapabilitiesCount); } [Test] @@ -56,17 +51,12 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK capabilities.Should().Contain(nameof(IScriptService)); capabilities.Should().Contain(nameof(IFileTransferService)); - //all versions have ScriptServiceV1 & IFileTransferService - var expectedCapabilitiesCount = 2; if (version.HasScriptServiceV2()) { capabilities.Should().Contain(nameof(IScriptServiceV2)); - expectedCapabilitiesCount++; } capabilities.Should().NotContain(nameof(IKubernetesScriptServiceV1)); - - capabilities.Count.Should().Be(expectedCapabilitiesCount); } [Test] diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 0149ced6d..299d0d8e1 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -283,15 +283,18 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul cts.Cancel(); // Cancel should return within ~10s worst case (5s SafelyWaitForAllOutput timeout - // per stream). If we hit the 30s test timeout, something is hanging — most likely - // process.Close() got re-added to DoOurBestToCleanUp (see that method for why). - var completed = task.Wait(TimeSpan.FromSeconds(30)); + // per stream). The 60s test bound is generous for slow Linux CI agents while + // still flagging a real regression — if process.Close() got re-added to + // DoOurBestToCleanUp (see that method for why) or SafelyWaitForAllOutput's + // per-stream timeout was removed, the wait hangs indefinitely and we trip the + // bound by miles. + var completed = task.Wait(TimeSpan.FromSeconds(60)); sw.Stop(); completed.Should().BeTrue( $"ExecuteCommandAsync should return promptly after cancellation even when a Unix " + $"grandchild (reparented to init/launchd) holds the redirected pipes. Worst case " + - $"is ~10s. If we hit the 30s test timeout, either process.Close() was re-introduced " + + $"is ~10s. If we hit the 60s test timeout, either process.Close() was re-introduced " + $"in DoOurBestToCleanUp (which races with the Exited event WaitForExitAsync depends " + $"on) or SafelyWaitForAllOutput's per-stream timeout has been removed. Elapsed " + $"since cancel: {sw.Elapsed.TotalSeconds:F1}s"); From a9f8ddf6399d38501a2c89890a36ba3dfa2ce98f Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 28 May 2026 21:01:33 +1000 Subject: [PATCH 47/52] Bump grandchild-pipes test bounds to 60s on both Windows and Linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous push only bumped the Unix task.Wait. CI showed 10 builds still failing on grandchild-pipes tests — most likely the asymmetric 30s bounds (Windows task.Wait, Unix WaitForPidFile) were still being hit on slow CI agents. Make both grandchild tests symmetric at 60s on the PID-file wait and the cancel-completion wait. Regression-detection property preserved because a real regression hangs indefinitely. --- .../Util/SilentProcessRunnerFixture.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 299d0d8e1..e15802bec 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -218,13 +218,13 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot var sw = Stopwatch.StartNew(); cts.Cancel(); - var completed = task.Wait(TimeSpan.FromSeconds(30)); + var completed = task.Wait(TimeSpan.FromSeconds(60)); sw.Stop(); completed.Should().BeTrue( $"ExecuteCommandAsync should return promptly after cancellation even when a " + $"grandchild holds the redirected pipes. Worst case is ~10s (5s timeout × 2 streams " + - $"in SafelyWaitForAllOutput). If we hit the 30s test timeout, either someone " + + $"in SafelyWaitForAllOutput). If we hit the 60s test timeout, either someone " + $"re-introduced process.Close() in DoOurBestToCleanUp (which races with the Exited " + $"event WaitForExitAsync depends on) or SafelyWaitForAllOutput's per-stream timeout " + $"has been removed. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); @@ -277,7 +277,7 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul out _, cts.Token)); - await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(30)); + await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(60)); var sw = Stopwatch.StartNew(); cts.Cancel(); From f1cfcc1c8b7a8e6477bb443e26a63203ccc42e1b Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 29 May 2026 10:43:19 +1000 Subject: [PATCH 48/52] Restore process.Close() in cancel cleanup to release pipe handles WaitForExitAsync on net8.0 waits for stream EOF after the Exited event fires. A re-parented grandchild holding our redirected stdout/stderr open prevents EOF, so the await hangs indefinitely on the cancel path. Close() releases the pipe handles immediately, matching what main does for the sync version. The cancel.Register callback runs Hitman.Kill + Close, the readers see EOF, and the await returns. Local Unix grandchild test goes from 60s+ hang to 80ms pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/CommandLine/SilentProcessRunner.cs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 4fe392f8f..0c8a4bed0 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -269,6 +269,21 @@ static void DoOurBestToCleanUp(Process process, Action error) error($"Failed to kill the launched process: {killProcessException}"); } } + finally + { + try + { + // Force the pipe handles closed. Process.WaitForExitAsync waits for stream EOF + // after the Exited event, and a re-parented grandchild holding our redirected + // stdout/stderr open will prevent EOF — so the await hangs without this. Close + // releases the handles, the readers see EOF, the await returns. + process.Close(); + } + catch (Exception ex) + { + error($"Failed to close process resources: {ex.Message}"); + } + } } // Single place we block waiting for the spawned process to exit. From 3071172c42f0bd87c0d3096af181aa777e310493 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 29 May 2026 12:23:03 +1000 Subject: [PATCH 49/52] Revert "Restore process.Close() in cancel cleanup to release pipe handles" This reverts commit f1cfcc1c8b7a8e6477bb443e26a63203ccc42e1b. --- .../Util/CommandLine/SilentProcessRunner.cs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 0c8a4bed0..4fe392f8f 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -269,21 +269,6 @@ static void DoOurBestToCleanUp(Process process, Action error) error($"Failed to kill the launched process: {killProcessException}"); } } - finally - { - try - { - // Force the pipe handles closed. Process.WaitForExitAsync waits for stream EOF - // after the Exited event, and a re-parented grandchild holding our redirected - // stdout/stderr open will prevent EOF — so the await hangs without this. Close - // releases the handles, the readers see EOF, the await returns. - process.Close(); - } - catch (Exception ex) - { - error($"Failed to close process resources: {ex.Message}"); - } - } } // Single place we block waiting for the spawned process to exit. From 5e62dbfd2cb4dd48bb873f3c269cf8cf6ce9ba34 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 29 May 2026 13:03:02 +1000 Subject: [PATCH 50/52] Cancel WaitForExitAsync via linked token instead of process.Close() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Process.WaitForExitAsync on net8.0 waits for stream EOF after the Exited event fires (per the dotnet/runtime source). A re-parented grandchild that inherits our redirected stdout/stderr holds the pipes open, EOF never arrives, and the await hangs. Adding process.Close() to cancel cleanup (matching main's sync version) broke things differently: Close clears the Process object's stream fields AND aborts the in-flight WaitForExitAsync, which broke ShouldCancelPing and the AbandonScript_WhenCancelFailsToKillProcess tests. Instead, pass a linked cancel∪abandon token to WaitForExitAsync. When cancel fires, Hitman.Kill runs synchronously via cancel.Register, the linked token cancels the await, and we fall through to the existing bounded stream-drain cleanup. The brief WaitForExit(5000) ensures the process has actually terminated before SafelyGetExitCode reads process.ExitCode. Local Unix grandchild test: 62ms (was 60s+ hang). Local ShouldCancelPing: still passes (2s). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/CommandLine/SilentProcessRunner.cs | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 4fe392f8f..af1ba719a 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -145,21 +145,18 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei process.BeginOutputReadLine(); process.BeginErrorReadLine(); + // Process.WaitForExitAsync waits for stream EOF after the Exited event + // fires. A re-parented grandchild that inherits our redirected stdout/stderr + // pipes will hold them open and prevent EOF, hanging the await forever. We + // can't add process.Close() to cancel cleanup (it clears the Process object's + // stream fields and aborts the in-flight await, breaking other tests). So + // instead we pass a linked token combining `abandon` and `cancel`: when + // either fires, the await throws OCE and we route to the matching catch. + using var stopWaiting = CancellationTokenSource.CreateLinkedTokenSource(abandon, cancel); + try { - // WaitForExitAsync completes when the Process.Exited event fires (or - // when `abandon` cancels). Unlike the sync WaitForExit() no-timeout - // overload, it does NOT wait for the redirected stdout/stderr streams - // to reach EOF — so a re-parented grandchild holding our pipes open - // cannot hang us here. Stream draining is handled separately below by - // SafelyWaitForAllOutput (with a 5s timeout per stream). - // - // We pass `abandon` (not `cancel`) because cancel is handled via the - // cancel.Register callback above which kills the process tree; the - // resulting Exited event is what unblocks this await on cancel. - // `abandon` is a separate token used by the abandon feature to stop waiting - // WITHOUT killing the process — see the catch block below. - await WaitForProcessExitAsync(process, abandon).ConfigureAwait(false); + await WaitForProcessExitAsync(process, stopWaiting.Token).ConfigureAwait(false); } catch (OperationCanceledException) when (abandon.IsCancellationRequested && !process.HasExited) { @@ -168,6 +165,16 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei running = false; return ScriptExitCodes.AbandonedExitCode; } + catch (OperationCanceledException) when (cancel.IsCancellationRequested) + { + // Cancel fired. Hitman.Kill ran synchronously via cancel.Register, sending + // the kill signal. The OS may take a moment to actually terminate the + // process; brief sync wait so SafelyGetExitCode below can read ExitCode + // without throwing. WaitForExit(timeout) does NOT wait for stream EOF + // (only the no-timeout overload does), so a grandchild holding pipes + // can't extend this. + try { process.WaitForExit(5000); } catch { /* best effort */ } + } SafelyCancelOutputAndErrorRead(process, debug); From 0bc660b199067e6c6c0cde336f6f21fa125c9d29 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 29 May 2026 14:12:28 +1000 Subject: [PATCH 51/52] Split cancel/abandon OCE handling into two catches with asymmetric guarantees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous single-catch on the linked-token await hijacked the abandon path: cancel firing first caused the cancel-OCE catch to win, even when the user-intended sequence was cancel-then-abandon. The AbandonScript integration tests broke as a result. Split into two catches with different exit-code guarantees: * Abandon catch: returns AbandonedExitCode unconditionally. Abandon is inherently a race against natural script exit — if the script happened to finish at the same moment we got the abandon signal, reporting "abandoned" is still semantically right. * Cancel catch: briefly WaitForExit(5000) so the kill takes effect, then falls through to SafelyGetExitCode for the real exit code (137 on Linux SIGKILL, 1 on Windows TerminateProcess). Only falls back to AbandonedExitCode if the kill genuinely didn't land within 5s (e.g., Hitman disabled by test, or process is stuck) — the only honest signal available when there's no real exit code yet. Local Unix grandchild test: 115ms (was 60s+ hang). Local ShouldCancelPing: 2s with real exit code 137 preserved. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Util/CommandLine/SilentProcessRunner.cs | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index af1ba719a..daad969cc 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -158,8 +158,12 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei { await WaitForProcessExitAsync(process, stopWaiting.Token).ConfigureAwait(false); } - catch (OperationCanceledException) when (abandon.IsCancellationRequested && !process.HasExited) + catch (OperationCanceledException) when (abandon.IsCancellationRequested) { + // Abandon path. From the user's perspective abandon is a race against + // natural script exit, so returning AbandonedExitCode is acceptable even + // if the process happened to finish at the same moment — that's why we + // don't check process.HasExited here. info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); SafelyCancelOutputAndErrorRead(process, debug); running = false; @@ -167,13 +171,25 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei } catch (OperationCanceledException) when (cancel.IsCancellationRequested) { - // Cancel fired. Hitman.Kill ran synchronously via cancel.Register, sending - // the kill signal. The OS may take a moment to actually terminate the - // process; brief sync wait so SafelyGetExitCode below can read ExitCode - // without throwing. WaitForExit(timeout) does NOT wait for stream EOF - // (only the no-timeout overload does), so a grandchild holding pipes - // can't extend this. + // Cancel path. Hitman.Kill ran synchronously via cancel.Register but the + // actual OS termination is async — brief sync wait so the fall-through to + // SafelyGetExitCode below reads the real exit code. WaitForExit(timeout) + // does NOT wait for stream EOF (only the no-timeout overload does), so a + // grandchild holding pipes can't extend this. try { process.WaitForExit(5000); } catch { /* best effort */ } + if (!process.HasExited) + { + // Kill didn't take effect within 5s (e.g., kill disabled in test, or + // process is genuinely stuck). We can't safely read ExitCode from a + // running process, so fall back to AbandonedExitCode — the user has + // committed to stopping and this is the only honest signal we have. + info("Tentacle stopped waiting for the cancelled script. The underlying script process may still be running on this host."); + SafelyCancelOutputAndErrorRead(process, debug); + running = false; + return ScriptExitCodes.AbandonedExitCode; + } + // Process exited cleanly within the bounded wait — fall through to read + // the real exit code via SafelyGetExitCode below. } SafelyCancelOutputAndErrorRead(process, debug); From f11754dc84765fd5e5ed57775656d8c1ec867217 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 29 May 2026 16:00:18 +1000 Subject: [PATCH 52/52] Make cancel-catch legible: name kill grace period, drop dead try Behavior-preserving cleanup of the cancel OperationCanceledException catch in SilentProcessRunner. No change to returned exit codes for any path. - Extract the 5s magic number into CancelKillGraceMilliseconds. - Remove the nested try/catch around WaitForExit(int): on a started, non-detached Process it cannot throw, so the empty catch was dead defensiveness (and a nested try inside a catch). - Use WaitForExit(int)'s own bool result instead of a second HasExited read. - Rewrite comments to cite, per decision, the test that requires it. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Util/CommandLine/SilentProcessRunner.cs | 40 +++++++++++++------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index daad969cc..493e33e85 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -17,6 +17,10 @@ namespace Octopus.Tentacle.Util { public static class SilentProcessRunner { + // How long we wait for an issued kill to actually reap the process before we + // give up and report the script as abandoned. + const int CancelKillGraceMilliseconds = 5000; + public static Task ExecuteCommandAsync( string executable, string arguments, @@ -171,25 +175,35 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei } catch (OperationCanceledException) when (cancel.IsCancellationRequested) { - // Cancel path. Hitman.Kill ran synchronously via cancel.Register but the - // actual OS termination is async — brief sync wait so the fall-through to - // SafelyGetExitCode below reads the real exit code. WaitForExit(timeout) - // does NOT wait for stream EOF (only the no-timeout overload does), so a - // grandchild holding pipes can't extend this. - try { process.WaitForExit(5000); } catch { /* best effort */ } - if (!process.HasExited) + // Cancel means "kill it". DoOurBestToCleanUp already issued the kill via + // cancel.Register, but Kill() only *requests* termination — the OS reaps the + // process asynchronously, so it is usually still alive at this point. We wait a + // bounded grace period for the reap so the fall-through below can read the real + // exit code (e.g. 137 for SIGKILL, 143 for SIGTERM). + // Required by CancellationToken_ShouldForceKillTheProcess. + // + // The finite-timeout overload is deliberate: WaitForExit(int) waits only for + // termination, never for redirected-stream EOF (the EOF drain is guarded by + // `milliseconds == Timeout.Infinite`). The no-arg overload WOULD drain EOF and + // hang when a re-parented grandchild holds our pipes open. + // Required by CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNotHang. + var exitedWithinGracePeriod = process.WaitForExit(CancelKillGraceMilliseconds); + + if (!exitedWithinGracePeriod) { - // Kill didn't take effect within 5s (e.g., kill disabled in test, or - // process is genuinely stuck). We can't safely read ExitCode from a - // running process, so fall back to AbandonedExitCode — the user has - // committed to stopping and this is the only honest signal we have. + // The kill did not land within the grace period: the process is genuinely + // stuck (or kill was disabled in a test). There is no real exit code to read + // — reading process.ExitCode here would throw "Process must exit before + // requested information can be determined." So we report it as abandoned; + // the process may still be running on this host. + // Required by AbandonScript_WhenCancelFailsToKillProcess_ReturnsAbandonedExitCode. info("Tentacle stopped waiting for the cancelled script. The underlying script process may still be running on this host."); SafelyCancelOutputAndErrorRead(process, debug); running = false; return ScriptExitCodes.AbandonedExitCode; } - // Process exited cleanly within the bounded wait — fall through to read - // the real exit code via SafelyGetExitCode below. + + // Exited within the grace period — fall through to read the real exit code below. } SafelyCancelOutputAndErrorRead(process, debug);