From de7f77730d2918d43c9f597a2eaaa892f254b70b Mon Sep 17 00:00:00 2001 From: Joe Doss Date: Mon, 4 May 2026 19:45:22 -0500 Subject: [PATCH 1/3] cache: clear in-memory entries when backing file vanishes maybe_reload returned False on FileNotFoundError, so a deleted cache.enc left the live serve process serving the entries it had loaded at startup. Forever. Clearing on vanish forces the next lookup through the provider, which is what the operator expects after wiping the cache. --- psi/cache.py | 20 ++++++++++++++++++-- tests/test_cache.py | 46 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/psi/cache.py b/psi/cache.py index 905bd8b..f278e65 100644 --- a/psi/cache.py +++ b/psi/cache.py @@ -193,13 +193,29 @@ def maybe_reload(self) -> bool: is ~1μs on modern kernels; actual reload happens only when setup has finished a write. + If the backing file has vanished after we previously loaded it, drop + the in-memory entries instead of silently serving stale values — a + wiped ``cache.enc`` should not produce a cache that pretends entries + still exist. The next provider lookup repopulates as needed. + Returns: - True if a reload happened, False otherwise. + True if entries were reloaded or cleared, False otherwise. """ try: current_mtime = self._path.stat().st_mtime_ns except FileNotFoundError: - return False + if self._mtime_ns == 0: + return False + with self._lock: + logger.warning( + "Cache file {} disappeared after previous load; clearing " + "{} in-memory entries to avoid serving stale values.", + self._path, + len(self._entries), + ) + self._entries = {} + self._mtime_ns = 0 + return True if current_mtime == self._mtime_ns: return False self.load() diff --git a/tests/test_cache.py b/tests/test_cache.py index 134d29d..9955a4c 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -267,6 +267,17 @@ def test_missing_file_returns_false_no_crash( cache = Cache(tmp_path / "does-not-exist.enc", backend) assert cache.maybe_reload() is False + def test_clears_entries_when_file_vanishes_after_load(self, cache: Cache) -> None: + """A wiped cache.enc must not leave the in-memory dict serving stale values.""" + cache.set("k", b"v") + cache.save() + assert cache.get("k") == b"v" + + cache.path.unlink() + + assert cache.maybe_reload() is True + assert cache.get("k") is None + class TestLegacyV1PayloadDiscarded: def test_v1_payload_is_treated_as_empty_with_fresh_hmac_key( @@ -293,3 +304,38 @@ def test_v1_payload_is_treated_as_empty_with_fresh_hmac_key( fresh.load() assert len(fresh) == 0 assert fresh.get("abc123hex") is None + + +class TestGuardExistingCache: + """`psi cache init` must not silently clobber a populated cache file.""" + + def test_no_existing_file_returns_none(self, tmp_path: Path) -> None: + from psi.cli import _guard_existing_cache + + result = _guard_existing_cache(tmp_path / "cache.enc", force=False) + assert result is None + + def test_existing_file_without_force_raises(self, tmp_path: Path) -> None: + from psi.cli import _guard_existing_cache + from psi.errors import ConfigError + + path = tmp_path / "cache.enc" + path.write_bytes(b"existing payload") + + with pytest.raises(ConfigError, match="already exists"): + _guard_existing_cache(path, force=False) + assert path.exists(), "guard must not delete the existing file when refusing" + + def test_existing_file_with_force_rotates_to_bak(self, tmp_path: Path) -> None: + from psi.cli import _guard_existing_cache + + path = tmp_path / "cache.enc" + original = b"existing payload" + path.write_bytes(original) + + bak = _guard_existing_cache(path, force=True) + assert bak is not None + assert bak.exists() + assert bak.read_bytes() == original + assert not path.exists() + assert bak.name.startswith("cache.enc.bak-") From c4ce6766ce8099b031a990fa554ca924b9105203 Mon Sep 17 00:00:00 2001 From: Joe Doss Date: Mon, 4 May 2026 19:45:47 -0500 Subject: [PATCH 2/3] cli: refuse to clobber existing cache.enc without --force, rotate to .bak cache init went straight to atomic rename, so re-running it on a populated cache wiped every entry with no recovery path. Now it refuses if the file exists and only proceeds with --force, in which case the previous file is moved to .bak- before the new empty cache is written. --- psi/cli.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/psi/cli.py b/psi/cli.py index 8135e73..385ec86 100644 --- a/psi/cli.py +++ b/psi/cli.py @@ -244,6 +244,14 @@ def cache_init( "Default: /cache.key.", ), ] = None, + force: Annotated[ + bool, + typer.Option( + "--force", + help="Overwrite an existing cache file. The previous file is " + "rotated to '.bak-' before the new one is written.", + ), + ] = False, config: ConfigOption = None, ) -> None: """Provision the cache encryption key and write an empty cache file.""" @@ -266,6 +274,8 @@ def cache_init( cache_path = settings.cache.resolve_path(settings.state_dir) cache_path.parent.mkdir(parents=True, exist_ok=True) + bak_path = _guard_existing_cache(cache_path, force=force) + if backend == "tpm": raw_key = os.urandom(32) target_key_path = key_path or (settings.config_dir / "cache.key") @@ -298,6 +308,8 @@ def cache_init( cache = Cache(cache_path, TpmBackend(key=raw_key)) cache.save() + if bak_path is not None: + console.print(f"Previous cache rotated to [bold]{bak_path}[/bold]", highlight=False) console.print( f"Sealed TPM key → [bold]{target_key_path}[/bold]\n" f"Empty cache → [bold]{cache_path}[/bold]\n" @@ -321,6 +333,8 @@ def cache_init( cache.save() finally: hsm_backend.close() + if bak_path is not None: + console.print(f"Previous cache rotated to [bold]{bak_path}[/bold]", highlight=False) console.print( f"Empty cache → [bold]{cache_path}[/bold]\n" "Cache will be unsealed via PKCS#11 at 'psi serve' startup.", @@ -328,6 +342,41 @@ def cache_init( ) +def _guard_existing_cache(cache_path: Path, *, force: bool) -> Path | None: + """Refuse to clobber an existing cache file unless ``force`` is set. + + When ``force`` is set and the file exists, rotate it to + ``.bak-`` so a misuse can be undone. The header + is not decrypted — wrapping in a backup is cheap and makes the + "I just nuked the populated cache" path recoverable. + + Returns: + The path the previous file was rotated to, or ``None`` if no file + existed. + + Raises: + ConfigError: If the file exists and ``force`` is False. + """ + from datetime import UTC, datetime + + from psi.errors import ConfigError + + if not cache_path.exists(): + return None + if not force: + msg = ( + f"Cache file already exists at {cache_path}. Re-running 'psi cache " + "init' would replace it with an empty cache and the existing " + "entries would be unrecoverable. Pass --force to overwrite; the " + "previous file is rotated to a '.bak-' sibling first." + ) + raise ConfigError(msg) + timestamp = datetime.now(tz=UTC).strftime("%Y%m%dT%H%M%SZ") + bak = cache_path.with_name(f"{cache_path.name}.bak-{timestamp}") + cache_path.rename(bak) + return bak + + @cache_app.command(name="refresh") def cache_refresh(config: ConfigOption = None) -> None: """Re-run setup to refresh every cached secret from providers.""" From 8cf4803c8d9633bb0c370da4dcc0e0877a2f9e67 Mon Sep 17 00:00:00 2001 From: Joe Doss Date: Mon, 4 May 2026 19:49:21 -0500 Subject: [PATCH 3/3] setup: surface orphaned podman secrets and exit non-zero Adds OrphanedSecretsError and a check in run_setup that scans every shell-driver Podman secret for a backing mapping file in state_dir. Missing mapping means lookups return 404 and the consuming container fails to start; before this commit setup would happily complete without surfacing the condition. Drift detection still fires when relevant; orphan takes precedence since it produces hard failures rather than missing env vars. --- psi/errors.py | 9 ++++ psi/setup.py | 74 ++++++++++++++++++++++++++++-- tests/test_setup.py | 109 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 187 insertions(+), 5 deletions(-) diff --git a/psi/errors.py b/psi/errors.py index 4c4d356..d6d4b25 100644 --- a/psi/errors.py +++ b/psi/errors.py @@ -25,3 +25,12 @@ class SecretNotFoundError(PsiError): class DriftDetectedError(PsiError): """Podman secret state diverged from the fetch — drop-ins are incomplete.""" + + +class OrphanedSecretsError(PsiError): + """One or more Podman shell secrets have no backing mapping file. + + Lookup will return 404 for these secrets and any container that depends + on them will fail to start. Distinct from drift — drift is "secret in + Podman, not in fetch", orphan is "secret in Podman, no mapping on disk". + """ diff --git a/psi/setup.py b/psi/setup.py index 0eaca95..1c137eb 100644 --- a/psi/setup.py +++ b/psi/setup.py @@ -9,7 +9,7 @@ import httpx from loguru import logger -from psi.errors import DriftDetectedError, ProviderError +from psi.errors import DriftDetectedError, OrphanedSecretsError, ProviderError from psi.systemd import daemon_reload if TYPE_CHECKING: @@ -42,10 +42,15 @@ def run_setup( provider: If set, only process workloads using this provider. Raises: - DriftDetectedError: when one or more Podman secrets under ``--*`` - are missing from the current fetch. Drop-ins are still written - and systemd is still reloaded — the error fires at the end so - the caller (and the setup systemd unit) sees a non-zero exit. + OrphanedSecretsError: when one or more Podman shell secrets have no + backing mapping file in ``state_dir``. Lookups for these will + return 404 and consuming containers will fail to start. Takes + precedence over drift since the failure mode is harder. + DriftDetectedError: when one or more Podman secrets under + ``--*`` are missing from the current fetch. Drop-ins + are still written and systemd is still reloaded — the error + fires at the end so the caller (and the setup systemd unit) + sees a non-zero exit. """ settings.state_dir.mkdir(parents=True, exist_ok=True) @@ -80,8 +85,34 @@ def run_setup( logger.info("Reloading systemd...") daemon_reload(settings.scope) + + orphans = _check_orphans(settings) + for orphan in orphans: + logger.warning( + "Orphan: Podman secret '{}' has no mapping file in {} — lookups " + "will return 404 and any container that uses this secret will " + "fail to start. Re-create the mapping (e.g. 'psi nitrokeyhsm " + "store {}') or remove the stale entry with 'podman secret rm {}'.", + orphan, + settings.state_dir, + orphan, + orphan, + ) + logger.info("Setup complete.") + if orphans: + msg = ( + f"Orphaned secrets detected: {len(orphans)} Podman shell " + "secret(s) have no backing mapping file. Lookups will return " + "404 and consuming containers will fail to start. Re-create " + "the mappings or remove the stale Podman secrets. Run 'psi " + "setup --dry-run' for per-secret details." + ) + if drift: + msg += f" Additionally, {len(drift)} drift entry/entries — see warnings above." + raise OrphanedSecretsError(msg) + if drift: msg = ( f"Drift detected: {len(drift)} Podman secret(s) not present in " @@ -447,6 +478,39 @@ def _workload_podman_names(workload_name: str, secrets: list[dict]) -> set[str]: } +def _check_orphans(settings: PsiSettings) -> list[str]: + """Return Podman shell secrets with no backing mapping file in ``state_dir``. + + These are the failure mode the regular setup path otherwise wouldn't + surface: a Podman secret created via ``shell`` driver whose + corresponding ``state_dir/`` file is missing means + ``psi serve`` will reply 404 for any lookup and the consuming + container will fail to start. ``setup`` itself can succeed (it only + re-registers Infisical-provider secrets), so the regression goes + unnoticed until something restarts. + + Returns: + Sorted list of orphaned Podman secret names. Returns an empty list + if the Podman API is unreachable; the primary fetch-and-register + path would already have failed loudly in that case. + """ + try: + secrets = _list_podman_shell_secrets() + except httpx.HTTPError as e: + logger.warning("Cannot list Podman secrets to check for orphans: {}", e) + return [] + orphans: list[str] = [] + for secret in secrets: + spec = secret.get("Spec", {}) + name = spec.get("Name", "") + secret_id = secret.get("ID", "") + if not name or not secret_id: + continue + if not (settings.state_dir / secret_id).exists(): + orphans.append(name) + return sorted(orphans) + + def _check_workload_drift( workload_name: str, merged: dict[str, str], diff --git a/tests/test_setup.py b/tests/test_setup.py index 7254b2f..4f29323 100644 --- a/tests/test_setup.py +++ b/tests/test_setup.py @@ -14,6 +14,7 @@ from psi.settings import PsiSettings from psi.setup import ( _RETRY_DELAYS, + _check_orphans, _check_workload_drift, _generate_drop_in, _is_retryable, @@ -530,3 +531,111 @@ def mock_fetch(settings, workload_name, values_by_mapping, drift): run_setup(settings) mock_reload.assert_called_once() + + +class TestCheckOrphans: + """Detection of Podman shell secrets with no backing mapping file.""" + + def test_no_orphans_when_all_have_mappings(self, tmp_path: Path) -> None: + settings = _make_settings(tmp_path) + settings.state_dir.mkdir(parents=True) + (settings.state_dir / "abc123").write_text("{}") + podman_secrets = [ + {"ID": "abc123", "Spec": {"Name": "myapp--K"}}, + ] + with patch("psi.setup._list_podman_shell_secrets", return_value=podman_secrets): + assert _check_orphans(settings) == [] + + def test_returns_names_for_secrets_without_mappings(self, tmp_path: Path) -> None: + settings = _make_settings(tmp_path) + settings.state_dir.mkdir(parents=True) + (settings.state_dir / "have-mapping").write_text("{}") + podman_secrets = [ + {"ID": "have-mapping", "Spec": {"Name": "myapp--PRESENT"}}, + {"ID": "missing-id-1", "Spec": {"Name": "INFISICAL_ENCRYPTION_KEY"}}, + {"ID": "missing-id-2", "Spec": {"Name": "INFISICAL_AUTH_SECRET"}}, + ] + with patch("psi.setup._list_podman_shell_secrets", return_value=podman_secrets): + assert _check_orphans(settings) == [ + "INFISICAL_AUTH_SECRET", + "INFISICAL_ENCRYPTION_KEY", + ] + + def test_returns_empty_when_podman_unreachable(self, tmp_path: Path) -> None: + settings = _make_settings(tmp_path) + with patch( + "psi.setup._list_podman_shell_secrets", + side_effect=httpx.ConnectError("connection refused"), + ): + assert _check_orphans(settings) == [] + + +class TestRunSetupOrphanExit: + def test_raises_orphaned_secrets_error_when_orphan_present(self, tmp_path: Path) -> None: + from psi.errors import OrphanedSecretsError + + settings = _make_settings( + tmp_path, + workloads={ + "myapp": WorkloadConfig( + provider="infisical", + secrets=[SecretSource(project="myproject", path="/app")], + ), + }, + ) + + def mock_fetch(settings, workload_name, values_by_mapping, drift): + pass + + with ( + patch("psi.setup._fetch_and_register_infisical", side_effect=mock_fetch), + patch("psi.setup.daemon_reload"), + patch("psi.setup._check_orphans", return_value=["INFISICAL_ENCRYPTION_KEY"]), + pytest.raises(OrphanedSecretsError, match="1 Podman shell"), + ): + run_setup(settings) + + def test_orphan_takes_precedence_over_drift(self, tmp_path: Path) -> None: + from psi.errors import OrphanedSecretsError + + settings = _make_settings( + tmp_path, + workloads={ + "myapp": WorkloadConfig( + provider="infisical", + secrets=[SecretSource(project="myproject", path="/app")], + ), + }, + ) + + def mock_fetch(settings, workload_name, values_by_mapping, drift): + drift.append("myapp--STALE") + + with ( + patch("psi.setup._fetch_and_register_infisical", side_effect=mock_fetch), + patch("psi.setup.daemon_reload"), + patch("psi.setup._check_orphans", return_value=["INFISICAL_ENCRYPTION_KEY"]), + pytest.raises(OrphanedSecretsError, match="1 drift"), + ): + run_setup(settings) + + def test_no_raise_when_orphans_empty(self, tmp_path: Path) -> None: + settings = _make_settings( + tmp_path, + workloads={ + "myapp": WorkloadConfig( + provider="infisical", + secrets=[SecretSource(project="myproject", path="/app")], + ), + }, + ) + + def mock_fetch(settings, workload_name, values_by_mapping, drift): + pass + + with ( + patch("psi.setup._fetch_and_register_infisical", side_effect=mock_fetch), + patch("psi.setup.daemon_reload"), + patch("psi.setup._check_orphans", return_value=[]), + ): + run_setup(settings)