diff --git a/CLAUDE.md b/CLAUDE.md index 2964169..e7d1d27 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -201,6 +201,8 @@ psi cache init --backend {tpm,hsm} Provision cache encryption key psi cache status [--verify] Show cache status (fast) or decrypt and count (slow) psi cache refresh Re-run setup to repopulate the cache psi cache invalidate Drop an entry and persist +# A psi-{provider}-setup.timer is auto-generated on `psi systemd install` +# when cache.backend is set; cadence via cache.refresh_interval (default 1h). # Infisical provider psi infisical login Test authentication diff --git a/README.md b/README.md index a774f5a..40939b9 100644 --- a/README.md +++ b/README.md @@ -177,13 +177,19 @@ memory — upstream provider outages no longer stop containers from starting. ```yaml cache: enabled: true - backend: hsm # 'tpm' or 'hsm'. Required for the cache to populate. + backend: hsm # 'tpm' or 'hsm'. Required for the cache to populate. + refresh_interval: 1h # how often the scheduled timer re-pulls secrets + refresh_randomized_delay: 5m # spread refreshes across a fleet ``` The TPM backend uses a 32-byte AES-256 key sealed by `systemd-creds` to the host TPM2. The HSM backend reuses the existing Nitrokey hybrid envelope (RSA-OAEP + AES-256-GCM), unwrapping the AES key via PKCS#11 at `psi serve` startup. +With the cache enabled, `psi systemd install` also generates a periodic refresh timer +(`psi-infisical-setup.timer`) that runs the setup unit on `refresh_interval`, so a +secret rotated upstream makes its way into PSI without manual intervention. + ```bash # One-time provisioning (host) sudo psi cache init --backend tpm # or --backend hsm @@ -499,6 +505,10 @@ psi cache refresh Re-run setup to repopulate the cache from provi psi cache invalidate Drop a single entry and persist the change ``` +The cache is also refreshed automatically by `psi-infisical-setup.timer` on the +`cache.refresh_interval` cadence (default `1h`). `psi cache refresh` is only needed +for out-of-band rotations that cannot wait for the next scheduled run. + See the [secret cache reference](docs/secret-cache.md) for full documentation. ### Infisical provider @@ -592,6 +602,7 @@ the exact invocation. Generates per-provider setup units based on configured providers: - `psi-secrets.container` — long-running lookup service - `psi-{provider}-setup.container` — oneshot per provider (e.g. `psi-infisical-setup`, `psi-nitrokeyhsm-setup`) +- `psi-infisical-setup.timer` — periodic cache refresh (only when the secret cache is enabled) - `psi-tls-renew.timer` + service — daily TLS renewal (if configured) When the [secret cache](docs/secret-cache.md) is configured, the generator automatically diff --git a/docs/secret-cache.md b/docs/secret-cache.md index 4609453..c90cae8 100644 --- a/docs/secret-cache.md +++ b/docs/secret-cache.md @@ -85,15 +85,23 @@ Add a `cache:` block at the top level of `/etc/psi/config.yaml`: ```yaml cache: - enabled: true # default: true - backend: hsm # 'tpm' or 'hsm'. Required for the cache to populate. - # path: /var/lib/psi/cache.enc # default: state_dir / cache.enc + enabled: true # default: true + backend: hsm # 'tpm' or 'hsm'. Required for the cache to populate. + # path: /var/lib/psi/cache.enc # default: state_dir / cache.enc + refresh_interval: 1h # default: 1h — systemd time string + refresh_randomized_delay: 5m # default: 5m — spreads refreshes across a fleet ``` If `cache.backend` is unset, `psi serve` logs a warning at startup and falls back to today's live-lookup behavior. Existing installs that upgrade but do not set a backend continue to work exactly as before. +`refresh_interval` and `refresh_randomized_delay` are passed to +`OnUnitActiveSec` and `RandomizedDelaySec` in the generated +`psi-{provider}-setup.timer`. See the [rotation and periodic refresh +section](#rotation-and-periodic-refresh) below for how the timer triggers +and how to tune it on a deployed host. + ## Backends ### TPM2 via `systemd-creds` @@ -362,7 +370,57 @@ Options: This is a trade-off of the push-at-rotation-time design — cold-boot resilience depends on the provider being up during initial provisioning. -## Rotation +## Rotation and periodic refresh + +PSI refreshes cached secrets from three triggers: a systemd timer on a +configurable interval, an explicit CLI command, and an automatic refill +on cache miss at lookup time. + +### Scheduled refresh (timer) + +`psi systemd install` generates a `psi-{provider}-setup.timer` next to +each refreshable provider's setup service — today that means +`psi-infisical-setup.timer`. The timer triggers the same setup unit that +ran at boot, which re-fetches every configured secret value and +atomically replaces `cache.enc`. Tune the cadence in config: + +```yaml +cache: + enabled: true + backend: hsm + refresh_interval: 1h # systemd time string: 30m, 2h, 1d, etc. + refresh_randomized_delay: 5m # spreads refreshes across a fleet +``` + +The generated timer uses `OnBootSec` and `OnUnitActiveSec` (both set to +`refresh_interval`) plus `Persistent=true`, so: + +- The first scheduled refresh runs `refresh_interval` after boot +- Subsequent refreshes run `refresh_interval` after the last successful + run +- A refresh missed while the host was powered off runs on the next boot + +The timer is only generated when `cache.enabled` is true and +`cache.backend` is set. If the cache is off, there is nothing to refresh +and no timer is written. The nitrokeyhsm provider does not get a timer — +its secrets are local-only and do not need periodic re-fetching. + +To override the interval without editing config, drop a systemd override: + +```bash +sudo systemctl edit psi-infisical-setup.timer +``` + +```ini +[Timer] +OnUnitActiveSec= +OnUnitActiveSec=15m +``` + +(Setting the key to an empty value first clears the generated value +before the override applies.) + +### Manual refresh ```bash # Rotate a secret in Infisical, then refresh the entire cache: @@ -370,13 +428,25 @@ sudo psi cache refresh # Or invalidate a single entry and let the next lookup pull it fresh: sudo psi cache invalidate myapp--DATABASE_URL + +# Or just kick the timer-managed setup unit directly: +sudo systemctl start psi-infisical-setup.service ``` -`psi cache refresh` re-runs the same code path as -`psi-infisical-setup.service`, pulls every configured workload secret, -encrypts the bundle, and atomically replaces `cache.enc`. Running -`refresh` during an Infisical outage is safe — it will retry via the -existing setup backoff, and on failure it leaves the old cache untouched. +`psi cache refresh` and the timer both run the same code path as +`psi-infisical-setup.service`: pull every configured workload secret, +encrypt the bundle, and atomically replace `cache.enc`. Running either +during an Infisical outage is safe — `psi/setup.py` retries via the +existing backoff and leaves the old cache untouched on failure. + +### On-miss refill + +If `psi serve` gets a lookup for a secret that is not in the in-memory +dict (first lookup after a `psi cache invalidate`, or a secret added +between refreshes), it calls the provider, returns the value, and +`cache.set() + cache.save()` persists it. This only helps when the +provider is reachable at lookup time — the durable path remains the +boot-time populate plus the periodic timer. ## Troubleshooting diff --git a/psi/installer.py b/psi/installer.py index a11026c..7a74504 100644 --- a/psi/installer.py +++ b/psi/installer.py @@ -20,7 +20,9 @@ generate_native_provider_setup_service, generate_native_serve_service, generate_native_tls_renew_service, + generate_provider_setup_timer, generate_tls_renew_timer, + provider_supports_refresh, ) if TYPE_CHECKING: @@ -96,6 +98,8 @@ def _install_native(settings: PsiSettings, enable: bool) -> None: unit_dir, ) + refresh_timers = _write_refresh_timers(settings, unit_dir) + if _has_tls(settings): _write_unit( unit_dir / "psi-tls-renew.service", @@ -113,6 +117,7 @@ def _install_native(settings: PsiSettings, enable: bool) -> None: ["psi-secrets.service", *setup_units], _has_tls(settings), scope, + refresh_timers=refresh_timers, ) @@ -133,6 +138,10 @@ def _install_container(settings: PsiSettings, image: str, enable: bool) -> None: quadlet_dir, ) + # Timers are plain systemd units — they live in the unit_dir regardless of + # whether the setup unit itself comes from a quadlet or a native service. + refresh_timers = _write_refresh_timers(settings, _systemd_unit_dir(scope)) + if _has_tls(settings): _write_unit( quadlet_dir / "psi-tls-renew.container", @@ -150,6 +159,7 @@ def _install_container(settings: PsiSettings, image: str, enable: bool) -> None: ["psi-secrets.service", *setup_units], _has_tls(settings), scope, + refresh_timers=refresh_timers, ) @@ -195,6 +205,33 @@ def _write_provider_setup_units_container( return units +def _write_refresh_timers(settings: PsiSettings, unit_dir: Path) -> list[str]: + """Write periodic cache-refresh timers for providers that support them. + + Returns the list of timer unit names written. Emits nothing and returns + an empty list when the cache is disabled or no backend is configured — + there is nothing to refresh if PSI is not caching values. + """ + if not settings.cache.enabled or settings.cache.backend is None: + return [] + + timers: list[str] = [] + for provider_name in settings.providers: + if not provider_supports_refresh(provider_name): + continue + timer_name = f"psi-{provider_name}-setup.timer" + _write_unit( + unit_dir / timer_name, + generate_provider_setup_timer( + provider_name, + settings.cache.refresh_interval, + settings.cache.refresh_randomized_delay, + ), + ) + timers.append(timer_name) + return timers + + def _write_unit(path: Path, content: str) -> None: """Write a unit file and log it.""" path.parent.mkdir(parents=True, exist_ok=True) @@ -219,6 +256,7 @@ def _enable_units( base_units: list[str], has_tls: object, scope: SystemdScope, + refresh_timers: list[str] | None = None, ) -> None: """Enable and start units.""" cmd_prefix = ["systemctl"] @@ -229,6 +267,10 @@ def _enable_units( subprocess.run([*cmd_prefix, "enable", "--now", unit], check=True) logger.info("Enabled {}", unit) + for timer in refresh_timers or []: + subprocess.run([*cmd_prefix, "enable", "--now", timer], check=True) + logger.info("Enabled {}", timer) + if has_tls: subprocess.run( [*cmd_prefix, "enable", "--now", "psi-tls-renew.timer"], diff --git a/psi/settings.py b/psi/settings.py index 3a9ae2d..259002b 100644 --- a/psi/settings.py +++ b/psi/settings.py @@ -29,6 +29,8 @@ class CacheConfig(BaseModel): enabled: bool = True backend: Literal["tpm", "hsm"] | None = None path: Path | None = None + refresh_interval: str = "1h" + refresh_randomized_delay: str = "5m" def resolve_path(self, state_dir: Path) -> Path: """Return the cache file path, defaulting to ``state_dir/cache.enc``.""" diff --git a/psi/unitgen.py b/psi/unitgen.py index 896ca45..7c410b2 100644 --- a/psi/unitgen.py +++ b/psi/unitgen.py @@ -138,6 +138,51 @@ def generate_tls_renew_timer() -> str: ) +# Providers whose setup path talks to a remote and therefore benefits from +# periodic refresh. The HSM provider is local-only and does not need a timer. +_REFRESHABLE_PROVIDERS = frozenset({"infisical"}) + + +def provider_supports_refresh(provider: str) -> bool: + """Return True when a periodic refresh timer makes sense for ``provider``.""" + return provider in _REFRESHABLE_PROVIDERS + + +def generate_provider_setup_timer( + provider: str, + interval: str, + randomized_delay: str, +) -> str: + """Generate psi-{provider}-setup.timer for periodic secret cache refresh. + + The timer triggers the matching ``psi-{provider}-setup.service`` unit on + a relative interval (``OnUnitActiveSec``) so the cache picks up secrets + rotated upstream between reboots. ``Persistent=true`` ensures a missed + refresh runs on the next boot rather than waiting a full interval. + + Args: + provider: Provider name (currently only ``infisical`` is supported). + interval: systemd time string for ``OnUnitActiveSec`` (e.g. ``"1h"``, + ``"30m"``, ``"2h"``). + randomized_delay: systemd time string for ``RandomizedDelaySec`` to + spread refresh events across a fleet. + """ + return ( + "[Unit]\n" + f"Description=Periodic PSI {provider} secret cache refresh\n" + "\n" + "[Timer]\n" + f"Unit=psi-{provider}-setup.service\n" + f"OnBootSec={interval}\n" + f"OnUnitActiveSec={interval}\n" + f"RandomizedDelaySec={randomized_delay}\n" + "Persistent=true\n" + "\n" + "[Install]\n" + "WantedBy=timers.target\n" + ) + + def generate_container_provider_setup_quadlet( image: str, settings: PsiSettings, diff --git a/tests/test_installer.py b/tests/test_installer.py index b975a0f..e515078 100644 --- a/tests/test_installer.py +++ b/tests/test_installer.py @@ -13,6 +13,7 @@ _systemd_unit_dir, _write_provider_setup_units_container, _write_provider_setup_units_native, + _write_refresh_timers, install_driver_conf, ) from psi.models import SystemdScope @@ -22,6 +23,10 @@ def _mock_settings( tmp_path: Path, scope: SystemdScope = SystemdScope.SYSTEM, providers: dict | None = None, + cache_backend: str | None = None, + cache_enabled: bool = True, + refresh_interval: str = "1h", + refresh_randomized_delay: str = "5m", ) -> MagicMock: settings = MagicMock() settings.state_dir = tmp_path / "state" @@ -35,6 +40,10 @@ def _mock_settings( settings.config_dir = Path.home() / ".config/psi" else: settings.config_dir = Path("/etc/psi") + settings.cache.enabled = cache_enabled + settings.cache.backend = cache_backend + settings.cache.refresh_interval = refresh_interval + settings.cache.refresh_randomized_delay = refresh_randomized_delay return settings @@ -175,3 +184,75 @@ def test_container_infisical_content(self, tmp_path: Path) -> None: content = (quadlet_dir / "psi-infisical-setup.container").read_text() assert "Exec=setup --provider infisical" in content assert "network-online.target" in content + + +class TestWriteRefreshTimers: + def test_infisical_timer_written_when_cache_enabled_with_backend( + self, + tmp_path: Path, + ) -> None: + settings = _mock_settings( + tmp_path, + providers={"infisical": {}, "nitrokeyhsm": {}}, + cache_backend="hsm", + ) + unit_dir = tmp_path / "units" + unit_dir.mkdir() + timers = _write_refresh_timers(settings, unit_dir) + assert timers == ["psi-infisical-setup.timer"] + assert (unit_dir / "psi-infisical-setup.timer").exists() + content = (unit_dir / "psi-infisical-setup.timer").read_text() + assert "Unit=psi-infisical-setup.service" in content + assert "OnUnitActiveSec=1h" in content + + def test_no_timer_when_cache_disabled(self, tmp_path: Path) -> None: + settings = _mock_settings( + tmp_path, + providers={"infisical": {}}, + cache_backend="hsm", + cache_enabled=False, + ) + unit_dir = tmp_path / "units" + unit_dir.mkdir() + timers = _write_refresh_timers(settings, unit_dir) + assert timers == [] + assert not (unit_dir / "psi-infisical-setup.timer").exists() + + def test_no_timer_when_no_backend(self, tmp_path: Path) -> None: + settings = _mock_settings( + tmp_path, + providers={"infisical": {}}, + cache_backend=None, + ) + unit_dir = tmp_path / "units" + unit_dir.mkdir() + timers = _write_refresh_timers(settings, unit_dir) + assert timers == [] + + def test_no_timer_for_nitrokeyhsm_provider(self, tmp_path: Path) -> None: + """HSM is local-only — nothing to periodically re-fetch.""" + settings = _mock_settings( + tmp_path, + providers={"nitrokeyhsm": {}}, + cache_backend="hsm", + ) + unit_dir = tmp_path / "units" + unit_dir.mkdir() + timers = _write_refresh_timers(settings, unit_dir) + assert timers == [] + assert not (unit_dir / "psi-nitrokeyhsm-setup.timer").exists() + + def test_custom_interval_is_honored(self, tmp_path: Path) -> None: + settings = _mock_settings( + tmp_path, + providers={"infisical": {}}, + cache_backend="tpm", + refresh_interval="15m", + refresh_randomized_delay="1m", + ) + unit_dir = tmp_path / "units" + unit_dir.mkdir() + _write_refresh_timers(settings, unit_dir) + content = (unit_dir / "psi-infisical-setup.timer").read_text() + assert "OnUnitActiveSec=15m" in content + assert "RandomizedDelaySec=1m" in content diff --git a/tests/test_unitgen.py b/tests/test_unitgen.py index e1df13f..c6b0c55 100644 --- a/tests/test_unitgen.py +++ b/tests/test_unitgen.py @@ -16,7 +16,9 @@ generate_native_provider_setup_service, generate_native_serve_service, generate_native_tls_renew_service, + generate_provider_setup_timer, generate_tls_renew_timer, + provider_supports_refresh, ) @@ -440,3 +442,40 @@ def test_serve_quadlet_has_notify_healthy(self, tmp_path: Path) -> None: assert "HealthCmd=curl -sf --unix-socket " in content assert "http://localhost/healthz" in content assert "HealthStartPeriod=60s" in content + + +class TestProviderRefreshSupport: + def test_infisical_supports_refresh(self) -> None: + assert provider_supports_refresh("infisical") is True + + def test_nitrokeyhsm_does_not_support_refresh(self) -> None: + assert provider_supports_refresh("nitrokeyhsm") is False + + def test_unknown_provider_does_not_support_refresh(self) -> None: + assert provider_supports_refresh("random-name") is False + + +class TestProviderSetupTimer: + def test_targets_matching_setup_unit(self) -> None: + content = generate_provider_setup_timer("infisical", "1h", "5m") + assert "Unit=psi-infisical-setup.service" in content + + def test_interval_and_randomized_delay_are_passed_through(self) -> None: + content = generate_provider_setup_timer("infisical", "30m", "2m") + assert "OnUnitActiveSec=30m" in content + assert "OnBootSec=30m" in content + assert "RandomizedDelaySec=2m" in content + + def test_is_persistent_so_missed_refreshes_run_on_next_boot(self) -> None: + content = generate_provider_setup_timer("infisical", "1h", "5m") + assert "Persistent=true" in content + + def test_install_section_hooks_into_timers_target(self) -> None: + content = generate_provider_setup_timer("infisical", "1h", "5m") + assert "[Install]" in content + assert "WantedBy=timers.target" in content + + def test_description_mentions_cache_refresh(self) -> None: + content = generate_provider_setup_timer("infisical", "1h", "5m") + assert "Description=" in content + assert "cache refresh" in content.lower()