diff --git a/backend/app/dependencies.py b/backend/app/dependencies.py index b58b143b..8a3e8c3d 100644 --- a/backend/app/dependencies.py +++ b/backend/app/dependencies.py @@ -1,9 +1,35 @@ """Shared FastAPI dependencies.""" -from fastapi import Query, Request +import os + +from fastapi import HTTPException, Query, Request from slowapi.util import get_remote_address +def require_admin(request: Request) -> str: + """Reject requests missing a valid `X-Admin-Token` header. + + Used as a FastAPI dependency on every `/api/admin/*` route. The + token comes from the `ADMIN_TOKEN` env var (sourced from + 1Password). If `ADMIN_TOKEN` isn't set, the entire admin surface + fails closed (503) — safer than silently allowing through. + + Defense in depth: in production the admin endpoints are also + gated by Cloudflare Access OAuth at the edge (see + `docker-compose.admin.yml` + `playbooks/admin-install.yml`). + Token check is the second layer in case CF Access is ever + bypassed or misconfigured. Returns the constant string "ok" + so the dependency can be used with `Depends(require_admin)`. + """ + expected = os.environ.get("ADMIN_TOKEN", "").strip() + if not expected: + raise HTTPException(status_code=503, detail="Admin disabled") + presented = request.headers.get("x-admin-token", "") + if presented != expected: + raise HTTPException(status_code=401, detail="Bad admin token") + return "ok" + + def client_ip(request: Request) -> str: """Resolve the real visitor IP behind Cloudflare → nginx → uvicorn. diff --git a/backend/app/routers/admin_api_keys.py b/backend/app/routers/admin_api_keys.py new file mode 100644 index 00000000..b16f0305 --- /dev/null +++ b/backend/app/routers/admin_api_keys.py @@ -0,0 +1,83 @@ +"""API key management — issue, list, rotate, revoke. + +Backed by `services/api_keys_store.py`. The store handles hashing +and Mongo persistence; this router is the admin surface. + +## Endpoints + + GET /api/admin/api-keys — list (without plaintext) + POST /api/admin/api-keys — create — returns plaintext ONCE + POST /api/admin/api-keys/{id}/rotate — issue new plaintext, invalidate old + DELETE /api/admin/api-keys/{id} — soft revoke (audit trail preserved) + +## Single-show semantics + +Creation and rotation are the only times the operator sees the +plaintext key. The response body contains it once with explicit +"this won't be shown again" language; the UI surfaces it as a +copy-button + warning banner. Hash is what's stored. + +## What keys unlock + +Once this PR lands AND the consumer-side wiring lands: + +- Service accounts (e.g. Spire Compendium / Overwolf desktop app) + authenticate as themselves rather than per-user-IP. Higher rate + limits, revocable identity, accurate attribution. +- Power users opt into a "claimed" identity that survives across + IP changes (so a user with VPN-rotation behaviour can still get + consistent rate limits + run attribution). +- Third-party widget embedders get analytics on who's using the + public API; we can revoke a single abuser without IP-banning. + +## Future: scope strings + +`scopes: ["runs:submit", "runs:read", "guides:read", "admin:*"]` — +intentionally not enforced in this sketch. The first PR ships with +all keys having full read+submit access; scope checking lands as a +follow-up once a real use case for narrower access shows up. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/api-keys", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +@router.get("") +async def list_keys(request: Request): + """List every key (hashes + metadata, never plaintext). Query: + ?include_revoked=false (default).""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("") +async def create_key(request: Request): + """Body: `{"owner": "spire-compendium", "owner_kind": "service", + "scopes": ["runs:submit"], "rate_limit_override": null}`. + + Response includes plaintext key — show to operator immediately; + it's not recoverable after this response is closed.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/{key_id}/rotate") +async def rotate_key(key_id: str, request: Request): + """Issue new plaintext for an existing key_id. Old plaintext stops + working immediately. Response shape same as create.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.delete("/{key_id}") +async def revoke_key(key_id: str, request: Request): + """Soft revoke — sets `revoked=true` + `revoked_at` + `revoked_by`. + Key doc stays for audit trail; subsequent presentations are + rejected by the lookup hot path.""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_audit.py b/backend/app/routers/admin_audit.py new file mode 100644 index 00000000..c8ca5ed5 --- /dev/null +++ b/backend/app/routers/admin_audit.py @@ -0,0 +1,50 @@ +"""Audit log — read-only view of every admin action. + +Records are written by `services/audit_log.py::record()` from each +admin write endpoint. This router is read-only — it never writes, +and there's deliberately no DELETE endpoint. An append-only log +that can be edited isn't an audit log. + +## Endpoints + + GET /api/admin/audit + Last 100 entries, newest first. + Query: ?limit=100, ?since=ISO, ?actor=..., ?action=... + + GET /api/admin/audit/by-target/{target} + Every entry referencing a specific run hash, slug, etc. — useful + for "show me everything that's ever been done to run X." + +## Retention + +Append-only, no TTL on the application side. Mongo collection has a +date-based partial index for fast `?since=...` queries. For +long-term retention, run `playbooks/backup.yml` (existing) which +already snapshots Mongo dumps to B2. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/audit", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +@router.get("") +async def list_audit(request: Request): + """Most recent entries, newest first. + Query: ?limit=100, ?since=2026-05-20T00:00:00, ?actor=..., ?action=...""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/by-target/{target}") +async def by_target(target: str, request: Request): + """All audit entries referencing a specific target (run hash, + guide slug, username, rate-limit slug, etc.).""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_bulk.py b/backend/app/routers/admin_bulk.py new file mode 100644 index 00000000..fccf5a10 --- /dev/null +++ b/backend/app/routers/admin_bulk.py @@ -0,0 +1,101 @@ +"""Bulk operations — the "I'd rather click than re-write a script" +endpoints. + +These are operator power tools: ops you'd otherwise do via a +one-shot Python script in tools/. Each is a long-running job that +returns a job id immediately and lets you poll status separately. + +## Job pattern + +Every bulk op writes a `bulk_jobs` doc: + + { + "_id": "job_", + "kind": "rehash_runs", # operation slug + "params": {...}, # original request body + "status": "queued|running|done|failed", + "started_at": ..., "finished_at": ..., + "processed": 0, "total": null, # progress counters + "error": null, + "actor": "peter", + } + +Endpoint kicks off the job in a background thread (FastAPI's +BackgroundTasks for the simple case, or a separate worker if jobs +ever get bigger). GET /jobs/{id} returns current status. + +## Initial set of bulk ops + +- `POST /api/admin/bulk/rehash-runs` — recompute run_hash for runs + matching a filter (after a hash-formula change) +- `POST /api/admin/bulk/dedupe-runs` — find duplicates by + (seed, character, start, run_time) and hide all but the oldest +- `POST /api/admin/bulk/reattach-files` — for runs missing JSON + files on disk, attempt sibling-copy or synthesize from Mongo doc +- `POST /api/admin/bulk/import-beta-version` — parse a fresh beta + PCK + DLL extraction into a versioned data-beta/ dir +- `POST /api/admin/bulk/recompute-scores` — full rebuild of + spire_codex_entity_scores (Codex Score) — same as /ops/refresh- + entity-scores but as a tracked job for visibility +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/bulk", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +@router.get("/jobs") +async def list_jobs(request: Request): + """Last N jobs across all kinds. Query: `?limit=50&kind=...&status=...`.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/jobs/{job_id}") +async def get_job(job_id: str, request: Request): + """Poll a specific job's status + progress.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/rehash-runs") +async def rehash_runs(request: Request): + """Body: `{"filter": {...}}`. Recompute run_hash on matching docs.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/dedupe-runs") +async def dedupe_runs(request: Request): + """Find duplicates by (seed, character, start_time, run_time) and + hide all but the oldest. Returns a job id; results in audit log.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/reattach-files") +async def reattach_files(request: Request): + """For runs missing JSON files on disk: try sibling-copy first, + fall back to synthesizing a minimal blob from Mongo. Same logic + as `/api/runs/shared/{hash}` does on-demand, batched.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/import-beta-version") +async def import_beta_version(request: Request): + """Body: `{"version": "v0.106.0"}`. Parse a fresh beta extraction + (assumes extraction/beta/raw + decompiled are populated) into + data-beta//.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/recompute-scores") +async def recompute_scores(request: Request): + """Full Codex Score rebuild as a tracked job. Same logic as the + startup pre-warm and `/ops/refresh-entity-scores`, but progress + is visible in /bulk/jobs.""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_integrations.py b/backend/app/routers/admin_integrations.py new file mode 100644 index 00000000..2b26308b --- /dev/null +++ b/backend/app/routers/admin_integrations.py @@ -0,0 +1,121 @@ +"""Outbound integration health + test-fire endpoints. + +Every external dependency this app calls — Discord webhooks, Resend, +Sentry, GitHub App, Cloudflare API, IndexNow — can fail silently +when a token rotates or an upstream API changes shape. The admin +dashboard surfaces last-success + last-error timestamps + a button +to fire a test request, so credential rotation breakage is found in +seconds, not days. + +## Health endpoint shape + + GET /api/admin/integrations + → + { + "discord_feedback": {"last_ok": ISO, "last_error": ISO|null, "last_error_msg": "..."}, + "discord_guide": {...}, + "resend": {...}, + "sentry": {...}, + "github_app": {"last_ok": ISO, "token_expires_at": ISO|null, ...}, + "cloudflare": {...}, + "indexnow": {...}, + } + +Status comes from a `integration_health` Mongo collection that each +existing outbound call writes to on success/failure (single +`upsert_one` per call, fire-and-forget). + +## Test endpoints + + POST /api/admin/integrations/discord-feedback/test → fires a + "[test from admin dashboard at ]" message + POST /api/admin/integrations/resend/test → sends a test + email to UNINSTALL_FORWARD_TO + POST /api/admin/integrations/github-app/test → calls GitHub + API /repos/ to verify the JWT signs valid + POST /api/admin/integrations/cloudflare/test → calls + /zones/ to verify the API token still authenticates + POST /api/admin/integrations/indexnow/test → pings one + URL via IndexNow + +All test endpoints write the result to the same integration_health +collection so the GET endpoint reflects the test outcome too. + +## Why this matters + +The breakage modes for these aren't loud — Discord webhooks return +404 silently, Resend's quota errors are 429s buried in logs, GitHub +App tokens expire after a year. The "things broke and nobody told +me" half-life is usually weeks, sometimes months. A one-click test ++ a one-glance panel collapses that to minutes. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/integrations", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +@router.get("") +async def integration_health(request: Request): + """Latest success + failure timestamps for every external dep, + plus token expiry where applicable. One Mongo find, sub-ms.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/discord-feedback/test") +async def test_discord_feedback(request: Request): + """Fire a test message at the FEEDBACK_WEBHOOK_URL. Body: + `{"message": "..."}` (optional override — default is a + timestamped probe).""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/discord-guide/test") +async def test_discord_guide(request: Request): + """Same as above for the GUIDE_WEBHOOK_URL.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/resend/test") +async def test_resend(request: Request): + """Send a test email via Resend to UNINSTALL_FORWARD_TO. Confirms + the API key + the from-address are still valid.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/sentry/test") +async def test_sentry(request: Request): + """Capture a synthetic exception via the Sentry SDK to verify + SENTRY_DSN is still valid and events flow.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/github-app/test") +async def test_github_app(request: Request): + """Call GitHub API /repos/ with the App JWT. + Confirms knowledge-demon.private-key.pem is still installed, + valid, and authorized on the configured repo.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/cloudflare/test") +async def test_cloudflare(request: Request): + """Call CF /zones/{zone_id} with the stored API token. Verifies + the token still authenticates and still has scope on the zone + (rotation breaks this silently otherwise).""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/indexnow/test") +async def test_indexnow(request: Request): + """Ping IndexNow for one URL (e.g. the home page). Confirms + api.indexnow.org is accepting our key.""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_moderation.py b/backend/app/routers/admin_moderation.py new file mode 100644 index 00000000..b3fa7934 --- /dev/null +++ b/backend/app/routers/admin_moderation.py @@ -0,0 +1,93 @@ +"""Content moderation — hide individual runs, guides, usernames. + +## Why "hide" instead of "delete" + +Soft-delete pattern: every doc gets a `hidden_at`, `hidden_by`, +`hidden_reason` field set by the admin endpoint. Read paths add +`{"hidden_at": None}` (or `{$exists: false}`) to their match clauses. +Reasons: + +- Undo is one update — restoring is `unset hidden_at`. Hard-delete + has no undo. +- Audit log entry remains valid (it references the doc by id). +- Stats summary refresher already supports filtering by arbitrary + match clause — adding `"hidden_at": None` is a one-line change. + +## Categories + +- `/api/admin/moderation/runs/{hash}/hide` — flag a single run +- `/api/admin/moderation/runs/{hash}/unhide` — restore +- `/api/admin/moderation/guides/{slug}/hide` — same for guides +- `/api/admin/moderation/usernames/{name}/clear` — strip a username + from all runs that claimed it (e.g. impersonation report) +- `/api/admin/moderation/usernames/{name}/reassign` — reassign claimed + hashes to a different name (dispute resolution) + +## Read-path filtering (follow-up PR scope) + +Every endpoint that lists runs needs a `"hidden_at": None` clause. +Doing that retroactively without breaking caches: + + 1. Add the filter to runs_db_mongo.list_runs / leaderboard / + get_stats / etc. + 2. Force-refresh stats_summary so the materialized doc reflects the + filtered population. + 3. Purge CF cache for the affected list pages. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/moderation", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +@router.post("/runs/{run_hash}/hide") +async def hide_run(run_hash: str, request: Request): + """Body: `{"reason": "impossible-time / sub-1-minute-win"}`. + Sets `hidden_at`, `hidden_by`, `hidden_reason` on the Mongo doc. + TODO: + audit log + stats_summary refresh trigger. + """ + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/runs/{run_hash}/unhide") +async def unhide_run(run_hash: str, request: Request): + """Unset the hidden_* fields. Run re-appears in leaderboards on + the next stats refresh.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/guides/{slug}/hide") +async def hide_guide(slug: str, request: Request): + """Same shape as hide_run; soft-deletes a community guide. + Body: `{"reason": "..."}`.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/guides/{slug}/unhide") +async def unhide_guide(slug: str, request: Request): + """Restore a hidden guide.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/usernames/{name}/clear") +async def clear_username(name: str, request: Request): + """Strip `name` from every run that claimed it (impersonation + response). The runs themselves stay; they just become anonymous + again.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/usernames/{name}/reassign") +async def reassign_username(name: str, request: Request): + """Body: `{"new_name": "..."}`. Bulk rewrite all runs claimed + under `name` to use `new_name`. Used for dispute resolution + when two players claim the same handle.""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_observability.py b/backend/app/routers/admin_observability.py new file mode 100644 index 00000000..69948875 --- /dev/null +++ b/backend/app/routers/admin_observability.py @@ -0,0 +1,85 @@ +"""Recent activity feeds — errors, rate-limit hits, search. + +These are read-only views over Prometheus + Mongo. Cuts +mean-time-to-discovery for a broken endpoint from "next time you +glance at Grafana" to "next time you open the admin dashboard." + +## Sources + +- **Recent errors**: re-uses the `spire_codex_api_errors_total` + counter we already emit. The /metrics endpoint exposes counts per + (method, path, status_code) — admin endpoint just sorts + paginates. + For per-request *detail* we'd need to wire a separate ring buffer + in `RequestLoggingMiddleware`; sketch a TODO for that. +- **Rate-limit hits**: same metric, filtered to `status_code="429"`. + Per-IP detail requires a separate writer (the limiter doesn't log + who got rejected). Sketch a small Mongo collection for this. +- **Search**: run hash / IP / seed / username free-text. The runs + collection already has indexes on character / submitted_at / + build_id; we'd add a sparse index on `client_ip` (new field — + written by submit_run going forward) for IP search. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +# ── Recent errors ──────────────────────────────────────────── +@router.get("/errors") +async def list_errors(request: Request): + """Returns 4xx + 5xx counts in the last N minutes, grouped by + (method, path, status_code). + Query: `?since=15m` (default), `?status=4xx|5xx|all`.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/errors/recent") +async def recent_error_requests(request: Request): + """Last-N individual error requests with full request line. + Requires the ring-buffer writer in RequestLoggingMiddleware to + land first — without it, this endpoint can only report counts + (above) not specific requests. + + TODO: add a sized-deque writer in middleware that captures + {ts, method, path, status, ip, user_agent} for the last ~5000 + error requests in process memory (no persistence — cheap, OK to + lose on restart).""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +# ── Rate-limit hits ────────────────────────────────────────── +@router.get("/rate-limit-hits") +async def recent_rate_limit_hits(request: Request): + """Last N rate-limit rejections grouped by (endpoint, client_ip). + Tells you whether to raise (legit user hitting cap) or lower + (scraper). + + Requires the limiter to record rejections somewhere. Cheapest: + a small Mongo collection `rate_limit_events` with a TTL index + expiring after 7d. Writer hooks `RateLimitExceeded` exception + handler (already registered in main.py).""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +# ── Search ─────────────────────────────────────────────────── +@router.get("/search/runs") +async def search_runs(request: Request): + """Free-text search across runs by hash / username / seed / IP. + Query: `?q=&kind=hash|username|seed|ip`. Returns matching + run docs (capped at 100).""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/search/guides") +async def search_guides(request: Request): + """Title / author / body free-text search across guides.""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_ops.py b/backend/app/routers/admin_ops.py new file mode 100644 index 00000000..c6270e49 --- /dev/null +++ b/backend/app/routers/admin_ops.py @@ -0,0 +1,160 @@ +"""Operational toggles — feature flags, cache purge, data refresh, +maintenance-mode banner. + +## Feature flags + +Backend already supports env-var kill switches (`DISABLE_RUN_SUBMISSIONS` +is the existing example). The flag table generalizes that: a Mongo +doc `app_config.flags` mapping `slug -> bool`, with the same 5s TTL +in-process cache pattern as `rate_limits_store`. Endpoints opt in by +calling `get_flag("submit_run_disabled", default=False)`. + +Initial flag set: +- `submit_run_disabled` — kill switch on POST /api/runs +- `claim_runs_disabled` — kill switch on POST /api/runs/claim +- `read_only_mode` — disable every write surface at once +- `maintenance_banner` — site-wide banner text (separate from + flag bool — see below) + +## Cache purge + +You have `playbooks/purge-cache.yml`. This wraps the same CF API +call as a one-click admin endpoint — for the "I just re-rendered +QA cards, drop /qa from CF" or "the news article had a typo, purge +/news/" cases. + +Three variants: +- /api/admin/ops/purge — POST body: {"urls": [...]} +- /api/admin/ops/purge-tag — POST body: {"tag": "..."} (Enterprise feature) +- /api/admin/ops/purge-all — DANGER, requires confirmation header + +## Data refresh + +Wraps existing service-layer functions: +- /api/admin/ops/refresh-stats → calls refresh_stats_summary() +- /api/admin/ops/refresh-entity-scores → calls run_entity_stats._build_cache() +- /api/admin/ops/refresh-news → invokes news_parser inline (or via webhook) + +## Site-wide banner — covers announcements + maintenance + incidents + +One Mongo doc `app_config.banner` with `{message, level, +expires_at, link?}`. Frontend layout reads this at SSR time and +renders a top banner if present + unexpired. Set via this endpoint, +vacated by deletion or TTL expiry. + +The `level` field carries the editorial intent: +- `level: "info"` — patch / feature announcement + ("Major Update #2 just dropped — see changelog →") +- `level: "warn"` — degraded state + ("Run submissions paused for ~5min during DB migration") +- `level: "error"` — incident in progress + ("Stats are stale — investigating") + +Same endpoint, same data model, three visual treatments on the +frontend. Operator picks the level; `expires_at` enforces a +self-vacating window so an announcement doesn't sit on the page +forever if you forget to clear it. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/ops", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +# ── Feature flags ──────────────────────────────────────────── +@router.get("/flags") +async def list_flags(request: Request): + """List every registered flag + its current value + its default.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.put("/flags/{slug}") +async def set_flag(slug: str, request: Request): + """Body: `{"value": true}`. Activates a kill switch.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.delete("/flags/{slug}") +async def clear_flag(slug: str, request: Request): + """Revert to the flag's hardcoded default.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +# ── Cache purge (wraps Cloudflare API) ─────────────────────── +@router.post("/purge") +async def purge_urls(request: Request): + """Body: `{"urls": ["https://spire-codex.com/news/123", ...]}`. + POST to CF /zones/.../purge_cache with `files=[...]`.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/purge-all") +async def purge_everything(request: Request): + """DANGER. Requires `X-Confirm: purge-all` header on top of the + admin token. Wipes the entire zone's edge cache — uses one of + the 5 daily purge-everything quotas on the Free plan.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +# ── Data refresh ───────────────────────────────────────────── +@router.post("/refresh-stats") +async def refresh_stats(request: Request): + """Force the stats_summary materialization to re-run NOW instead + of waiting for the 60s refresher tick. Useful after bulk moderation + or back-fill imports.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/refresh-entity-scores") +async def refresh_entity_scores(request: Request): + """Rebuild the Codex Score cache. Slow (~5-10s); blocks on + completion. Same as the startup pre-warm.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/refresh-news") +async def refresh_news(request: Request): + """Run news_parser inline. Normally a cron does this every 6 hours + — manual trigger is for when a hot Steam announcement drops and + you don't want to wait.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +# ── Site-wide banner (announcements + maintenance + incidents) ── +@router.get("/banner") +async def get_banner(request: Request): + """Current banner (if any). Frontend layout reads the public-side + of this via /api/banner (separate, unauthenticated) — admin GET + returns the same shape plus internal metadata (created_by, etc.).""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.put("/banner") +async def set_banner(request: Request): + """Body shape: + { + "message": "Major Update #2 just dropped — see changelog →", + "level": "info" | "warn" | "error", + "expires_at": "2026-05-21T00:00:00Z", + "link": "/changelog#1.0.7" // optional, clickable + } + Banner auto-vanishes after `expires_at` so an announcement + doesn't outlive its relevance if you forget to clear it. Use + `level: "info"` for patch announcements, `warn` for degraded + state, `error` for active incidents — frontend renders three + visual treatments.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.delete("/banner") +async def clear_banner(request: Request): + """Drop the banner immediately.""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_query.py b/backend/app/routers/admin_query.py new file mode 100644 index 00000000..9412c414 --- /dev/null +++ b/backend/app/routers/admin_query.py @@ -0,0 +1,132 @@ +"""Locked-down Mongo read-only query console. + +Lets the operator answer one-off questions ("how many wins by +Necrobinder since the last patch", "every run with seed starting +'21_04_2026'") without dropping into mongosh on the box or writing +a throwaway Python script. + +## Threat model + +The admin token gate + CF Access OAuth already establish operator +identity. The remaining concern is *operator footgun* — accidentally +running `{$out: "runs"}` and wiping a collection, or `count` on an +unbounded set that locks the worker. Defense: + +1. **Whitelist-only operations**. The endpoint only accepts: + find, find_one, count_documents, aggregate, distinct + Everything else (insert*, update*, delete*, drop*, etc.) is + rejected before touching pymongo. + +2. **Whitelist-only collections**. `runs`, `stats_summary`, + `bulk_jobs`, `admin_audit` — and that's it for now. New + collections need an explicit allow. + +3. **Result cap**. Hard limit of 100 documents in the response, no + matter what limit the operator passed. Counts are exact, finds + are sampled. + +4. **Aggregation pipeline filtering**. `$out`, `$merge`, + `$function`, `$accumulator`, `$where` (server-side JS), `$lookup` + targeting non-whitelisted collections — all rejected by a static + pre-pass before sending to Mongo. + +5. **Query timeout**. `maxTimeMS: 5000` on every call so a runaway + aggregation can't pin a worker. + +6. **Audit log entry per query**. Every query (filter + pipeline + + result count) is written to `admin_audit` so future-you can see + what you ran last Tuesday at 3am. + +## Endpoints + + POST /api/admin/query/find + Body: {"collection": "runs", "filter": {...}, "projection": {...}, "limit": 100} + + POST /api/admin/query/count + Body: {"collection": "runs", "filter": {...}} + + POST /api/admin/query/aggregate + Body: {"collection": "runs", "pipeline": [...], "limit": 100} + + POST /api/admin/query/distinct + Body: {"collection": "runs", "field": "character", "filter": {...}} + + GET /api/admin/query/saved-queries + POST /api/admin/query/saved-queries + Lets you bookmark a query you'll re-run (e.g. "weekly + cheater-candidate scan: deck_size > 200 sorted by submitted_at desc"). + Stored in `saved_queries` collection. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/query", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +# Whitelists enforced at request time. Keeping them at module scope so +# they show up in repo grep — easier to audit than runtime config. +ALLOWED_COLLECTIONS = {"runs", "stats_summary", "bulk_jobs", "admin_audit"} +ALLOWED_OPS = {"find", "find_one", "count_documents", "aggregate", "distinct"} +FORBIDDEN_PIPELINE_STAGES = { + "$out", + "$merge", + "$function", + "$accumulator", + "$where", +} +MAX_RESULT_DOCS = 100 +QUERY_TIMEOUT_MS = 5000 + + +@router.post("/find") +async def query_find(request: Request): + """Read a few documents matching the filter. Capped at 100 hard.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/count") +async def query_count(request: Request): + """Exact count_documents for the filter. Cheap; no result cap.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/aggregate") +async def query_aggregate(request: Request): + """Run an aggregation pipeline. Pipeline pre-validated against + FORBIDDEN_PIPELINE_STAGES; $lookup `from` collection must be in + ALLOWED_COLLECTIONS too. Results capped, 5s timeout.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/distinct") +async def query_distinct(request: Request): + """Distinct field values for the matched docs. Useful for the + "what build_ids have we seen this month" class of question.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/saved") +async def list_saved(request: Request): + """List the operator's bookmarked queries.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/saved") +async def save_query(request: Request): + """Bookmark a query for re-use. Body: `{"name": "...", "op": "find", + "collection": "runs", "filter": {...}}`.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.delete("/saved/{name}") +async def delete_saved(name: str, request: Request): + """Drop a saved query.""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_rate_limits.py b/backend/app/routers/admin_rate_limits.py new file mode 100644 index 00000000..6b43500e --- /dev/null +++ b/backend/app/routers/admin_rate_limits.py @@ -0,0 +1,101 @@ +"""Admin endpoint for runtime rate-limit overrides. + +## Defense in depth + +Two layers gate this surface: + +1. **Cloudflare Access** at the edge. The admin subdomain is added + to a CF Access application that requires OAuth login (Google, + GitHub, etc.). CF only forwards the request to the origin once + the user is authenticated. Unauthenticated traffic never reaches + us — see `infrastructure/ansible/playbooks/admin-install.yml`. + +2. **X-Admin-Token header** verified here. Defense in depth: if CF + Access is ever bypassed (misconfig, leaked Tunnel cert), the + token check still rejects unauthenticated callers. Token comes + from `ADMIN_TOKEN` env var, sourced from 1Password. + +The combination lets us reuse the existing token-checked admin +pattern AND give ourselves a graphical UI behind OAuth without +inventing session management. + +## Endpoints (sketched, not yet implemented) + + GET /api/admin/rate-limits → list every (slug, current, default) + PUT /api/admin/rate-limits/{slug} → body: {"limit": "3000/hour"} + DEL /api/admin/rate-limits/{slug} → revert to hardcoded default + +## Routing + +`/api/admin/*` is served by the same FastAPI backend, not a separate +container. The "admin container" the deploy playbook talks about is +just a tiny static UI that consumes these endpoints — separating UI +from API keeps the API on the existing scale-tested path while the +UI can be iterated on without backend redeploys. + +TODO before merging the follow-up: +- [ ] Implement GET — return slugs from REGISTRY + overrides from store +- [ ] Implement PUT — validate limit string via slowapi's parser +- [ ] Implement DEL — clear from store +- [ ] Wire `submit_run` first as the smoke test target +- [ ] Add a `recent_changes` collection so the UI can show audit log +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/rate-limits", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +# Slugs each decorated endpoint will register itself under. The +# registry lets the GET endpoint list every overridable limit even +# if no override is set yet. Populated at module-import time once +# the rewrite of the actual decorators lands. +REGISTRY: dict[str, str] = { + # "submit_run": "3000/hour", + # "claim_runs": "10/minute", + # "list_runs": "120/minute", + # "shared_run": "60/minute", + # "feedback": "5/minute", + # "guide_submit": "3/minute", + # ... +} + + +@router.get("") +def list_limits(request: Request): + """Return every registered limit + any active override. + + Response shape: + { + "limits": [ + {"slug": "submit_run", "default": "3000/hour", "override": null}, + {"slug": "feedback", "default": "5/minute", "override": "10/minute"}, + ... + ] + } + """ + # TODO: implement once REGISTRY is populated. + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.put("/{slug}") +async def set_limit(slug: str, request: Request): + """Set a runtime override. Body: `{"limit": "3000/hour"}`.""" + # TODO: validate slug ∈ REGISTRY, validate body.limit parses, + # call rate_limits_store.set_override. + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.delete("/{slug}") +def clear_limit(slug: str, request: Request): + """Revert a slug to its hardcoded default.""" + # TODO: call rate_limits_store.clear_override. + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_schedules.py b/backend/app/routers/admin_schedules.py new file mode 100644 index 00000000..88ebf6ef --- /dev/null +++ b/backend/app/routers/admin_schedules.py @@ -0,0 +1,89 @@ +"""Scheduled task viewer — when did each cron / GH Actions / systemd +timer / backend background job last run, with what result. + +## What we schedule today + +1. **GitHub Actions cron workflows** (in `.github/workflows/`): + - `news-refresh.yml` — every 6h, parses Steam news into data/news/ + - `runs-db-backup.yml` — nightly snapshot of the runs DB + +2. **Backend background daemons** (in-process threads): + - `stats_summary` refresher — every 60s, materializes + /api/runs/stats output (see `routers/runs.py::start_stats_refresher`) + - `run-entity-stats` warm-up — once at startup, then opportunistic + refresh + +3. **Systemd / OS-level timers on the DO box**: + - `logrotate.timer` — nightly + - (nothing else app-specific currently) + +The dashboard surface unifies these into one panel: "everything +that's supposed to run on a schedule, and when it last ran." + +## Sources + +- **GH Actions**: `gh api /repos/{owner}/{repo}/actions/workflows` + + `/runs?per_page=1&status=completed` per workflow. ~5 cron + workflows × 1 API call each → ~250ms total, cached for 60s. +- **Backend daemons**: each writes its last-success timestamp to a + `schedule_health` Mongo doc on every tick. Read here. +- **Systemd timers** on the host: harder to query from inside the + container without exposing the host. Punt for v1 — admin can + `systemctl list-timers` on the box if they need it. + +## Endpoints + + GET /api/admin/schedules + → + { + "github_actions": [ + {"name": "news-refresh", "last_run_at": ISO, "conclusion": "success", + "next_estimated_at": ISO}, + ... + ], + "backend_daemons": [ + {"name": "stats_summary_refresher", "last_tick_at": ISO, + "interval_seconds": 60, "lag_seconds": 12.4}, + ... + ] + } + + POST /api/admin/schedules/{name}/trigger-now + Where supported (GH workflows have a `workflow_dispatch` event) + — kicks off the workflow immediately. Returns the new run id. + +## What this catches + +The half-life on "a scheduled job stopped running but nobody +noticed" is usually as long as it takes for someone to notice the +*output* missing. For news: a week. For backups: until you need a +restore. This panel makes the silence audible. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/schedules", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +@router.get("") +async def list_schedules(request: Request): + """Combined view of GH Actions cron workflows + backend daemons. + See module docstring for the response shape.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.post("/{name}/trigger-now") +async def trigger_now(name: str, request: Request): + """Force an immediate run of a scheduled job. Only works for + things that support manual triggers (GitHub Actions via + workflow_dispatch; backend daemons via an explicit wake-up + method on the daemon thread).""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/admin_umami.py b/backend/app/routers/admin_umami.py new file mode 100644 index 00000000..641ab5bd --- /dev/null +++ b/backend/app/routers/admin_umami.py @@ -0,0 +1,78 @@ +"""Umami analytics passthrough for the admin dashboard. + +Wraps `services/umami_client.py`. Backend holds Umami credentials +(env-injected from 1P) so the admin dashboard never sees them — it +just calls our backend with the admin token, we call Umami on its +behalf. + +## Why two layers of caching + +- `umami_client.py` caches each Umami response (5-60s TTL per endpoint) + to keep dashboard polling cheap. +- These admin endpoints sit behind the existing CF cache rules but + we explicitly set `Cache-Control: no-store` since stats need to + feel live to the operator. + +## Response shapes (all flat, ready to render) + + GET /api/admin/umami/active → {"count": 12} + GET /api/admin/umami/summary?period=24h + → {pageviews, visitors, visits, ...} + GET /api/admin/umami/top-pages → [{path, count}, ...] + GET /api/admin/umami/top-referrers → [{referrer, count}, ...] + GET /api/admin/umami/countries → [{code, count}, ...] + GET /api/admin/umami/browsers → {chrome: N, firefox: N, ...} + +The dashboard wires these directly into the panels — no +re-shaping needed. +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Request + +from ..dependencies import require_admin + +router = APIRouter( + prefix="/api/admin/umami", + tags=["Admin"], + dependencies=[Depends(require_admin)], +) + + +@router.get("/active") +async def active_visitors(request: Request): + """Concurrent visitors right now. Dashboard polls every ~10s.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/summary") +async def summary(request: Request): + """Headline stats for the period. + Query: `?period=1h|24h|7d|30d` (default 24h).""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/top-pages") +async def top_pages(request: Request): + """Most-viewed URLs in the period. Query: `?period=24h&limit=20`.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/top-referrers") +async def top_referrers(request: Request): + """Inbound traffic sources. Query: `?period=24h&limit=20`.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/countries") +async def countries(request: Request): + """Visitor distribution by country. Query: `?period=24h&limit=20`.""" + raise HTTPException(status_code=501, detail="Not implemented yet") + + +@router.get("/browsers") +async def browsers(request: Request): + """Browser breakdown — useful for spotting bot waves (sudden + headless-chrome spike). Query: `?period=24h`.""" + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/hall_of_shame.py b/backend/app/routers/hall_of_shame.py new file mode 100644 index 00000000..85d8e16e --- /dev/null +++ b/backend/app/routers/hall_of_shame.py @@ -0,0 +1,61 @@ +"""Public-facing inverse leaderboard for moderated runs. + +The complement to `admin_moderation.py`: that one writes the +`hidden_at` field, this one exposes the population of hidden runs to +the public. Two surfaces share the same data with mirrored filter +clauses: + + Regular leaderboard: match {hidden_at: None} + Hall of Shame: match {hidden_at: {$ne: None}} + +Why public-readable: the entertainment value depends on visibility, +and the transparency cuts both ways — anyone curious why a specific +run was hidden can see the moderator's stated reason next to it. + +## False-positive policy + +Strictly admin-curated. No auto-flagging fed into this surface +without a human reviewing. Automated detection (impossibly short +times, oversized decks for floor count) can populate a *separate* +admin-only review queue (sketch TODO), but the public Hall of Shame +only shows entries an admin manually confirmed. + +## Reason field is required + visible + +Every entry shows the `hidden_reason` string verbatim. A vague +"looks suspicious" is mean-spirited; "deck size: 200, max possible: +~80" is fair game. The moderation endpoint validates that +`hidden_reason` is non-empty before letting a run land here. + +## Endpoint shape + + GET /api/runs/hall-of-shame + ?category=fastest|highest_ascension (mirror normal leaderboard) + ?players=single|multi + ?game_mode=standard|daily|custom + ?character=... + ?page=N&limit=20 + + Response is the standard leaderboard shape plus + `hidden_reason` + `hidden_at` per entry. +""" + +from __future__ import annotations + +from fastapi import APIRouter, HTTPException, Request + +router = APIRouter(prefix="/api/runs/hall-of-shame", tags=["Runs"]) + + +@router.get("") +async def list_hall_of_shame(request: Request): + """Inverse of /api/runs/leaderboard — returns runs where + `hidden_at` is set. Same query params, same response shape, plus + `hidden_reason` + `hidden_at` per entry. + + TODO: implement in the follow-up that wires `admin_moderation` + to actually set `hidden_at`. Until that lands, this returns an + empty list rather than 501 — the route is just hidden from + `/leaderboards/hall-of-shame` users entirely. + """ + raise HTTPException(status_code=501, detail="Not implemented yet") diff --git a/backend/app/routers/runs.py b/backend/app/routers/runs.py index ac98b009..b224b132 100644 --- a/backend/app/routers/runs.py +++ b/backend/app/routers/runs.py @@ -49,18 +49,22 @@ def _load_run_blob(run_hash: str) -> str | None: @router.post("", tags=["Runs"]) -@limiter.limit("600/hour") +@limiter.limit("3000/hour") async def submit_run_endpoint(request: Request, username: str | None = None): """Submit a run for community stats. Paste the .run file JSON content. Optional ?username= param. - Rate limit: 600/hour (~10/min sustained, but allows bursts). Sized - for the Overwolf-launch scenario where a desktop-app user has a - backlog of hundreds of saved runs and wants to upload them all - after first install. The previous default of 60/min would have - forced a power user with 200 runs to wait ~4 minutes mid-upload. - Duplicate detection via run_hash UNIQUE constraint short-circuits - re-submission attempts, so the practical write load is bounded by - actual distinct runs per uploader. + Rate limit: 3000/hour (~50/min sustained, with room for burst). The + earlier 600/hour ceiling was sized for "a few hundred backlog runs + on first install" but silently capped users with larger histories + — a Discord report of someone with 1000+ saved runs would have + dropped 400 of them at 600/hour. Each submission is ~10ms backend + work (Mongo insert + JSON file write + metrics bump) and duplicate + detection short-circuits at ~3ms via the run_hash UNIQUE + constraint, so actual write load is bounded by *distinct* runs per + uploader, not raw submission count. Easy to lower again if scraper + abuse shows up in + `spire_codex_api_errors_total{status_code="429"}` against this + endpoint. """ if os.environ.get("DISABLE_RUN_SUBMISSIONS"): run_errors.labels(reason="disabled").inc() diff --git a/backend/app/services/api_keys_store.py b/backend/app/services/api_keys_store.py new file mode 100644 index 00000000..d4eb382a --- /dev/null +++ b/backend/app/services/api_keys_store.py @@ -0,0 +1,153 @@ +"""API key issuance, lookup, and revocation. + +Lets the platform offer per-key identity instead of per-IP. Unlocks: +- Service-account submissions from the Overwolf desktop app +- Per-key rate-limit overrides (heavy-uploader users don't share the + IP bucket with NAT'd housemates) +- Targeted revocation (kill one abuser without IP-banning their ISP) +- Attribution for third-party widget consumers + +## Document shape (`api_keys` collection) + + { + "_id": "k_", # public key id (safe to log) + "key_hash": "", # plain text NEVER stored + "owner": "spire-compendium", # operator-visible label + "owner_kind": "service|user", + "scopes": ["runs:submit", "guides:read"], + "rate_limit_override": null, # optional per-key override, e.g. "10000/hour" + "created_at": ISODate(...), + "created_by": "peter", # admin who issued + "last_used_at": ISODate(...), + "revoked": false, + "revoked_at": null, + "revoked_by": null, + } + +## Format presented to clients + + sk_codex_ ─ prefix-tagged so leaks are + scannable in logs and via GitHub + secret-scanning + +Cloudflare offers a free secret-scanning service that catches your +prefix in public repos before it gets indexed. Worth registering +`sk_codex_` once the format is final. + +## Storage + +Plain key text is shown ONCE at creation time, never persisted. +We store sha256(plain_text). Hash, not bcrypt/argon2 — keys are +high-entropy random (256 bits) so a slow hash buys nothing; sha256 +keeps lookup O(1). + +## Lookup hot path + +For every request that presents a key: + 1. sha256 the presented key + 2. coll.find_one({"key_hash": h, "revoked": False}) + 3. Bump last_used_at (fire-and-forget, no await) + 4. Return owner + scopes + rate_limit_override + +Indexed on (key_hash) so lookup is single-digit ms. Cache the +mapping in-process for ~30s to make hot keys ~free. + +## Integration points (future PRs) + +- `app/dependencies.py::client_ip` → extend with a sibling + `bearer_subject(request) -> tuple[kind, id, override]` that returns + the API key's owner + per-key override if present. Routes that + authenticate use that as the rate-limit bucket key + identity. +- `app/services/rate_limits_store.py::get_limit` → check for a + per-key override before the slug-based override. +""" + +from __future__ import annotations + +import hashlib +from typing import Optional + + +PREFIX = "sk_codex_" +KEY_BYTES = 32 # 256 bits of entropy, base32 ≈ 52 chars + + +def generate() -> tuple[str, str]: + """Return (public_id, plaintext_key). Persist hashed form via + `create()`; never store plaintext. Show plaintext to the operator + once at creation, then drop it. + + TODO: implement. Sketch: + plain = PREFIX + secrets.token_urlsafe(KEY_BYTES) + key_id = "k_" + secrets.token_urlsafe(16) + return key_id, plain + """ + raise NotImplementedError + + +def hash_key(plain: str) -> str: + """sha256 of the presented key. High-entropy random key → slow + hash buys nothing; speed buys cheap per-request lookup.""" + return hashlib.sha256(plain.encode()).hexdigest() + + +def create( + owner: str, + owner_kind: str, + scopes: list[str], + created_by: str, + rate_limit_override: Optional[str] = None, +) -> tuple[str, str]: + """Issue a new key. Returns (public_id, plaintext_key) — caller + MUST present plaintext to the operator immediately; it can't be + recovered later. + + TODO: implement. Sketch: + key_id, plain = generate() + coll = _get_db().api_keys + coll.insert_one({ + "_id": key_id, "key_hash": hash_key(plain), + "owner": owner, "owner_kind": owner_kind, + "scopes": scopes, "rate_limit_override": rate_limit_override, + "created_at": datetime.utcnow(), "created_by": created_by, + "last_used_at": None, "revoked": False, + }) + return key_id, plain + """ + raise NotImplementedError + + +def lookup_by_plain(plain: str) -> Optional[dict]: + """Hot path. Returns the key doc (minus key_hash) or None. + + TODO: implement with 30s in-process cache keyed on hash.""" + raise NotImplementedError + + +def revoke(key_id: str, by: str) -> None: + """Soft-revoke. We don't delete so the audit log + last_used_at + timestamps survive.""" + raise NotImplementedError + + +def list_keys(include_revoked: bool = False) -> list[dict]: + """Admin UI listing. Returns docs without `key_hash`.""" + raise NotImplementedError + + +def rotate(key_id: str, by: str) -> tuple[str, str]: + """Issue a new plaintext for the same key_id, invalidating the + old. Returns (key_id, new_plaintext). Useful when a key was + leaked or its owner needs a fresh secret without re-issuing. + + TODO: implement. Sketch: + new_plain = PREFIX + secrets.token_urlsafe(KEY_BYTES) + coll.update_one( + {"_id": key_id, "revoked": False}, + {"$set": {"key_hash": hash_key(new_plain), + "rotated_at": datetime.utcnow(), + "rotated_by": by}} + ) + return key_id, new_plain + """ + raise NotImplementedError diff --git a/backend/app/services/audit_log.py b/backend/app/services/audit_log.py new file mode 100644 index 00000000..b068a4da --- /dev/null +++ b/backend/app/services/audit_log.py @@ -0,0 +1,78 @@ +"""Append-only audit log for every admin action. + +Single Mongo collection (`admin_audit`) with one document per +operator action. Append-only by convention — the write path here is +the only thing that writes, and only the GET endpoint in +`admin_audit.py` reads. + +## Document shape + + { + "_id": ObjectId(...), # natural ordering by insertion time + "ts": ISODate(...), + "actor": "peter@ptrlrd.com", # from CF Access JWT once wired; today + # we tag with "admin" since + # X-Admin-Token doesn't carry identity + "action": "rate_limits.set", # dotted slug: . + "target": "submit_run", # human-readable, varies per action + "before": "600/hour", # nullable + "after": "3000/hour", # nullable + "request_id": "...", # request_id middleware tag (TBD) + "ip": "1.2.3.4", # caller IP via client_ip() + } + +## Why a separate collection + +- Independent retention. Run docs are forever; audit entries can age + out after 1y to keep the working set lean. +- Independent indexes. Audit reads are always recent-N or by-actor, + not by run-hash, so the index shape diverges from the runs + collection. +- One write per admin action — cheap. + +## Read patterns + +- "Show me the last 100 admin actions" +- "Show me what `peter` did in the last 24h" +- "Show me every change to rate-limit `submit_run`" +""" + +from __future__ import annotations + +from typing import Any + + +def record( + actor: str, + action: str, + target: str | None = None, + before: Any = None, + after: Any = None, + ip: str | None = None, +) -> None: + """Append one entry to the audit log. Best-effort — if Mongo is + unreachable we log the failure but DON'T fail the calling admin + action. Better to apply a config change and lose the audit + record than block the operator from acting during an outage. + + TODO: implement. Sketch: + coll = _get_db().admin_audit + coll.insert_one({ + "ts": datetime.utcnow(), + "actor": actor, "action": action, "target": target, + "before": before, "after": after, "ip": ip, + }) + """ + raise NotImplementedError("Audit write path lands with the first admin endpoint.") + + +def list_recent(limit: int = 100, since_iso: str | None = None) -> list[dict]: + """Return the most recent `limit` entries, newest first. + + TODO: implement once the writer exists. Sketch: + coll = _get_db().admin_audit + q = {} + if since_iso: q["ts"] = {"$gte": datetime.fromisoformat(since_iso)} + return list(coll.find(q).sort("ts", -1).limit(limit)) + """ + raise NotImplementedError diff --git a/backend/app/services/rate_limits_store.py b/backend/app/services/rate_limits_store.py new file mode 100644 index 00000000..17af91af --- /dev/null +++ b/backend/app/services/rate_limits_store.py @@ -0,0 +1,136 @@ +"""Mongo-backed runtime rate limit config with in-process TTL cache. + +The default rate limits on `/api/runs`, `/api/feedback`, etc. live as +literal strings in `@limiter.limit("3000/hour")` decorators. Changing +them requires a code commit + CI build + deploy (~15 min). For one +or two cases (Discord report of a 1000-run backlog) that's fine; for +ongoing tuning it's friction. + +This store lets an admin override any registered limit at runtime by +writing to a single Mongo doc, and lets the per-request limit lookup +be ~free (TTL cache in front of the doc read). + +## Document shape + + // collection: rate_limits, _id: "config" + { + "_id": "config", + "updated_at": ISODate(...), + "updated_by": "peter", // operator identifier (audit only) + "limits": { + "submit_run": "3000/hour", + "claim_runs": "10/minute", + "list_runs": "120/minute", + "shared_run": "60/minute", + ... + } + } + +Keys are operation slugs (NOT route paths). Each slowapi-decorated +endpoint registers under a slug; this module is the source of truth +for current limit strings keyed by slug. + +## Cache + +Every limit lookup hits this module per request. Going to Mongo on +every request would add 1-3ms per hop, so the doc is cached for +TTL_SECONDS (default 5s). 5s is a deliberate trade-off: +- Long enough to absorb 100 req/s of in-flight traffic without + re-reading +- Short enough that an admin's "lower this NOW" change goes live + within one cache tick + +## Integration + +Endpoints opt in by replacing + @limiter.limit("3000/hour") +with + @limiter.limit(lambda: get_limit("submit_run", default="3000/hour")) + +The default string is always required as a fallback for when Mongo +is unreachable or the slug hasn't been overridden — same shape as +the hardcoded value so nothing regresses if this whole module +crashes silently. + +TODO: +- [ ] Implement `_fetch_from_mongo` once admin router lands and we + know the actual doc shape we want to persist (sections, audit + log structure, etc.) +- [ ] Wire `submit_run` as the first endpoint to use this — only + after the admin write path is tested +""" + +from __future__ import annotations + +import os +import time +from typing import Any + +# In-process cache. Shared across uvicorn workers? No — each worker +# has its own copy. That's fine; the 5s TTL means at worst each +# worker is briefly out of sync with the others (irrelevant for rate +# limiting since limits are per-IP-per-worker anyway). +_cache: dict[str, Any] = {"limits": {}, "fetched_at": 0.0} +TTL_SECONDS = 5.0 + + +def get_limit(slug: str, default: str) -> str: + """Return the current limit string for `slug`, or `default`. + + Called from the slowapi limit callable on every decorated request, + so this must be cheap. The TTL cache means we hit Mongo at most + once per TTL_SECONDS per worker (so ~12 reads/min across 4 workers + at the current 5s TTL). + """ + now = time.monotonic() + if now - _cache["fetched_at"] > TTL_SECONDS: + _refresh_cache() + return _cache["limits"].get(slug, default) + + +def _refresh_cache() -> None: + """Pull the latest config doc from Mongo. Failures fall through to + the previous cache + an updated fetched_at, so a Mongo outage + doesn't trigger constant refresh attempts on every request.""" + # TODO: implement once admin router defines the doc shape. + # Sketch: + # from pymongo import MongoClient + # coll = _get_db().rate_limits + # doc = coll.find_one({"_id": "config"}) + # _cache["limits"] = (doc or {}).get("limits", {}) + _cache["fetched_at"] = time.monotonic() + + +def list_overrides() -> dict[str, str]: + """Return all current overrides (for the admin GET endpoint).""" + if time.monotonic() - _cache["fetched_at"] > TTL_SECONDS: + _refresh_cache() + return dict(_cache["limits"]) + + +def set_override(slug: str, limit_string: str, actor: str) -> None: + """Persist `slug -> limit_string` to Mongo and invalidate the + in-process cache. `actor` is an audit-trail identifier (logged, + not enforced). + + Limit string format: slowapi's standard — `"/"` where + period is `second`, `minute`, `hour`, or `day`. e.g. `"50/minute"`, + `"3000/hour"`. + """ + # TODO: implement. Validate the limit string is parseable by + # slowapi before writing, otherwise we ship a broken config that + # gets rejected at decorator-eval time and breaks the route. + raise NotImplementedError("Admin write path lands next.") + + +def clear_override(slug: str, actor: str) -> None: + """Remove `slug` from the override doc — endpoint falls back to + its hardcoded default string.""" + raise NotImplementedError("Admin write path lands next.") + + +def _admin_token_from_env() -> str | None: + """Resolves the admin gating token. Set ADMIN_TOKEN in + docker-compose env from 1Password (`op://Spire Codex/Admin Token`). + """ + return os.environ.get("ADMIN_TOKEN") or None diff --git a/backend/app/services/umami_client.py b/backend/app/services/umami_client.py new file mode 100644 index 00000000..92350e61 --- /dev/null +++ b/backend/app/services/umami_client.py @@ -0,0 +1,124 @@ +"""Umami HTTP API client — pulls real-time + historical stats into the +admin dashboard. + +Why proxy instead of iframe-embedding the Umami UI: +- Cross-origin headaches with cookies + the admin's CF Access JWT. +- We can compose a *summary* view (active visitors + today's top + pages + last-24h trend, in one panel) instead of bouncing between + two UIs. The Umami UI is still there for deep dives — this is + just the 80%-case glance. + +## Auth + +Umami's API requires a session token (POST /api/auth/login with +admin/password). We hold credentials in env (sourced from 1P), log +in once on first call, cache the token until it 401s, then refresh. + +Env: +- UMAMI_API_URL — e.g. https://analytics.spire-codex.com +- UMAMI_ADMIN_USERNAME — usually `admin` +- UMAMI_ADMIN_PASSWORD — the rotated admin password we set on first + login (stored at 1P → Umami → admin_password) +- UMAMI_WEBSITE_ID — same UUID hardcoded in app/layout.tsx + +## Caching + +The admin dashboard hits these endpoints on every page load — a 30s +in-process cache per endpoint keeps that ~free and absorbs the +hot-reload patterns where an operator alt-tabs back and forth. +Stats that need to be truly real-time (active visitors) get a 5s +TTL. + +## Endpoints we expose to the admin dashboard + +These wrap the Umami API and pre-shape the response so the frontend +doesn't have to understand Umami's data model: + +- get_active() → integer (concurrent visitors) +- get_summary(period="24h") → {pageviews, visitors, visits, bounce_rate, avg_visit_duration} +- get_top_pages(period, limit) → [{path, count}] +- get_top_referrers(period, limit) +- get_countries(period, limit) +- get_browsers(period) + +All return plain dicts/lists ready to JSON-serialize. +""" + +from __future__ import annotations + +import os +import time +from typing import Any + + +_token_cache: dict[str, Any] = {"token": None, "expires_at": 0.0} +_data_cache: dict[str, tuple[float, Any]] = {} + + +def _api_base() -> str: + return os.environ.get("UMAMI_API_URL", "").rstrip("/") + + +def _website_id() -> str: + return os.environ.get("UMAMI_WEBSITE_ID", "") + + +def _login() -> str | None: + """POST /api/auth/login. Cached until expiry.""" + # TODO: implement. Sketch: + # r = httpx.post(f"{_api_base()}/api/auth/login", json={ + # "username": os.environ["UMAMI_ADMIN_USERNAME"], + # "password": os.environ["UMAMI_ADMIN_PASSWORD"], + # }) + # _token_cache["token"] = r.json()["token"] + # _token_cache["expires_at"] = time.monotonic() + 23*3600 # Umami tokens last ~24h + raise NotImplementedError + + +def _cached(key: str, ttl: float, fetch): + """Generic in-process TTL cache wrapper. Per-endpoint TTL because + 'active visitors' needs to be near real-time (5s) but '24h top + pages' can absorb 60s of staleness without anyone noticing.""" + now = time.monotonic() + entry = _data_cache.get(key) + if entry and now - entry[0] < ttl: + return entry[1] + value = fetch() + _data_cache[key] = (now, value) + return value + + +def get_active() -> int: + """Current concurrent visitors. 5s cache — the dashboard polls + this every ~10s so cache cost ≈ 1 Umami hit per 10s per worker.""" + # TODO: GET /api/websites/{id}/active + raise NotImplementedError + + +def get_summary(period: str = "24h") -> dict: + """Headline stats for the period (pageviews, visitors, visits, + bounce_rate, avg_visit_duration). 60s cache.""" + # TODO: GET /api/websites/{id}/stats?startAt=...&endAt=... + raise NotImplementedError + + +def get_top_pages(period: str = "24h", limit: int = 20) -> list[dict]: + """Top URL paths by pageview count. 60s cache.""" + # TODO: GET /api/websites/{id}/metrics?type=url&startAt=... + raise NotImplementedError + + +def get_top_referrers(period: str = "24h", limit: int = 20) -> list[dict]: + """Top inbound referrer domains. 60s cache.""" + raise NotImplementedError + + +def get_countries(period: str = "24h", limit: int = 20) -> list[dict]: + """Top countries by visitor count. 60s cache.""" + raise NotImplementedError + + +def get_browsers(period: str = "24h") -> dict: + """Browser breakdown — useful for spotting bot waves (sudden + spike in headless-chrome user agents). 60s cache.""" + raise NotImplementedError diff --git a/docker-compose.admin.yml b/docker-compose.admin.yml new file mode 100644 index 00000000..791509cd --- /dev/null +++ b/docker-compose.admin.yml @@ -0,0 +1,95 @@ +name: spire-codex-admin + +# Admin dashboard container + Cloudflare Tunnel sidecar. +# +# This is SKETCH ONLY — committed alongside the backend skeleton in +# `app/routers/admin_rate_limits.py` to lock in the deployment shape +# before we write any real UI code. +# +# Architecture: +# +# You (browser) +# │ OAuth (Google/GitHub via CF Access) +# ▼ +# Cloudflare edge — terminates TLS, checks Access policy +# │ +# ▼ +# cloudflared (sidecar) — outbound HTTPS to CF, no public port +# │ +# ▼ +# admin-dashboard — tiny static UI +# │ X-Admin-Token header +# ▼ +# spire-codex-backend /api/admin/rate-limits/* (on nginx_web-network) +# +# Why a separate container instead of mounting on the main frontend: +# - The main site uses static prerender + edge cache. The admin UI +# is dynamic-only and shouldn't share the static pipeline. +# - Easier to gate at the network layer — only this container talks +# to /api/admin, so the rest of the stack stays clean. +# - Disposable: we can rewrite the admin UI without touching the +# site. +# +# Cloudflare Tunnel pulls double duty here: +# - The admin container has no port published to the host (nothing +# on the public internet can reach it). cloudflared makes an +# outbound connection to Cloudflare and forwards traffic back. +# - Cloudflare Access enforces OAuth at the edge; combined with the +# X-Admin-Token header check in the backend, we get two +# independent gates. + +services: + admin-dashboard: + # TODO: pick a concrete image once we write the UI. Options: + # - nginx serving static HTML+vanilla JS (lightest) + # - a thin Next.js app (consistent with main site) + # - HTMX + minimal Python (zero JS build step) + # First pass: probably nginx + static. + image: ptrlrd/spire-codex-admin:latest + container_name: spire-codex-admin + restart: unless-stopped + environment: + # The static UI calls /api/admin/* against this URL. Same + # docker DNS as the main frontend uses for backend lookups. + - API_INTERNAL_URL=http://spire-codex-backend:8000 + # NO ports published — only reachable via cloudflared below or + # via internal nginx if we ever want a fallback path. This is + # the whole point of using Tunnel. + networks: + - nginx_web-network + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + + cloudflared: + image: cloudflare/cloudflared:latest + container_name: spire-codex-cloudflared + restart: unless-stopped + # The tunnel-run subcommand reads $CLOUDFLARED_TOKEN from env and + # establishes the persistent outbound connection. The token is + # bound to a specific tunnel + ingress config defined in the CF + # dashboard (Zero Trust → Networks → Tunnels). See + # `infrastructure/ansible/playbooks/admin-install.yml` for the + # one-time tunnel-create steps. + command: tunnel --no-autoupdate run + environment: + - TUNNEL_TOKEN=${CLOUDFLARED_TOKEN:-} + networks: + - nginx_web-network + depends_on: + - admin-dashboard + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + +networks: + # Reuse the existing network so cloudflared can dial the + # admin-dashboard container by service name DNS, and so the + # admin-dashboard container can dial spire-codex-backend the + # same way. + nginx_web-network: + external: true diff --git a/frontend/app/leaderboards/hall-of-shame/page.tsx b/frontend/app/leaderboards/hall-of-shame/page.tsx new file mode 100644 index 00000000..d8c6010f --- /dev/null +++ b/frontend/app/leaderboards/hall-of-shame/page.tsx @@ -0,0 +1,82 @@ +// Hall of Shame — public leaderboard inverse. +// +// Surfaces every run an admin has explicitly hidden via +// `/api/admin/moderation/runs/{hash}/hide`. Reads from the public +// `/api/runs/hall-of-shame` endpoint (see +// `backend/app/routers/hall_of_shame.py`). +// +// This is intentionally SEPARATE from /leaderboards rather than a +// tab on it — different visual register (mock-shame red theme, skull +// glyphs, explicit "moderator's reason" column), and keeps the main +// leaderboard component free of an "is this row a cheater" branch. +// +// Editorial policy summarized at the top of the page so visitors +// understand what they're looking at: +// - Only admin-curated entries. No auto-flagging. +// - Every row has a stated reason from the moderator. +// - Hiding is reversible — if it turns out to be wrong, the run +// comes off this page on the next stats refresh. +// +// Sketch only — replace with a real component when the moderation +// write path lands and we have actual data to render. For now it's a +// placeholder that explains the page's existence and links back to +// the main leaderboard. + +import Link from "next/link"; + +export const metadata = { + title: "Hall of Shame - Slay the Spire 2 (sts2) | Spire Codex", + description: + "Community-submitted runs that didn't pass moderation review — impossible times, oversized decks, and other obvious anomalies, kept here for posterity.", + // robots: noindex so search engines don't surface flagged-user names + // (the audit-style transparency is for site visitors, not the open web). + robots: { index: false, follow: false }, +}; + +export default function HallOfShamePage() { + return ( +
+

+ Hall of Shame +

+

+ Runs that didn't make the cut after moderator review — impossible + times, oversized decks, modded clients, the works. Each entry shows + the reason a moderator gave for hiding it from the main leaderboard. +

+ + {/* Editorial note — explains the curation policy in-page so we don't + need a separate FAQ */} +
+

+ Curated, not automated.{" "} + Every entry here is a manual call by a moderator. We don't + auto-flag anything onto this page. +

+

+ Reversible.{" "} + If a hide turns out to be wrong, the run goes back on the main + leaderboard at the next stats refresh. +

+

+ Looking for legit runs?{" "} + + Main leaderboards + {" "} + have everything that passed review. +

+
+ + {/* TODO: replace with the actual table once + /api/runs/hall-of-shame returns data. Shape mirrors + LeaderboardBrowseClient.tsx, plus an extra "reason" column + and a hidden_at date column. */} +
+ Nothing here yet. (Sketch page — moderation pipeline pending.) +
+
+ ); +} diff --git a/infrastructure/ansible/playbooks/admin-install.yml b/infrastructure/ansible/playbooks/admin-install.yml new file mode 100644 index 00000000..608cdeea --- /dev/null +++ b/infrastructure/ansible/playbooks/admin-install.yml @@ -0,0 +1,78 @@ +--- +# Admin dashboard install — Cloudflare Tunnel + CF Access OAuth gate. +# +# SKETCH PLAYBOOK — sequence of operations is fully documented but +# tasks are still placeholders. Replaces this header with real tasks +# once the admin container image exists and the CF Tunnel + Access +# config is created. +# +# ─── One-time manual prerequisites ─────────────────────────────── +# +# 1. Cloudflare Zero Trust → Networks → Tunnels → Create a tunnel. +# Name: "spire-codex-admin". CF gives you a connector token (long +# string starting with "ey...") — save to 1Password under: +# op://Spire Codex/Cloudflare/admin_tunnel_token +# +# 2. In the tunnel's Public Hostname tab: +# Subdomain: admin +# Domain: spire-codex.com +# Service: http://admin-dashboard:80 +# CF will auto-create the CNAME for admin.spire-codex.com pointing +# at the tunnel. +# +# 3. Cloudflare Zero Trust → Access → Applications → Add an application: +# Application type: Self-hosted +# Application domain: admin.spire-codex.com +# Policy: Allow → emails matching im@ptrlrd.com (or a Google / +# GitHub identity provider — configurable per your taste). +# From this point on, anyone visiting https://admin.spire-codex.com +# has to OAuth in before CF will forward the request to the tunnel. +# +# 4. Generate a fresh ADMIN_TOKEN (32+ random chars), save to: +# op://Spire Codex/Admin Token/value +# Add to `infrastructure/ansible/files/.env.tpl` as +# ADMIN_TOKEN=op://Spire Codex/Admin Token/value +# +# ─── Deploy steps (run from controller) ────────────────────────── +# +# ./bin/do-ansible playbooks/sync-secrets-do.yml --limit do_origin +# ./bin/do-ansible playbooks/admin-install.yml --limit do_origin +# +# ─── What the playbook does (when implemented) ─────────────────── +# +# 1. Pull the admin-dashboard image (after one ships). +# 2. Inject CLOUDFLARED_TOKEN into the host .env (via sync-secrets). +# 3. Bring up docker-compose.admin.yml: admin-dashboard + +# cloudflared sidecar. +# 4. Verify the cloudflared container reports "Registered tunnel +# connection" within ~10s (`docker logs spire-codex-cloudflared`). +# 5. Smoke-test from your laptop: +# curl https://admin.spire-codex.com → should 302 to CF Access +# (after OAuth) → 200 from admin-dashboard. +# +# ─── Threat model ──────────────────────────────────────────────── +# +# Layer 1: CF Access OAuth — only operators in the policy can reach +# admin.spire-codex.com at all. Unauthenticated public traffic +# stops at CF's edge. +# Layer 2: X-Admin-Token header — even if CF Access is bypassed +# (tunnel cert leak, misconfigured policy), the backend +# requires a token on every /api/admin/* call. +# Layer 3: No published port — admin-dashboard has no host port +# mapping. The DO box's firewall doesn't need a new rule. +# Only path in is via cloudflared's outbound connection. + +- name: Install admin dashboard (sketch) + hosts: digital_ocean + gather_facts: false + become: true + tasks: + - name: Placeholder — playbook body lands with the real admin image + debug: + msg: | + SKETCH only. See header for the architecture; real tasks + arrive when: + 1. The admin-dashboard image is built + pushed. + 2. The CF Tunnel is created in the dashboard and its + token is saved to 1P. + 3. /api/admin/rate-limits/* endpoints have bodies.