From ca1cbd6ab1c3179232107bd11678b14628de96fc Mon Sep 17 00:00:00 2001
From: Peter Lord <im@ptrlrd.com>
Date: Thu, 14 May 2026 20:00:38 -0700
Subject: [PATCH] Capture search query text in searches.db for top +
 zero-result analytics

---
 backend/app/main.py                   |  19 ++-
 backend/app/routers/admin_searches.py |  71 +++++++++
 backend/app/services/searches_db.py   | 218 ++++++++++++++++++++++++++
 docker-compose.beta.yml               |   1 +
 docker-compose.prod.yml               |   4 +
 5 files changed, 312 insertions(+), 1 deletion(-)
 create mode 100644 backend/app/routers/admin_searches.py
 create mode 100644 backend/app/services/searches_db.py

diff --git a/backend/app/main.py b/backend/app/main.py
index 24b1eef8..d1f9c4a3 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -54,8 +54,10 @@
     auth_steam,
     uninstall,
     qa_feedback,
+    admin_searches,
 )
 from .services.data_service import get_stats, load_translation_maps, current_version
+from .services import searches_db
 from .dependencies import get_lang, VALID_LANGUAGES, LANGUAGE_NAMES
 from prometheus_fastapi_instrumentator import Instrumentator
 
@@ -342,8 +344,22 @@ async def dispatch(self, request: Request, call_next):
                 elif len(parts) == 2:
                     # List view: /api/cards
                     entity_list_views.labels(entity_type=etype).inc()
-                    if request.query_params.get("search"):
+                    search_q = request.query_params.get("search")
+                    if search_q:
                         search_queries.labels(entity_type=etype).inc()
+                        # Persist the query text for analytics. Prometheus
+                        # can only carry the count by entity_type (label
+                        # cardinality), so the actual strings live in
+                        # searches.db where they can be aggregated for
+                        # "what do people search for that we don't have".
+                        searches_db.log_search(
+                            query=search_q,
+                            entity_type=etype,
+                            lang=request.query_params.get("lang") or "eng",
+                            ip_hash=searches_db.hash_ip(
+                                request.client.host if request.client else None
+                            ),
+                        )
 
             # Compare views
             if len(parts) == 3 and parts[1] == "compare":
@@ -430,6 +446,7 @@ async def dispatch(self, request: Request, call_next):
 app.include_router(feedback.router)
 app.include_router(uninstall.router)
 app.include_router(qa_feedback.router)
+app.include_router(admin_searches.router)
 app.include_router(acts.router)
 app.include_router(ascensions.router)
 app.include_router(names.router)
diff --git a/backend/app/routers/admin_searches.py b/backend/app/routers/admin_searches.py
new file mode 100644
index 00000000..90d63097
--- /dev/null
+++ b/backend/app/routers/admin_searches.py
@@ -0,0 +1,71 @@
+"""Admin-gated search analytics endpoints.
+
+Reads searches.db and exposes top / recent / volume rollups. Gated by a
+shared-secret header (`X-Admin-Token`) sourced from the `ADMIN_TOKEN`
+env var. If the env var is missing the endpoints return 503 — explicit
+"not configured" is easier to triage than silent 401s.
+
+Query-string auth would have been simpler but tokens leak through
+browser history and proxy access logs; a header keeps them out of the
+URL surface even if someone copies the curl into Slack.
+"""
+
+import os
+
+from fastapi import APIRouter, Header, HTTPException, Query
+
+from ..services import searches_db
+
+router = APIRouter(prefix="/api/admin/searches", tags=["Admin"])
+
+
+def _require_admin(token: str | None) -> None:
+    expected = os.environ.get("ADMIN_TOKEN")
+    if not expected:
+        # Deliberate over silent 401 — operator hasn't configured the
+        # secret yet, the endpoint is just disabled, not authenticated.
+        raise HTTPException(status_code=503, detail="Admin endpoints not configured")
+    if token != expected:
+        raise HTTPException(status_code=401, detail="Invalid admin token")
+
+
+@router.get("/top")
+def top(
+    x_admin_token: str | None = Header(default=None),
+    days: int = Query(default=7, ge=1, le=365),
+    limit: int = Query(default=100, ge=1, le=1000),
+    entity_type: str | None = None,
+):
+    """Most-searched queries in the last N days, with unique-user counts."""
+    _require_admin(x_admin_token)
+    return {
+        "days": days,
+        "entity_type": entity_type,
+        "results": searches_db.top_searches(
+            days=days, limit=limit, entity_type=entity_type
+        ),
+    }
+
+
+@router.get("/recent")
+def recent(
+    x_admin_token: str | None = Header(default=None),
+    limit: int = Query(default=200, ge=1, le=2000),
+    entity_type: str | None = None,
+):
+    """Most-recent searches, newest first. Useful for spot-checking live traffic."""
+    _require_admin(x_admin_token)
+    return {
+        "entity_type": entity_type,
+        "results": searches_db.recent_searches(limit=limit, entity_type=entity_type),
+    }
+
+
+@router.get("/volume")
+def volume(
+    x_admin_token: str | None = Header(default=None),
+    days: int = Query(default=30, ge=1, le=365),
+):
+    """Per-day search volume + unique-user counts. Spot trends and traffic spikes."""
+    _require_admin(x_admin_token)
+    return {"days": days, "results": searches_db.search_volume(days=days)}
diff --git a/backend/app/services/searches_db.py b/backend/app/services/searches_db.py
new file mode 100644
index 00000000..561419cf
--- /dev/null
+++ b/backend/app/services/searches_db.py
@@ -0,0 +1,218 @@
+"""SQLite-backed search analytics.
+
+Writes are batched on a background thread so the API request never blocks
+on disk I/O — search logging is best-effort and a stalled writer drops
+queued rows rather than backpressuring real traffic. The DB lives next to
+runs.db inside DATA_DIR so the same Docker volume + Litestream replication
+pipeline covers both.
+
+The point of capturing query text (which would explode Prometheus
+cardinality) is twofold:
+  - top searches → what people actually look for, used to prioritize
+    product work
+  - zero-result searches → what people look for AND don't find, the
+    most actionable bucket (missing entities, synonym gaps,
+    locale-only spellings)
+
+`ip_hash` is salted with the current UTC date so the same client on the
+same day collapses to one identity (de-duping the "top" list) but cannot
+be tracked across days. Hash is truncated to 16 hex chars — enough for
+within-day uniqueness, short enough not to bloat the row.
+"""
+
+import hashlib
+import logging
+import os
+import queue
+import sqlite3
+import threading
+import time
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+_data_dir = Path(
+    os.environ.get("DATA_DIR", Path(__file__).resolve().parents[3] / "data")
+)
+DB_PATH = _data_dir / "searches.db"
+
+# Bounded — drop on overflow rather than back-pressuring HTTP traffic.
+# ~2 writes/sec sustained at current load; 10k entries = ~80 min of buffer
+# even with the writer thread completely stalled. Plenty for any plausible
+# disk hiccup.
+_WRITE_QUEUE: queue.Queue = queue.Queue(maxsize=10_000)
+_BATCH_SIZE = 100
+_FLUSH_INTERVAL_S = 5.0
+_MAX_QUERY_LEN = 200
+
+
+def get_db_path() -> Path:
+    DB_PATH.parent.mkdir(parents=True, exist_ok=True)
+    return DB_PATH
+
+
+@contextmanager
+def get_conn():
+    conn = sqlite3.connect(str(get_db_path()), timeout=10)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA journal_mode=WAL")
+    try:
+        yield conn
+        conn.commit()
+    except Exception:
+        conn.rollback()
+        raise
+    finally:
+        conn.close()
+
+
+def init_db():
+    with get_conn() as conn:
+        conn.executescript("""
+            CREATE TABLE IF NOT EXISTS searches (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                ts INTEGER NOT NULL,
+                query TEXT NOT NULL,
+                entity_type TEXT,
+                lang TEXT,
+                ip_hash TEXT
+            );
+            CREATE INDEX IF NOT EXISTS idx_searches_ts ON searches(ts);
+            CREATE INDEX IF NOT EXISTS idx_searches_query ON searches(query);
+            CREATE INDEX IF NOT EXISTS idx_searches_entity ON searches(entity_type);
+        """)
+
+
+def hash_ip(client_ip: str | None) -> str:
+    """Stable-within-day, untraceable-across-days hash of the client IP."""
+    if not client_ip:
+        return ""
+    today = datetime.now(timezone.utc).date().isoformat()
+    return hashlib.sha256(f"{client_ip}:{today}".encode()).hexdigest()[:16]
+
+
+def log_search(
+    query: str,
+    entity_type: str | None,
+    lang: str | None,
+    ip_hash: str,
+) -> None:
+    """Enqueue a search for the background writer. Non-blocking."""
+    if not query:
+        return
+    try:
+        _WRITE_QUEUE.put_nowait(
+            (
+                int(time.time()),
+                query[:_MAX_QUERY_LEN],
+                entity_type,
+                lang,
+                ip_hash,
+            )
+        )
+    except queue.Full:
+        # Best-effort logging — never block the request path.
+        pass
+
+
+def _writer_loop() -> None:
+    while True:
+        batch: list[tuple] = []
+        try:
+            batch.append(_WRITE_QUEUE.get(timeout=_FLUSH_INTERVAL_S))
+        except queue.Empty:
+            continue
+        while len(batch) < _BATCH_SIZE:
+            try:
+                batch.append(_WRITE_QUEUE.get_nowait())
+            except queue.Empty:
+                break
+        try:
+            with get_conn() as conn:
+                conn.executemany(
+                    "INSERT INTO searches (ts, query, entity_type, lang, ip_hash) "
+                    "VALUES (?, ?, ?, ?, ?)",
+                    batch,
+                )
+        except Exception:
+            logger.exception("search batch write failed (%d rows dropped)", len(batch))
+
+
+def top_searches(
+    days: int = 7,
+    limit: int = 100,
+    entity_type: str | None = None,
+) -> list[dict]:
+    cutoff = int(time.time()) - days * 86400
+    where = ["ts >= ?"]
+    params: list = [cutoff]
+    if entity_type:
+        where.append("entity_type = ?")
+        params.append(entity_type)
+    where_clause = " WHERE " + " AND ".join(where)
+    with get_conn() as conn:
+        rows = conn.execute(
+            f"""
+            SELECT query,
+                   entity_type,
+                   COUNT(*) as hits,
+                   COUNT(DISTINCT ip_hash) as unique_users
+            FROM searches{where_clause}
+            GROUP BY query, entity_type
+            ORDER BY hits DESC
+            LIMIT ?
+            """,
+            [*params, limit],
+        ).fetchall()
+    return [dict(r) for r in rows]
+
+
+def recent_searches(
+    limit: int = 200,
+    entity_type: str | None = None,
+) -> list[dict]:
+    where: list[str] = []
+    params: list = []
+    if entity_type:
+        where.append("entity_type = ?")
+        params.append(entity_type)
+    where_clause = (" WHERE " + " AND ".join(where)) if where else ""
+    with get_conn() as conn:
+        rows = conn.execute(
+            f"""
+            SELECT ts, query, entity_type, lang
+            FROM searches{where_clause}
+            ORDER BY ts DESC
+            LIMIT ?
+            """,
+            [*params, limit],
+        ).fetchall()
+    return [dict(r) for r in rows]
+
+
+def search_volume(days: int = 30) -> list[dict]:
+    """Per-day hit counts, useful for spotting trends."""
+    cutoff = int(time.time()) - days * 86400
+    with get_conn() as conn:
+        rows = conn.execute(
+            """
+            SELECT
+                date(ts, 'unixepoch') as day,
+                COUNT(*) as hits,
+                COUNT(DISTINCT ip_hash) as unique_users
+            FROM searches
+            WHERE ts >= ?
+            GROUP BY day
+            ORDER BY day DESC
+            """,
+            (cutoff,),
+        ).fetchall()
+    return [dict(r) for r in rows]
+
+
+# Schema + background writer come up at import time so they're ready
+# before the first request lands.
+init_db()
+threading.Thread(target=_writer_loop, daemon=True, name="searches-writer").start()
diff --git a/docker-compose.beta.yml b/docker-compose.beta.yml
index 9e9146e6..54f5651e 100644
--- a/docker-compose.beta.yml
+++ b/docker-compose.beta.yml
@@ -24,6 +24,7 @@ services:
       - GITHUB_APP_INSTALLATION_ID=${GITHUB_APP_INSTALLATION_ID:-}
       - GITHUB_APP_REPO=${GITHUB_APP_REPO:-}
       - GITHUB_APP_PRIVATE_KEY_PATH=${GITHUB_APP_PRIVATE_KEY_PATH:-/secrets/knowledge-demon.private-key.pem}
+      - ADMIN_TOKEN=${ADMIN_TOKEN:-}
     networks:
       - nginx_web-network
     logging:
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 12f8baee..97a6abd3 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -36,6 +36,10 @@ services:
       # so the flip is explicit per host:
       #   TURSO_LOCAL_REPLICA=/data/runs-replica.db
       - TURSO_LOCAL_REPLICA=${TURSO_LOCAL_REPLICA:-}
+      # Shared-secret gate for /api/admin/* endpoints (searches analytics,
+      # destructive ops later). Empty → endpoints return 503 "not
+      # configured" rather than silent 401s.
+      - ADMIN_TOKEN=${ADMIN_TOKEN:-}
     networks:
       - nginx_web-network
     logging: