From ca1cbd6ab1c3179232107bd11678b14628de96fc Mon Sep 17 00:00:00 2001 From: Peter Lord Date: Thu, 14 May 2026 20:00:38 -0700 Subject: [PATCH] Capture search query text in searches.db for top + zero-result analytics --- backend/app/main.py | 19 ++- backend/app/routers/admin_searches.py | 71 +++++++++ backend/app/services/searches_db.py | 218 ++++++++++++++++++++++++++ docker-compose.beta.yml | 1 + docker-compose.prod.yml | 4 + 5 files changed, 312 insertions(+), 1 deletion(-) create mode 100644 backend/app/routers/admin_searches.py create mode 100644 backend/app/services/searches_db.py diff --git a/backend/app/main.py b/backend/app/main.py index 24b1eef8..d1f9c4a3 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -54,8 +54,10 @@ auth_steam, uninstall, qa_feedback, + admin_searches, ) from .services.data_service import get_stats, load_translation_maps, current_version +from .services import searches_db from .dependencies import get_lang, VALID_LANGUAGES, LANGUAGE_NAMES from prometheus_fastapi_instrumentator import Instrumentator @@ -342,8 +344,22 @@ async def dispatch(self, request: Request, call_next): elif len(parts) == 2: # List view: /api/cards entity_list_views.labels(entity_type=etype).inc() - if request.query_params.get("search"): + search_q = request.query_params.get("search") + if search_q: search_queries.labels(entity_type=etype).inc() + # Persist the query text for analytics. Prometheus + # can only carry the count by entity_type (label + # cardinality), so the actual strings live in + # searches.db where they can be aggregated for + # "what do people search for that we don't have". + searches_db.log_search( + query=search_q, + entity_type=etype, + lang=request.query_params.get("lang") or "eng", + ip_hash=searches_db.hash_ip( + request.client.host if request.client else None + ), + ) # Compare views if len(parts) == 3 and parts[1] == "compare": @@ -430,6 +446,7 @@ async def dispatch(self, request: Request, call_next): app.include_router(feedback.router) app.include_router(uninstall.router) app.include_router(qa_feedback.router) +app.include_router(admin_searches.router) app.include_router(acts.router) app.include_router(ascensions.router) app.include_router(names.router) diff --git a/backend/app/routers/admin_searches.py b/backend/app/routers/admin_searches.py new file mode 100644 index 00000000..90d63097 --- /dev/null +++ b/backend/app/routers/admin_searches.py @@ -0,0 +1,71 @@ +"""Admin-gated search analytics endpoints. + +Reads searches.db and exposes top / recent / volume rollups. Gated by a +shared-secret header (`X-Admin-Token`) sourced from the `ADMIN_TOKEN` +env var. If the env var is missing the endpoints return 503 — explicit +"not configured" is easier to triage than silent 401s. + +Query-string auth would have been simpler but tokens leak through +browser history and proxy access logs; a header keeps them out of the +URL surface even if someone copies the curl into Slack. +""" + +import os + +from fastapi import APIRouter, Header, HTTPException, Query + +from ..services import searches_db + +router = APIRouter(prefix="/api/admin/searches", tags=["Admin"]) + + +def _require_admin(token: str | None) -> None: + expected = os.environ.get("ADMIN_TOKEN") + if not expected: + # Deliberate over silent 401 — operator hasn't configured the + # secret yet, the endpoint is just disabled, not authenticated. + raise HTTPException(status_code=503, detail="Admin endpoints not configured") + if token != expected: + raise HTTPException(status_code=401, detail="Invalid admin token") + + +@router.get("/top") +def top( + x_admin_token: str | None = Header(default=None), + days: int = Query(default=7, ge=1, le=365), + limit: int = Query(default=100, ge=1, le=1000), + entity_type: str | None = None, +): + """Most-searched queries in the last N days, with unique-user counts.""" + _require_admin(x_admin_token) + return { + "days": days, + "entity_type": entity_type, + "results": searches_db.top_searches( + days=days, limit=limit, entity_type=entity_type + ), + } + + +@router.get("/recent") +def recent( + x_admin_token: str | None = Header(default=None), + limit: int = Query(default=200, ge=1, le=2000), + entity_type: str | None = None, +): + """Most-recent searches, newest first. Useful for spot-checking live traffic.""" + _require_admin(x_admin_token) + return { + "entity_type": entity_type, + "results": searches_db.recent_searches(limit=limit, entity_type=entity_type), + } + + +@router.get("/volume") +def volume( + x_admin_token: str | None = Header(default=None), + days: int = Query(default=30, ge=1, le=365), +): + """Per-day search volume + unique-user counts. Spot trends and traffic spikes.""" + _require_admin(x_admin_token) + return {"days": days, "results": searches_db.search_volume(days=days)} diff --git a/backend/app/services/searches_db.py b/backend/app/services/searches_db.py new file mode 100644 index 00000000..561419cf --- /dev/null +++ b/backend/app/services/searches_db.py @@ -0,0 +1,218 @@ +"""SQLite-backed search analytics. + +Writes are batched on a background thread so the API request never blocks +on disk I/O — search logging is best-effort and a stalled writer drops +queued rows rather than backpressuring real traffic. The DB lives next to +runs.db inside DATA_DIR so the same Docker volume + Litestream replication +pipeline covers both. + +The point of capturing query text (which would explode Prometheus +cardinality) is twofold: + - top searches → what people actually look for, used to prioritize + product work + - zero-result searches → what people look for AND don't find, the + most actionable bucket (missing entities, synonym gaps, + locale-only spellings) + +`ip_hash` is salted with the current UTC date so the same client on the +same day collapses to one identity (de-duping the "top" list) but cannot +be tracked across days. Hash is truncated to 16 hex chars — enough for +within-day uniqueness, short enough not to bloat the row. +""" + +import hashlib +import logging +import os +import queue +import sqlite3 +import threading +import time +from contextlib import contextmanager +from datetime import datetime, timezone +from pathlib import Path + +logger = logging.getLogger(__name__) + +_data_dir = Path( + os.environ.get("DATA_DIR", Path(__file__).resolve().parents[3] / "data") +) +DB_PATH = _data_dir / "searches.db" + +# Bounded — drop on overflow rather than back-pressuring HTTP traffic. +# ~2 writes/sec sustained at current load; 10k entries = ~80 min of buffer +# even with the writer thread completely stalled. Plenty for any plausible +# disk hiccup. +_WRITE_QUEUE: queue.Queue = queue.Queue(maxsize=10_000) +_BATCH_SIZE = 100 +_FLUSH_INTERVAL_S = 5.0 +_MAX_QUERY_LEN = 200 + + +def get_db_path() -> Path: + DB_PATH.parent.mkdir(parents=True, exist_ok=True) + return DB_PATH + + +@contextmanager +def get_conn(): + conn = sqlite3.connect(str(get_db_path()), timeout=10) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + try: + yield conn + conn.commit() + except Exception: + conn.rollback() + raise + finally: + conn.close() + + +def init_db(): + with get_conn() as conn: + conn.executescript(""" + CREATE TABLE IF NOT EXISTS searches ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + ts INTEGER NOT NULL, + query TEXT NOT NULL, + entity_type TEXT, + lang TEXT, + ip_hash TEXT + ); + CREATE INDEX IF NOT EXISTS idx_searches_ts ON searches(ts); + CREATE INDEX IF NOT EXISTS idx_searches_query ON searches(query); + CREATE INDEX IF NOT EXISTS idx_searches_entity ON searches(entity_type); + """) + + +def hash_ip(client_ip: str | None) -> str: + """Stable-within-day, untraceable-across-days hash of the client IP.""" + if not client_ip: + return "" + today = datetime.now(timezone.utc).date().isoformat() + return hashlib.sha256(f"{client_ip}:{today}".encode()).hexdigest()[:16] + + +def log_search( + query: str, + entity_type: str | None, + lang: str | None, + ip_hash: str, +) -> None: + """Enqueue a search for the background writer. Non-blocking.""" + if not query: + return + try: + _WRITE_QUEUE.put_nowait( + ( + int(time.time()), + query[:_MAX_QUERY_LEN], + entity_type, + lang, + ip_hash, + ) + ) + except queue.Full: + # Best-effort logging — never block the request path. + pass + + +def _writer_loop() -> None: + while True: + batch: list[tuple] = [] + try: + batch.append(_WRITE_QUEUE.get(timeout=_FLUSH_INTERVAL_S)) + except queue.Empty: + continue + while len(batch) < _BATCH_SIZE: + try: + batch.append(_WRITE_QUEUE.get_nowait()) + except queue.Empty: + break + try: + with get_conn() as conn: + conn.executemany( + "INSERT INTO searches (ts, query, entity_type, lang, ip_hash) " + "VALUES (?, ?, ?, ?, ?)", + batch, + ) + except Exception: + logger.exception("search batch write failed (%d rows dropped)", len(batch)) + + +def top_searches( + days: int = 7, + limit: int = 100, + entity_type: str | None = None, +) -> list[dict]: + cutoff = int(time.time()) - days * 86400 + where = ["ts >= ?"] + params: list = [cutoff] + if entity_type: + where.append("entity_type = ?") + params.append(entity_type) + where_clause = " WHERE " + " AND ".join(where) + with get_conn() as conn: + rows = conn.execute( + f""" + SELECT query, + entity_type, + COUNT(*) as hits, + COUNT(DISTINCT ip_hash) as unique_users + FROM searches{where_clause} + GROUP BY query, entity_type + ORDER BY hits DESC + LIMIT ? + """, + [*params, limit], + ).fetchall() + return [dict(r) for r in rows] + + +def recent_searches( + limit: int = 200, + entity_type: str | None = None, +) -> list[dict]: + where: list[str] = [] + params: list = [] + if entity_type: + where.append("entity_type = ?") + params.append(entity_type) + where_clause = (" WHERE " + " AND ".join(where)) if where else "" + with get_conn() as conn: + rows = conn.execute( + f""" + SELECT ts, query, entity_type, lang + FROM searches{where_clause} + ORDER BY ts DESC + LIMIT ? + """, + [*params, limit], + ).fetchall() + return [dict(r) for r in rows] + + +def search_volume(days: int = 30) -> list[dict]: + """Per-day hit counts, useful for spotting trends.""" + cutoff = int(time.time()) - days * 86400 + with get_conn() as conn: + rows = conn.execute( + """ + SELECT + date(ts, 'unixepoch') as day, + COUNT(*) as hits, + COUNT(DISTINCT ip_hash) as unique_users + FROM searches + WHERE ts >= ? + GROUP BY day + ORDER BY day DESC + """, + (cutoff,), + ).fetchall() + return [dict(r) for r in rows] + + +# Schema + background writer come up at import time so they're ready +# before the first request lands. +init_db() +threading.Thread(target=_writer_loop, daemon=True, name="searches-writer").start() diff --git a/docker-compose.beta.yml b/docker-compose.beta.yml index 9e9146e6..54f5651e 100644 --- a/docker-compose.beta.yml +++ b/docker-compose.beta.yml @@ -24,6 +24,7 @@ services: - GITHUB_APP_INSTALLATION_ID=${GITHUB_APP_INSTALLATION_ID:-} - GITHUB_APP_REPO=${GITHUB_APP_REPO:-} - GITHUB_APP_PRIVATE_KEY_PATH=${GITHUB_APP_PRIVATE_KEY_PATH:-/secrets/knowledge-demon.private-key.pem} + - ADMIN_TOKEN=${ADMIN_TOKEN:-} networks: - nginx_web-network logging: diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 12f8baee..97a6abd3 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -36,6 +36,10 @@ services: # so the flip is explicit per host: # TURSO_LOCAL_REPLICA=/data/runs-replica.db - TURSO_LOCAL_REPLICA=${TURSO_LOCAL_REPLICA:-} + # Shared-secret gate for /api/admin/* endpoints (searches analytics, + # destructive ops later). Empty → endpoints return 503 "not + # configured" rather than silent 401s. + - ADMIN_TOKEN=${ADMIN_TOKEN:-} networks: - nginx_web-network logging: