Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion backend/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@
auth_steam,
uninstall,
qa_feedback,
admin_searches,
)
from .services.data_service import get_stats, load_translation_maps, current_version
from .services import searches_db
from .dependencies import get_lang, VALID_LANGUAGES, LANGUAGE_NAMES
from prometheus_fastapi_instrumentator import Instrumentator

Expand Down Expand Up @@ -342,8 +344,22 @@ async def dispatch(self, request: Request, call_next):
elif len(parts) == 2:
# List view: /api/cards
entity_list_views.labels(entity_type=etype).inc()
if request.query_params.get("search"):
search_q = request.query_params.get("search")
if search_q:
search_queries.labels(entity_type=etype).inc()
# Persist the query text for analytics. Prometheus
# can only carry the count by entity_type (label
# cardinality), so the actual strings live in
# searches.db where they can be aggregated for
# "what do people search for that we don't have".
searches_db.log_search(
query=search_q,
entity_type=etype,
lang=request.query_params.get("lang") or "eng",
ip_hash=searches_db.hash_ip(
request.client.host if request.client else None
),
)

# Compare views
if len(parts) == 3 and parts[1] == "compare":
Expand Down Expand Up @@ -430,6 +446,7 @@ async def dispatch(self, request: Request, call_next):
app.include_router(feedback.router)
app.include_router(uninstall.router)
app.include_router(qa_feedback.router)
app.include_router(admin_searches.router)
app.include_router(acts.router)
app.include_router(ascensions.router)
app.include_router(names.router)
Expand Down
71 changes: 71 additions & 0 deletions backend/app/routers/admin_searches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Admin-gated search analytics endpoints.

Reads searches.db and exposes top / recent / volume rollups. Gated by a
shared-secret header (`X-Admin-Token`) sourced from the `ADMIN_TOKEN`
env var. If the env var is missing the endpoints return 503 — explicit
"not configured" is easier to triage than silent 401s.

Query-string auth would have been simpler but tokens leak through
browser history and proxy access logs; a header keeps them out of the
URL surface even if someone copies the curl into Slack.
"""

import os

from fastapi import APIRouter, Header, HTTPException, Query

from ..services import searches_db

router = APIRouter(prefix="/api/admin/searches", tags=["Admin"])


def _require_admin(token: str | None) -> None:
expected = os.environ.get("ADMIN_TOKEN")
if not expected:
# Deliberate over silent 401 — operator hasn't configured the
# secret yet, the endpoint is just disabled, not authenticated.
raise HTTPException(status_code=503, detail="Admin endpoints not configured")
if token != expected:
raise HTTPException(status_code=401, detail="Invalid admin token")


@router.get("/top")
def top(
x_admin_token: str | None = Header(default=None),
days: int = Query(default=7, ge=1, le=365),
limit: int = Query(default=100, ge=1, le=1000),
entity_type: str | None = None,
):
"""Most-searched queries in the last N days, with unique-user counts."""
_require_admin(x_admin_token)
return {
"days": days,
"entity_type": entity_type,
"results": searches_db.top_searches(
days=days, limit=limit, entity_type=entity_type
),
}


@router.get("/recent")
def recent(
x_admin_token: str | None = Header(default=None),
limit: int = Query(default=200, ge=1, le=2000),
entity_type: str | None = None,
):
"""Most-recent searches, newest first. Useful for spot-checking live traffic."""
_require_admin(x_admin_token)
return {
"entity_type": entity_type,
"results": searches_db.recent_searches(limit=limit, entity_type=entity_type),
}


@router.get("/volume")
def volume(
x_admin_token: str | None = Header(default=None),
days: int = Query(default=30, ge=1, le=365),
):
"""Per-day search volume + unique-user counts. Spot trends and traffic spikes."""
_require_admin(x_admin_token)
return {"days": days, "results": searches_db.search_volume(days=days)}
218 changes: 218 additions & 0 deletions backend/app/services/searches_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
"""SQLite-backed search analytics.

Writes are batched on a background thread so the API request never blocks
on disk I/O — search logging is best-effort and a stalled writer drops
queued rows rather than backpressuring real traffic. The DB lives next to
runs.db inside DATA_DIR so the same Docker volume + Litestream replication
pipeline covers both.

The point of capturing query text (which would explode Prometheus
cardinality) is twofold:
- top searches → what people actually look for, used to prioritize
product work
- zero-result searches → what people look for AND don't find, the
most actionable bucket (missing entities, synonym gaps,
locale-only spellings)

`ip_hash` is salted with the current UTC date so the same client on the
same day collapses to one identity (de-duping the "top" list) but cannot
be tracked across days. Hash is truncated to 16 hex chars — enough for
within-day uniqueness, short enough not to bloat the row.
"""

import hashlib
import logging
import os
import queue
import sqlite3
import threading
import time
from contextlib import contextmanager
from datetime import datetime, timezone
from pathlib import Path

logger = logging.getLogger(__name__)

_data_dir = Path(
os.environ.get("DATA_DIR", Path(__file__).resolve().parents[3] / "data")
)
DB_PATH = _data_dir / "searches.db"

# Bounded — drop on overflow rather than back-pressuring HTTP traffic.
# ~2 writes/sec sustained at current load; 10k entries = ~80 min of buffer
# even with the writer thread completely stalled. Plenty for any plausible
# disk hiccup.
_WRITE_QUEUE: queue.Queue = queue.Queue(maxsize=10_000)
_BATCH_SIZE = 100
_FLUSH_INTERVAL_S = 5.0
_MAX_QUERY_LEN = 200


def get_db_path() -> Path:
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
return DB_PATH


@contextmanager
def get_conn():
conn = sqlite3.connect(str(get_db_path()), timeout=10)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
conn.close()


def init_db():
with get_conn() as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS searches (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ts INTEGER NOT NULL,
query TEXT NOT NULL,
entity_type TEXT,
lang TEXT,
ip_hash TEXT
);
CREATE INDEX IF NOT EXISTS idx_searches_ts ON searches(ts);
CREATE INDEX IF NOT EXISTS idx_searches_query ON searches(query);
CREATE INDEX IF NOT EXISTS idx_searches_entity ON searches(entity_type);
""")


def hash_ip(client_ip: str | None) -> str:
"""Stable-within-day, untraceable-across-days hash of the client IP."""
if not client_ip:
return ""
today = datetime.now(timezone.utc).date().isoformat()
return hashlib.sha256(f"{client_ip}:{today}".encode()).hexdigest()[:16]


def log_search(
query: str,
entity_type: str | None,
lang: str | None,
ip_hash: str,
) -> None:
"""Enqueue a search for the background writer. Non-blocking."""
if not query:
return
try:
_WRITE_QUEUE.put_nowait(
(
int(time.time()),
query[:_MAX_QUERY_LEN],
entity_type,
lang,
ip_hash,
)
)
except queue.Full:
# Best-effort logging — never block the request path.
pass


def _writer_loop() -> None:
while True:
batch: list[tuple] = []
try:
batch.append(_WRITE_QUEUE.get(timeout=_FLUSH_INTERVAL_S))
except queue.Empty:
continue
while len(batch) < _BATCH_SIZE:
try:
batch.append(_WRITE_QUEUE.get_nowait())
except queue.Empty:
break
try:
with get_conn() as conn:
conn.executemany(
"INSERT INTO searches (ts, query, entity_type, lang, ip_hash) "
"VALUES (?, ?, ?, ?, ?)",
batch,
)
except Exception:
logger.exception("search batch write failed (%d rows dropped)", len(batch))


def top_searches(
days: int = 7,
limit: int = 100,
entity_type: str | None = None,
) -> list[dict]:
cutoff = int(time.time()) - days * 86400
where = ["ts >= ?"]
params: list = [cutoff]
if entity_type:
where.append("entity_type = ?")
params.append(entity_type)
where_clause = " WHERE " + " AND ".join(where)
with get_conn() as conn:
rows = conn.execute(
f"""
SELECT query,
entity_type,
COUNT(*) as hits,
COUNT(DISTINCT ip_hash) as unique_users
FROM searches{where_clause}
GROUP BY query, entity_type
ORDER BY hits DESC
LIMIT ?
""",
[*params, limit],
).fetchall()
return [dict(r) for r in rows]


def recent_searches(
limit: int = 200,
entity_type: str | None = None,
) -> list[dict]:
where: list[str] = []
params: list = []
if entity_type:
where.append("entity_type = ?")
params.append(entity_type)
where_clause = (" WHERE " + " AND ".join(where)) if where else ""
with get_conn() as conn:
rows = conn.execute(
f"""
SELECT ts, query, entity_type, lang
FROM searches{where_clause}
ORDER BY ts DESC
LIMIT ?
""",
[*params, limit],
).fetchall()
return [dict(r) for r in rows]


def search_volume(days: int = 30) -> list[dict]:
"""Per-day hit counts, useful for spotting trends."""
cutoff = int(time.time()) - days * 86400
with get_conn() as conn:
rows = conn.execute(
"""
SELECT
date(ts, 'unixepoch') as day,
COUNT(*) as hits,
COUNT(DISTINCT ip_hash) as unique_users
FROM searches
WHERE ts >= ?
GROUP BY day
ORDER BY day DESC
""",
(cutoff,),
).fetchall()
return [dict(r) for r in rows]


# Schema + background writer come up at import time so they're ready
# before the first request lands.
init_db()
threading.Thread(target=_writer_loop, daemon=True, name="searches-writer").start()
1 change: 1 addition & 0 deletions docker-compose.beta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ services:
- GITHUB_APP_INSTALLATION_ID=${GITHUB_APP_INSTALLATION_ID:-}
- GITHUB_APP_REPO=${GITHUB_APP_REPO:-}
- GITHUB_APP_PRIVATE_KEY_PATH=${GITHUB_APP_PRIVATE_KEY_PATH:-/secrets/knowledge-demon.private-key.pem}
- ADMIN_TOKEN=${ADMIN_TOKEN:-}
networks:
- nginx_web-network
logging:
Expand Down
4 changes: 4 additions & 0 deletions docker-compose.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ services:
# so the flip is explicit per host:
# TURSO_LOCAL_REPLICA=/data/runs-replica.db
- TURSO_LOCAL_REPLICA=${TURSO_LOCAL_REPLICA:-}
# Shared-secret gate for /api/admin/* endpoints (searches analytics,
# destructive ops later). Empty → endpoints return 503 "not
# configured" rather than silent 401s.
- ADMIN_TOKEN=${ADMIN_TOKEN:-}
networks:
- nginx_web-network
logging:
Expand Down
Loading