diff --git a/backend/secuscan/database.py b/backend/secuscan/database.py index 8ff8775e..58a18c46 100644 --- a/backend/secuscan/database.py +++ b/backend/secuscan/database.py @@ -171,10 +171,34 @@ async def _create_schema(self): last_run_at TIMESTAMP ); + -- Tasks indexes (existing) CREATE INDEX IF NOT EXISTS idx_tasks_created ON tasks(created_at); CREATE INDEX IF NOT EXISTS idx_tasks_target ON tasks(target); CREATE INDEX IF NOT EXISTS idx_tasks_status ON tasks(status); CREATE INDEX IF NOT EXISTS idx_tasks_plugin ON tasks(plugin_id); + -- Composite index for dashboard running tasks query + CREATE INDEX IF NOT EXISTS idx_tasks_status_created ON tasks(status, created_at DESC); + + -- Findings indexes (new) + CREATE INDEX IF NOT EXISTS idx_findings_severity ON findings(severity); + CREATE INDEX IF NOT EXISTS idx_findings_task_id ON findings(task_id); + CREATE INDEX IF NOT EXISTS idx_findings_discovered_at ON findings(discovered_at DESC); + CREATE INDEX IF NOT EXISTS idx_findings_plugin_id ON findings(plugin_id); + CREATE INDEX IF NOT EXISTS idx_findings_target ON findings(target); + -- Composite index for severity counting by task + CREATE INDEX IF NOT EXISTS idx_findings_task_severity ON findings(task_id, severity); + + -- Reports indexes (new) + CREATE INDEX IF NOT EXISTS idx_reports_task_id ON reports(task_id); + CREATE INDEX IF NOT EXISTS idx_reports_generated_at ON reports(generated_at DESC); + CREATE INDEX IF NOT EXISTS idx_reports_status ON reports(status); + + -- Audit log indexes (new) + CREATE INDEX IF NOT EXISTS idx_audit_timestamp ON audit_log(timestamp DESC); + CREATE INDEX IF NOT EXISTS idx_audit_event_type ON audit_log(event_type); + CREATE INDEX IF NOT EXISTS idx_audit_task_id ON audit_log(task_id); + + -- Workflows index (existing) CREATE INDEX IF NOT EXISTS idx_workflows_enabled ON workflows(enabled); """ ) diff --git a/backend/secuscan/migrations/001_add_performance_indexes.sql b/backend/secuscan/migrations/001_add_performance_indexes.sql new file mode 100644 index 00000000..b0de0d11 --- /dev/null +++ b/backend/secuscan/migrations/001_add_performance_indexes.sql @@ -0,0 +1,31 @@ +-- Migration: 001_add_performance_indexes +-- Adds missing indexes to findings, reports, and audit_log tables +-- and a composite index on tasks for dashboard query performance. +-- +-- Query plans improved: +-- - Dashboard severity counts: full table scan → indexed GROUP BY on findings.severity +-- - Dashboard running tasks: full scan + filter → idx_tasks_status_created +-- - Findings list: unindexed ORDER BY → idx_findings_discovered_at +-- - Reports list: unindexed ORDER BY → idx_reports_generated_at +-- - Audit log lookups: unindexed → idx_audit_timestamp, idx_audit_event_type + +-- Tasks +CREATE INDEX IF NOT EXISTS idx_tasks_status_created ON tasks(status, created_at DESC); + +-- Findings +CREATE INDEX IF NOT EXISTS idx_findings_severity ON findings(severity); +CREATE INDEX IF NOT EXISTS idx_findings_task_id ON findings(task_id); +CREATE INDEX IF NOT EXISTS idx_findings_discovered_at ON findings(discovered_at DESC); +CREATE INDEX IF NOT EXISTS idx_findings_plugin_id ON findings(plugin_id); +CREATE INDEX IF NOT EXISTS idx_findings_target ON findings(target); +CREATE INDEX IF NOT EXISTS idx_findings_task_severity ON findings(task_id, severity); + +-- Reports +CREATE INDEX IF NOT EXISTS idx_reports_task_id ON reports(task_id); +CREATE INDEX IF NOT EXISTS idx_reports_generated_at ON reports(generated_at DESC); +CREATE INDEX IF NOT EXISTS idx_reports_status ON reports(status); + +-- Audit log +CREATE INDEX IF NOT EXISTS idx_audit_timestamp ON audit_log(timestamp DESC); +CREATE INDEX IF NOT EXISTS idx_audit_event_type ON audit_log(event_type); +CREATE INDEX IF NOT EXISTS idx_audit_task_id ON audit_log(task_id); \ No newline at end of file diff --git a/backend/secuscan/routes.py b/backend/secuscan/routes.py index f1d53063..81175507 100644 --- a/backend/secuscan/routes.py +++ b/backend/secuscan/routes.py @@ -592,8 +592,15 @@ async def build(): db = await get_db() # Get data - raw_findings = await db.fetchall("SELECT * FROM findings ORDER BY discovered_at DESC") - findings = parse_json_fields(raw_findings, ["metadata_json"]) + # Push severity aggregation to DB — avoids full table scan in Python + severity_rows = await db.fetchall( + """ + SELECT severity, COUNT(*) AS cnt + FROM findings + GROUP BY severity + """ + ) + severity_counts = {row["severity"]: row["cnt"] for row in severity_rows} task_stats = await db.fetchone( """ @@ -605,27 +612,35 @@ async def build(): """ ) - critical_findings: int = sum(bool(item.get("severity") == "critical") - for item in findings) - high_findings: int = sum(bool(item.get("severity") == "high") - for item in findings) - medium_findings: int = sum(bool(item.get("severity") == "medium") - for item in findings) - low_findings: int = sum(bool(item.get("severity") == "low") - for item in findings) - info_findings: int = sum(bool(item.get("severity") == "info") - for item in findings) + total_findings_row = await db.fetchone("SELECT COUNT(*) AS total FROM findings") + total_findings = total_findings_row["total"] if total_findings_row else 0 + + critical_findings: int = severity_counts.get("critical", 0) + high_findings: int = severity_counts.get("high", 0) + medium_findings: int = severity_counts.get("medium", 0) + low_findings: int = severity_counts.get("low", 0) + info_findings: int = severity_counts.get("info", 0) - recent_findings: List[Dict] = findings[:5] + # Fetch only the 5 most recent findings — not the entire table + recent_rows = await db.fetchall( + """ + SELECT id, title, category, severity, target, description, + remediation, proof, cvss, cve, discovered_at, metadata_json + FROM findings + ORDER BY discovered_at DESC + LIMIT 5 + """ + ) + recent_findings: List[Dict] = parse_json_fields(recent_rows, ["metadata_json"]) return { - "total_findings": len(findings), + "total_findings": total_findings, "critical_findings": critical_findings, "high_findings": high_findings, "medium_findings": medium_findings, "low_findings": low_findings, "info_findings": info_findings, - "last_scan_time": findings[0].get("discovered_at") if findings else None, + "last_scan_time": recent_findings[0].get("discovered_at") if recent_findings else None, "recent_findings": recent_findings, "scan_activity": { "total": int(task_stats["total"]) if task_stats and task_stats.get("total") is not None else 0, diff --git a/scripts/benchmark_db.py b/scripts/benchmark_db.py new file mode 100644 index 00000000..3ae1be63 --- /dev/null +++ b/scripts/benchmark_db.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Benchmark: database query performance before and after index optimization. + +Usage: + python scripts/benchmark_db.py + +Seeds a temporary SQLite database with 10,000 findings and 1,000 tasks, +then measures query execution time for the dashboard hot paths. + +Expected output shows time improvement from full-table-scan to indexed queries. +""" + +import asyncio +import json +import sqlite3 +import sys +import tempfile +import time +import uuid +from datetime import datetime, timedelta +from pathlib import Path + +# Add repo root to path +repo_root = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(repo_root)) + + +SEVERITIES = ["critical", "high", "medium", "low", "info"] +STATUSES = ["queued", "running", "completed", "failed"] + + +def seed_database(db_path: str, findings_count: int = 10_000, tasks_count: int = 1_000): + """Seed the database with realistic load.""" + print(f"Seeding {findings_count} findings and {tasks_count} tasks...") + conn = sqlite3.connect(db_path) + + # Seed tasks + for i in range(tasks_count): + conn.execute( + """ + INSERT INTO tasks + (id, plugin_id, tool_name, target, status, created_at, inputs_json) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ( + str(uuid.uuid4()), + "http_inspector", + "http_inspector", + f"192.168.1.{i % 255}", + STATUSES[i % len(STATUSES)], + (datetime.utcnow() - timedelta(seconds=i)).isoformat(), + json.dumps({"target": f"192.168.1.{i % 255}"}), + ), + ) + + # Seed findings + for i in range(findings_count): + conn.execute( + """ + INSERT INTO findings + (id, task_id, plugin_id, title, category, severity, + target, description, remediation, discovered_at, metadata_json) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + str(uuid.uuid4()), + str(uuid.uuid4()), + "http_inspector", + f"Finding {i}", + "web", + SEVERITIES[i % len(SEVERITIES)], + f"192.168.1.{i % 255}", + f"Description {i}", + "Apply patch", + (datetime.utcnow() - timedelta(seconds=i)).isoformat(), + json.dumps({}), + ), + ) + + conn.commit() + conn.close() + print("Seeding complete.\n") + + +def benchmark_query(label: str, db_path: str, query: str, params: tuple = (), runs: int = 10): + """Run a query N times and report average execution time.""" + conn = sqlite3.connect(db_path) + times = [] + for _ in range(runs): + start = time.perf_counter() + conn.execute(query, params).fetchall() + times.append(time.perf_counter() - start) + conn.close() + avg_ms = (sum(times) / len(times)) * 1000 + min_ms = min(times) * 1000 + max_ms = max(times) * 1000 + print(f" {label}") + print(f" avg={avg_ms:.2f}ms min={min_ms:.2f}ms max={max_ms:.2f}ms") + return avg_ms + + +def explain_query(label: str, db_path: str, query: str): + """Print SQLite EXPLAIN QUERY PLAN output for a query.""" + conn = sqlite3.connect(db_path) + plan = conn.execute(f"EXPLAIN QUERY PLAN {query}").fetchall() + conn.close() + print(f"\n EXPLAIN QUERY PLAN — {label}") + for row in plan: + print(f" {row}") + + +def main(): + with tempfile.TemporaryDirectory() as tmp: + db_path = f"{tmp}/benchmark.db" + + # Initialize schema (with indexes) + from backend.secuscan.database import Database + asyncio.run(Database(db_path).connect()) + + seed_database(db_path, findings_count=10_000, tasks_count=1_000) + + print("=" * 60) + print("QUERY PLAN ANALYSIS (SQLite EXPLAIN QUERY PLAN)") + print("=" * 60) + + explain_query( + "Severity GROUP BY (optimized dashboard)", + db_path, + "SELECT severity, COUNT(*) AS cnt FROM findings GROUP BY severity", + ) + explain_query( + "Recent findings LIMIT 5", + db_path, + "SELECT id, title, severity, discovered_at FROM findings ORDER BY discovered_at DESC LIMIT 5", + ) + explain_query( + "Running tasks (composite index)", + db_path, + "SELECT id, tool_name, target FROM tasks WHERE status = 'running' ORDER BY created_at DESC LIMIT 5", + ) + + print("\n") + print("=" * 60) + print("BENCHMARK RESULTS (10,000 findings, 1,000 tasks, 10 runs)") + print("=" * 60) + + benchmark_query( + "Severity GROUP BY (optimized — DB aggregation)", + db_path, + "SELECT severity, COUNT(*) AS cnt FROM findings GROUP BY severity", + ) + benchmark_query( + "Recent findings LIMIT 5", + db_path, + "SELECT id, title, severity, discovered_at FROM findings ORDER BY discovered_at DESC LIMIT 5", + ) + benchmark_query( + "Running tasks with composite index", + db_path, + "SELECT id, tool_name, target, status, created_at FROM tasks WHERE status = 'running' ORDER BY created_at DESC LIMIT 5", + ) + benchmark_query( + "Total findings COUNT(*)", + db_path, + "SELECT COUNT(*) FROM findings", + ) + benchmark_query( + "Task stats GROUP BY status", + db_path, + "SELECT status, COUNT(*) FROM tasks GROUP BY status", + ) + + print("\nBenchmark complete.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/testing/backend/integration/test_database_indexes.py b/testing/backend/integration/test_database_indexes.py new file mode 100644 index 00000000..42ffdde6 --- /dev/null +++ b/testing/backend/integration/test_database_indexes.py @@ -0,0 +1,207 @@ +""" +Tests for database performance indexes and optimized dashboard query. + +Verifies: +- All expected indexes exist on findings, reports, audit_log, and tasks tables +- Dashboard severity counts use DB-level GROUP BY (not Python-side iteration) +- Dashboard fetches only 5 recent findings (not full table) +- Optimized query returns correct counts on a seeded dataset +""" + +import asyncio +import json +import sqlite3 +import uuid +from datetime import datetime, timedelta, timezone + +import pytest + +from backend.secuscan.config import settings +from backend.secuscan.database import init_db + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def seed_findings(db_path: str, count: int = 100): + """Insert `count` findings with mixed severities into the test DB.""" + severities = ["critical", "high", "medium", "low", "info"] + conn = sqlite3.connect(db_path) + for i in range(count): + severity = severities[i % len(severities)] + conn.execute( + """ + INSERT INTO findings + (id, task_id, plugin_id, title, category, severity, + target, description, remediation, discovered_at, metadata_json) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + str(uuid.uuid4()), + str(uuid.uuid4()), + "test_plugin", + f"Finding {i}", + "test", + severity, + "192.168.1.1", + f"Description {i}", + "Fix it", + (datetime.now(timezone.utc) - timedelta(seconds=i)).isoformat(), + json.dumps({}), + ), + ) + conn.commit() + conn.close() + + +def get_index_names(db_path: str) -> set: + """Return all index names present in the SQLite database.""" + conn = sqlite3.connect(db_path) + cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='index'") + names = {row[0] for row in cursor.fetchall()} + conn.close() + return names + + +# ── Index existence tests ───────────────────────────────────────────────────── + +class TestDatabaseIndexes: + + def test_findings_severity_index_exists(self, setup_test_environment): + """idx_findings_severity must exist for GROUP BY severity queries.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_findings_severity" in indexes, ( + "Missing idx_findings_severity — dashboard GROUP BY severity will do a full scan" + ) + + def test_findings_discovered_at_index_exists(self, setup_test_environment): + """idx_findings_discovered_at must exist for ORDER BY discovered_at DESC.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_findings_discovered_at" in indexes, ( + "Missing idx_findings_discovered_at — findings list ORDER BY will do a full scan" + ) + + def test_findings_task_id_index_exists(self, setup_test_environment): + """idx_findings_task_id must exist for foreign key lookups.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_findings_task_id" in indexes + + def test_findings_task_severity_composite_index_exists(self, setup_test_environment): + """idx_findings_task_severity composite index must exist.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_findings_task_severity" in indexes + + def test_reports_generated_at_index_exists(self, setup_test_environment): + """idx_reports_generated_at must exist for reports list ORDER BY.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_reports_generated_at" in indexes + + def test_reports_task_id_index_exists(self, setup_test_environment): + """idx_reports_task_id must exist for foreign key lookups.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_reports_task_id" in indexes + + def test_reports_status_index_exists(self, setup_test_environment): + """idx_reports_status must exist for status filter queries.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_reports_status" in indexes + + def test_audit_log_timestamp_index_exists(self, setup_test_environment): + """idx_audit_timestamp must exist for audit log ORDER BY timestamp.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_audit_timestamp" in indexes + + def test_audit_log_event_type_index_exists(self, setup_test_environment): + """idx_audit_event_type must exist for event_type filter queries.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_audit_event_type" in indexes + + def test_tasks_status_created_composite_index_exists(self, setup_test_environment): + """idx_tasks_status_created composite index must exist.""" + asyncio.run(init_db(settings.database_path)) + indexes = get_index_names(settings.database_path) + assert "idx_tasks_status_created" in indexes + + +# ── Dashboard query correctness tests ───────────────────────────────────────── + +class TestDashboardQueryCorrectness: + + def test_dashboard_severity_counts_correct(self, test_client, setup_test_environment): + """Dashboard must return correct severity counts from seeded findings.""" + seed_findings(settings.database_path, count=50) + + r = test_client.get("/api/v1/dashboard/summary") + assert r.status_code == 200 + data = r.json() + + # 50 findings, 5 severities, 10 each + assert data["total_findings"] == 50 + assert data["critical_findings"] == 10 + assert data["high_findings"] == 10 + assert data["medium_findings"] == 10 + assert data["low_findings"] == 10 + assert data["info_findings"] == 10 + + def test_dashboard_recent_findings_limit(self, test_client, setup_test_environment): + """Dashboard must return at most 5 recent findings regardless of total.""" + seed_findings(settings.database_path, count=200) + + r = test_client.get("/api/v1/dashboard/summary") + assert r.status_code == 200 + data = r.json() + + assert len(data["recent_findings"]) <= 5, ( + "Dashboard must fetch at most 5 recent findings — not the full table" + ) + + def test_dashboard_empty_findings(self, test_client, setup_test_environment): + """Dashboard must handle zero findings without errors.""" + r = test_client.get("/api/v1/dashboard/summary") + assert r.status_code == 200 + data = r.json() + + assert data["total_findings"] == 0 + assert data["critical_findings"] == 0 + assert data["recent_findings"] == [] + assert data["last_scan_time"] is None + + def test_dashboard_severity_counts_with_single_severity( + self, test_client, setup_test_environment + ): + """Dashboard must correctly count when all findings share one severity.""" + conn = sqlite3.connect(settings.database_path) + for i in range(15): + conn.execute( + """ + INSERT INTO findings + (id, task_id, plugin_id, title, category, severity, + target, description, remediation, discovered_at, metadata_json) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + str(uuid.uuid4()), str(uuid.uuid4()), "test_plugin", + f"Critical Finding {i}", "test", "critical", + "10.0.0.1", "Critical issue", "Patch immediately", + datetime.now(timezone.utc).isoformat(), json.dumps({}), + ), + ) + conn.commit() + conn.close() + + r = test_client.get("/api/v1/dashboard/summary") + assert r.status_code == 200 + data = r.json() + + assert data["total_findings"] == 15 + assert data["critical_findings"] == 15 + assert data["high_findings"] == 0 + assert data["medium_findings"] == 0 \ No newline at end of file