utksh1 · divyansha12 · May 24, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -38,7 +38,35 @@ jobs:
           python -m pip install --upgrade pip
           pip install -r backend/requirements.txt -r backend/requirements-dev.txt
       - name: Run backend tests
-        run: pytest testing/backend -q
+        run: pytest testing/backend -q -m "not benchmark"
+
+  benchmark:
+    runs-on: ubuntu-latest
+    needs: [backend-lint]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install backend system dependencies
+        run: sudo apt-get update && sudo apt-get install -y libcairo2-dev pkg-config
+      - name: Install backend dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r backend/requirements.txt -r backend/requirements-dev.txt
+      - name: Run benchmarks
+        id: run_benchmarks
+        run: python3 scripts/run_benchmarks.py
+        continue-on-error: true
+      - name: Upload benchmark results artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results
+          path: benchmark_results.json
+      - name: Add warning annotation on failure
+        if: steps.run_benchmarks.outcome == 'failure'
+        run: |
+          echo "::warning::Performance benchmark thresholds exceeded or benchmarks failed to run. Check the job logs for details."
 
   frontend-checks:
     runs-on: ubuntu-latest

diff --git a/.gitignore b/.gitignore
@@ -35,13 +35,13 @@ venv_tests/
 *.swo
 *~
 .DS_Store
-
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/
 *.cover
 .hypothesis/
+benchmark_results.json
 
 # Database
 *.db

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,3 +32,10 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["backend*"]
+
+[tool.pytest.ini_options]
+markers = [
+    "benchmark: performance benchmark tests (deselect with '-m not benchmark')",
+]
+asyncio_mode = "strict"
+python_files = ["test_*.py", "bench_*.py"]
diff --git a/scripts/run_benchmarks.py b/scripts/run_benchmarks.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""
+Benchmark Runner Script for SecuScan.
+Runs the performance benchmarks, compares results against thresholds,
+and exits non-zero if any regressions are detected.
+"""
+
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+# ANSI color codes
+GREEN = "\033[92m"
+RED = "\033[91m"
+BOLD = "\033[1m"
+RESET = "\033[0m"
+
+
+def main():
+    root_dir = Path(__file__).resolve().parents[1]
+    thresholds_path = (
+        root_dir / "testing" / "backend" / "benchmarks" / "thresholds.json"
+    )
+    results_path = root_dir / "benchmark_results.json"
+
+    # 1. Load thresholds
+    if not thresholds_path.exists():
+        print(f"{RED}Error: Thresholds file not found at {thresholds_path}{RESET}")
+        sys.exit(1)
+
+    with open(thresholds_path) as f:
+        thresholds = json.load(f)
+
+    # Remove stale results if they exist from a previous run
+    if results_path.exists():
+        try:
+            results_path.unlink()
+        except OSError:
+            pass
+
+    # 2. Run pytest benchmarks
+    print(f"{BOLD}Running SecuScan Performance Benchmarks...{RESET}\n")
+    cmd = [
+        sys.executable,
+        "-m",
+        "pytest",
+        str(root_dir / "testing" / "backend" / "benchmarks"),
+        "-m",
+        "benchmark",
+        "-v",
+        "-s",
+    ]
+
+    # Run the tests. We capture output/errors normally.
+    result = subprocess.run(cmd, cwd=str(root_dir))
+
+    # 3. Read results
+    if not results_path.exists():
+        print(f"\n{RED}Error: Benchmark run did not produce {results_path}{RESET}")
+        sys.exit(1)
+
+    with open(results_path) as f:
+        results = json.load(f)
+
+    # 4. Compare results against thresholds
+    print(f"\n{BOLD}=== Performance Benchmark Report ==={RESET}\n")
+    print(
+        f"{'Benchmark Metric':<45} | {'Measured':<12} | {'Threshold':<12} | {'Status':<6}"
+    )
+    print("-" * 82)
+
+    has_regression = False
+    for metric, threshold in thresholds.items():
+        if metric not in results:
+            print(f"{metric:<45} | {'N/A':<12} | {threshold:<12} | {RED}MISSING{RESET}")
+            has_regression = True
+            continue
+
+        value = results[metric]
+
+        # Check if throughput metric (higher is better) or latency metric (lower is better)
+        if "throughput" in metric:
+            passed = value >= threshold
+            status_str = f"{GREEN}PASS{RESET}" if passed else f"{RED}FAIL{RESET}"
+            unit = "calls/s"
+        else:
+            passed = value <= threshold
+            status_str = f"{GREEN}PASS{RESET}" if passed else f"{RED}FAIL{RESET}"
+            unit = "ms"
+
+        val_fmt = f"{value:.2f} {unit}"
+        thresh_fmt = f"{threshold:.2f} {unit}"
+
+        # If we failed the threshold, mark regression
+        if not passed:
+            has_regression = True
+
+        print(f"{metric:<45} | {val_fmt:<12} | {thresh_fmt:<12} | {status_str:<6}")
+
+    print("\n" + "=" * 82 + "\n")
+
+    if has_regression:
+        print(
+            f"{RED}{BOLD}Performance regression detected! One or more metrics exceeded thresholds.{RESET}"
+        )
+        sys.exit(1)
+    else:
+        print(f"{GREEN}{BOLD}All performance benchmarks passed!{RESET}")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testing/backend/benchmarks/__init__.py b/testing/backend/benchmarks/__init__.py
@@ -0,0 +1 @@
+# Benchmark suite package
diff --git a/testing/backend/benchmarks/bench_concurrent_task_start.py b/testing/backend/benchmarks/bench_concurrent_task_start.py
@@ -0,0 +1,111 @@
+import asyncio
+import statistics
+import time
+import pytest
+from testing.backend.benchmarks.conftest import load_threshold
+
+
+@pytest.mark.benchmark
+@pytest.mark.asyncio
+async def test_10_concurrent_task_creates(bench_env, record_benchmark):
+    executor = bench_env["executor"]
+    plugin_id = "icmp_ping"
+    inputs = {"target": "127.0.0.1"}
+
+    latencies = []
+
+    async def create_one():
+        start = time.perf_counter()
+        tid = await executor.create_task(plugin_id, inputs)
+        latencies.append((time.perf_counter() - start) * 1000.0)
+        return tid
+
+    start_total = time.perf_counter()
+    tasks = [create_one() for _ in range(10)]
+    await asyncio.gather(*tasks)
+    total_time_ms = (time.perf_counter() - start_total) * 1000.0
+
+    mean_lat = statistics.mean(latencies)
+    p50_lat = statistics.median(latencies)
+    latencies.sort()
+    p95_lat = (
+        latencies[int(len(latencies) * 0.95)] if len(latencies) >= 2 else latencies[-1]
+    )
+
+    # Record metric
+    record_benchmark("concurrent_task_creates_10_total_ms", total_time_ms)
+
+    threshold_total = load_threshold("concurrent_task_creates_10_total_ms")
+
+    print(
+        f"\n[bench_10_concurrent_task_creates] Total time: {total_time_ms:.2f}ms (threshold: {threshold_total}ms)"
+    )
+    print(f"Mean: {mean_lat:.2f}ms, P50: {p50_lat:.2f}ms, P95: {p95_lat:.2f}ms")
+
+    assert total_time_ms < threshold_total, (
+        f"10 concurrent task creates took {total_time_ms:.2f}ms, threshold: {threshold_total}ms"
+    )
+
+
+@pytest.mark.benchmark
+@pytest.mark.asyncio
+async def test_20_sequential_task_creates(bench_env, record_benchmark):
+    executor = bench_env["executor"]
+    plugin_id = "icmp_ping"
+    inputs = {"target": "127.0.0.1"}
+
+    latencies = []
+    for _ in range(20):
+        start = time.perf_counter()
+        await executor.create_task(plugin_id, inputs)
+        latencies.append((time.perf_counter() - start) * 1000.0)
+
+    mean_lat = statistics.mean(latencies)
+
+    # Record metric
+    record_benchmark("sequential_task_creates_mean_ms", mean_lat)
+
+    threshold_mean = load_threshold("sequential_task_creates_mean_ms")
+
+    print(
+        f"\n[bench_20_sequential_task_creates] Mean latency: {mean_lat:.2f}ms (threshold: {threshold_mean}ms)"
+    )
+
+    assert mean_lat < threshold_mean, (
+        f"Mean sequential task create took {mean_lat:.2f}ms, threshold: {threshold_mean}ms"
+    )
+
+
+@pytest.mark.benchmark
+@pytest.mark.asyncio
+async def test_concurrent_slot_saturation(bench_env, record_benchmark):
+    from backend.secuscan.ratelimit import concurrent_limiter
+
+    # Fills all 3 concurrency slots (via limiter), tries to acquire a 4th slot,
+    # asserts it is rejected in < 5 ms (no spin-wait regression).
+    async with concurrent_limiter.lock:
+        concurrent_limiter.running_tasks.clear()
+
+    # Fill slots
+    assert (await concurrent_limiter.acquire("task-1")) == (True, "")
+    assert (await concurrent_limiter.acquire("task-2")) == (True, "")
+    assert (await concurrent_limiter.acquire("task-3")) == (True, "")
+
+    # Try acquiring 4th slot, measure time
+    start = time.perf_counter()
+    acquired, msg = await concurrent_limiter.acquire("task-4")
+    elapsed_ms = (time.perf_counter() - start) * 1000.0
+
+    # Record metric
+    record_benchmark("slot_rejection_ms", elapsed_ms)
+
+    threshold_rejection = load_threshold("slot_rejection_ms")
+
+    print(
+        f"\n[bench_concurrent_slot_saturation] Slot rejection elapsed: {elapsed_ms:.4f}ms (threshold: {threshold_rejection}ms)"
+    )
+
+    assert not acquired, "Should not be able to acquire 4th slot"
+    assert elapsed_ms < threshold_rejection, (
+        f"Slot rejection took {elapsed_ms:.2f}ms, threshold: {threshold_rejection}ms"
+    )