Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions generative/eval_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@
def _ver_sort_key(v: str) -> tuple:
return tuple(int(n) for n in re.findall(r"\d+", v))

def is_foss_version(version) -> bool:
"""True fuer nicht-generative Pipeline-Versionen. Realer Prefix ist `extractive-`
(extractive/orchestrator.py: EXTRACTIVE_VERSION = "extractive-v0.2.0"); `foss-`
wird als Forward-Compat-Alias mit erkannt. Diese Pipelines sind eine andere
Architektur als die generative — ihre Versionen gehoeren nicht in denselben
Versions-Trend (#36). NB: `foss-` allein matchte nichts Reales → Trennung war
ein No-op (Cross-Model-Review 2026-06-23)."""
return str(version or "").startswith(("extractive-", "foss-"))

def _latest_version(ver_map: dict) -> str:
return sorted(ver_map.keys(), key=_ver_sort_key)[-1]

Expand Down Expand Up @@ -436,6 +445,31 @@ def _chart_longitudinal(log_data: dict) -> dict:
return {"versions": versions, "datasets": datasets}


_DELTA_MIN_N = 20 # unter N=20 kein Besser/Schlechter-Urteil (Apophenie-Schutz)


def version_delta(kpi_trend: dict, metric: str) -> dict:
"""Delta neueste-vs-Vorversion fuer eine KPI-Metrik.

`kpi_trend["versions"]` ist aufsteigend sortiert (neueste = letzte Position),
die Metrik-Arrays laufen parallel dazu. `reliable` ist nur True, wenn beide
beteiligten Versionen n>=20 haben — sonst ist das Delta Rauschen (N-Guard).
"""
values = kpi_trend.get(metric) or []
ns = kpi_trend.get("n") or []
latest = values[-1] if values else None
prev = values[-2] if len(values) >= 2 else None
n_latest = ns[-1] if ns else None
n_prev = ns[-2] if len(ns) >= 2 else None
delta = None if (latest is None or prev is None) else round(latest - prev, 4)
reliable = (
delta is not None
and (n_latest or 0) >= _DELTA_MIN_N
and (n_prev or 0) >= _DELTA_MIN_N
)
return {"latest": latest, "prev": prev, "delta": delta, "reliable": reliable}


def _chart_tokens(runs: list[dict]) -> dict:
return {
"labels": [r["date"] for r in runs],
Expand All @@ -447,6 +481,47 @@ def _chart_tokens(runs: list[dict]) -> dict:
}


_SCALING_RECENT_KEEP = 10 # juengste N Versionen ungedimmt (#36 P2)


def mark_scaling_recency(points: list[dict], keep: int = _SCALING_RECENT_KEEP) -> list[dict]:
"""Gibt eine neue Punktliste zurueck, in der die Punkte der juengsten `keep`
Versionen `recent=True` tragen, aeltere `recent=False`. So kann der Client
kaputte Frueh-Versions-Aeren dimmen, statt sie ungefiltert in die
PDF-Laengen-Skalierung zu mischen (#36 P2). Mutiert die Eingabe nicht.
"""
versions = sorted({p["ver"] for p in points if p.get("ver")}, key=_ver_sort_key)
recent_set = set(versions[-keep:]) if keep > 0 else set()
return [{**p, "recent": p.get("ver") in recent_set} for p in points]


def _chart_tokens_by_version(runs: list[dict]) -> dict:
"""Token-Komposition (Summe) + Median-Duration pro Pipeline-Version,
aufsteigend sortiert (neueste rechts), foss-frei. Ersetzt die chronologische
Pro-Run-Achse, die bei vielen Runs unlesbar war und keinen Vergleich trug
(#36, E6)."""
by_ver: dict = {}
for r in runs:
ver = r.get("ver") or r.get("pipeline_version")
if not ver or is_foss_version(ver):
continue
b = by_ver.setdefault(ver, {"in": 0, "out": 0, "cache": 0, "dur": []})
b["in"] += r.get("tokens_in", 0) or 0
b["out"] += r.get("tokens_out", 0) or 0
b["cache"] += r.get("tokens_cache", 0) or 0
if r.get("duration_min") is not None:
b["dur"].append(r["duration_min"])
versions = sorted(by_ver, key=_ver_sort_key)
return {
"labels": versions,
"tokens_in": [by_ver[v]["in"] for v in versions],
"tokens_out": [by_ver[v]["out"] for v in versions],
"tokens_cache": [by_ver[v]["cache"] for v in versions],
"duration_min": [round(_median(by_ver[v]["dur"]), 1) if by_ver[v]["dur"] else None
for v in versions],
}


def _chart_scaling(all_log_runs: list[dict]) -> dict:
points = [
{
Expand All @@ -462,6 +537,7 @@ def _chart_scaling(all_log_runs: list[dict]) -> dict:
for r in all_log_runs
if r["words"] is not None
]
points = mark_scaling_recency(points)
keys = sorted({p["key"] for p in points})
return {"points": points, "keys": keys}

Expand Down
23 changes: 20 additions & 3 deletions generative/eval_dashboard_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,10 @@ def build_data(eval_version: str | None = None,

# ── all_log_runs Dropdown-Optionen VOR all_log_runs-Filtern ──────
_all_pdfs_opts = sorted({r["label"] for r in all_log_runs if r.get("label")})
_all_pvers_opts = sorted({r["ver"] for r in all_log_runs if r.get("ver")},
key=lambda v: [int(x) for x in __import__("re").findall(r"\d+", v)])
_all_pvers_opts = sorted({r["ver"] for r in all_log_runs
if r.get("ver") and not D.is_foss_version(r["ver"])},
key=lambda v: [int(x) for x in __import__("re").findall(r"\d+", v)],
reverse=True) # neueste generative Version oben (foss raus, #36)

# PDF + Language + Version + Model-Filter auf all_log_runs (nach DB-Fallback)
if model:
Expand All @@ -432,6 +434,16 @@ def build_data(eval_version: str | None = None,
or (r.get("key","")).lower() in _lang_pdfs]
log_data = D._build_log_data(all_log_runs)

# foss-Pipeline (gliner/extractive) nicht mit generativer mischen:
# im ungefilterten Default-View foss ausschliessen — ueber Modell-/Versions-
# Filter bleibt foss einsehbar (#36, User-Wunsch 2026-06-19)
if not (model or pipeline_version):
all_log_runs = [r for r in all_log_runs if not D.is_foss_version(r.get("ver"))]
log_data = D._build_log_data(all_log_runs)
quality_rows = [r for r in quality_rows
if not D.is_foss_version(r.get("version") or r.get("pipeline_version"))]
token_runs = [tr for tr in token_runs if not D.is_foss_version(tr.get("ver"))]

# Log-Runs nach Version gruppiert
runs_by_version: dict = {}
for r in all_log_runs:
Expand Down Expand Up @@ -526,6 +538,11 @@ def _pooled_accept(ver: str) -> float | None:
"tokens": [round(sum(tok_by_ver.get(v,[])) / 1000, 1) if tok_by_ver.get(v) else None for v in sorted_pipeline_versions], # in M-Tokens
"cost": [round(sum(cost_by_ver.get(v, [])), 4) if cost_by_ver.get(v) else None for v in sorted_pipeline_versions],
}
# Delta neueste-vs-Vorversion pro KPI (mit N-Guard, #36 P4)
kpi_trend["deltas"] = {
m: D.version_delta(kpi_trend, m)
for m in ("hall", "cov", "n", "accept", "dur", "tokens", "cost")
}

return {
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
Expand All @@ -536,7 +553,7 @@ def _pooled_accept(ver: str) -> float | None:
"accept": D._chart_acceptance(log_data),
"scatter": _chart_scatter_versioned(quality_rows),
"long": D._chart_longitudinal(log_data),
"tokens": D._chart_tokens(token_runs),
"tokens": D._chart_tokens_by_version(token_runs),
"scaling": D._chart_scaling(all_log_runs),
"quality_by_version": quality_by_version,
"runs_by_version": runs_by_version,
Expand Down
156 changes: 156 additions & 0 deletions generative/tests/test_dashboard_followups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""Tests für die Dashboard-Follow-ups (atomic-notes Issue #36, 2026-06-19).

- P4: version_delta — Delta neueste-vs-Vorversion pro KPI mit N-Guard.
- P2: scaling-Recency-Flag — alte Versions-Ären markieren (dimmen statt mischen).
- P1: _chart_longitudinal Median-über-PDFs-Serie (Anti-Spaghetti).
"""
from __future__ import annotations

import pytest

from generative.eval_dashboard import (
version_delta, mark_scaling_recency, is_foss_version, _chart_tokens_by_version,
)


# ---------------------------------------------------------------- P4 fixtures
def _kpi_trend(**over):
"""sorted_pipeline_versions aufsteigend → neueste = letzte Position."""
base = {
"versions": ["v0.3.134", "v0.3.135"],
"hall": [12.0, 9.7],
"cov": [30.0, 35.0],
"n": [25, 22],
"accept": [50.0, 60.0],
}
base.update(over)
return base


# ------------------------------------------------------------------- P4 tests
def test_version_delta_latest_prev_and_signed_delta():
d = version_delta(_kpi_trend(), "hall")
assert d["latest"] == 9.7
assert d["prev"] == 12.0
assert d["delta"] == pytest.approx(-2.3) # Halluzination gesunken → negativ


def test_version_delta_positive_sign_for_rising_coverage():
d = version_delta(_kpi_trend(), "cov")
assert d["delta"] == pytest.approx(5.0) # Coverage gestiegen → positiv


def test_version_delta_reliable_only_when_both_n_at_least_20():
assert version_delta(_kpi_trend(n=[25, 22]), "hall")["reliable"] is True
assert version_delta(_kpi_trend(n=[25, 5]), "hall")["reliable"] is False # latest zu klein
assert version_delta(_kpi_trend(n=[5, 25]), "hall")["reliable"] is False # prev zu klein


def test_version_delta_no_previous_version_yields_none_delta():
d = version_delta(
{"versions": ["v0.3.135"], "hall": [9.7], "n": [22]}, "hall"
)
assert d["latest"] == 9.7
assert d["prev"] is None
assert d["delta"] is None
assert d["reliable"] is False


def test_version_delta_none_metric_value_yields_none_delta():
d = version_delta(_kpi_trend(hall=[12.0, None]), "hall")
assert d["latest"] is None
assert d["delta"] is None
assert d["reliable"] is False


def test_version_delta_empty_trend_yields_none_delta():
d = version_delta({"versions": [], "hall": [], "n": []}, "hall")
assert d["latest"] is None
assert d["prev"] is None
assert d["delta"] is None
assert d["reliable"] is False


# ------------------------------------------------------------------- P2 tests
def _pt(ver, key="a", x=1000, y=4):
return {"x": x, "y": y, "key": key, "label": key, "ver": ver}


def test_mark_scaling_recency_flags_only_youngest_keep_versions():
pts = [_pt(f"v0.3.{i}") for i in range(1, 13)] # 12 Versionen, numerisch sortiert
recent = {p["ver"]: p["recent"] for p in mark_scaling_recency(pts, keep=10)}
assert recent["v0.3.1"] is False # die zwei ältesten Versionen gedimmt
assert recent["v0.3.2"] is False
assert recent["v0.3.3"] is True
assert recent["v0.3.12"] is True


def test_mark_scaling_recency_all_recent_when_fewer_than_keep():
out = mark_scaling_recency([_pt("v0.1.0"), _pt("v0.2.0")], keep=10)
assert all(p["recent"] for p in out)


def test_mark_scaling_recency_multiple_points_per_version():
pts = [_pt("v0.1.0", key="a"), _pt("v0.1.0", key="b"), _pt("v0.9.0", key="a")]
out = mark_scaling_recency(pts, keep=1)
assert [p["recent"] for p in out] == [False, False, True] # nur jüngste Version recent


def test_mark_scaling_recency_missing_version_not_recent():
assert mark_scaling_recency([_pt(None)], keep=10)[0]["recent"] is False


def test_mark_scaling_recency_does_not_mutate_input():
pts = [_pt("v0.1.0")]
mark_scaling_recency(pts, keep=10)
assert "recent" not in pts[0]


# ----------------------------------------------------- foss/generative-Trennung
def test_is_foss_version_detects_foss_prefix():
assert is_foss_version("foss-v0.1.1") is True
assert is_foss_version("foss-v0.2.0") is True
# Realer Prefix der nicht-generativen Pipeline ist `extractive-`
# (extractive/orchestrator.py: EXTRACTIVE_VERSION = "extractive-v0.2.0"),
# NICHT `foss-` (das taggt nirgends real). Muss ebenfalls erkannt werden,
# sonst ist die ganze Trennung ein No-op (Cross-Model-Review Codex 2026-06-23).
assert is_foss_version("extractive-v0.2.0") is True
assert is_foss_version("extractive-1.0") is True


def test_is_foss_version_false_for_generative_and_edge_cases():
assert is_foss_version("v0.3.139") is False
assert is_foss_version("v0.1.0") is False # generativ, trotz kleiner Zahl
assert is_foss_version("") is False
assert is_foss_version(None) is False


# ------------------------------------------ Token/Duration pro Version (statt chronologisch)
def test_chart_tokens_by_version_sums_and_medians_excluding_foss():
runs = [
{"ver": "v0.3.1", "tokens_in": 100, "tokens_out": 50, "tokens_cache": 10, "duration_min": 5.0},
{"ver": "v0.3.1", "tokens_in": 200, "tokens_out": 50, "tokens_cache": 10, "duration_min": 7.0},
{"ver": "v0.3.2", "tokens_in": 300, "tokens_out": 60, "tokens_cache": 20, "duration_min": 10.0},
{"ver": "foss-v0.1.0", "tokens_in": 999, "tokens_out": 999, "tokens_cache": 999, "duration_min": 99.0},
]
out = _chart_tokens_by_version(runs)
assert out["labels"] == ["v0.3.1", "v0.3.2"] # foss raus, aufsteigend (neueste rechts)
assert out["tokens_in"] == [300, 300] # v0.3.1: 100+200
assert out["tokens_out"] == [100, 60]
assert out["tokens_cache"] == [20, 20]
assert out["duration_min"] == [6.0, 10.0] # median([5,7])=6


def test_chart_tokens_by_version_empty():
out = _chart_tokens_by_version([])
assert out["labels"] == [] and out["tokens_in"] == []


def test_chart_tokens_by_version_skips_versionless_runs():
runs = [
{"ver": "v0.3.1", "tokens_in": 100, "tokens_out": 10, "tokens_cache": 0, "duration_min": 5.0},
{"ver": "", "tokens_in": 999, "tokens_out": 999, "tokens_cache": 0, "duration_min": 9.0},
{"tokens_in": 888, "tokens_out": 0, "tokens_cache": 0, "duration_min": 3.0},
]
out = _chart_tokens_by_version(runs)
assert out["labels"] == ["v0.3.1"] # versionslose raus, kein "?"-Bucket
Loading
Loading