diff --git a/generative/eval_dashboard.py b/generative/eval_dashboard.py index 533dc04..f8b80a6 100644 --- a/generative/eval_dashboard.py +++ b/generative/eval_dashboard.py @@ -106,6 +106,15 @@ def _ver_sort_key(v: str) -> tuple: return tuple(int(n) for n in re.findall(r"\d+", v)) +def is_foss_version(version) -> bool: + """True fuer nicht-generative Pipeline-Versionen. Realer Prefix ist `extractive-` + (extractive/orchestrator.py: EXTRACTIVE_VERSION = "extractive-v0.2.0"); `foss-` + wird als Forward-Compat-Alias mit erkannt. Diese Pipelines sind eine andere + Architektur als die generative — ihre Versionen gehoeren nicht in denselben + Versions-Trend (#36). NB: `foss-` allein matchte nichts Reales → Trennung war + ein No-op (Cross-Model-Review 2026-06-23).""" + return str(version or "").startswith(("extractive-", "foss-")) + def _latest_version(ver_map: dict) -> str: return sorted(ver_map.keys(), key=_ver_sort_key)[-1] @@ -436,6 +445,31 @@ def _chart_longitudinal(log_data: dict) -> dict: return {"versions": versions, "datasets": datasets} +_DELTA_MIN_N = 20 # unter N=20 kein Besser/Schlechter-Urteil (Apophenie-Schutz) + + +def version_delta(kpi_trend: dict, metric: str) -> dict: + """Delta neueste-vs-Vorversion fuer eine KPI-Metrik. + + `kpi_trend["versions"]` ist aufsteigend sortiert (neueste = letzte Position), + die Metrik-Arrays laufen parallel dazu. `reliable` ist nur True, wenn beide + beteiligten Versionen n>=20 haben — sonst ist das Delta Rauschen (N-Guard). + """ + values = kpi_trend.get(metric) or [] + ns = kpi_trend.get("n") or [] + latest = values[-1] if values else None + prev = values[-2] if len(values) >= 2 else None + n_latest = ns[-1] if ns else None + n_prev = ns[-2] if len(ns) >= 2 else None + delta = None if (latest is None or prev is None) else round(latest - prev, 4) + reliable = ( + delta is not None + and (n_latest or 0) >= _DELTA_MIN_N + and (n_prev or 0) >= _DELTA_MIN_N + ) + return {"latest": latest, "prev": prev, "delta": delta, "reliable": reliable} + + def _chart_tokens(runs: list[dict]) -> dict: return { "labels": [r["date"] for r in runs], @@ -447,6 +481,47 @@ def _chart_tokens(runs: list[dict]) -> dict: } +_SCALING_RECENT_KEEP = 10 # juengste N Versionen ungedimmt (#36 P2) + + +def mark_scaling_recency(points: list[dict], keep: int = _SCALING_RECENT_KEEP) -> list[dict]: + """Gibt eine neue Punktliste zurueck, in der die Punkte der juengsten `keep` + Versionen `recent=True` tragen, aeltere `recent=False`. So kann der Client + kaputte Frueh-Versions-Aeren dimmen, statt sie ungefiltert in die + PDF-Laengen-Skalierung zu mischen (#36 P2). Mutiert die Eingabe nicht. + """ + versions = sorted({p["ver"] for p in points if p.get("ver")}, key=_ver_sort_key) + recent_set = set(versions[-keep:]) if keep > 0 else set() + return [{**p, "recent": p.get("ver") in recent_set} for p in points] + + +def _chart_tokens_by_version(runs: list[dict]) -> dict: + """Token-Komposition (Summe) + Median-Duration pro Pipeline-Version, + aufsteigend sortiert (neueste rechts), foss-frei. Ersetzt die chronologische + Pro-Run-Achse, die bei vielen Runs unlesbar war und keinen Vergleich trug + (#36, E6).""" + by_ver: dict = {} + for r in runs: + ver = r.get("ver") or r.get("pipeline_version") + if not ver or is_foss_version(ver): + continue + b = by_ver.setdefault(ver, {"in": 0, "out": 0, "cache": 0, "dur": []}) + b["in"] += r.get("tokens_in", 0) or 0 + b["out"] += r.get("tokens_out", 0) or 0 + b["cache"] += r.get("tokens_cache", 0) or 0 + if r.get("duration_min") is not None: + b["dur"].append(r["duration_min"]) + versions = sorted(by_ver, key=_ver_sort_key) + return { + "labels": versions, + "tokens_in": [by_ver[v]["in"] for v in versions], + "tokens_out": [by_ver[v]["out"] for v in versions], + "tokens_cache": [by_ver[v]["cache"] for v in versions], + "duration_min": [round(_median(by_ver[v]["dur"]), 1) if by_ver[v]["dur"] else None + for v in versions], + } + + def _chart_scaling(all_log_runs: list[dict]) -> dict: points = [ { @@ -462,6 +537,7 @@ def _chart_scaling(all_log_runs: list[dict]) -> dict: for r in all_log_runs if r["words"] is not None ] + points = mark_scaling_recency(points) keys = sorted({p["key"] for p in points}) return {"points": points, "keys": keys} diff --git a/generative/eval_dashboard_server.py b/generative/eval_dashboard_server.py index 1663305..4274b56 100644 --- a/generative/eval_dashboard_server.py +++ b/generative/eval_dashboard_server.py @@ -409,8 +409,10 @@ def build_data(eval_version: str | None = None, # ── all_log_runs Dropdown-Optionen VOR all_log_runs-Filtern ────── _all_pdfs_opts = sorted({r["label"] for r in all_log_runs if r.get("label")}) - _all_pvers_opts = sorted({r["ver"] for r in all_log_runs if r.get("ver")}, - key=lambda v: [int(x) for x in __import__("re").findall(r"\d+", v)]) + _all_pvers_opts = sorted({r["ver"] for r in all_log_runs + if r.get("ver") and not D.is_foss_version(r["ver"])}, + key=lambda v: [int(x) for x in __import__("re").findall(r"\d+", v)], + reverse=True) # neueste generative Version oben (foss raus, #36) # PDF + Language + Version + Model-Filter auf all_log_runs (nach DB-Fallback) if model: @@ -432,6 +434,16 @@ def build_data(eval_version: str | None = None, or (r.get("key","")).lower() in _lang_pdfs] log_data = D._build_log_data(all_log_runs) + # foss-Pipeline (gliner/extractive) nicht mit generativer mischen: + # im ungefilterten Default-View foss ausschliessen — ueber Modell-/Versions- + # Filter bleibt foss einsehbar (#36, User-Wunsch 2026-06-19) + if not (model or pipeline_version): + all_log_runs = [r for r in all_log_runs if not D.is_foss_version(r.get("ver"))] + log_data = D._build_log_data(all_log_runs) + quality_rows = [r for r in quality_rows + if not D.is_foss_version(r.get("version") or r.get("pipeline_version"))] + token_runs = [tr for tr in token_runs if not D.is_foss_version(tr.get("ver"))] + # Log-Runs nach Version gruppiert runs_by_version: dict = {} for r in all_log_runs: @@ -526,6 +538,11 @@ def _pooled_accept(ver: str) -> float | None: "tokens": [round(sum(tok_by_ver.get(v,[])) / 1000, 1) if tok_by_ver.get(v) else None for v in sorted_pipeline_versions], # in M-Tokens "cost": [round(sum(cost_by_ver.get(v, [])), 4) if cost_by_ver.get(v) else None for v in sorted_pipeline_versions], } + # Delta neueste-vs-Vorversion pro KPI (mit N-Guard, #36 P4) + kpi_trend["deltas"] = { + m: D.version_delta(kpi_trend, m) + for m in ("hall", "cov", "n", "accept", "dur", "tokens", "cost") + } return { "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), @@ -536,7 +553,7 @@ def _pooled_accept(ver: str) -> float | None: "accept": D._chart_acceptance(log_data), "scatter": _chart_scatter_versioned(quality_rows), "long": D._chart_longitudinal(log_data), - "tokens": D._chart_tokens(token_runs), + "tokens": D._chart_tokens_by_version(token_runs), "scaling": D._chart_scaling(all_log_runs), "quality_by_version": quality_by_version, "runs_by_version": runs_by_version, diff --git a/generative/tests/test_dashboard_followups.py b/generative/tests/test_dashboard_followups.py new file mode 100644 index 0000000..5859543 --- /dev/null +++ b/generative/tests/test_dashboard_followups.py @@ -0,0 +1,156 @@ +"""Tests für die Dashboard-Follow-ups (atomic-notes Issue #36, 2026-06-19). + +- P4: version_delta — Delta neueste-vs-Vorversion pro KPI mit N-Guard. +- P2: scaling-Recency-Flag — alte Versions-Ären markieren (dimmen statt mischen). +- P1: _chart_longitudinal Median-über-PDFs-Serie (Anti-Spaghetti). +""" +from __future__ import annotations + +import pytest + +from generative.eval_dashboard import ( + version_delta, mark_scaling_recency, is_foss_version, _chart_tokens_by_version, +) + + +# ---------------------------------------------------------------- P4 fixtures +def _kpi_trend(**over): + """sorted_pipeline_versions aufsteigend → neueste = letzte Position.""" + base = { + "versions": ["v0.3.134", "v0.3.135"], + "hall": [12.0, 9.7], + "cov": [30.0, 35.0], + "n": [25, 22], + "accept": [50.0, 60.0], + } + base.update(over) + return base + + +# ------------------------------------------------------------------- P4 tests +def test_version_delta_latest_prev_and_signed_delta(): + d = version_delta(_kpi_trend(), "hall") + assert d["latest"] == 9.7 + assert d["prev"] == 12.0 + assert d["delta"] == pytest.approx(-2.3) # Halluzination gesunken → negativ + + +def test_version_delta_positive_sign_for_rising_coverage(): + d = version_delta(_kpi_trend(), "cov") + assert d["delta"] == pytest.approx(5.0) # Coverage gestiegen → positiv + + +def test_version_delta_reliable_only_when_both_n_at_least_20(): + assert version_delta(_kpi_trend(n=[25, 22]), "hall")["reliable"] is True + assert version_delta(_kpi_trend(n=[25, 5]), "hall")["reliable"] is False # latest zu klein + assert version_delta(_kpi_trend(n=[5, 25]), "hall")["reliable"] is False # prev zu klein + + +def test_version_delta_no_previous_version_yields_none_delta(): + d = version_delta( + {"versions": ["v0.3.135"], "hall": [9.7], "n": [22]}, "hall" + ) + assert d["latest"] == 9.7 + assert d["prev"] is None + assert d["delta"] is None + assert d["reliable"] is False + + +def test_version_delta_none_metric_value_yields_none_delta(): + d = version_delta(_kpi_trend(hall=[12.0, None]), "hall") + assert d["latest"] is None + assert d["delta"] is None + assert d["reliable"] is False + + +def test_version_delta_empty_trend_yields_none_delta(): + d = version_delta({"versions": [], "hall": [], "n": []}, "hall") + assert d["latest"] is None + assert d["prev"] is None + assert d["delta"] is None + assert d["reliable"] is False + + +# ------------------------------------------------------------------- P2 tests +def _pt(ver, key="a", x=1000, y=4): + return {"x": x, "y": y, "key": key, "label": key, "ver": ver} + + +def test_mark_scaling_recency_flags_only_youngest_keep_versions(): + pts = [_pt(f"v0.3.{i}") for i in range(1, 13)] # 12 Versionen, numerisch sortiert + recent = {p["ver"]: p["recent"] for p in mark_scaling_recency(pts, keep=10)} + assert recent["v0.3.1"] is False # die zwei ältesten Versionen gedimmt + assert recent["v0.3.2"] is False + assert recent["v0.3.3"] is True + assert recent["v0.3.12"] is True + + +def test_mark_scaling_recency_all_recent_when_fewer_than_keep(): + out = mark_scaling_recency([_pt("v0.1.0"), _pt("v0.2.0")], keep=10) + assert all(p["recent"] for p in out) + + +def test_mark_scaling_recency_multiple_points_per_version(): + pts = [_pt("v0.1.0", key="a"), _pt("v0.1.0", key="b"), _pt("v0.9.0", key="a")] + out = mark_scaling_recency(pts, keep=1) + assert [p["recent"] for p in out] == [False, False, True] # nur jüngste Version recent + + +def test_mark_scaling_recency_missing_version_not_recent(): + assert mark_scaling_recency([_pt(None)], keep=10)[0]["recent"] is False + + +def test_mark_scaling_recency_does_not_mutate_input(): + pts = [_pt("v0.1.0")] + mark_scaling_recency(pts, keep=10) + assert "recent" not in pts[0] + + +# ----------------------------------------------------- foss/generative-Trennung +def test_is_foss_version_detects_foss_prefix(): + assert is_foss_version("foss-v0.1.1") is True + assert is_foss_version("foss-v0.2.0") is True + # Realer Prefix der nicht-generativen Pipeline ist `extractive-` + # (extractive/orchestrator.py: EXTRACTIVE_VERSION = "extractive-v0.2.0"), + # NICHT `foss-` (das taggt nirgends real). Muss ebenfalls erkannt werden, + # sonst ist die ganze Trennung ein No-op (Cross-Model-Review Codex 2026-06-23). + assert is_foss_version("extractive-v0.2.0") is True + assert is_foss_version("extractive-1.0") is True + + +def test_is_foss_version_false_for_generative_and_edge_cases(): + assert is_foss_version("v0.3.139") is False + assert is_foss_version("v0.1.0") is False # generativ, trotz kleiner Zahl + assert is_foss_version("") is False + assert is_foss_version(None) is False + + +# ------------------------------------------ Token/Duration pro Version (statt chronologisch) +def test_chart_tokens_by_version_sums_and_medians_excluding_foss(): + runs = [ + {"ver": "v0.3.1", "tokens_in": 100, "tokens_out": 50, "tokens_cache": 10, "duration_min": 5.0}, + {"ver": "v0.3.1", "tokens_in": 200, "tokens_out": 50, "tokens_cache": 10, "duration_min": 7.0}, + {"ver": "v0.3.2", "tokens_in": 300, "tokens_out": 60, "tokens_cache": 20, "duration_min": 10.0}, + {"ver": "foss-v0.1.0", "tokens_in": 999, "tokens_out": 999, "tokens_cache": 999, "duration_min": 99.0}, + ] + out = _chart_tokens_by_version(runs) + assert out["labels"] == ["v0.3.1", "v0.3.2"] # foss raus, aufsteigend (neueste rechts) + assert out["tokens_in"] == [300, 300] # v0.3.1: 100+200 + assert out["tokens_out"] == [100, 60] + assert out["tokens_cache"] == [20, 20] + assert out["duration_min"] == [6.0, 10.0] # median([5,7])=6 + + +def test_chart_tokens_by_version_empty(): + out = _chart_tokens_by_version([]) + assert out["labels"] == [] and out["tokens_in"] == [] + + +def test_chart_tokens_by_version_skips_versionless_runs(): + runs = [ + {"ver": "v0.3.1", "tokens_in": 100, "tokens_out": 10, "tokens_cache": 0, "duration_min": 5.0}, + {"ver": "", "tokens_in": 999, "tokens_out": 999, "tokens_cache": 0, "duration_min": 9.0}, + {"tokens_in": 888, "tokens_out": 0, "tokens_cache": 0, "duration_min": 3.0}, + ] + out = _chart_tokens_by_version(runs) + assert out["labels"] == ["v0.3.1"] # versionslose raus, kein "?"-Bucket diff --git a/internal/dashboard/eval_dashboard.html b/internal/dashboard/eval_dashboard.html index c8d47cf..966e747 100644 --- a/internal/dashboard/eval_dashboard.html +++ b/internal/dashboard/eval_dashboard.html @@ -548,7 +548,6 @@