From aff21f21ade46d31fccac712bfe418fbb3ec111a Mon Sep 17 00:00:00 2001 From: DerTill123 Date: Fri, 19 Jun 2026 17:27:02 +0200 Subject: [PATCH 1/4] =?UTF-8?q?fix(dashboard):=20#36=20P2/P4=20=E2=80=94?= =?UTF-8?q?=20KPI-Delta=20vs.=20Vorversion=20+=20Scaling-Recency-Dimmen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live-Pfad ist internal/dashboard/eval_dashboard.html (SPA via /data.json), NICHT der deprecatete Legacy-Pfad eval_dashboard.py:_build_html. P4 — KPI-Delta neueste-vs-Vorversion mit N-Guard (statt neueste-vs-erste): - version_delta(kpi_trend, metric) pure Funktion (TDD), reliable nur wenn beide Versionen n>=20 (Apophenie-Schutz) - Server: kpi_trend["deltas"] pro KPI; Template-JS nutzt es, ohne reliable kein Besser/Schlechter-Farbcode (neutral + "(n<20)") P2 — Scaling-Chart (ch3) dimmt alte Versions-Aeren: - mark_scaling_recency(points, keep=10) pure Funktion (TDD), recent-Flag - _chart_scaling haengt es an; ch3-JS dimmt recent===false (fail-open) P1 (#36 Punkt 1, Median-Linie) war im Live-Pfad bereits durch PR #38 erledigt — der frueher vermutete Regressionsverdacht war ein Blick auf den Legacy-Pfad. Codex-Review: 0 HIGH/MED, 1 LOW (cost-Delta-Anzeige toFixed(1), vorbestehend). Suite 4608 passed. Server-/data.json-End-to-End + JS-Syntax verifiziert. Bekannter offener Punkt: visuelle Firefox-Augenschein-Pruefung. --- generative/eval_dashboard.py | 40 +++++++ generative/eval_dashboard_server.py | 5 + generative/tests/test_dashboard_followups.py | 104 +++++++++++++++++++ internal/dashboard/eval_dashboard.html | 23 ++-- 4 files changed, 165 insertions(+), 7 deletions(-) create mode 100644 generative/tests/test_dashboard_followups.py diff --git a/generative/eval_dashboard.py b/generative/eval_dashboard.py index 533dc04..da0564b 100644 --- a/generative/eval_dashboard.py +++ b/generative/eval_dashboard.py @@ -436,6 +436,31 @@ def _chart_longitudinal(log_data: dict) -> dict: return {"versions": versions, "datasets": datasets} +_DELTA_MIN_N = 20 # unter N=20 kein Besser/Schlechter-Urteil (Apophenie-Schutz) + + +def version_delta(kpi_trend: dict, metric: str) -> dict: + """Delta neueste-vs-Vorversion fuer eine KPI-Metrik. + + `kpi_trend["versions"]` ist aufsteigend sortiert (neueste = letzte Position), + die Metrik-Arrays laufen parallel dazu. `reliable` ist nur True, wenn beide + beteiligten Versionen n>=20 haben — sonst ist das Delta Rauschen (N-Guard). + """ + values = kpi_trend.get(metric) or [] + ns = kpi_trend.get("n") or [] + latest = values[-1] if values else None + prev = values[-2] if len(values) >= 2 else None + n_latest = ns[-1] if ns else None + n_prev = ns[-2] if len(ns) >= 2 else None + delta = None if (latest is None or prev is None) else round(latest - prev, 4) + reliable = ( + delta is not None + and (n_latest or 0) >= _DELTA_MIN_N + and (n_prev or 0) >= _DELTA_MIN_N + ) + return {"latest": latest, "prev": prev, "delta": delta, "reliable": reliable} + + def _chart_tokens(runs: list[dict]) -> dict: return { "labels": [r["date"] for r in runs], @@ -447,6 +472,20 @@ def _chart_tokens(runs: list[dict]) -> dict: } +_SCALING_RECENT_KEEP = 10 # juengste N Versionen ungedimmt (#36 P2) + + +def mark_scaling_recency(points: list[dict], keep: int = _SCALING_RECENT_KEEP) -> list[dict]: + """Gibt eine neue Punktliste zurueck, in der die Punkte der juengsten `keep` + Versionen `recent=True` tragen, aeltere `recent=False`. So kann der Client + kaputte Frueh-Versions-Aeren dimmen, statt sie ungefiltert in die + PDF-Laengen-Skalierung zu mischen (#36 P2). Mutiert die Eingabe nicht. + """ + versions = sorted({p["ver"] for p in points if p.get("ver")}, key=_ver_sort_key) + recent_set = set(versions[-keep:]) if keep > 0 else set() + return [{**p, "recent": p.get("ver") in recent_set} for p in points] + + def _chart_scaling(all_log_runs: list[dict]) -> dict: points = [ { @@ -462,6 +501,7 @@ def _chart_scaling(all_log_runs: list[dict]) -> dict: for r in all_log_runs if r["words"] is not None ] + points = mark_scaling_recency(points) keys = sorted({p["key"] for p in points}) return {"points": points, "keys": keys} diff --git a/generative/eval_dashboard_server.py b/generative/eval_dashboard_server.py index 1663305..1e76a5f 100644 --- a/generative/eval_dashboard_server.py +++ b/generative/eval_dashboard_server.py @@ -526,6 +526,11 @@ def _pooled_accept(ver: str) -> float | None: "tokens": [round(sum(tok_by_ver.get(v,[])) / 1000, 1) if tok_by_ver.get(v) else None for v in sorted_pipeline_versions], # in M-Tokens "cost": [round(sum(cost_by_ver.get(v, [])), 4) if cost_by_ver.get(v) else None for v in sorted_pipeline_versions], } + # Delta neueste-vs-Vorversion pro KPI (mit N-Guard, #36 P4) + kpi_trend["deltas"] = { + m: D.version_delta(kpi_trend, m) + for m in ("hall", "cov", "n", "accept", "dur", "tokens", "cost") + } return { "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), diff --git a/generative/tests/test_dashboard_followups.py b/generative/tests/test_dashboard_followups.py new file mode 100644 index 0000000..9ed746d --- /dev/null +++ b/generative/tests/test_dashboard_followups.py @@ -0,0 +1,104 @@ +"""Tests für die Dashboard-Follow-ups (atomic-notes Issue #36, 2026-06-19). + +- P4: version_delta — Delta neueste-vs-Vorversion pro KPI mit N-Guard. +- P2: scaling-Recency-Flag — alte Versions-Ären markieren (dimmen statt mischen). +- P1: _chart_longitudinal Median-über-PDFs-Serie (Anti-Spaghetti). +""" +from __future__ import annotations + +import pytest + +from generative.eval_dashboard import version_delta, mark_scaling_recency + + +# ---------------------------------------------------------------- P4 fixtures +def _kpi_trend(**over): + """sorted_pipeline_versions aufsteigend → neueste = letzte Position.""" + base = { + "versions": ["v0.3.134", "v0.3.135"], + "hall": [12.0, 9.7], + "cov": [30.0, 35.0], + "n": [25, 22], + "accept": [50.0, 60.0], + } + base.update(over) + return base + + +# ------------------------------------------------------------------- P4 tests +def test_version_delta_latest_prev_and_signed_delta(): + d = version_delta(_kpi_trend(), "hall") + assert d["latest"] == 9.7 + assert d["prev"] == 12.0 + assert d["delta"] == pytest.approx(-2.3) # Halluzination gesunken → negativ + + +def test_version_delta_positive_sign_for_rising_coverage(): + d = version_delta(_kpi_trend(), "cov") + assert d["delta"] == pytest.approx(5.0) # Coverage gestiegen → positiv + + +def test_version_delta_reliable_only_when_both_n_at_least_20(): + assert version_delta(_kpi_trend(n=[25, 22]), "hall")["reliable"] is True + assert version_delta(_kpi_trend(n=[25, 5]), "hall")["reliable"] is False # latest zu klein + assert version_delta(_kpi_trend(n=[5, 25]), "hall")["reliable"] is False # prev zu klein + + +def test_version_delta_no_previous_version_yields_none_delta(): + d = version_delta( + {"versions": ["v0.3.135"], "hall": [9.7], "n": [22]}, "hall" + ) + assert d["latest"] == 9.7 + assert d["prev"] is None + assert d["delta"] is None + assert d["reliable"] is False + + +def test_version_delta_none_metric_value_yields_none_delta(): + d = version_delta(_kpi_trend(hall=[12.0, None]), "hall") + assert d["latest"] is None + assert d["delta"] is None + assert d["reliable"] is False + + +def test_version_delta_empty_trend_yields_none_delta(): + d = version_delta({"versions": [], "hall": [], "n": []}, "hall") + assert d["latest"] is None + assert d["prev"] is None + assert d["delta"] is None + assert d["reliable"] is False + + +# ------------------------------------------------------------------- P2 tests +def _pt(ver, key="a", x=1000, y=4): + return {"x": x, "y": y, "key": key, "label": key, "ver": ver} + + +def test_mark_scaling_recency_flags_only_youngest_keep_versions(): + pts = [_pt(f"v0.3.{i}") for i in range(1, 13)] # 12 Versionen, numerisch sortiert + recent = {p["ver"]: p["recent"] for p in mark_scaling_recency(pts, keep=10)} + assert recent["v0.3.1"] is False # die zwei ältesten Versionen gedimmt + assert recent["v0.3.2"] is False + assert recent["v0.3.3"] is True + assert recent["v0.3.12"] is True + + +def test_mark_scaling_recency_all_recent_when_fewer_than_keep(): + out = mark_scaling_recency([_pt("v0.1.0"), _pt("v0.2.0")], keep=10) + assert all(p["recent"] for p in out) + + +def test_mark_scaling_recency_multiple_points_per_version(): + pts = [_pt("v0.1.0", key="a"), _pt("v0.1.0", key="b"), _pt("v0.9.0", key="a")] + out = mark_scaling_recency(pts, keep=1) + assert [p["recent"] for p in out] == [False, False, True] # nur jüngste Version recent + + +def test_mark_scaling_recency_missing_version_not_recent(): + assert mark_scaling_recency([_pt(None)], keep=10)[0]["recent"] is False + + +def test_mark_scaling_recency_does_not_mutate_input(): + pts = [_pt("v0.1.0")] + mark_scaling_recency(pts, keep=10) + assert "recent" not in pts[0] diff --git a/internal/dashboard/eval_dashboard.html b/internal/dashboard/eval_dashboard.html index c8d47cf..1f5cce1 100644 --- a/internal/dashboard/eval_dashboard.html +++ b/internal/dashboard/eval_dashboard.html @@ -717,7 +717,7 @@
-
Skaliert die Pipeline mit der PDF-Länge?
Lange Papers erzeugen nicht linear mehr Notes.
+
Skaliert die Pipeline mit der PDF-Länge?
Lange Papers erzeugen nicht linear mehr Notes. Ältere Versionen gedimmt.
Notes vs. Wörter
@@ -1134,15 +1134,22 @@ const vers = kpiTrend?.versions || []; const sparkColor = toneColor(kd.tone); const sparkSvg = _trendChart(vals, vers, sparkColor, kd.u); - const validVals = vals.filter(v=>v!=null); - const delta = validVals.length >= 2 ? (validVals[validVals.length-1] - validVals[0]).toFixed(1) : null; - const dClass = delta!=null ? (parseFloat(delta)>0 ? (kd.key==='hall'?'dneg':'dpos') : (kd.key==='hall'?'dpos':'dneg')) : ''; - const sign = delta!=null && parseFloat(delta)>0 ? '+' : ''; + // #36 P4: Delta neueste-vs-Vorversion (server-berechnet) mit N-Guard statt + // neueste-vs-erste. Ohne reliable (n<20 in einer der beiden Versionen) kein + // Besser/Schlechter-Farbcode — nur neutrale Anzeige (Rausch-Schutz). + const vd = kpiTrend?.deltas?.[kd.key] || null; + const dval = (vd && vd.delta != null) ? vd.delta.toFixed(1) : null; + const prevVer = vers.length >= 2 ? vers[vers.length-2] : null; + const dClass = (dval!=null && vd.reliable) ? (parseFloat(dval)>0 ? (kd.key==='hall'?'dneg':'dpos') : (kd.key==='hall'?'dpos':'dneg')) : ''; + const sign = dval!=null && parseFloat(dval)>0 ? '+' : ''; + const deltaHtml = dval!=null + ? `Δ vs. ${prevVer}: ${sign}${dval} ${kd.u}${vd.reliable ? '' : ' (n<20)'}` + : `${vers.length} Pipeline-Versionen`; const inner = document.getElementById('spark-inner'); if (inner) inner.innerHTML = `
${kd.label} — Versions-Trend -
${delta!=null ? `Δ ${sign}${delta} ${kd.u} · ${vers.length} Versionen` : `${vers.length} Pipeline-Versionen`}
+
${deltaHtml}
${sparkSvg}
`; } @@ -1410,7 +1417,9 @@ const col=t.enc[keys.indexOf(k)%t.enc.length]; return{label:(d.scaling.points.find(p=>p.key===k)||{}).label||k, data:d.scaling.points.filter(p=>p.key===k).map(p=>({x:p.x,y:p.y,_p:p})), - backgroundColor:col+'b0',borderColor:col,borderWidth:1,pointRadius:6,pointHoverRadius:9}; + backgroundColor:ctx=>ctx.raw?._p?.recent===false?col+'33':col+'b0', + borderColor:ctx=>ctx.raw?._p?.recent===false?col+'55':col, + borderWidth:1,pointRadius:ctx=>ctx.raw?._p?.recent===false?3:6,pointHoverRadius:9}; })}, options:{maintainAspectRatio:false,responsive:true,animation:{duration:500}, plugins:{legend:legendCfg(),tooltip:{callbacks:{label:ctx=>{const p=ctx.raw._p||{};return [`${p.label} ${p.ver}`,`${(p.x||0).toLocaleString('de-DE')} Wörter · ${p.y} Notes`];}}}}, From 4d9a402ac15b0292dd2c98a52089a522e65a384f Mon Sep 17 00:00:00 2001 From: DerTill123 Date: Fri, 19 Jun 2026 18:28:19 +0200 Subject: [PATCH 2/4] =?UTF-8?q?fix(dashboard):=20#36=20Live-Pfad-UX=20?= =?UTF-8?q?=E2=80=94=20Sortierung,=20foss-Trennung,=20%-Deckel,=20Token-pr?= =?UTF-8?q?o-Version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sechs aus dem Firefox-Augenschein gemeldete Punkte im Live-Render-Pfad internal/dashboard/eval_dashboard.html + build_data: A) Versions-Dropdown: neueste Version oben (reverse), foss-frei. B) Eval-Version-Filter komplett entfernt (UI + URL-State + Sidebar-Pill); Server nutzt default = neueste eval_version. C) KPI-Sparkline-X-Achse: nur erste+letzte Version statt ~50 ueberlappende Labels. D) foss/generative-Trennung: is_foss_version() (TDD); im ungefilterten Default-View foss aus all_log_runs + quality_rows + token_runs ausgeschlossen, Dropdown immer foss-frei. Ueber Modell-/Versions-Filter bleibt foss sichtbar. E) Sparkline-Y-Achse bei %-Metriken auf 100 gedeckelt (keine 120%-Achse mehr). F) Token/Duration-Charts pro Pipeline-Version aggregiert (Summe/Median) statt chronologisch pro Run — _chart_tokens_by_version (TDD), foss-frei. Cross-Model-Review (Codex + Qwen): 0 HIGH/MED nach Fixes. Codex MED gefixt (eval_version-URL-State, token_runs-foss-Ausschluss, foss--Praefix); Qwen LOW gefixt (versions[n-1]-Guard, duration_min is not None). Suite 4613 passed, +5 Tests (16 in test_dashboard_followups.py). --- generative/eval_dashboard.py | 33 ++++++++++++++ generative/eval_dashboard_server.py | 18 ++++++-- generative/tests/test_dashboard_followups.py | 48 +++++++++++++++++++- internal/dashboard/eval_dashboard.html | 32 ++++++------- 4 files changed, 109 insertions(+), 22 deletions(-) diff --git a/generative/eval_dashboard.py b/generative/eval_dashboard.py index da0564b..8b3b299 100644 --- a/generative/eval_dashboard.py +++ b/generative/eval_dashboard.py @@ -106,6 +106,12 @@ def _ver_sort_key(v: str) -> tuple: return tuple(int(n) for n in re.findall(r"\d+", v)) +def is_foss_version(version) -> bool: + """True fuer foss/extractive-Pipeline-Versionen (Praefix 'foss-'). Die foss- + Pipeline ist eine andere Architektur als die generative — ihre Versionen + gehoeren nicht in denselben Versions-Trend (#36).""" + return str(version or "").startswith("foss-") + def _latest_version(ver_map: dict) -> str: return sorted(ver_map.keys(), key=_ver_sort_key)[-1] @@ -486,6 +492,33 @@ def mark_scaling_recency(points: list[dict], keep: int = _SCALING_RECENT_KEEP) - return [{**p, "recent": p.get("ver") in recent_set} for p in points] +def _chart_tokens_by_version(runs: list[dict]) -> dict: + """Token-Komposition (Summe) + Median-Duration pro Pipeline-Version, + aufsteigend sortiert (neueste rechts), foss-frei. Ersetzt die chronologische + Pro-Run-Achse, die bei vielen Runs unlesbar war und keinen Vergleich trug + (#36, E6).""" + by_ver: dict = {} + for r in runs: + ver = r.get("ver") or r.get("pipeline_version") + if not ver or is_foss_version(ver): + continue + b = by_ver.setdefault(ver, {"in": 0, "out": 0, "cache": 0, "dur": []}) + b["in"] += r.get("tokens_in", 0) or 0 + b["out"] += r.get("tokens_out", 0) or 0 + b["cache"] += r.get("tokens_cache", 0) or 0 + if r.get("duration_min") is not None: + b["dur"].append(r["duration_min"]) + versions = sorted(by_ver, key=_ver_sort_key) + return { + "labels": versions, + "tokens_in": [by_ver[v]["in"] for v in versions], + "tokens_out": [by_ver[v]["out"] for v in versions], + "tokens_cache": [by_ver[v]["cache"] for v in versions], + "duration_min": [round(_median(by_ver[v]["dur"]), 1) if by_ver[v]["dur"] else None + for v in versions], + } + + def _chart_scaling(all_log_runs: list[dict]) -> dict: points = [ { diff --git a/generative/eval_dashboard_server.py b/generative/eval_dashboard_server.py index 1e76a5f..4274b56 100644 --- a/generative/eval_dashboard_server.py +++ b/generative/eval_dashboard_server.py @@ -409,8 +409,10 @@ def build_data(eval_version: str | None = None, # ── all_log_runs Dropdown-Optionen VOR all_log_runs-Filtern ────── _all_pdfs_opts = sorted({r["label"] for r in all_log_runs if r.get("label")}) - _all_pvers_opts = sorted({r["ver"] for r in all_log_runs if r.get("ver")}, - key=lambda v: [int(x) for x in __import__("re").findall(r"\d+", v)]) + _all_pvers_opts = sorted({r["ver"] for r in all_log_runs + if r.get("ver") and not D.is_foss_version(r["ver"])}, + key=lambda v: [int(x) for x in __import__("re").findall(r"\d+", v)], + reverse=True) # neueste generative Version oben (foss raus, #36) # PDF + Language + Version + Model-Filter auf all_log_runs (nach DB-Fallback) if model: @@ -432,6 +434,16 @@ def build_data(eval_version: str | None = None, or (r.get("key","")).lower() in _lang_pdfs] log_data = D._build_log_data(all_log_runs) + # foss-Pipeline (gliner/extractive) nicht mit generativer mischen: + # im ungefilterten Default-View foss ausschliessen — ueber Modell-/Versions- + # Filter bleibt foss einsehbar (#36, User-Wunsch 2026-06-19) + if not (model or pipeline_version): + all_log_runs = [r for r in all_log_runs if not D.is_foss_version(r.get("ver"))] + log_data = D._build_log_data(all_log_runs) + quality_rows = [r for r in quality_rows + if not D.is_foss_version(r.get("version") or r.get("pipeline_version"))] + token_runs = [tr for tr in token_runs if not D.is_foss_version(tr.get("ver"))] + # Log-Runs nach Version gruppiert runs_by_version: dict = {} for r in all_log_runs: @@ -541,7 +553,7 @@ def _pooled_accept(ver: str) -> float | None: "accept": D._chart_acceptance(log_data), "scatter": _chart_scatter_versioned(quality_rows), "long": D._chart_longitudinal(log_data), - "tokens": D._chart_tokens(token_runs), + "tokens": D._chart_tokens_by_version(token_runs), "scaling": D._chart_scaling(all_log_runs), "quality_by_version": quality_by_version, "runs_by_version": runs_by_version, diff --git a/generative/tests/test_dashboard_followups.py b/generative/tests/test_dashboard_followups.py index 9ed746d..5b3f3b4 100644 --- a/generative/tests/test_dashboard_followups.py +++ b/generative/tests/test_dashboard_followups.py @@ -8,7 +8,9 @@ import pytest -from generative.eval_dashboard import version_delta, mark_scaling_recency +from generative.eval_dashboard import ( + version_delta, mark_scaling_recency, is_foss_version, _chart_tokens_by_version, +) # ---------------------------------------------------------------- P4 fixtures @@ -102,3 +104,47 @@ def test_mark_scaling_recency_does_not_mutate_input(): pts = [_pt("v0.1.0")] mark_scaling_recency(pts, keep=10) assert "recent" not in pts[0] + + +# ----------------------------------------------------- foss/generative-Trennung +def test_is_foss_version_detects_foss_prefix(): + assert is_foss_version("foss-v0.1.1") is True + assert is_foss_version("foss-v0.2.0") is True + + +def test_is_foss_version_false_for_generative_and_edge_cases(): + assert is_foss_version("v0.3.139") is False + assert is_foss_version("v0.1.0") is False # generativ, trotz kleiner Zahl + assert is_foss_version("") is False + assert is_foss_version(None) is False + + +# ------------------------------------------ Token/Duration pro Version (statt chronologisch) +def test_chart_tokens_by_version_sums_and_medians_excluding_foss(): + runs = [ + {"ver": "v0.3.1", "tokens_in": 100, "tokens_out": 50, "tokens_cache": 10, "duration_min": 5.0}, + {"ver": "v0.3.1", "tokens_in": 200, "tokens_out": 50, "tokens_cache": 10, "duration_min": 7.0}, + {"ver": "v0.3.2", "tokens_in": 300, "tokens_out": 60, "tokens_cache": 20, "duration_min": 10.0}, + {"ver": "foss-v0.1.0", "tokens_in": 999, "tokens_out": 999, "tokens_cache": 999, "duration_min": 99.0}, + ] + out = _chart_tokens_by_version(runs) + assert out["labels"] == ["v0.3.1", "v0.3.2"] # foss raus, aufsteigend (neueste rechts) + assert out["tokens_in"] == [300, 300] # v0.3.1: 100+200 + assert out["tokens_out"] == [100, 60] + assert out["tokens_cache"] == [20, 20] + assert out["duration_min"] == [6.0, 10.0] # median([5,7])=6 + + +def test_chart_tokens_by_version_empty(): + out = _chart_tokens_by_version([]) + assert out["labels"] == [] and out["tokens_in"] == [] + + +def test_chart_tokens_by_version_skips_versionless_runs(): + runs = [ + {"ver": "v0.3.1", "tokens_in": 100, "tokens_out": 10, "tokens_cache": 0, "duration_min": 5.0}, + {"ver": "", "tokens_in": 999, "tokens_out": 999, "tokens_cache": 0, "duration_min": 9.0}, + {"tokens_in": 888, "tokens_out": 0, "tokens_cache": 0, "duration_min": 3.0}, + ] + out = _chart_tokens_by_version(runs) + assert out["labels"] == ["v0.3.1"] # versionslose raus, kein "?"-Bucket diff --git a/internal/dashboard/eval_dashboard.html b/internal/dashboard/eval_dashboard.html index 1f5cce1..073d90d 100644 --- a/internal/dashboard/eval_dashboard.html +++ b/internal/dashboard/eval_dashboard.html @@ -548,7 +548,6 @@
-
Eval
auto-refresh 15 s
-
- Eval - -
- Skalen nicht vergleichbar +
lädt…
@@ -1179,7 +1174,9 @@ const au = _autoUnit(valid, unit); // dynamische Einheit + Formatter const vmax = Math.max(...valid), vmin = Math.min(...valid); const pad = (vmax - vmin) * 0.2 || (vmax * 0.1) || 1; - const max = vmax + pad, min = Math.max(0, vmin - pad); + // %-Metriken bei 100 deckeln — eine Rate kann nicht ueber 100 % (User-Feedback) + const max = (unit === '%') ? Math.min(100, vmax + pad) : vmax + pad; + const min = Math.max(0, vmin - pad); const span = (max - min) || 1; const n = points.length; const xs = points.map((_,i) => padL + (i/(n-1))*iw); @@ -1229,10 +1226,11 @@ }).join(''); // Version-Labels auf X-Achse - const verLabels = (versions||[]).map((v,i) => - `${v}` - ).join(''); + // Nur Endpunkte labeln — bei vielen Versionen ueberlappen Einzel-Labels (User-Feedback) + const verLabels = (versions && versions.length >= 2) + ? `${versions[0] ?? ''}` + + `${versions[n-1] ?? ''}` + : ''; // 3 Hilfslinien mit Wert-Beschriftung — mehr Kontext const hairLines = [0, 0.5, 1].map(t => { @@ -1483,14 +1481,13 @@ // ch5 — Tokens (Komposition: Teal=Input, Amber=Output, Grau=Cache) if (d.tokens?.labels?.length) { - const ax5=document.getElementById('ax5'); if(ax5) ax5.textContent=`${d.tokens.labels.length} Runs · chronologisch`; - const _tokPdf = d.tokens.pdf_labels||[]; + const ax5=document.getElementById('ax5'); if(ax5) ax5.textContent=`${d.tokens.labels.length} Versionen · Summe`; getOrCreate('ch5',{type:'bar',data:{labels:d.tokens.labels,datasets:[ {label:'Input', data:d.tokens.tokens_in, backgroundColor:t.teal+'aa', stack:'t'}, {label:'Output',data:d.tokens.tokens_out,backgroundColor:t.amber+'aa', stack:'t'}, {label:'Cache', data:d.tokens.tokens_cache,backgroundColor:t.ink4+'66',stack:'t'}, ]},options:{maintainAspectRatio:false,responsive:true,animation:{duration:500}, - plugins:{legend:legendCfg(),tooltip:{callbacks:{title:items=>{const i=items[0]?.dataIndex;const pdf=_tokPdf[i];return pdf?`${d.tokens.labels[i]} · ${pdf}`:d.tokens.labels[i];}}}}, + plugins:{legend:legendCfg()}, scales:{x:axisCfg({grid:{display:false},ticks:{color:t.ink3,maxRotation:45,font:{size:9}}}), y:{...axisCfg(),stacked:true,title:{display:true,text:'Tokens',color:t.ink4,font:{size:11}}}}}}); const l5=document.getElementById('leg5'); @@ -1498,7 +1495,7 @@ // ch6 — Duration getOrCreate('ch6',{type:'line',data:{labels:d.tokens.labels,datasets:[{data:d.tokens.duration_min||[],borderColor:t.coral,backgroundColor:t.coral+'18',borderWidth:2,pointRadius:4,pointHoverRadius:7,pointBackgroundColor:t.card,pointBorderColor:t.coral,pointBorderWidth:2,tension:.3,fill:true}]}, options:{maintainAspectRatio:false,responsive:true,animation:{duration:500}, - plugins:{legend:{display:false},tooltip:{callbacks:{title:items=>{const i=items[0]?.dataIndex;const pdf=_tokPdf[i];return pdf?`${d.tokens.labels[i]} · ${pdf}`:d.tokens.labels[i];}}}}, + plugins:{legend:{display:false}}, scales:{x:axisCfg({grid:{display:false},ticks:{color:t.ink3,maxRotation:45,font:{size:9}}}),y:axisCfg({beginAtZero:true,ticks:{callback:v=>v+' min'}})}}}); } else { setChartEmpty('ch5'); setChartEmpty('ch6'); @@ -1805,7 +1802,7 @@ function _renderWithData(d) { if(!d) return; window._lastData=d; const t = C(); - _initEvalVerDropdown(d.available_eval_versions||[], d.eval_version||''); + // Eval-Version-Dropdown entfernt — Server nutzt default (neueste eval_version) // Dropdowns: immer mit ALLEN Optionen befüllen (vor globalem Filter) if (d.all_pvers?.length) _initGlobalPverFilter(d.all_pvers); if (d.all_pdfs?.length) _initGlobalPdfFilter(d.all_pdfs); @@ -1856,9 +1853,8 @@ async function loadAndRender() { const seq = ++_loadSeq; try { - const urlVer=_getEvalVerFromUrl(), verParam=_currentEvalVersion||urlVer||''; + // Eval-Version-Filter entfernt: nie aus URL/State setzen → Server nutzt default (neueste) const p = new URLSearchParams(); - if (verParam) p.set('eval_version', verParam); if (_globalFilters.lang && _globalFilters.lang !== '__all__') p.set('language', _globalFilters.lang); if (_globalFilters.model && _globalFilters.model !== '__all__') p.set('model', _globalFilters.model); if (_globalFilters.pdf && _globalFilters.pdf !== '__all__') p.set('pdf', _globalFilters.pdf); From 09ddfd251bcf21693768e0f602f9631d3539cd0a Mon Sep 17 00:00:00 2001 From: TillQuandel Date: Tue, 23 Jun 2026 16:59:38 +0200 Subject: [PATCH 3/4] =?UTF-8?q?fix(dashboard):=20Cross-Model-Review-Funde?= =?UTF-8?q?=20(#59)=20=E2=80=94=20ins-cost-Label=20+=20Null-Delta=20neutra?= =?UTF-8?q?l?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex-Cross-Review auf #59: - ins-cost-Insight zeigte "N Runs ... Median-Laufzeit pro Run", aber d.tokens ist seit _chart_tokens_by_version pro VERSION aggregiert → Label auf Versionen/pro Version korrigiert (Z1751-58). Sonst still falsche Stat-Karte. - Null-Delta (0.0) bekam pos/neg-Farbklasse → jetzt neutral (Z1138). Reine Client-JS (kein JS-Test-Harness im Repo) — durch Code-Lesung verifiziert. Offen (braucht Konventions-Input): is_foss_version matcht nur "foss-", Server-Z73 nutzt "extractive-" → foss-Trennung evtl. wirkungslos; + foss-Drop im Token-Chart bei explizitem Filter. Beide an die Prefix-Konvention gekoppelt, separat zu klaeren. --- internal/dashboard/eval_dashboard.html | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/internal/dashboard/eval_dashboard.html b/internal/dashboard/eval_dashboard.html index 073d90d..966e747 100644 --- a/internal/dashboard/eval_dashboard.html +++ b/internal/dashboard/eval_dashboard.html @@ -1135,7 +1135,7 @@ const vd = kpiTrend?.deltas?.[kd.key] || null; const dval = (vd && vd.delta != null) ? vd.delta.toFixed(1) : null; const prevVer = vers.length >= 2 ? vers[vers.length-2] : null; - const dClass = (dval!=null && vd.reliable) ? (parseFloat(dval)>0 ? (kd.key==='hall'?'dneg':'dpos') : (kd.key==='hall'?'dpos':'dneg')) : ''; + const dClass = (dval!=null && vd.reliable && parseFloat(dval)!==0) ? (parseFloat(dval)>0 ? (kd.key==='hall'?'dneg':'dpos') : (kd.key==='hall'?'dpos':'dneg')) : ''; // 0.0 = neutral, keine Farbe const sign = dval!=null && parseFloat(dval)>0 ? '+' : ''; const deltaHtml = dval!=null ? `Δ vs. ${prevVer}: ${sign}${dval} ${kd.u}${vd.reliable ? '' : ' (n<20)'}` @@ -1748,14 +1748,15 @@ } else { _setInsight('ins-quality', ''); } } else { _setInsight('ins-quality', ''); } - // Kosten: Gesamt-Tokens + Laufzeit der gefilterten Runs + // Kosten: Gesamt-Tokens + Laufzeit der gefilterten Versionen (d.tokens ist seit + // _chart_tokens_by_version pro Version aggregiert, nicht mehr pro Run — Label angepasst) const tok = d.tokens || {}; if ((tok.labels||[]).length) { const totalTok = (tok.tokens_in||[]).reduce((a,b)=>a+(b||0),0) + (tok.tokens_out||[]).reduce((a,b)=>a+(b||0),0); const medDur = _median((tok.duration_min||[]).filter(v=>v>0).sort((a,b)=>a-b)); _setInsight('ins-cost', - `${tok.labels.length} Runs verbrauchten ${_fmtDE(totalTok/1e6, 2)} M Tokens` + - (medDur != null ? `, Median-Laufzeit ${_fmtDE(medDur, 0)} min pro Run.` : '.')); + `${tok.labels.length} Versionen verbrauchten ${_fmtDE(totalTok/1e6, 2)} M Tokens` + + (medDur != null ? `, Median-Laufzeit ${_fmtDE(medDur, 0)} min pro Version.` : '.')); } else { _setInsight('ins-cost', ''); } // Agenten: größter Output-Token-Verursacher From 3c32c1c18deb07287784e100494cdb300766e43e Mon Sep 17 00:00:00 2001 From: TillQuandel Date: Tue, 23 Jun 2026 17:07:49 +0200 Subject: [PATCH 4/4] =?UTF-8?q?fix(dashboard):=20is=5Ffoss=5Fversion=20mat?= =?UTF-8?q?cht=20extractive-=20(realer=20Prefix)=20=E2=80=94=20Trennung=20?= =?UTF-8?q?war=20No-op?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-Model-Review (Codex) HIGH: die nicht-generative Pipeline taggt Runs mit "extractive-" (extractive/orchestrator.py: EXTRACTIVE_VERSION="extractive-v0.2.0"), aber is_foss_version prüfte nur "foss-" (taggt nirgends real) → die ganze #36-foss- Trennung filterte NICHTS (No-op). Fix: startswith(("extractive-","foss-")). +Test. --- generative/eval_dashboard.py | 11 +++++++---- generative/tests/test_dashboard_followups.py | 6 ++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/generative/eval_dashboard.py b/generative/eval_dashboard.py index 8b3b299..f8b80a6 100644 --- a/generative/eval_dashboard.py +++ b/generative/eval_dashboard.py @@ -107,10 +107,13 @@ def _ver_sort_key(v: str) -> tuple: return tuple(int(n) for n in re.findall(r"\d+", v)) def is_foss_version(version) -> bool: - """True fuer foss/extractive-Pipeline-Versionen (Praefix 'foss-'). Die foss- - Pipeline ist eine andere Architektur als die generative — ihre Versionen - gehoeren nicht in denselben Versions-Trend (#36).""" - return str(version or "").startswith("foss-") + """True fuer nicht-generative Pipeline-Versionen. Realer Prefix ist `extractive-` + (extractive/orchestrator.py: EXTRACTIVE_VERSION = "extractive-v0.2.0"); `foss-` + wird als Forward-Compat-Alias mit erkannt. Diese Pipelines sind eine andere + Architektur als die generative — ihre Versionen gehoeren nicht in denselben + Versions-Trend (#36). NB: `foss-` allein matchte nichts Reales → Trennung war + ein No-op (Cross-Model-Review 2026-06-23).""" + return str(version or "").startswith(("extractive-", "foss-")) def _latest_version(ver_map: dict) -> str: return sorted(ver_map.keys(), key=_ver_sort_key)[-1] diff --git a/generative/tests/test_dashboard_followups.py b/generative/tests/test_dashboard_followups.py index 5b3f3b4..5859543 100644 --- a/generative/tests/test_dashboard_followups.py +++ b/generative/tests/test_dashboard_followups.py @@ -110,6 +110,12 @@ def test_mark_scaling_recency_does_not_mutate_input(): def test_is_foss_version_detects_foss_prefix(): assert is_foss_version("foss-v0.1.1") is True assert is_foss_version("foss-v0.2.0") is True + # Realer Prefix der nicht-generativen Pipeline ist `extractive-` + # (extractive/orchestrator.py: EXTRACTIVE_VERSION = "extractive-v0.2.0"), + # NICHT `foss-` (das taggt nirgends real). Muss ebenfalls erkannt werden, + # sonst ist die ganze Trennung ein No-op (Cross-Model-Review Codex 2026-06-23). + assert is_foss_version("extractive-v0.2.0") is True + assert is_foss_version("extractive-1.0") is True def test_is_foss_version_false_for_generative_and_edge_cases():