diff --git a/generative/eval_dashboard.py b/generative/eval_dashboard.py index 533dc04..f8b80a6 100644 --- a/generative/eval_dashboard.py +++ b/generative/eval_dashboard.py @@ -106,6 +106,15 @@ def _ver_sort_key(v: str) -> tuple: return tuple(int(n) for n in re.findall(r"\d+", v)) +def is_foss_version(version) -> bool: + """True fuer nicht-generative Pipeline-Versionen. Realer Prefix ist `extractive-` + (extractive/orchestrator.py: EXTRACTIVE_VERSION = "extractive-v0.2.0"); `foss-` + wird als Forward-Compat-Alias mit erkannt. Diese Pipelines sind eine andere + Architektur als die generative — ihre Versionen gehoeren nicht in denselben + Versions-Trend (#36). NB: `foss-` allein matchte nichts Reales → Trennung war + ein No-op (Cross-Model-Review 2026-06-23).""" + return str(version or "").startswith(("extractive-", "foss-")) + def _latest_version(ver_map: dict) -> str: return sorted(ver_map.keys(), key=_ver_sort_key)[-1] @@ -436,6 +445,31 @@ def _chart_longitudinal(log_data: dict) -> dict: return {"versions": versions, "datasets": datasets} +_DELTA_MIN_N = 20 # unter N=20 kein Besser/Schlechter-Urteil (Apophenie-Schutz) + + +def version_delta(kpi_trend: dict, metric: str) -> dict: + """Delta neueste-vs-Vorversion fuer eine KPI-Metrik. + + `kpi_trend["versions"]` ist aufsteigend sortiert (neueste = letzte Position), + die Metrik-Arrays laufen parallel dazu. `reliable` ist nur True, wenn beide + beteiligten Versionen n>=20 haben — sonst ist das Delta Rauschen (N-Guard). + """ + values = kpi_trend.get(metric) or [] + ns = kpi_trend.get("n") or [] + latest = values[-1] if values else None + prev = values[-2] if len(values) >= 2 else None + n_latest = ns[-1] if ns else None + n_prev = ns[-2] if len(ns) >= 2 else None + delta = None if (latest is None or prev is None) else round(latest - prev, 4) + reliable = ( + delta is not None + and (n_latest or 0) >= _DELTA_MIN_N + and (n_prev or 0) >= _DELTA_MIN_N + ) + return {"latest": latest, "prev": prev, "delta": delta, "reliable": reliable} + + def _chart_tokens(runs: list[dict]) -> dict: return { "labels": [r["date"] for r in runs], @@ -447,6 +481,47 @@ def _chart_tokens(runs: list[dict]) -> dict: } +_SCALING_RECENT_KEEP = 10 # juengste N Versionen ungedimmt (#36 P2) + + +def mark_scaling_recency(points: list[dict], keep: int = _SCALING_RECENT_KEEP) -> list[dict]: + """Gibt eine neue Punktliste zurueck, in der die Punkte der juengsten `keep` + Versionen `recent=True` tragen, aeltere `recent=False`. So kann der Client + kaputte Frueh-Versions-Aeren dimmen, statt sie ungefiltert in die + PDF-Laengen-Skalierung zu mischen (#36 P2). Mutiert die Eingabe nicht. + """ + versions = sorted({p["ver"] for p in points if p.get("ver")}, key=_ver_sort_key) + recent_set = set(versions[-keep:]) if keep > 0 else set() + return [{**p, "recent": p.get("ver") in recent_set} for p in points] + + +def _chart_tokens_by_version(runs: list[dict]) -> dict: + """Token-Komposition (Summe) + Median-Duration pro Pipeline-Version, + aufsteigend sortiert (neueste rechts), foss-frei. Ersetzt die chronologische + Pro-Run-Achse, die bei vielen Runs unlesbar war und keinen Vergleich trug + (#36, E6).""" + by_ver: dict = {} + for r in runs: + ver = r.get("ver") or r.get("pipeline_version") + if not ver or is_foss_version(ver): + continue + b = by_ver.setdefault(ver, {"in": 0, "out": 0, "cache": 0, "dur": []}) + b["in"] += r.get("tokens_in", 0) or 0 + b["out"] += r.get("tokens_out", 0) or 0 + b["cache"] += r.get("tokens_cache", 0) or 0 + if r.get("duration_min") is not None: + b["dur"].append(r["duration_min"]) + versions = sorted(by_ver, key=_ver_sort_key) + return { + "labels": versions, + "tokens_in": [by_ver[v]["in"] for v in versions], + "tokens_out": [by_ver[v]["out"] for v in versions], + "tokens_cache": [by_ver[v]["cache"] for v in versions], + "duration_min": [round(_median(by_ver[v]["dur"]), 1) if by_ver[v]["dur"] else None + for v in versions], + } + + def _chart_scaling(all_log_runs: list[dict]) -> dict: points = [ { @@ -462,6 +537,7 @@ def _chart_scaling(all_log_runs: list[dict]) -> dict: for r in all_log_runs if r["words"] is not None ] + points = mark_scaling_recency(points) keys = sorted({p["key"] for p in points}) return {"points": points, "keys": keys} diff --git a/generative/eval_dashboard_server.py b/generative/eval_dashboard_server.py index 1663305..4274b56 100644 --- a/generative/eval_dashboard_server.py +++ b/generative/eval_dashboard_server.py @@ -409,8 +409,10 @@ def build_data(eval_version: str | None = None, # ── all_log_runs Dropdown-Optionen VOR all_log_runs-Filtern ────── _all_pdfs_opts = sorted({r["label"] for r in all_log_runs if r.get("label")}) - _all_pvers_opts = sorted({r["ver"] for r in all_log_runs if r.get("ver")}, - key=lambda v: [int(x) for x in __import__("re").findall(r"\d+", v)]) + _all_pvers_opts = sorted({r["ver"] for r in all_log_runs + if r.get("ver") and not D.is_foss_version(r["ver"])}, + key=lambda v: [int(x) for x in __import__("re").findall(r"\d+", v)], + reverse=True) # neueste generative Version oben (foss raus, #36) # PDF + Language + Version + Model-Filter auf all_log_runs (nach DB-Fallback) if model: @@ -432,6 +434,16 @@ def build_data(eval_version: str | None = None, or (r.get("key","")).lower() in _lang_pdfs] log_data = D._build_log_data(all_log_runs) + # foss-Pipeline (gliner/extractive) nicht mit generativer mischen: + # im ungefilterten Default-View foss ausschliessen — ueber Modell-/Versions- + # Filter bleibt foss einsehbar (#36, User-Wunsch 2026-06-19) + if not (model or pipeline_version): + all_log_runs = [r for r in all_log_runs if not D.is_foss_version(r.get("ver"))] + log_data = D._build_log_data(all_log_runs) + quality_rows = [r for r in quality_rows + if not D.is_foss_version(r.get("version") or r.get("pipeline_version"))] + token_runs = [tr for tr in token_runs if not D.is_foss_version(tr.get("ver"))] + # Log-Runs nach Version gruppiert runs_by_version: dict = {} for r in all_log_runs: @@ -526,6 +538,11 @@ def _pooled_accept(ver: str) -> float | None: "tokens": [round(sum(tok_by_ver.get(v,[])) / 1000, 1) if tok_by_ver.get(v) else None for v in sorted_pipeline_versions], # in M-Tokens "cost": [round(sum(cost_by_ver.get(v, [])), 4) if cost_by_ver.get(v) else None for v in sorted_pipeline_versions], } + # Delta neueste-vs-Vorversion pro KPI (mit N-Guard, #36 P4) + kpi_trend["deltas"] = { + m: D.version_delta(kpi_trend, m) + for m in ("hall", "cov", "n", "accept", "dur", "tokens", "cost") + } return { "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), @@ -536,7 +553,7 @@ def _pooled_accept(ver: str) -> float | None: "accept": D._chart_acceptance(log_data), "scatter": _chart_scatter_versioned(quality_rows), "long": D._chart_longitudinal(log_data), - "tokens": D._chart_tokens(token_runs), + "tokens": D._chart_tokens_by_version(token_runs), "scaling": D._chart_scaling(all_log_runs), "quality_by_version": quality_by_version, "runs_by_version": runs_by_version, diff --git a/generative/tests/test_dashboard_followups.py b/generative/tests/test_dashboard_followups.py new file mode 100644 index 0000000..5859543 --- /dev/null +++ b/generative/tests/test_dashboard_followups.py @@ -0,0 +1,156 @@ +"""Tests für die Dashboard-Follow-ups (atomic-notes Issue #36, 2026-06-19). + +- P4: version_delta — Delta neueste-vs-Vorversion pro KPI mit N-Guard. +- P2: scaling-Recency-Flag — alte Versions-Ären markieren (dimmen statt mischen). +- P1: _chart_longitudinal Median-über-PDFs-Serie (Anti-Spaghetti). +""" +from __future__ import annotations + +import pytest + +from generative.eval_dashboard import ( + version_delta, mark_scaling_recency, is_foss_version, _chart_tokens_by_version, +) + + +# ---------------------------------------------------------------- P4 fixtures +def _kpi_trend(**over): + """sorted_pipeline_versions aufsteigend → neueste = letzte Position.""" + base = { + "versions": ["v0.3.134", "v0.3.135"], + "hall": [12.0, 9.7], + "cov": [30.0, 35.0], + "n": [25, 22], + "accept": [50.0, 60.0], + } + base.update(over) + return base + + +# ------------------------------------------------------------------- P4 tests +def test_version_delta_latest_prev_and_signed_delta(): + d = version_delta(_kpi_trend(), "hall") + assert d["latest"] == 9.7 + assert d["prev"] == 12.0 + assert d["delta"] == pytest.approx(-2.3) # Halluzination gesunken → negativ + + +def test_version_delta_positive_sign_for_rising_coverage(): + d = version_delta(_kpi_trend(), "cov") + assert d["delta"] == pytest.approx(5.0) # Coverage gestiegen → positiv + + +def test_version_delta_reliable_only_when_both_n_at_least_20(): + assert version_delta(_kpi_trend(n=[25, 22]), "hall")["reliable"] is True + assert version_delta(_kpi_trend(n=[25, 5]), "hall")["reliable"] is False # latest zu klein + assert version_delta(_kpi_trend(n=[5, 25]), "hall")["reliable"] is False # prev zu klein + + +def test_version_delta_no_previous_version_yields_none_delta(): + d = version_delta( + {"versions": ["v0.3.135"], "hall": [9.7], "n": [22]}, "hall" + ) + assert d["latest"] == 9.7 + assert d["prev"] is None + assert d["delta"] is None + assert d["reliable"] is False + + +def test_version_delta_none_metric_value_yields_none_delta(): + d = version_delta(_kpi_trend(hall=[12.0, None]), "hall") + assert d["latest"] is None + assert d["delta"] is None + assert d["reliable"] is False + + +def test_version_delta_empty_trend_yields_none_delta(): + d = version_delta({"versions": [], "hall": [], "n": []}, "hall") + assert d["latest"] is None + assert d["prev"] is None + assert d["delta"] is None + assert d["reliable"] is False + + +# ------------------------------------------------------------------- P2 tests +def _pt(ver, key="a", x=1000, y=4): + return {"x": x, "y": y, "key": key, "label": key, "ver": ver} + + +def test_mark_scaling_recency_flags_only_youngest_keep_versions(): + pts = [_pt(f"v0.3.{i}") for i in range(1, 13)] # 12 Versionen, numerisch sortiert + recent = {p["ver"]: p["recent"] for p in mark_scaling_recency(pts, keep=10)} + assert recent["v0.3.1"] is False # die zwei ältesten Versionen gedimmt + assert recent["v0.3.2"] is False + assert recent["v0.3.3"] is True + assert recent["v0.3.12"] is True + + +def test_mark_scaling_recency_all_recent_when_fewer_than_keep(): + out = mark_scaling_recency([_pt("v0.1.0"), _pt("v0.2.0")], keep=10) + assert all(p["recent"] for p in out) + + +def test_mark_scaling_recency_multiple_points_per_version(): + pts = [_pt("v0.1.0", key="a"), _pt("v0.1.0", key="b"), _pt("v0.9.0", key="a")] + out = mark_scaling_recency(pts, keep=1) + assert [p["recent"] for p in out] == [False, False, True] # nur jüngste Version recent + + +def test_mark_scaling_recency_missing_version_not_recent(): + assert mark_scaling_recency([_pt(None)], keep=10)[0]["recent"] is False + + +def test_mark_scaling_recency_does_not_mutate_input(): + pts = [_pt("v0.1.0")] + mark_scaling_recency(pts, keep=10) + assert "recent" not in pts[0] + + +# ----------------------------------------------------- foss/generative-Trennung +def test_is_foss_version_detects_foss_prefix(): + assert is_foss_version("foss-v0.1.1") is True + assert is_foss_version("foss-v0.2.0") is True + # Realer Prefix der nicht-generativen Pipeline ist `extractive-` + # (extractive/orchestrator.py: EXTRACTIVE_VERSION = "extractive-v0.2.0"), + # NICHT `foss-` (das taggt nirgends real). Muss ebenfalls erkannt werden, + # sonst ist die ganze Trennung ein No-op (Cross-Model-Review Codex 2026-06-23). + assert is_foss_version("extractive-v0.2.0") is True + assert is_foss_version("extractive-1.0") is True + + +def test_is_foss_version_false_for_generative_and_edge_cases(): + assert is_foss_version("v0.3.139") is False + assert is_foss_version("v0.1.0") is False # generativ, trotz kleiner Zahl + assert is_foss_version("") is False + assert is_foss_version(None) is False + + +# ------------------------------------------ Token/Duration pro Version (statt chronologisch) +def test_chart_tokens_by_version_sums_and_medians_excluding_foss(): + runs = [ + {"ver": "v0.3.1", "tokens_in": 100, "tokens_out": 50, "tokens_cache": 10, "duration_min": 5.0}, + {"ver": "v0.3.1", "tokens_in": 200, "tokens_out": 50, "tokens_cache": 10, "duration_min": 7.0}, + {"ver": "v0.3.2", "tokens_in": 300, "tokens_out": 60, "tokens_cache": 20, "duration_min": 10.0}, + {"ver": "foss-v0.1.0", "tokens_in": 999, "tokens_out": 999, "tokens_cache": 999, "duration_min": 99.0}, + ] + out = _chart_tokens_by_version(runs) + assert out["labels"] == ["v0.3.1", "v0.3.2"] # foss raus, aufsteigend (neueste rechts) + assert out["tokens_in"] == [300, 300] # v0.3.1: 100+200 + assert out["tokens_out"] == [100, 60] + assert out["tokens_cache"] == [20, 20] + assert out["duration_min"] == [6.0, 10.0] # median([5,7])=6 + + +def test_chart_tokens_by_version_empty(): + out = _chart_tokens_by_version([]) + assert out["labels"] == [] and out["tokens_in"] == [] + + +def test_chart_tokens_by_version_skips_versionless_runs(): + runs = [ + {"ver": "v0.3.1", "tokens_in": 100, "tokens_out": 10, "tokens_cache": 0, "duration_min": 5.0}, + {"ver": "", "tokens_in": 999, "tokens_out": 999, "tokens_cache": 0, "duration_min": 9.0}, + {"tokens_in": 888, "tokens_out": 0, "tokens_cache": 0, "duration_min": 3.0}, + ] + out = _chart_tokens_by_version(runs) + assert out["labels"] == ["v0.3.1"] # versionslose raus, kein "?"-Bucket diff --git a/internal/dashboard/eval_dashboard.html b/internal/dashboard/eval_dashboard.html index c8d47cf..966e747 100644 --- a/internal/dashboard/eval_dashboard.html +++ b/internal/dashboard/eval_dashboard.html @@ -548,7 +548,6 @@
-
Eval
auto-refresh 15 s
-
- Eval - -
- Skalen nicht vergleichbar +
lädt…
@@ -717,7 +712,7 @@
-
Skaliert die Pipeline mit der PDF-Länge?
Lange Papers erzeugen nicht linear mehr Notes.
+
Skaliert die Pipeline mit der PDF-Länge?
Lange Papers erzeugen nicht linear mehr Notes. Ältere Versionen gedimmt.
Notes vs. Wörter
@@ -1134,15 +1129,22 @@ const vers = kpiTrend?.versions || []; const sparkColor = toneColor(kd.tone); const sparkSvg = _trendChart(vals, vers, sparkColor, kd.u); - const validVals = vals.filter(v=>v!=null); - const delta = validVals.length >= 2 ? (validVals[validVals.length-1] - validVals[0]).toFixed(1) : null; - const dClass = delta!=null ? (parseFloat(delta)>0 ? (kd.key==='hall'?'dneg':'dpos') : (kd.key==='hall'?'dpos':'dneg')) : ''; - const sign = delta!=null && parseFloat(delta)>0 ? '+' : ''; + // #36 P4: Delta neueste-vs-Vorversion (server-berechnet) mit N-Guard statt + // neueste-vs-erste. Ohne reliable (n<20 in einer der beiden Versionen) kein + // Besser/Schlechter-Farbcode — nur neutrale Anzeige (Rausch-Schutz). + const vd = kpiTrend?.deltas?.[kd.key] || null; + const dval = (vd && vd.delta != null) ? vd.delta.toFixed(1) : null; + const prevVer = vers.length >= 2 ? vers[vers.length-2] : null; + const dClass = (dval!=null && vd.reliable && parseFloat(dval)!==0) ? (parseFloat(dval)>0 ? (kd.key==='hall'?'dneg':'dpos') : (kd.key==='hall'?'dpos':'dneg')) : ''; // 0.0 = neutral, keine Farbe + const sign = dval!=null && parseFloat(dval)>0 ? '+' : ''; + const deltaHtml = dval!=null + ? `Δ vs. ${prevVer}: ${sign}${dval} ${kd.u}${vd.reliable ? '' : ' (n<20)'}` + : `${vers.length} Pipeline-Versionen`; const inner = document.getElementById('spark-inner'); if (inner) inner.innerHTML = `
${kd.label} — Versions-Trend -
${delta!=null ? `Δ ${sign}${delta} ${kd.u} · ${vers.length} Versionen` : `${vers.length} Pipeline-Versionen`}
+
${deltaHtml}
${sparkSvg}
`; } @@ -1172,7 +1174,9 @@ const au = _autoUnit(valid, unit); // dynamische Einheit + Formatter const vmax = Math.max(...valid), vmin = Math.min(...valid); const pad = (vmax - vmin) * 0.2 || (vmax * 0.1) || 1; - const max = vmax + pad, min = Math.max(0, vmin - pad); + // %-Metriken bei 100 deckeln — eine Rate kann nicht ueber 100 % (User-Feedback) + const max = (unit === '%') ? Math.min(100, vmax + pad) : vmax + pad; + const min = Math.max(0, vmin - pad); const span = (max - min) || 1; const n = points.length; const xs = points.map((_,i) => padL + (i/(n-1))*iw); @@ -1222,10 +1226,11 @@ }).join(''); // Version-Labels auf X-Achse - const verLabels = (versions||[]).map((v,i) => - `${v}` - ).join(''); + // Nur Endpunkte labeln — bei vielen Versionen ueberlappen Einzel-Labels (User-Feedback) + const verLabels = (versions && versions.length >= 2) + ? `${versions[0] ?? ''}` + + `${versions[n-1] ?? ''}` + : ''; // 3 Hilfslinien mit Wert-Beschriftung — mehr Kontext const hairLines = [0, 0.5, 1].map(t => { @@ -1410,7 +1415,9 @@ const col=t.enc[keys.indexOf(k)%t.enc.length]; return{label:(d.scaling.points.find(p=>p.key===k)||{}).label||k, data:d.scaling.points.filter(p=>p.key===k).map(p=>({x:p.x,y:p.y,_p:p})), - backgroundColor:col+'b0',borderColor:col,borderWidth:1,pointRadius:6,pointHoverRadius:9}; + backgroundColor:ctx=>ctx.raw?._p?.recent===false?col+'33':col+'b0', + borderColor:ctx=>ctx.raw?._p?.recent===false?col+'55':col, + borderWidth:1,pointRadius:ctx=>ctx.raw?._p?.recent===false?3:6,pointHoverRadius:9}; })}, options:{maintainAspectRatio:false,responsive:true,animation:{duration:500}, plugins:{legend:legendCfg(),tooltip:{callbacks:{label:ctx=>{const p=ctx.raw._p||{};return [`${p.label} ${p.ver}`,`${(p.x||0).toLocaleString('de-DE')} Wörter · ${p.y} Notes`];}}}}, @@ -1474,14 +1481,13 @@ // ch5 — Tokens (Komposition: Teal=Input, Amber=Output, Grau=Cache) if (d.tokens?.labels?.length) { - const ax5=document.getElementById('ax5'); if(ax5) ax5.textContent=`${d.tokens.labels.length} Runs · chronologisch`; - const _tokPdf = d.tokens.pdf_labels||[]; + const ax5=document.getElementById('ax5'); if(ax5) ax5.textContent=`${d.tokens.labels.length} Versionen · Summe`; getOrCreate('ch5',{type:'bar',data:{labels:d.tokens.labels,datasets:[ {label:'Input', data:d.tokens.tokens_in, backgroundColor:t.teal+'aa', stack:'t'}, {label:'Output',data:d.tokens.tokens_out,backgroundColor:t.amber+'aa', stack:'t'}, {label:'Cache', data:d.tokens.tokens_cache,backgroundColor:t.ink4+'66',stack:'t'}, ]},options:{maintainAspectRatio:false,responsive:true,animation:{duration:500}, - plugins:{legend:legendCfg(),tooltip:{callbacks:{title:items=>{const i=items[0]?.dataIndex;const pdf=_tokPdf[i];return pdf?`${d.tokens.labels[i]} · ${pdf}`:d.tokens.labels[i];}}}}, + plugins:{legend:legendCfg()}, scales:{x:axisCfg({grid:{display:false},ticks:{color:t.ink3,maxRotation:45,font:{size:9}}}), y:{...axisCfg(),stacked:true,title:{display:true,text:'Tokens',color:t.ink4,font:{size:11}}}}}}); const l5=document.getElementById('leg5'); @@ -1489,7 +1495,7 @@ // ch6 — Duration getOrCreate('ch6',{type:'line',data:{labels:d.tokens.labels,datasets:[{data:d.tokens.duration_min||[],borderColor:t.coral,backgroundColor:t.coral+'18',borderWidth:2,pointRadius:4,pointHoverRadius:7,pointBackgroundColor:t.card,pointBorderColor:t.coral,pointBorderWidth:2,tension:.3,fill:true}]}, options:{maintainAspectRatio:false,responsive:true,animation:{duration:500}, - plugins:{legend:{display:false},tooltip:{callbacks:{title:items=>{const i=items[0]?.dataIndex;const pdf=_tokPdf[i];return pdf?`${d.tokens.labels[i]} · ${pdf}`:d.tokens.labels[i];}}}}, + plugins:{legend:{display:false}}, scales:{x:axisCfg({grid:{display:false},ticks:{color:t.ink3,maxRotation:45,font:{size:9}}}),y:axisCfg({beginAtZero:true,ticks:{callback:v=>v+' min'}})}}}); } else { setChartEmpty('ch5'); setChartEmpty('ch6'); @@ -1742,14 +1748,15 @@ } else { _setInsight('ins-quality', ''); } } else { _setInsight('ins-quality', ''); } - // Kosten: Gesamt-Tokens + Laufzeit der gefilterten Runs + // Kosten: Gesamt-Tokens + Laufzeit der gefilterten Versionen (d.tokens ist seit + // _chart_tokens_by_version pro Version aggregiert, nicht mehr pro Run — Label angepasst) const tok = d.tokens || {}; if ((tok.labels||[]).length) { const totalTok = (tok.tokens_in||[]).reduce((a,b)=>a+(b||0),0) + (tok.tokens_out||[]).reduce((a,b)=>a+(b||0),0); const medDur = _median((tok.duration_min||[]).filter(v=>v>0).sort((a,b)=>a-b)); _setInsight('ins-cost', - `${tok.labels.length} Runs verbrauchten ${_fmtDE(totalTok/1e6, 2)} M Tokens` + - (medDur != null ? `, Median-Laufzeit ${_fmtDE(medDur, 0)} min pro Run.` : '.')); + `${tok.labels.length} Versionen verbrauchten ${_fmtDE(totalTok/1e6, 2)} M Tokens` + + (medDur != null ? `, Median-Laufzeit ${_fmtDE(medDur, 0)} min pro Version.` : '.')); } else { _setInsight('ins-cost', ''); } // Agenten: größter Output-Token-Verursacher @@ -1796,7 +1803,7 @@ function _renderWithData(d) { if(!d) return; window._lastData=d; const t = C(); - _initEvalVerDropdown(d.available_eval_versions||[], d.eval_version||''); + // Eval-Version-Dropdown entfernt — Server nutzt default (neueste eval_version) // Dropdowns: immer mit ALLEN Optionen befüllen (vor globalem Filter) if (d.all_pvers?.length) _initGlobalPverFilter(d.all_pvers); if (d.all_pdfs?.length) _initGlobalPdfFilter(d.all_pdfs); @@ -1847,9 +1854,8 @@ async function loadAndRender() { const seq = ++_loadSeq; try { - const urlVer=_getEvalVerFromUrl(), verParam=_currentEvalVersion||urlVer||''; + // Eval-Version-Filter entfernt: nie aus URL/State setzen → Server nutzt default (neueste) const p = new URLSearchParams(); - if (verParam) p.set('eval_version', verParam); if (_globalFilters.lang && _globalFilters.lang !== '__all__') p.set('language', _globalFilters.lang); if (_globalFilters.model && _globalFilters.model !== '__all__') p.set('model', _globalFilters.model); if (_globalFilters.pdf && _globalFilters.pdf !== '__all__') p.set('pdf', _globalFilters.pdf);