TillQuandel · TillQuandel · Jun 25, 2026 · Jun 26, 2026
diff --git a/generative/agents/extractor.py b/generative/agents/extractor.py
@@ -7,6 +7,7 @@
 from __future__ import annotations
 import json
 import re
+from pathlib import Path
 
 from generative.agents.base import call_claude_async
 from generative.agents.structured_output import parse_extractor_output
@@ -258,12 +259,31 @@ def _format_tag_whitelist(tags: list[str] | None,
     return "\n".join(f"- {t}" for t in tags)
 
 
+def _clean_source_file_display(source_file: str) -> str:
+    """Gibt den Dateinamen für die Prompt-`Datei:`-Zeile mit gesäubertem Autor
+    zurück. Der rohe Zotero-Dateiname ('Mahmood und University of the Punjab -
+    2016 - …') leakt den Affiliations-Koautor sonst trotz gesäubertem Autor-Feld
+    in LLM-Sekundärzitate ('zit. n. Mahmood & Punjab') — das ' und ' liest sich
+    als Zwei-Autoren-Trenner. Dreiter Geschwister-Kanal der Issue-41/PR-71-Klasse.
+    Nicht-parsbare Namen bleiben unverändert."""
+    from generative.pipeline.vault_writer import _parse_filename_fallback
+    fb = _parse_filename_fallback(source_file)
+    author = fb.get("Author")
+    if not author:
+        return source_file
+    ext = Path(source_file).suffix
+    title = fb.get("Title", "")
+    year = fb.get("Year")
+    core = f"{author} - {year} - {title}" if year else f"{author} - {title}"
+    return f"{core}{ext}"
+
+
 def _format_source_meta(meta: dict[str, str], source_file: str) -> str:
     parts = []
     if meta.get("Author"): parts.append(f"Autor: {meta['Author']}")
     if meta.get("Title"):  parts.append(f"Titel: {meta['Title']}")
     if meta.get("Year"):   parts.append(f"Jahr: {meta['Year']}")
-    parts.append(f"Datei: {source_file}")
+    parts.append(f"Datei: {_clean_source_file_display(source_file)}")
     return "\n".join(f"- {p}" for p in parts)
 
 

diff --git a/generative/pipeline/vault_writer.py b/generative/pipeline/vault_writer.py
@@ -10,6 +10,7 @@
 
 from generative.config import VAULT, WISSEN, INBOX, LITERATURE_DIR, CRITIC_AUTO_THRESHOLD
 from generative.schemas.atomic_note import AtomicNoteDraft
+from shared.author_norm import drop_institutional_coauthors
 
 
 # Schema-MoC Naming: `MoC-<Thema>.md` — Spaces erlaubt, nur FS-unsichere Zeichen ersetzen.
@@ -58,14 +59,14 @@ def _parse_filename_fallback(source_file: str) -> dict[str, str]:
     m = _FILENAME_PATTERN_FULL.match(stem)
     if m:
         return {
-            "Author": m.group("author").strip(),
+            "Author": drop_institutional_coauthors(m.group("author").strip()),
             "Year": m.group("year"),
             "Title": m.group("title").strip(),
         }
     m = _FILENAME_PATTERN_NOYEAR.match(stem)
     if m:
         return {
-            "Author": m.group("author").strip(),
+            "Author": drop_institutional_coauthors(m.group("author").strip()),
             "Title": m.group("title").strip(),
         }
     return {}

diff --git a/generative/tests/test_author_norm.py b/generative/tests/test_author_norm.py
@@ -0,0 +1,106 @@
+"""Tests für shared.author_norm.drop_institutional_coauthors + Integration in die
+beiden Zotero-Dateiname-Parser.
+
+Bug-Klasse (Mahmood-Lauf 2026-06-25): Zotero exportiert die Affiliation als
+zweiten "Autor" (`Mahmood und University of the Punjab`). Die Pipeline behandelte
+"University of the Punjab" als Koautor → Body-Zitate "Mahmood & Punjab",
+`_extract_primary_authors` verlor den echten Autor ganz (-> ['Punjab']),
+`_short_label` erzeugte falsches "et al.". Wurzel: ungereinigter Autor-String aus
+ZWEI Dateiname-Parsern (pdf_enrich._parse_filename_dynamic Kanal 1 + vault_writer.
+_parse_filename_fallback Kanal 2).
+"""
+from pathlib import Path
+
+from shared.author_norm import drop_institutional_coauthors
+from generative.tools.pdf_enrich import _parse_filename_dynamic
+from generative.pipeline.vault_writer import _parse_filename_fallback
+
+
+# --- Kern-Helper: Person + Institution gemischt -> Institution droppen ---
+
+def test_german_und_affiliation_dropped():
+    assert drop_institutional_coauthors("Mahmood und University of the Punjab") == "Mahmood"
+
+
+def test_english_and_affiliation_dropped():
+    assert drop_institutional_coauthors("Smith and University of California") == "Smith"
+
+
+def test_semicolon_institute_dropped():
+    assert drop_institutional_coauthors("Müller; Institut für Bildungsforschung") == "Müller"
+
+
+# --- Regressions-Schutz: legitime Fälle dürfen NICHT verändert werden ---
+
+def test_two_persons_unchanged():
+    # KRITISCH: legitime Zwei-Autoren bleiben unverändert.
+    assert drop_institutional_coauthors("Schlebbe und Greifeneder") == "Schlebbe und Greifeneder"
+
+
+def test_sole_corporate_author_preserved():
+    # Reiner Korporativ-Autor (alle Segmente institutionell) bleibt erhalten.
+    assert drop_institutional_coauthors("World Health Organization") == "World Health Organization"
+
+
+def test_single_person_unchanged():
+    assert drop_institutional_coauthors("Mahmood") == "Mahmood"
+
+
+def test_empty_unchanged():
+    assert drop_institutional_coauthors("") == ""
+
+
+def test_three_persons_unchanged():
+    assert drop_institutional_coauthors("Gross and Latham and Folk") == "Gross and Latham and Folk"
+
+
+# --- Review-Härtung (Qwen/Codex 2026-06-25) ---
+
+def test_uppercase_separator_still_strips():
+    # HIGH 1 (Qwen): 'UND'/'AND' aus manuellem Rename muss auch greifen.
+    assert drop_institutional_coauthors("Mahmood UND University of the Punjab") == "Mahmood"
+    assert drop_institutional_coauthors("Smith AND University of California") == "Smith"
+
+
+def test_single_token_surname_collision_preserved():
+    # HIGH 2 (Qwen) / LOW (Codex): 1-Wort-Nachname, der zufällig ein Marker-Wort
+    # ist, darf NICHT als Institution gestrippt werden (≥2-Token-Guard).
+    assert drop_institutional_coauthors("Smith und Hospital") == "Smith und Hospital"
+    assert drop_institutional_coauthors("Bureau und Center") == "Bureau und Center"
+
+
+def test_markerless_affiliation_passes_through():
+    # Akzeptiertes Residual (Codex MED): verkürzte Affiliation ohne Marker kann
+    # nicht erkannt werden — dokumentiert das bewusste Limit.
+    assert drop_institutional_coauthors("Mahmood und Punjab") == "Mahmood und Punjab"
+
+
+# --- Integration: beide Dateiname-Parser liefern den gereinigten Autor ---
+
+_MAHMOOD = ("Mahmood und University of the Punjab - 2016 - "
+            "Do People Overestimate Their Information Literacy Skills.pdf")
+
+
+def test_parse_filename_dynamic_cleans_affiliation():
+    meta = _parse_filename_dynamic(Path(_MAHMOOD))
+    assert meta is not None
+    assert meta["author"] == "Mahmood"
+
+
+def test_parse_filename_fallback_cleans_affiliation():
+    fb = _parse_filename_fallback(_MAHMOOD)
+    assert fb["Author"] == "Mahmood"
+
+
+def test_extractor_source_meta_datei_line_drops_affiliation():
+    """Dritter Kanal: die 'Datei:'-Zeile im Extractor-Prompt zeigte den rohen
+    Zotero-Dateinamen → der Affiliations-Koautor leakte trotz gesäubertem
+    Autor-Feld in LLM-Sekundärzitate ('zit. n. Mahmood & Punjab')."""
+    from generative.agents.extractor import _format_source_meta
+    out = _format_source_meta(
+        {"Author": "Mahmood", "Year": "2016",
+         "Title": "Do People Overestimate Their Information Literacy Skills"},
+        _MAHMOOD,
+    )
+    assert "University of the Punjab" not in out
+    assert "Mahmood" in out
diff --git a/generative/tools/pdf_enrich.py b/generative/tools/pdf_enrich.py
@@ -24,6 +24,8 @@
 import urllib.request
 from pathlib import Path
 
+from shared.author_norm import drop_institutional_coauthors
+
 try:
     from pypdf import PdfReader
 except ImportError:
@@ -529,6 +531,7 @@ def _parse_filename_dynamic(pdf_path: Path) -> dict | None:
     if m:
         author_raw, year, title = m.group(1).strip(), int(m.group(2)), m.group(3).strip()
         author = re.sub(r'\s+et al\.?$', '', author_raw, flags=re.IGNORECASE).strip()
+        author = drop_institutional_coauthors(author)
         return {"title": title, "author": author, "year": year, "doi": "", "type": ""}
 
     # Jahr-Anker mit automatischer Separator-Erkennung (Unterstrich, Bindestrich)
@@ -547,6 +550,7 @@ def _parse_filename_dynamic(pdf_path: Path) -> dict | None:
             author_raw = re.sub(r'[_]', ' ', before).strip()
             title = re.sub(r'[_]', ' ', after).strip()
             author = re.sub(r'\s+et al\.?$', '', author_raw, flags=re.IGNORECASE).strip()
+            author = drop_institutional_coauthors(author)
             return {"title": title, "author": author.split()[-1] if author else "",
                     "year": year, "doi": "", "type": ""}
 
@@ -555,6 +559,7 @@ def _parse_filename_dynamic(pdf_path: Path) -> dict | None:
     if m2:
         author_raw, title = m2.group(1).strip(), m2.group(2).strip()
         author = re.sub(r'\s+et al\.?$', '', author_raw, flags=re.IGNORECASE).strip()
+        author = drop_institutional_coauthors(author)
         return {"title": title, "author": author, "year": None, "doi": "", "type": ""}
 
     return None

diff --git a/shared/author_norm.py b/shared/author_norm.py
@@ -0,0 +1,80 @@
+"""Autor-Normalisierung — geteilt zwischen Dateiname-Parsern (pdf_enrich +
+vault_writer). Reine Funktion, keine internen Deps (unterste Schicht).
+
+Bug-Klasse (Mahmood-Lauf 2026-06-25): Zotero (oft deutsche Locale) hängt die
+Affiliation als zweiten "Autor" an — `Mahmood und University of the Punjab`.
+Die Affiliation ist kein Koautor; sie als solchen zu behandeln verfälscht jede
+Inline-Zitation ("Mahmood & Punjab"), die Planner-origin-Klassifikation und das
+Footnote-Label ("et al.").
+"""
+from __future__ import annotations
+
+import re
+
+# Klar institutionelle Marker (Wortgrenze, case-insensitiv). Bewusst KEINE
+# Akronyme (MIT, ETH) und kein "school"/"college" allein — diese sind als
+# Nachnamen mehrdeutig. Die ≥1-Person-bleibt-Garantie unten schützt zusätzlich:
+# eine reine Personenliste wird nie angefasst, ein reiner Korporativ-Autor bleibt.
+_INSTITUTION_RE = re.compile(
+    r"\b("
+    r"universi(?:ty|t[äa]t|dad|t[ée]|t[àa])"
+    r"|institut(?:e|o|ion)?"
+    r"|department|fakult[äa]t|faculty"
+    r"|hochschule|polytechnic"
+    r"|academy|akademie"
+    r"|laborator(?:y|ies)|laboratoire"
+    r"|hospital|klinik|clinic"
+    r"|minist(?:ry|erium|ère)"
+    r"|foundation|stiftung"
+    r"|society|gesellschaft|associat(?:ion|ed)|verband"
+    r"|council|committee|kommission|commission"
+    r"|corporation|incorporated|gmbh|inc|ltd|llc|plc"
+    r"|centre|center|zentrum"
+    r"|bureau|agency|agentur"
+    r"|organi[sz]ation"
+    r")\b",
+    re.IGNORECASE,
+)
+
+# Autor-Trenner: ';', ' und ', ' and ', ' & '. (Komma NICHT — würde
+# 'Lastname, Firstname' fälschlich splitten.) IGNORECASE: Zotero/manuelle
+# Renames liefern auch 'UND'/'AND' (Qwen-Review HIGH 1).
+_AUTHOR_SEP_RE = re.compile(r"\s*;\s*|\s+und\s+|\s+and\s+|\s*&\s*", re.IGNORECASE)
+
+
+def _looks_institutional(segment: str) -> bool:
+    """Ein Segment gilt nur als institutionell, wenn es ≥2 Tokens hat UND einen
+    Institutions-Marker trägt. Der Token-Guard schützt 1-Wort-Nachnamen, die
+    zufällig ein Marker-Wort sind (Hospital, Bureau, Center, Foundation als
+    Personenname) vor falschem Strippen (Qwen/Codex-Review)."""
+    if len(segment.split()) < 2:
+        return False
+    return bool(_INSTITUTION_RE.search(segment))
+
+
+def drop_institutional_coauthors(author: str) -> str:
+    """Entfernt institutionelle Affiliations-Segmente aus einem Autor-String —
+    aber nur, wenn mindestens ein Personen-Segment übrig bleibt.
+
+    - ``"Mahmood und University of the Punjab"`` → ``"Mahmood"``
+    - ``"Schlebbe und Greifeneder"`` → unverändert (beides Personen)
+    - ``"World Health Organization"`` → unverändert (reiner Korporativ-Autor)
+
+    Der Trenner zwischen verbleibenden Personen wird aus dem Original übernommen,
+    damit Downstream-Parser (`_short_author`, `_short_label`) unverändert greifen.
+    """
+    if not author or not author.strip():
+        return author
+    sep_match = _AUTHOR_SEP_RE.search(author)
+    if not sep_match:
+        return author  # ein einziges Segment — nichts zu trennen
+    parts = [p.strip() for p in _AUTHOR_SEP_RE.split(author) if p.strip()]
+    if len(parts) < 2:
+        return author
+    persons = [p for p in parts if not _looks_institutional(p)]
+    institutional = [p for p in parts if _looks_institutional(p)]
+    # Nur eingreifen, wenn sich Personen UND Institutionen mischen.
+    if not persons or not institutional:
+        return author
+    sep = sep_match.group()
+    return sep.join(persons)