Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion generative/agents/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from __future__ import annotations
import json
import re
from pathlib import Path

from generative.agents.base import call_claude_async
from generative.agents.structured_output import parse_extractor_output
Expand Down Expand Up @@ -258,12 +259,31 @@ def _format_tag_whitelist(tags: list[str] | None,
return "\n".join(f"- {t}" for t in tags)


def _clean_source_file_display(source_file: str) -> str:
"""Gibt den Dateinamen für die Prompt-`Datei:`-Zeile mit gesäubertem Autor
zurück. Der rohe Zotero-Dateiname ('Mahmood und University of the Punjab -
2016 - …') leakt den Affiliations-Koautor sonst trotz gesäubertem Autor-Feld
in LLM-Sekundärzitate ('zit. n. Mahmood & Punjab') — das ' und ' liest sich
als Zwei-Autoren-Trenner. Drei­ter Geschwister-Kanal der Issue-41/PR-71-Klasse.
Nicht-parsbare Namen bleiben unverändert."""
from generative.pipeline.vault_writer import _parse_filename_fallback
fb = _parse_filename_fallback(source_file)
author = fb.get("Author")
if not author:
return source_file
ext = Path(source_file).suffix
title = fb.get("Title", "")
year = fb.get("Year")
core = f"{author} - {year} - {title}" if year else f"{author} - {title}"
return f"{core}{ext}"


def _format_source_meta(meta: dict[str, str], source_file: str) -> str:
parts = []
if meta.get("Author"): parts.append(f"Autor: {meta['Author']}")
if meta.get("Title"): parts.append(f"Titel: {meta['Title']}")
if meta.get("Year"): parts.append(f"Jahr: {meta['Year']}")
parts.append(f"Datei: {source_file}")
parts.append(f"Datei: {_clean_source_file_display(source_file)}")
return "\n".join(f"- {p}" for p in parts)


Expand Down
5 changes: 3 additions & 2 deletions generative/pipeline/vault_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from generative.config import VAULT, WISSEN, INBOX, LITERATURE_DIR, CRITIC_AUTO_THRESHOLD
from generative.schemas.atomic_note import AtomicNoteDraft
from shared.author_norm import drop_institutional_coauthors


# Schema-MoC Naming: `MoC-<Thema>.md` — Spaces erlaubt, nur FS-unsichere Zeichen ersetzen.
Expand Down Expand Up @@ -58,14 +59,14 @@ def _parse_filename_fallback(source_file: str) -> dict[str, str]:
m = _FILENAME_PATTERN_FULL.match(stem)
if m:
return {
"Author": m.group("author").strip(),
"Author": drop_institutional_coauthors(m.group("author").strip()),
"Year": m.group("year"),
"Title": m.group("title").strip(),
}
m = _FILENAME_PATTERN_NOYEAR.match(stem)
if m:
return {
"Author": m.group("author").strip(),
"Author": drop_institutional_coauthors(m.group("author").strip()),
"Title": m.group("title").strip(),
}
return {}
Expand Down
106 changes: 106 additions & 0 deletions generative/tests/test_author_norm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Tests für shared.author_norm.drop_institutional_coauthors + Integration in die
beiden Zotero-Dateiname-Parser.

Bug-Klasse (Mahmood-Lauf 2026-06-25): Zotero exportiert die Affiliation als
zweiten "Autor" (`Mahmood und University of the Punjab`). Die Pipeline behandelte
"University of the Punjab" als Koautor → Body-Zitate "Mahmood & Punjab",
`_extract_primary_authors` verlor den echten Autor ganz (-> ['Punjab']),
`_short_label` erzeugte falsches "et al.". Wurzel: ungereinigter Autor-String aus
ZWEI Dateiname-Parsern (pdf_enrich._parse_filename_dynamic Kanal 1 + vault_writer.
_parse_filename_fallback Kanal 2).
"""
from pathlib import Path

from shared.author_norm import drop_institutional_coauthors
from generative.tools.pdf_enrich import _parse_filename_dynamic
from generative.pipeline.vault_writer import _parse_filename_fallback


# --- Kern-Helper: Person + Institution gemischt -> Institution droppen ---

def test_german_und_affiliation_dropped():
assert drop_institutional_coauthors("Mahmood und University of the Punjab") == "Mahmood"


def test_english_and_affiliation_dropped():
assert drop_institutional_coauthors("Smith and University of California") == "Smith"


def test_semicolon_institute_dropped():
assert drop_institutional_coauthors("Müller; Institut für Bildungsforschung") == "Müller"


# --- Regressions-Schutz: legitime Fälle dürfen NICHT verändert werden ---

def test_two_persons_unchanged():
# KRITISCH: legitime Zwei-Autoren bleiben unverändert.
assert drop_institutional_coauthors("Schlebbe und Greifeneder") == "Schlebbe und Greifeneder"


def test_sole_corporate_author_preserved():
# Reiner Korporativ-Autor (alle Segmente institutionell) bleibt erhalten.
assert drop_institutional_coauthors("World Health Organization") == "World Health Organization"


def test_single_person_unchanged():
assert drop_institutional_coauthors("Mahmood") == "Mahmood"


def test_empty_unchanged():
assert drop_institutional_coauthors("") == ""


def test_three_persons_unchanged():
assert drop_institutional_coauthors("Gross and Latham and Folk") == "Gross and Latham and Folk"


# --- Review-Härtung (Qwen/Codex 2026-06-25) ---

def test_uppercase_separator_still_strips():
# HIGH 1 (Qwen): 'UND'/'AND' aus manuellem Rename muss auch greifen.
assert drop_institutional_coauthors("Mahmood UND University of the Punjab") == "Mahmood"
assert drop_institutional_coauthors("Smith AND University of California") == "Smith"


def test_single_token_surname_collision_preserved():
# HIGH 2 (Qwen) / LOW (Codex): 1-Wort-Nachname, der zufällig ein Marker-Wort
# ist, darf NICHT als Institution gestrippt werden (≥2-Token-Guard).
assert drop_institutional_coauthors("Smith und Hospital") == "Smith und Hospital"
assert drop_institutional_coauthors("Bureau und Center") == "Bureau und Center"


def test_markerless_affiliation_passes_through():
# Akzeptiertes Residual (Codex MED): verkürzte Affiliation ohne Marker kann
# nicht erkannt werden — dokumentiert das bewusste Limit.
assert drop_institutional_coauthors("Mahmood und Punjab") == "Mahmood und Punjab"


# --- Integration: beide Dateiname-Parser liefern den gereinigten Autor ---

_MAHMOOD = ("Mahmood und University of the Punjab - 2016 - "
"Do People Overestimate Their Information Literacy Skills.pdf")


def test_parse_filename_dynamic_cleans_affiliation():
meta = _parse_filename_dynamic(Path(_MAHMOOD))
assert meta is not None
assert meta["author"] == "Mahmood"


def test_parse_filename_fallback_cleans_affiliation():
fb = _parse_filename_fallback(_MAHMOOD)
assert fb["Author"] == "Mahmood"


def test_extractor_source_meta_datei_line_drops_affiliation():
"""Dritter Kanal: die 'Datei:'-Zeile im Extractor-Prompt zeigte den rohen
Zotero-Dateinamen → der Affiliations-Koautor leakte trotz gesäubertem
Autor-Feld in LLM-Sekundärzitate ('zit. n. Mahmood & Punjab')."""
from generative.agents.extractor import _format_source_meta
out = _format_source_meta(
{"Author": "Mahmood", "Year": "2016",
"Title": "Do People Overestimate Their Information Literacy Skills"},
_MAHMOOD,
)
assert "University of the Punjab" not in out
assert "Mahmood" in out
5 changes: 5 additions & 0 deletions generative/tools/pdf_enrich.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import urllib.request
from pathlib import Path

from shared.author_norm import drop_institutional_coauthors

try:
from pypdf import PdfReader
except ImportError:
Expand Down Expand Up @@ -529,6 +531,7 @@ def _parse_filename_dynamic(pdf_path: Path) -> dict | None:
if m:
author_raw, year, title = m.group(1).strip(), int(m.group(2)), m.group(3).strip()
author = re.sub(r'\s+et al\.?$', '', author_raw, flags=re.IGNORECASE).strip()
author = drop_institutional_coauthors(author)
return {"title": title, "author": author, "year": year, "doi": "", "type": ""}

# Jahr-Anker mit automatischer Separator-Erkennung (Unterstrich, Bindestrich)
Expand All @@ -547,6 +550,7 @@ def _parse_filename_dynamic(pdf_path: Path) -> dict | None:
author_raw = re.sub(r'[_]', ' ', before).strip()
title = re.sub(r'[_]', ' ', after).strip()
author = re.sub(r'\s+et al\.?$', '', author_raw, flags=re.IGNORECASE).strip()
author = drop_institutional_coauthors(author)
return {"title": title, "author": author.split()[-1] if author else "",
"year": year, "doi": "", "type": ""}

Expand All @@ -555,6 +559,7 @@ def _parse_filename_dynamic(pdf_path: Path) -> dict | None:
if m2:
author_raw, title = m2.group(1).strip(), m2.group(2).strip()
author = re.sub(r'\s+et al\.?$', '', author_raw, flags=re.IGNORECASE).strip()
author = drop_institutional_coauthors(author)
return {"title": title, "author": author, "year": None, "doi": "", "type": ""}

return None
Expand Down
80 changes: 80 additions & 0 deletions shared/author_norm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Autor-Normalisierung — geteilt zwischen Dateiname-Parsern (pdf_enrich +
vault_writer). Reine Funktion, keine internen Deps (unterste Schicht).

Bug-Klasse (Mahmood-Lauf 2026-06-25): Zotero (oft deutsche Locale) hängt die
Affiliation als zweiten "Autor" an — `Mahmood und University of the Punjab`.
Die Affiliation ist kein Koautor; sie als solchen zu behandeln verfälscht jede
Inline-Zitation ("Mahmood & Punjab"), die Planner-origin-Klassifikation und das
Footnote-Label ("et al.").
"""
from __future__ import annotations

import re

# Klar institutionelle Marker (Wortgrenze, case-insensitiv). Bewusst KEINE
# Akronyme (MIT, ETH) und kein "school"/"college" allein — diese sind als
# Nachnamen mehrdeutig. Die ≥1-Person-bleibt-Garantie unten schützt zusätzlich:
# eine reine Personenliste wird nie angefasst, ein reiner Korporativ-Autor bleibt.
_INSTITUTION_RE = re.compile(
r"\b("
r"universi(?:ty|t[äa]t|dad|t[ée]|t[àa])"
r"|institut(?:e|o|ion)?"
r"|department|fakult[äa]t|faculty"
r"|hochschule|polytechnic"
r"|academy|akademie"
r"|laborator(?:y|ies)|laboratoire"
r"|hospital|klinik|clinic"
r"|minist(?:ry|erium|ère)"
r"|foundation|stiftung"
r"|society|gesellschaft|associat(?:ion|ed)|verband"
r"|council|committee|kommission|commission"
r"|corporation|incorporated|gmbh|inc|ltd|llc|plc"
r"|centre|center|zentrum"
r"|bureau|agency|agentur"
r"|organi[sz]ation"
r")\b",
re.IGNORECASE,
)

# Autor-Trenner: ';', ' und ', ' and ', ' & '. (Komma NICHT — würde
# 'Lastname, Firstname' fälschlich splitten.) IGNORECASE: Zotero/manuelle
# Renames liefern auch 'UND'/'AND' (Qwen-Review HIGH 1).
_AUTHOR_SEP_RE = re.compile(r"\s*;\s*|\s+und\s+|\s+and\s+|\s*&\s*", re.IGNORECASE)


def _looks_institutional(segment: str) -> bool:
"""Ein Segment gilt nur als institutionell, wenn es ≥2 Tokens hat UND einen
Institutions-Marker trägt. Der Token-Guard schützt 1-Wort-Nachnamen, die
zufällig ein Marker-Wort sind (Hospital, Bureau, Center, Foundation als
Personenname) vor falschem Strippen (Qwen/Codex-Review)."""
if len(segment.split()) < 2:
return False
return bool(_INSTITUTION_RE.search(segment))


def drop_institutional_coauthors(author: str) -> str:
"""Entfernt institutionelle Affiliations-Segmente aus einem Autor-String —
aber nur, wenn mindestens ein Personen-Segment übrig bleibt.

- ``"Mahmood und University of the Punjab"`` → ``"Mahmood"``
- ``"Schlebbe und Greifeneder"`` → unverändert (beides Personen)
- ``"World Health Organization"`` → unverändert (reiner Korporativ-Autor)

Der Trenner zwischen verbleibenden Personen wird aus dem Original übernommen,
damit Downstream-Parser (`_short_author`, `_short_label`) unverändert greifen.
"""
if not author or not author.strip():
return author
sep_match = _AUTHOR_SEP_RE.search(author)
if not sep_match:
return author # ein einziges Segment — nichts zu trennen
parts = [p.strip() for p in _AUTHOR_SEP_RE.split(author) if p.strip()]
if len(parts) < 2:
return author
persons = [p for p in parts if not _looks_institutional(p)]
institutional = [p for p in parts if _looks_institutional(p)]
# Nur eingreifen, wenn sich Personen UND Institutionen mischen.
if not persons or not institutional:
return author
sep = sep_match.group()
return sep.join(persons)
Loading