Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 56 additions & 15 deletions generative/agents/cross_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from generative.agents.base import call_claude
from generative.agents.structured_output import parse_cross_reference_output
from generative import config as _config
from generative.config import VAULT, MODEL_CROSS_REF, ENABLE_NLI_VALIDATION, NLI_MODEL_NAME, NLI_CONTRADICTION_THRESHOLD
from generative.config import VAULT, MODEL_CROSS_REF, ENABLE_NLI_VALIDATION, NLI_MODEL_NAME, NLI_CONTRADICTION_THRESHOLD, SIBLING_SEMANTIC_COSINE_THRESHOLD
from generative.schemas.atomic_note import AtomicNoteDraft

# Mindest-Anzahl `related`-Wikilinks für eine Schema-konforme Note (siehe Schema-Konzept §5)
Expand Down Expand Up @@ -282,6 +282,47 @@ def _excerpt_from_body(body: str, max_words: int = 150) -> str:
return " ".join(body.split()[:max_words])


def _rank_sibling_candidates(draft: AtomicNoteDraft,
siblings: dict[str, AtomicNoteDraft] | None,
query_tokens: set,
sib_cosine_fn,
threshold: float | None = None
) -> list[tuple[str, AtomicNoteDraft]]:
"""Pipeline-Geschwister als related-Kandidaten ranken — ADDITIV.

Signal 1 (unverändert): Titel-/Alias-Token-Overlap (≥1) — starkes lexikalisches
Signal, Score ≥ 2.0, rankt vor semantischen Treffern.
Signal 2 (neu): für lexikalisch disjunkte Geschwister (0 Tokens) entscheidet
``sib_cosine_fn(sib_draft) → float`` gegen ``threshold``. Fängt semantisch nahe,
aber anders betitelte Geschwister (dt. Komposita, Alias-Drift), die der reine
Token-Gate verfehlt (z.B. „Wissensorganisation" ↔ „Semantisches Retrieval mit
Assoziationsnetz"). Der LLM bleibt finaler Arbiter über den tatsächlichen Link.

``sib_cosine_fn`` wird NUR für Token-disjunkte Geschwister aufgerufen (spart
Embedding-Calls). Top 5.
"""
if not siblings:
return []
if threshold is None:
threshold = SIBLING_SEMANTIC_COSINE_THRESHOLD
scored: list[tuple[float, str, AtomicNoteDraft]] = []
for sib_title, sib_draft in siblings.items():
if sib_title == draft.title:
continue # self
sib_keys = _tokens(sib_title)
for alias in sib_draft.aliases:
sib_keys |= _tokens(alias)
overlap = len(query_tokens & sib_keys)
if overlap >= 1:
scored.append((1.0 + overlap, sib_title, sib_draft)) # lexikalisch vor semantisch
continue
cos = sib_cosine_fn(sib_draft)
if cos >= threshold:
scored.append((cos, sib_title, sib_draft)) # 0<cos<1 → unter Token-Treffern
scored.sort(key=lambda t: -t[0])
return [(t, d) for _, t, d in scored[:5]]


def run(draft: AtomicNoteDraft, existing_concepts: dict[str, str],
siblings: dict[str, AtomicNoteDraft] | None = None) -> AtomicNoteDraft:
# Relevante existierende Notes finden via Content-Token-Overlap.
Expand All @@ -304,22 +345,22 @@ def run(draft: AtomicNoteDraft, existing_concepts: dict[str, str],

# Stage B (F5): Pipeline-Sibling-Drafts als zusätzliche Kandidaten.
# Drafts vom selben PDF-Lauf kennen sich nicht — Cross-Reference sah bisher nur
# Vault. Bei Kuhlthau-ISP-Phasen führt das zu 0-1 related → Hard-Gate-Fail.
# Sibling-Drafts werden mit gleicher Token-Overlap-Heuristik geranked, Top 5.
# Vault. Token-Overlap-Gate verfehlte lexikalisch disjunkte, aber semantisch nahe
# Geschwister (0 Tokens → 0 Kandidaten → leeres related). Jetzt additiv:
# Token-Overlap ODER Body-Embedding-Cosine ≥ Schwelle (_rank_sibling_candidates).
sibling_candidates: list[tuple[str, AtomicNoteDraft]] = []
if siblings:
scored_sib: list[tuple[int, str, AtomicNoteDraft]] = []
for sib_title, sib_draft in siblings.items():
if sib_title == draft.title:
continue # self
sib_keys = _tokens(sib_title)
for alias in sib_draft.aliases:
sib_keys |= _tokens(alias)
overlap = len(query_tokens & sib_keys)
if overlap >= 1:
scored_sib.append((overlap, sib_title, sib_draft))
scored_sib.sort(key=lambda t: -t[0])
sibling_candidates = [(t, d) for _, t, d in scored_sib[:5]]
from generative.pipeline import embeddings as _emb
_draft_emb: dict[str, object] = {}

def _sib_cos(sib_draft: AtomicNoteDraft) -> float:
# Draft-Embedding einmal cachen; nur Token-disjunkte Siblings landen hier.
if "e" not in _draft_emb:
_draft_emb["e"] = _emb.embed_body(draft.body or "")
return _emb.cosine(_draft_emb["e"], _emb.embed_body(sib_draft.body or ""))

sibling_candidates = _rank_sibling_candidates(
draft, siblings, query_tokens, _sib_cos)

total_candidates = len(vault_candidates) + len(sibling_candidates)
if total_candidates == 0:
Expand Down
11 changes: 11 additions & 0 deletions generative/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,17 @@
REDUNDANT_SIBLING_COSINE_THRESHOLD = float(
os.getenv("ATOMIC_AGENT_REDUNDANT_SIBLING_COSINE", "0.90"))

# Stage-B-Sibling-Linking: ab dieser Body-Cosine wird ein Pipeline-Geschwister
# OHNE Titel-/Alias-Token-Overlap trotzdem als related-Kandidat aufgenommen (additiv
# zum Token-Gate, das unverändert bleibt). Fängt semantisch nahe, aber lexikalisch
# disjunkt betitelte Geschwister (dt. Komposita, Alias-Drift) — z.B.
# "Wissensorganisation" ↔ "Semantisches Retrieval mit Assoziationsnetz" (cos 0.97).
# Empirisch kalibriert 2026-06-27: verwandte Geschwister 0.97–0.985, fremde Paare
# 0.73–0.76 → 0.85 trennt mit Marge. Kandidatur ist looser als der Redundanz-Flag
# (0.90); der LLM bleibt finaler Arbiter über den Link. ENV-überschreibbar.
SIBLING_SEMANTIC_COSINE_THRESHOLD = float(
os.getenv("ATOMIC_AGENT_SIBLING_SEMANTIC_COSINE", "0.85"))

# Chunk-Größe Fallback (Wörter)
CHUNK_WORDS = 3000

Expand Down
88 changes: 88 additions & 0 deletions generative/tests/test_sibling_semantic_linking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Tests für semantisches Sibling-Candidate-Ranking (Stage B).

Wurzel: Stage B gated Pipeline-Geschwister bisher über rohen Titel-/Alias-Token-
Overlap (`overlap >= 1`). Zwei semantisch fast identische Notes EINES Laufs mit
lexikalisch disjunkten Titeln ("Wissensorganisation" ↔ "Semantisches Retrieval mit
Assoziationsnetz", Body-cos 0,97) teilten 0 Tokens → Geschwister verworfen → beide
`related: []`. Fix: zusätzlich Body-Embedding-Cosine als Kandidaten-Signal (additiv —
Token-Treffer bleiben unverändert, kein Regressionsrisiko). Schwelle 0,85 empirisch
kalibriert (verwandt 0,97–0,99, fremd 0,73–0,76).
"""
from __future__ import annotations

from generative.agents.cross_reference import _rank_sibling_candidates, _tokens


def _d(title, body="b", aliases=None):
from generative.schemas.atomic_note import AtomicNoteDraft
return AtomicNoteDraft(title=title, body=body, source_anchors=[], related=[],
tags=[], synthesis_confidence="low",
aliases=aliases or [])


def _q(draft):
q = _tokens(draft.title)
for a in draft.aliases:
q |= _tokens(a)
return q


def test_token_overlap_sibling_included_without_embedding():
# Geschwister mit gemeinsamem Token ("ISP") → Kandidat; cosine_fn NICHT aufgerufen.
draft = _d("ISP Stage Collection")
sibs = {"ISP Stage Exploration": _d("ISP Stage Exploration")}
called = []
def cos_fn(s):
called.append(s); return 0.0
out = _rank_sibling_candidates(draft, sibs, _q(draft), cos_fn, threshold=0.85)
assert [t for t, _ in out] == ["ISP Stage Exploration"]
assert called == [] # Token-Treffer → kein Embedding nötig


def test_lexically_disjoint_but_semantic_included():
# Der echte Fall: 0 gemeinsame Tokens, aber Body-cos 0,97 → Kandidat.
draft = _d("Wissensorganisation")
sibs = {"Semantisches Retrieval mit Assoziationsnetz":
_d("Semantisches Retrieval mit Assoziationsnetz")}
out = _rank_sibling_candidates(draft, sibs, _q(draft), lambda s: 0.97, threshold=0.85)
assert [t for t, _ in out] == ["Semantisches Retrieval mit Assoziationsnetz"]


def test_lexically_disjoint_below_threshold_excluded():
# 0 Tokens UND Body-cos unter Schwelle (fremdes Thema, 0,76) → kein Kandidat.
draft = _d("Wissensorganisation")
sibs = {"ADKAR Ability": _d("ADKAR Ability")}
out = _rank_sibling_candidates(draft, sibs, _q(draft), lambda s: 0.76, threshold=0.85)
assert out == []


def test_self_excluded():
draft = _d("Wissensorganisation")
sibs = {"Wissensorganisation": draft}
out = _rank_sibling_candidates(draft, sibs, _q(draft), lambda s: 0.99, threshold=0.85)
assert out == []


def test_lexical_ranks_above_semantic():
# Token-Treffer (starkes Signal) muss vor reinem Embedding-Treffer ranken.
draft = _d("ISP Stage Collection")
sibs = {
"ISP Stage Exploration": _d("ISP Stage Exploration"), # Token-Overlap ("ISP","Stage")
"Affektives Paradigma der Suche": _d("Affektives Paradigma der Suche"), # nur semantisch
}
out = _rank_sibling_candidates(draft, sibs, _q(draft), lambda s: 0.95, threshold=0.85)
assert out[0][0] == "ISP Stage Exploration"
assert "Affektives Paradigma der Suche" in [t for t, _ in out]


def test_empty_siblings_returns_empty():
draft = _d("X")
assert _rank_sibling_candidates(draft, None, _q(draft), lambda s: 0.99) == []
assert _rank_sibling_candidates(draft, {}, _q(draft), lambda s: 0.99) == []


def test_caps_at_five():
draft = _d("Topic")
sibs = {f"Sib {i}": _d(f"Sib {i}") for i in range(8)} # alle semantisch
out = _rank_sibling_candidates(draft, sibs, _q(draft), lambda s: 0.9, threshold=0.85)
assert len(out) == 5
Loading