Neal006 · Neal006 · May 22, 2026 · May 22, 2026
diff --git a/.env.example b/.env.example
@@ -1 +1,31 @@
-GROQ_API_KEY=your_groq_api_key_here
+# ── MemoryLens environment variables ─────────────────────────────────────────
+# Copy this file to .env and fill in the keys you want to use.
+# At least ONE provider is needed for --llm / LLM eval mode.
+# All providers are optional — without any key the benchmark runs in
+# content-only mode (fast, free, no API needed).
+
+# ── Provider API keys (pick any one or more) ─────────────────────────────────
+
+# Groq  — free tier, very fast (recommended for quick experiments)
+# https://console.groq.com/keys
+GROQ_API_KEY=
+
+# OpenAI  — gpt-4o-mini by default
+# https://platform.openai.com/api-keys
+OPENAI_API_KEY=
+
+# Anthropic  — claude-haiku-4-5 by default
+# https://console.anthropic.com/settings/keys
+ANTHROPIC_API_KEY=
+
+# OpenRouter  — 200+ models via one key, has a free tier
+# https://openrouter.ai/settings/keys
+OPENROUTER_API_KEY=
+
+# Ollama  — local LLMs, no key needed, just start the server
+# Default: http://localhost:11434   Override:
+# OLLAMA_HOST=http://localhost:11434
+
+# ── Auto-detect override ──────────────────────────────────────────────────────
+# Force a specific provider instead of auto-detecting:
+# MEMORYLENS_PROVIDER=groq
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,30 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
-### Added
+### Added — Multi-Provider Real LLM Evaluation (`feat/multi-provider-llm-eval`)
+- `utils/providers.py` — unified LLM abstraction layer supporting **5 providers**:
+  - **Groq** (`GROQ_API_KEY`) — free tier, llama-3.1-8b-instant
+  - **OpenAI** (`OPENAI_API_KEY`) — gpt-4o-mini
+  - **Anthropic** (`ANTHROPIC_API_KEY`) — claude-haiku-4-5
+  - **OpenRouter** (`OPENROUTER_API_KEY`) — 200+ models via one key
+  - **Ollama** (local, no key) — any locally running model
+  - Auto-detection priority: Groq → OpenAI → Anthropic → OpenRouter → Ollama
+- **Two-stage LLM evaluation pipeline** in `evaluation/metrics.py`:
+  - `llm_recall_at_t()` — LLM answers the query; a judge LLM call verifies correctness
+  - `llm_temporal_drift()` — checks if LLM returns old vs new value after a fact update
+- **CLI flags** in `main.py`:
+  - `--llm` — enable real LLM evaluation pass
+  - `--provider <name>` — force a specific provider
+  - `--list-providers` — print availability of all providers and exit
+- **Three-table CLI output**: Content Recall, LLM Recall, and Gap (Content − LLM)
+- **Dashboard updates**:
+  - Provider selector in sidebar (auto-detects available providers)
+  - Tabbed recall chart: Content Recall / LLM Recall / Gap
+  - KPI cards show LLM Recall with gap delta when available
+  - `summary` backend added to backend multiselect
+- Updated `.env.example` with all five provider keys and inline documentation
+
+### Added — SummaryMemory Backend
 - `memory/summary.py`: SummaryMemory backend — rolling compression memory with dual-mode support:
   - **LLM mode** (when `GROQ_API_KEY` is set): Groq-powered abstractive summarisation
   - **Extractive fallback** (zero API cost): regex-based fact-pattern extraction

diff --git a/dashboard.py b/dashboard.py
@@ -24,10 +24,40 @@
 </style>
 """, unsafe_allow_html=True)
 
-COLORS = {"naive": "#f38ba8", "rag": "#89b4fa", "cascading": "#a6e3a1"}
+COLORS = {
+    "naive":     "#f38ba8",
+    "rag":       "#89b4fa",
+    "cascading": "#a6e3a1",
+    "summary":   "#fab387",
+}
 MONTHLY_QUERIES = 100_000
 COST_PER_TOKEN_INR = 83 / 1_000_000  # ~$1 per 1M tokens * 83 INR/USD
 
+_PROVIDER_KEYS = {
+    "groq":       "GROQ_API_KEY",
+    "openai":     "OPENAI_API_KEY",
+    "anthropic":  "ANTHROPIC_API_KEY",
+    "openrouter": "OPENROUTER_API_KEY",
+    "ollama":     None,  # no key needed — just a running server
+}
+
+
+def _detect_available_providers() -> List[str]:
+    """Return provider names whose credentials are currently present."""
+    available = []
+    for name, env_var in _PROVIDER_KEYS.items():
+        if env_var is None:
+            # Ollama: try a quick ping
+            import urllib.request
+            try:
+                urllib.request.urlopen("http://localhost:11434/api/tags", timeout=1)
+                available.append(name)
+            except Exception:
+                pass
+        elif os.getenv(env_var):
+            available.append(name)
+    return available
+
 
 # ─── Sidebar ────────────────────────────────────────────────────────────────
 with st.sidebar:
@@ -46,11 +76,37 @@
     )
     backends = st.multiselect(
         "Memory backends",
-        ["naive", "rag", "cascading"],
+        ["naive", "rag", "cascading", "summary"],
         default=["naive", "rag", "cascading"],
     )
     st.divider()
 
+    # ── LLM Provider ──────────────────────────────────────────────────────
+    st.subheader("LLM Evaluation (optional)")
+    available_providers = _detect_available_providers()
+    provider_options = ["None (content-only)"] + available_providers
+    selected_provider_label = st.selectbox(
+        "Provider",
+        provider_options,
+        help=(
+            "Run a real answer+judge pass on top of content-based metrics. "
+            "Set the matching API key in your .env file to unlock a provider."
+        ),
+    )
+    selected_provider = (
+        None if selected_provider_label == "None (content-only)"
+        else selected_provider_label
+    )
+
+    if not available_providers:
+        st.caption(
+            "No provider detected. Add an API key to `.env`:\n"
+            "`GROQ_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, "
+            "`OPENROUTER_API_KEY`, or start Ollama."
+        )
+
+    st.divider()
+
     col_run, col_demo = st.columns(2)
     run_btn  = col_run.button("▶ Run Live",  type="primary",   use_container_width=True)
     demo_btn = col_demo.button("📊 Demo",    use_container_width=True)
@@ -71,19 +127,33 @@ def load_demo() -> Dict:
 
 def render_results(data: Dict, is_demo: bool = False) -> None:
     cps: List[int] = data["checkpoints"]
-    present = [b for b in ["naive", "rag", "cascading"] if b in data]
+    present = [b for b in ["naive", "rag", "cascading", "summary"] if b in data]
+    has_llm  = data.get("has_llm_eval", False)
 
     if is_demo:
-        st.info("📊 Showing pre-computed demo results. Set GROQ_API_KEY and click ▶ Run Live for real evaluation.", icon="ℹ️")
+        st.info(
+            "📊 Showing pre-computed demo results. "
+            "Set an API key in `.env` and click **▶ Run Live** for real LLM evaluation.",
+            icon="ℹ️",
+        )
 
     # ── KPI cards ──────────────────────────────────────────────────────────
     st.subheader("Summary at Final Checkpoint")
     cols = st.columns(len(present))
     for col, name in zip(cols, present):
         d = data[name]
+        llm_val = d.get("llm_recall", [None])[-1]
         with col:
             st.markdown(f"#### {name.capitalize()}")
             st.metric("Recall@Final",    f"{d['recall'][-1]*100:.1f}%")
+            if llm_val is not None:
+                gap = (d["recall"][-1] - llm_val) * 100
+                st.metric(
+                    "LLM Recall@Final",
+                    f"{llm_val*100:.1f}%",
+                    delta=f"{gap:+.1f}pp gap",
+                    delta_color="inverse",
+                )
             st.metric("Avg Tokens",      f"{d['tokens'][-1]:,}")
             st.metric("Temporal Drift",  f"{d['drift'][-1]*100:.1f}%")
             st.metric("Precision@K",     f"{d['precision'][-1]*100:.1f}%")
@@ -92,19 +162,96 @@ def render_results(data: Dict, is_demo: bool = False) -> None:
 
     # ── Recall decay ───────────────────────────────────────────────────────
     st.subheader("Memory Recall Decay Over Time")
-    fig = go.Figure()
-    for name in present:
-        fig.add_trace(go.Scatter(
-            x=cps, y=[v * 100 for v in data[name]["recall"]],
-            name=name.capitalize(), mode="lines+markers",
-            line=dict(color=COLORS[name], width=3), marker=dict(size=9),
-        ))
-    fig.update_layout(
-        xaxis_title="Conversation Turn", yaxis_title="Recall (%)",
-        yaxis=dict(range=[0, 105]), template="plotly_dark",
-        height=360, legend=dict(orientation="h", y=1.12),
-    )
-    st.plotly_chart(fig, use_container_width=True)
+
+    if has_llm:
+        tab_content, tab_llm, tab_gap = st.tabs(
+            ["Content Recall", "LLM Recall", "Gap (Content − LLM)"]
+        )
+    else:
+        (tab_content,) = st.tabs(["Content Recall"])
+        tab_llm = tab_gap = None
+
+    with tab_content:
+        fig = go.Figure()
+        for name in present:
+            color = COLORS.get(name, "#cdd6f4")
+            fig.add_trace(go.Scatter(
+                x=cps, y=[v * 100 for v in data[name]["recall"]],
+                name=name.capitalize(), mode="lines+markers",
+                line=dict(color=color, width=3), marker=dict(size=9),
+            ))
+        fig.update_layout(
+            xaxis_title="Conversation Turn", yaxis_title="Recall (%)",
+            yaxis=dict(range=[0, 105]), template="plotly_dark",
+            height=360, legend=dict(orientation="h", y=1.12),
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        st.caption(
+            "Content Recall: substring match on retrieved context chunks — "
+            "fast, reproducible, zero API cost."
+        )
+
+    if has_llm and tab_llm is not None:
+        with tab_llm:
+            fig_llm = go.Figure()
+            for name in present:
+                color = COLORS.get(name, "#cdd6f4")
+                llm_vals = data[name].get("llm_recall", [])
+                if any(v is not None for v in llm_vals):
+                    fig_llm.add_trace(go.Scatter(
+                        x=cps,
+                        y=[v * 100 if v is not None else None for v in llm_vals],
+                        name=name.capitalize(), mode="lines+markers",
+                        line=dict(color=color, width=3, dash="dash"),
+                        marker=dict(size=9, symbol="diamond"),
+                    ))
+            fig_llm.update_layout(
+                xaxis_title="Conversation Turn", yaxis_title="LLM Recall (%)",
+                yaxis=dict(range=[0, 105]), template="plotly_dark",
+                height=360, legend=dict(orientation="h", y=1.12),
+            )
+            st.plotly_chart(fig_llm, use_container_width=True)
+            provider_used = next(
+                (data[b].get("provider") for b in present if data[b].get("provider")),
+                "unknown",
+            )
+            st.caption(
+                f"LLM Recall: two-stage answer+judge pipeline using **{provider_used}**. "
+                "The LLM actually answers each question; a judge call verifies correctness."
+            )
+
+    if has_llm and tab_gap is not None:
+        with tab_gap:
+            fig_gap = go.Figure()
+            for name in present:
+                color = COLORS.get(name, "#cdd6f4")
+                content_vals = data[name]["recall"]
+                llm_vals = data[name].get("llm_recall", [None] * len(content_vals))
+                gaps = [
+                    (c - l) * 100 if l is not None else None
+                    for c, l in zip(content_vals, llm_vals)
+                ]
+                if any(g is not None for g in gaps):
+                    fig_gap.add_trace(go.Bar(
+                        x=[f"T={c}" for c in cps],
+                        y=gaps,
+                        name=name.capitalize(),
+                        marker_color=color,
+                    ))
+            fig_gap.add_hline(y=0, line_dash="dot", line_color="#cdd6f4")
+            fig_gap.update_layout(
+                xaxis_title="Checkpoint",
+                yaxis_title="Content Recall − LLM Recall (pp)",
+                template="plotly_dark", height=360,
+                barmode="group",
+                legend=dict(orientation="h", y=1.12),
+            )
+            st.plotly_chart(fig_gap, use_container_width=True)
+            st.caption(
+                "Positive gap means content recall *overestimates* true answer quality. "
+                "A large gap signals the backend retrieves the right text but the LLM "
+                "still fails to extract the correct answer."
+            )
 
     # ── Drift + Noise ───────────────────────────────────────────────────────
     c1, c2 = st.columns(2)
@@ -254,9 +401,7 @@ def _latex_table(data: Dict, checkpoints: List[int], present: List[str]) -> str:
     st.rerun()
 
 if run_btn:
-    if not os.getenv("GROQ_API_KEY"):
-        st.error("GROQ_API_KEY not found. Add it to a `.env` file in the project root.")
-    elif not checkpoints:
+    if not checkpoints:
         st.warning("Select at least one checkpoint.")
     else:
         log_area = st.empty()
@@ -269,17 +414,34 @@ def push_log(msg: str) -> None:
         with st.spinner("Running benchmark…"):
             from evaluation.benchmark import run_benchmark, results_to_display_dict
             from evaluation.logger import log_run
+
+            # Resolve provider (None = content-only)
+            provider_obj = None
+            if selected_provider:
+                try:
+                    from utils.providers import get_provider
+                    provider_obj = get_provider(selected_provider)
+                    push_log(f"LLM provider: {provider_obj.name}")
+                except Exception as e:
+                    st.error(f"Provider error: {e}")
+                    st.stop()
+
             raw = run_benchmark(
                 total_turns=total_turns,
                 eval_checkpoints=sorted(checkpoints),
                 backends=backends,
+                provider=provider_obj,
                 progress=push_log,
             )
             display = results_to_display_dict(raw)
             st.session_state.results = display
             st.session_state.is_demo = False
-            saved = log_run(display, {"total_turns": total_turns, "backends": backends})
-            push_log(f"Results saved → {saved}")
+            saved = log_run(display, {
+                "total_turns": total_turns,
+                "backends":    backends,
+                "provider":    provider_obj.name if provider_obj else None,
+            })
+            push_log(f"Results saved -> {saved}")
 
         log_area.empty()
         st.rerun()
@@ -297,11 +459,12 @@ def push_log(msg: str) -> None:
 | Layer | What It Does |
 |-------|--------------|
 | **Memory Injection** | Injects personal facts at T=0 and queries them at T=10, 25, 50, 100 |
-| **3 Backends** | Naive (full history), RAG (vector retrieval), Cascading Temporal (tiered decay) |
+| **4 Backends** | Naive · RAG · Cascading Temporal · SummaryMemory |
 | **5 Metrics** | Recall@T · Precision@K · Temporal Drift · Memory Noise Ratio · Token Cost |
-| **Dashboard** | Decay curves, cost impact, LaTeX-ready research tables |
+| **LLM Eval** | Two-stage answer+judge pipeline — 5 providers (Groq, OpenAI, Anthropic, OpenRouter, Ollama) |
+| **Dashboard** | Decay curves, content vs LLM recall gap, cost impact, LaTeX export |
 
-**Click 📊 Demo** in the sidebar for instant results, or set `GROQ_API_KEY` and click **▶ Run Live**.
+**Click 📊 Demo** in the sidebar for instant results, or configure a provider and click **▶ Run Live**.
 """)
 
     st.markdown("---")