Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1 +1,31 @@
GROQ_API_KEY=your_groq_api_key_here
# ── MemoryLens environment variables ─────────────────────────────────────────
# Copy this file to .env and fill in the keys you want to use.
# At least ONE provider is needed for --llm / LLM eval mode.
# All providers are optional — without any key the benchmark runs in
# content-only mode (fast, free, no API needed).

# ── Provider API keys (pick any one or more) ─────────────────────────────────

# Groq — free tier, very fast (recommended for quick experiments)
# https://console.groq.com/keys
GROQ_API_KEY=

# OpenAI — gpt-4o-mini by default
# https://platform.openai.com/api-keys
OPENAI_API_KEY=

# Anthropic — claude-haiku-4-5 by default
# https://console.anthropic.com/settings/keys
ANTHROPIC_API_KEY=

# OpenRouter — 200+ models via one key, has a free tier
# https://openrouter.ai/settings/keys
OPENROUTER_API_KEY=

# Ollama — local LLMs, no key needed, just start the server
# Default: http://localhost:11434 Override:
# OLLAMA_HOST=http://localhost:11434

# ── Auto-detect override ──────────────────────────────────────────────────────
# Force a specific provider instead of auto-detecting:
# MEMORYLENS_PROVIDER=groq
25 changes: 24 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,30 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [Unreleased]

### Added
### Added — Multi-Provider Real LLM Evaluation (`feat/multi-provider-llm-eval`)
- `utils/providers.py` — unified LLM abstraction layer supporting **5 providers**:
- **Groq** (`GROQ_API_KEY`) — free tier, llama-3.1-8b-instant
- **OpenAI** (`OPENAI_API_KEY`) — gpt-4o-mini
- **Anthropic** (`ANTHROPIC_API_KEY`) — claude-haiku-4-5
- **OpenRouter** (`OPENROUTER_API_KEY`) — 200+ models via one key
- **Ollama** (local, no key) — any locally running model
- Auto-detection priority: Groq → OpenAI → Anthropic → OpenRouter → Ollama
- **Two-stage LLM evaluation pipeline** in `evaluation/metrics.py`:
- `llm_recall_at_t()` — LLM answers the query; a judge LLM call verifies correctness
- `llm_temporal_drift()` — checks if LLM returns old vs new value after a fact update
- **CLI flags** in `main.py`:
- `--llm` — enable real LLM evaluation pass
- `--provider <name>` — force a specific provider
- `--list-providers` — print availability of all providers and exit
- **Three-table CLI output**: Content Recall, LLM Recall, and Gap (Content − LLM)
- **Dashboard updates**:
- Provider selector in sidebar (auto-detects available providers)
- Tabbed recall chart: Content Recall / LLM Recall / Gap
- KPI cards show LLM Recall with gap delta when available
- `summary` backend added to backend multiselect
- Updated `.env.example` with all five provider keys and inline documentation

### Added — SummaryMemory Backend
- `memory/summary.py`: SummaryMemory backend — rolling compression memory with dual-mode support:
- **LLM mode** (when `GROQ_API_KEY` is set): Groq-powered abstractive summarisation
- **Extractive fallback** (zero API cost): regex-based fact-pattern extraction
Expand Down
213 changes: 188 additions & 25 deletions dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,40 @@
</style>
""", unsafe_allow_html=True)

COLORS = {"naive": "#f38ba8", "rag": "#89b4fa", "cascading": "#a6e3a1"}
COLORS = {
"naive": "#f38ba8",
"rag": "#89b4fa",
"cascading": "#a6e3a1",
"summary": "#fab387",
}
MONTHLY_QUERIES = 100_000
COST_PER_TOKEN_INR = 83 / 1_000_000 # ~$1 per 1M tokens * 83 INR/USD

_PROVIDER_KEYS = {
"groq": "GROQ_API_KEY",
"openai": "OPENAI_API_KEY",
"anthropic": "ANTHROPIC_API_KEY",
"openrouter": "OPENROUTER_API_KEY",
"ollama": None, # no key needed — just a running server
}


def _detect_available_providers() -> List[str]:
"""Return provider names whose credentials are currently present."""
available = []
for name, env_var in _PROVIDER_KEYS.items():
if env_var is None:
# Ollama: try a quick ping
import urllib.request
try:
urllib.request.urlopen("http://localhost:11434/api/tags", timeout=1)
Comment on lines +50 to +53
available.append(name)
except Exception:
pass
elif os.getenv(env_var):
available.append(name)
return available


# ─── Sidebar ────────────────────────────────────────────────────────────────
with st.sidebar:
Expand All @@ -46,11 +76,37 @@
)
backends = st.multiselect(
"Memory backends",
["naive", "rag", "cascading"],
["naive", "rag", "cascading", "summary"],
default=["naive", "rag", "cascading"],
)
st.divider()

# ── LLM Provider ──────────────────────────────────────────────────────
st.subheader("LLM Evaluation (optional)")
available_providers = _detect_available_providers()
provider_options = ["None (content-only)"] + available_providers
selected_provider_label = st.selectbox(
"Provider",
provider_options,
help=(
"Run a real answer+judge pass on top of content-based metrics. "
"Set the matching API key in your .env file to unlock a provider."
),
)
selected_provider = (
None if selected_provider_label == "None (content-only)"
else selected_provider_label
)

if not available_providers:
st.caption(
"No provider detected. Add an API key to `.env`:\n"
"`GROQ_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, "
"`OPENROUTER_API_KEY`, or start Ollama."
)

st.divider()

col_run, col_demo = st.columns(2)
run_btn = col_run.button("▶ Run Live", type="primary", use_container_width=True)
demo_btn = col_demo.button("📊 Demo", use_container_width=True)
Expand All @@ -71,19 +127,33 @@ def load_demo() -> Dict:

def render_results(data: Dict, is_demo: bool = False) -> None:
cps: List[int] = data["checkpoints"]
present = [b for b in ["naive", "rag", "cascading"] if b in data]
present = [b for b in ["naive", "rag", "cascading", "summary"] if b in data]
has_llm = data.get("has_llm_eval", False)

if is_demo:
st.info("📊 Showing pre-computed demo results. Set GROQ_API_KEY and click ▶ Run Live for real evaluation.", icon="ℹ️")
st.info(
"📊 Showing pre-computed demo results. "
"Set an API key in `.env` and click **▶ Run Live** for real LLM evaluation.",
icon="ℹ️",
)

# ── KPI cards ──────────────────────────────────────────────────────────
st.subheader("Summary at Final Checkpoint")
cols = st.columns(len(present))
for col, name in zip(cols, present):
d = data[name]
llm_val = d.get("llm_recall", [None])[-1]
with col:
st.markdown(f"#### {name.capitalize()}")
st.metric("Recall@Final", f"{d['recall'][-1]*100:.1f}%")
if llm_val is not None:
gap = (d["recall"][-1] - llm_val) * 100
st.metric(
"LLM Recall@Final",
f"{llm_val*100:.1f}%",
delta=f"{gap:+.1f}pp gap",
delta_color="inverse",
)
st.metric("Avg Tokens", f"{d['tokens'][-1]:,}")
st.metric("Temporal Drift", f"{d['drift'][-1]*100:.1f}%")
st.metric("Precision@K", f"{d['precision'][-1]*100:.1f}%")
Expand All @@ -92,19 +162,96 @@ def render_results(data: Dict, is_demo: bool = False) -> None:

# ── Recall decay ───────────────────────────────────────────────────────
st.subheader("Memory Recall Decay Over Time")
fig = go.Figure()
for name in present:
fig.add_trace(go.Scatter(
x=cps, y=[v * 100 for v in data[name]["recall"]],
name=name.capitalize(), mode="lines+markers",
line=dict(color=COLORS[name], width=3), marker=dict(size=9),
))
fig.update_layout(
xaxis_title="Conversation Turn", yaxis_title="Recall (%)",
yaxis=dict(range=[0, 105]), template="plotly_dark",
height=360, legend=dict(orientation="h", y=1.12),
)
st.plotly_chart(fig, use_container_width=True)

if has_llm:
tab_content, tab_llm, tab_gap = st.tabs(
["Content Recall", "LLM Recall", "Gap (Content − LLM)"]
)
else:
(tab_content,) = st.tabs(["Content Recall"])
tab_llm = tab_gap = None

with tab_content:
fig = go.Figure()
for name in present:
color = COLORS.get(name, "#cdd6f4")
fig.add_trace(go.Scatter(
x=cps, y=[v * 100 for v in data[name]["recall"]],
name=name.capitalize(), mode="lines+markers",
line=dict(color=color, width=3), marker=dict(size=9),
))
fig.update_layout(
xaxis_title="Conversation Turn", yaxis_title="Recall (%)",
yaxis=dict(range=[0, 105]), template="plotly_dark",
height=360, legend=dict(orientation="h", y=1.12),
)
st.plotly_chart(fig, use_container_width=True)
st.caption(
"Content Recall: substring match on retrieved context chunks — "
"fast, reproducible, zero API cost."
)

if has_llm and tab_llm is not None:
with tab_llm:
fig_llm = go.Figure()
for name in present:
color = COLORS.get(name, "#cdd6f4")
llm_vals = data[name].get("llm_recall", [])
if any(v is not None for v in llm_vals):
fig_llm.add_trace(go.Scatter(
x=cps,
y=[v * 100 if v is not None else None for v in llm_vals],
name=name.capitalize(), mode="lines+markers",
line=dict(color=color, width=3, dash="dash"),
marker=dict(size=9, symbol="diamond"),
))
fig_llm.update_layout(
xaxis_title="Conversation Turn", yaxis_title="LLM Recall (%)",
yaxis=dict(range=[0, 105]), template="plotly_dark",
height=360, legend=dict(orientation="h", y=1.12),
)
st.plotly_chart(fig_llm, use_container_width=True)
provider_used = next(
(data[b].get("provider") for b in present if data[b].get("provider")),
"unknown",
)
st.caption(
f"LLM Recall: two-stage answer+judge pipeline using **{provider_used}**. "
"The LLM actually answers each question; a judge call verifies correctness."
)

if has_llm and tab_gap is not None:
with tab_gap:
fig_gap = go.Figure()
for name in present:
color = COLORS.get(name, "#cdd6f4")
content_vals = data[name]["recall"]
llm_vals = data[name].get("llm_recall", [None] * len(content_vals))
gaps = [
(c - l) * 100 if l is not None else None
for c, l in zip(content_vals, llm_vals)
]
if any(g is not None for g in gaps):
fig_gap.add_trace(go.Bar(
x=[f"T={c}" for c in cps],
y=gaps,
name=name.capitalize(),
marker_color=color,
))
fig_gap.add_hline(y=0, line_dash="dot", line_color="#cdd6f4")
fig_gap.update_layout(
xaxis_title="Checkpoint",
yaxis_title="Content Recall − LLM Recall (pp)",
template="plotly_dark", height=360,
barmode="group",
legend=dict(orientation="h", y=1.12),
)
st.plotly_chart(fig_gap, use_container_width=True)
st.caption(
"Positive gap means content recall *overestimates* true answer quality. "
"A large gap signals the backend retrieves the right text but the LLM "
"still fails to extract the correct answer."
)

# ── Drift + Noise ───────────────────────────────────────────────────────
c1, c2 = st.columns(2)
Expand Down Expand Up @@ -254,9 +401,7 @@ def _latex_table(data: Dict, checkpoints: List[int], present: List[str]) -> str:
st.rerun()

if run_btn:
if not os.getenv("GROQ_API_KEY"):
st.error("GROQ_API_KEY not found. Add it to a `.env` file in the project root.")
elif not checkpoints:
if not checkpoints:
st.warning("Select at least one checkpoint.")
else:
log_area = st.empty()
Expand All @@ -269,17 +414,34 @@ def push_log(msg: str) -> None:
with st.spinner("Running benchmark…"):
from evaluation.benchmark import run_benchmark, results_to_display_dict
from evaluation.logger import log_run

# Resolve provider (None = content-only)
provider_obj = None
if selected_provider:
try:
from utils.providers import get_provider
provider_obj = get_provider(selected_provider)
push_log(f"LLM provider: {provider_obj.name}")
except Exception as e:
st.error(f"Provider error: {e}")
st.stop()

raw = run_benchmark(
total_turns=total_turns,
eval_checkpoints=sorted(checkpoints),
backends=backends,
provider=provider_obj,
progress=push_log,
)
display = results_to_display_dict(raw)
st.session_state.results = display
st.session_state.is_demo = False
saved = log_run(display, {"total_turns": total_turns, "backends": backends})
push_log(f"Results saved → {saved}")
saved = log_run(display, {
"total_turns": total_turns,
"backends": backends,
"provider": provider_obj.name if provider_obj else None,
})
push_log(f"Results saved -> {saved}")

log_area.empty()
st.rerun()
Expand All @@ -297,11 +459,12 @@ def push_log(msg: str) -> None:
| Layer | What It Does |
|-------|--------------|
| **Memory Injection** | Injects personal facts at T=0 and queries them at T=10, 25, 50, 100 |
| **3 Backends** | Naive (full history), RAG (vector retrieval), Cascading Temporal (tiered decay) |
| **4 Backends** | Naive · RAG · Cascading Temporal · SummaryMemory |
| **5 Metrics** | Recall@T · Precision@K · Temporal Drift · Memory Noise Ratio · Token Cost |
| **Dashboard** | Decay curves, cost impact, LaTeX-ready research tables |
| **LLM Eval** | Two-stage answer+judge pipeline — 5 providers (Groq, OpenAI, Anthropic, OpenRouter, Ollama) |
| **Dashboard** | Decay curves, content vs LLM recall gap, cost impact, LaTeX export |

**Click 📊 Demo** in the sidebar for instant results, or set `GROQ_API_KEY` and click **▶ Run Live**.
**Click 📊 Demo** in the sidebar for instant results, or configure a provider and click **▶ Run Live**.
""")

st.markdown("---")
Expand Down
Loading
Loading