diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index b92e425..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..7bc8663 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +# https://editorconfig.org +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_style = space +indent_size = 4 + +[*.{md,yml,yaml,json,toml}] +indent_size = 2 + +[*.ipynb] +trim_trailing_whitespace = false +insert_final_newline = false + +[Makefile] +indent_style = tab diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..fff872b --- /dev/null +++ b/.env.example @@ -0,0 +1,16 @@ +# Copy to .env and fill in. Never commit .env (it is in .gitignore). +# On Google Colab, set these via the Secrets panel (key icon) instead. + +# ── ArangoDB (GraphRAG only) ────────────────────────────────────────────────── +# Local (default): use the bundled docker-compose — `docker compose up -d`, +# then ARANGO_HOST=http://localhost:8529 and ARANGO_PASS=devpassword. +# Cloud (ArangoDB Oasis): point ARANGO_HOST at your deployment endpoint, e.g. +# https://.arangodb.cloud:8529 +ARANGO_HOST=http://localhost:8529 +ARANGO_USER=root +ARANGO_PASS= +ARANGO_DB=pubmed_graph + +# ── Ollama (LLM) ────────────────────────────────────────────────────────────── +OLLAMA_API=http://localhost:11434/api/chat +LLM_MODEL=deepseek-r1:8b diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..04e2728 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,18 @@ +# Normalize line endings: LF in the repository and on checkout, everywhere. +* text=auto eol=lf + +# Must be LF to run on Unix (Makefile is also tab-sensitive). +Makefile text eol=lf +*.sh text eol=lf + +# Binary assets — no EOL conversion, no diff noise. +*.png binary +*.jpg binary +*.pdf binary +*.pptx binary +*.pkl binary +*.bin binary + +# Thin Colab wrappers are documentation, not core source — keep them out of the +# language breakdown so the repo reads as the Python project it is. +*.ipynb linguist-documentation diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..daa70c3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,34 @@ +--- +name: Bug report +about: Report something that isn't working as expected +title: "[Bug] " +labels: bug +assignees: "" +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To reproduce** +Steps or the exact command, e.g.: +```bash +python scripts/run_benchmark.py --arm graph --n 200 +``` + +**Expected behavior** +What you expected to happen. + +**Logs / traceback** +``` +paste the error here +``` + +**Environment** +- OS: +- Python version: +- Running where: [local / Colab] +- GPU (if any): +- Arango reachable / Ollama running: [yes/no] + +**Additional context** +Anything else that might help. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..a78cf63 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: false +contact_links: + - name: Question / discussion + url: https://github.com/vardhjain/Knowledge_Graph_Question_Answering/discussions + about: Ask a question or discuss the methodology, results, or design. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..7392076 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,24 @@ +--- +name: Feature request +about: Suggest an idea or improvement +title: "[Feature] " +labels: enhancement +assignees: "" +--- + +**What problem does this solve?** +A clear description of the motivation or gap. + +**Proposed solution** +What you'd like to happen. + +**Fairness check (for retrieval/eval changes)** +This project is a *fair* ablation. If your idea touches retrieval or evaluation, +note how it keeps the arms comparable (shared corpus/embedder/reranker/prompt/ +LLM/top-k) and avoids leaking the answer into context. + +**Alternatives considered** +Any other approaches you weighed. + +**Additional context** +Links, papers, or examples. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..286fc76 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,29 @@ +## Summary + + + +## Type of change + +- [ ] Bug fix +- [ ] New feature +- [ ] Refactor / cleanup +- [ ] Docs +- [ ] Benchmark / results + +## Checklist + +- [ ] `make test` passes +- [ ] `make lint` passes +- [ ] `CHANGELOG.md` updated under "Unreleased" +- [ ] Docs/README updated if behavior changed + +## Fairness (retrieval/evaluation changes only) + +- [ ] Confounders (embedder, reranker, prompt, LLM, top-k, seed, n) stay in + `config.py` and identical across arms +- [ ] No benchmark question/answer can leak into a retrieved context + (the leakage regression test still passes) + +## Notes + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..410f499 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,42 @@ +name: CI + +on: + push: + branches: [main, revamp] + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11"] + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install test dependencies + # The heavy ML libraries (torch, sentence-transformers, faiss, arango, + # datasets) are imported lazily, so unit tests need only this light set. + run: | + python -m pip install --upgrade pip + python -m pip install numpy scikit-learn scipy requests pytest pytest-cov ruff + + - name: Lint (ruff) + run: ruff check src scripts tests app + + - name: Test (pytest) + run: pytest --cov=kgqa --cov-report=xml --cov-report=term-missing + + - name: Upload coverage to Codecov + if: matrix.python-version == '3.11' + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: ./coverage.xml + fail_ci_if_error: false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fd4d01a --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +# ── OS ──────────────────────────────────────────────────────────────────────── +.DS_Store +Thumbs.db + +# ── Python ──────────────────────────────────────────────────────────────────── +__pycache__/ +*.py[cod] +*.egg-info/ +.eggs/ +build/ +dist/ +.venv/ +venv/ +env/ +.ipynb_checkpoints/ + +# ── Secrets ─────────────────────────────────────────────────────────────────── +.env + +# ── Caches & artifacts (regenerated; never committed) ───────────────────────── +pubmed_vectors_cache.pkl +Plain_RAG/pubmed_rag_index.bin +Plain_RAG/pubmed_rag_data.pkl +*.bin +*.pkl + +# ── Results (figures are committed; keep raw JSON if you want — see README) ──── +# results/ is committed intentionally so the README can reference real numbers. + +# ── Tooling ─────────────────────────────────────────────────────────────────── +.pytest_cache/ +.ruff_cache/ +.coverage +htmlcov/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..e10fe55 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,21 @@ +# Run automatically on `git commit` after `pre-commit install`. +# See https://pre-commit.com +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-yaml + - id: check-toml + - id: check-added-large-files + args: [--maxkb=1024] + - id: check-merge-conflict + - id: detect-private-key diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 0000000..3fabf7a --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,11 @@ +# Theme for the Streamlit dashboard (app/dashboard.py). Read by `streamlit run` +# locally and by Streamlit Community Cloud. Only long-stable keys are used so it +# renders correctly on any recent Streamlit version. Palette matches the +# matplotlib figure in results/ablation.png (blue primary). +[theme] +base = "light" +primaryColor = "#2196F3" +backgroundColor = "#FFFFFF" +secondaryBackgroundColor = "#F5F7FA" +textColor = "#1A2027" +font = "sans serif" diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..009261f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,57 @@ +# Changelog + +All notable changes to this project are documented here. The format follows +[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project +adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- **Interactive UIs** in `app/`: a Gradio chat demo (`chat_app.py`) over the + winning `graph` arm that cites source PubMed IDs, and a Streamlit results + dashboard (`dashboard.py`) that visualizes the ablation, McNemar tests, and + per-class breakdown. `requirements-app.txt`, `make chat` / `make dashboard`. +- `BaseRetriever.chat()` — conversational answer plus the retrieved source pubids. +- `scripts/compare.py` now also writes `results/summary.json` (structured metrics + + contrasts) for the dashboard. +- One-click **Streamlit Community Cloud** deploy for the dashboard: a light + `app/requirements.txt` (picked up before the heavy root file), a themed + `.streamlit/config.toml`, a richer page config, and a README live-demo badge. + +## [1.0.0] — 2026-06-12 + +The "fair comparison" revamp: turned a confounded notebook demo into a +controlled, reproducible 4-arm ablation with an industry-standard repo layout. + +### Added +- Importable `src/kgqa/` package: `config`, `prompts`, `llm`, `data`, + `evaluation`, `models`, and a `retrieval/` sub-package (`base`, `plain`, `graph`). +- Four retrieval arms isolating each component: + `plain → plain_rr → graph → graph_concepts`. +- A shared `ChunkStore` so every arm searches an identical corpus. +- MeSH concept-hop expansion (`graph_concepts`) — the previously unused + `Concepts`/`MENTIONS` graph is now exercised. +- Seeded random sampling and a paired **McNemar** significance test. +- `scripts/`: `ingest.py` (leakage-free graph build), `run_benchmark.py` + (`--arm`, retry + Ollama auto-restart + checkpointing), `compare.py`. +- Test suite (CPU-only via fakes), GitHub Actions CI, `ruff` + `pre-commit`. +- Docs and meta: README with results, `CONTRIBUTING`, `CODE_OF_CONDUCT`, + `SECURITY`, `CITATION.cff`, issue/PR templates, `Makefile`, architecture diagram. +- Benchmark results (n=200) and ablation figure under `results/`. + +### Fixed +- **Label leakage:** ingestion no longer stores a question-derived `title` or + `final_decision`; graph contexts use generic `=== STUDY n ===` labels, so the + benchmark question/answer can never appear in a retrieved context. +- **Confounded comparison:** the cross-encoder reranker is now its own arm + instead of a hidden advantage for GraphRAG. +- **Inconsistent corpus/chunking** across arms — now identical. +- `NameError` in the graph-expansion fallback path. + +### Changed +- Generation is bounded (`num_predict`) and the model kept resident + (`keep_alive`); `LLM_NUM_CTX` / `LLM_NUM_PREDICT` are environment-tunable. +- Removed the dead `faiss` dependency (PlainRAG uses the shared numpy-cosine store). + +[Unreleased]: https://github.com/vardhjain/Knowledge_Graph_Question_Answering/compare/v1.0.0...HEAD +[1.0.0]: https://github.com/vardhjain/Knowledge_Graph_Question_Answering/releases/tag/v1.0.0 diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..b5c28ea --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,25 @@ +cff-version: 1.2.0 +title: "Knowledge Graph Question Answering: a fair GraphRAG vs PlainRAG comparison" +message: "If you use this software or its findings, please cite it as below." +type: software +authors: + - given-names: Vardh + family-names: Jain + email: vardhjain20@gmail.com +repository-code: "https://github.com/vardhjain/Knowledge_Graph_Question_Answering" +abstract: >- + A controlled 4-arm ablation (plain, plain_rr, graph, graph_concepts) on + PubMedQA that isolates what a knowledge graph contributes to retrieval-augmented + question answering, holding corpus, chunking, embedder, reranker, prompt, LLM, + and top-k constant. Includes a paired McNemar significance test and a + leakage-free ArangoDB graph schema. +keywords: + - graphrag + - retrieval-augmented-generation + - knowledge-graph + - pubmedqa + - arangodb + - ablation-study +license: MIT +version: 1.0.0 +date-released: "2026-06-12" diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..4535721 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,57 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment: + +- Demonstrating empathy and kindness toward other people +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by our mistakes +- Focusing on what is best for the overall community + +Examples of unacceptable behavior: + +- The use of sexualized language or imagery, and sexual attention or advances +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information without explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards and +will take appropriate and fair corrective action in response to any behavior +they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement via GitHub. All +complaints will be reviewed and investigated promptly and fairly. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +https://www.contributor-covenant.org/version/2/1/code_of_conduct.html. + +[homepage]: https://www.contributor-covenant.org diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..edf5a5c --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,64 @@ +# Contributing + +Thanks for your interest in this project! It's a research codebase for a *fair* +GraphRAG vs PlainRAG comparison on PubMedQA, so contributions that improve +rigor, reproducibility, or clarity are especially welcome. + +## Development setup + +```bash +git clone https://github.com/vardhjain/Knowledge_Graph_Question_Answering.git +cd Knowledge_Graph_Question_Answering +python -m venv .venv && source .venv/bin/activate # Windows: .venv\Scripts\activate +make install-dev # or: pip install -r requirements-dev.txt +pre-commit install # optional: run ruff automatically on commit +``` + +The unit tests inject fakes for the encoder, reranker, and ArangoDB, so you can +run the whole suite on CPU with **no GPU, Ollama, or database** required: + +```bash +make test # pytest +make lint # ruff +``` + +See the [Makefile](Makefile) (`make help`) for all shortcuts. + +## Where things live + +| Path | What | +| --- | --- | +| `src/kgqa/` | the importable package (single source of truth) | +| `src/kgqa/config.py` | **all** shared constants + env overrides | +| `src/kgqa/retrieval/` | the four retrieval arms (`base`, `plain`, `graph`) | +| `scripts/` | `ingest.py`, `run_benchmark.py`, `compare.py` | +| `notebooks/` | thin Colab wrappers (kept output-free) | +| `tests/` | pytest suite (CPU-only via fakes) | + +> **Why no `configs/` directory?** Configuration is centralized in +> `src/kgqa/config.py` as a typed dataclass with environment-variable overrides +> (and an `.env.example` template). For this project that's safer and less +> error-prone than scattering YAML/JSON config files; please keep new knobs there. + +## Ground rules for changes + +This repo's whole point is a **fair** comparison. Before changing retrieval or +evaluation, please make sure: + +- Anything that could confound the arms (embedder, reranker, prompt, LLM, top-k, + seed, sample size) stays in `config.py` and identical across arms. +- No benchmark answer or question text can leak into a retrieved context + (there's a regression test for this — keep it green). +- New behavior has a test; `make test` and `make lint` both pass. + +## Pull requests + +1. Branch from `main`, make focused commits. +2. Run `make test && make lint`. +3. Open a PR using the template; describe what changed and why, and update + `CHANGELOG.md` under "Unreleased". + +## Commit messages + +Short imperative subject line, a blank line, then a body explaining the *why* +when it isn't obvious. diff --git a/Data_Ingestion_KG.ipynb b/Data_Ingestion_KG.ipynb deleted file mode 100644 index 5e0c0be..0000000 --- a/Data_Ingestion_KG.ipynb +++ /dev/null @@ -1,468 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "code", - "source": [ - "!pip install python-arango sentence-transformers datasets tqdm" - ], - "metadata": { - "id": "rUGLgHO2I-_Q" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import time\n", - "from arango import ArangoClient\n", - "from sentence_transformers import SentenceTransformer\n", - "from datasets import load_dataset\n", - "from tqdm import tqdm" - ], - "metadata": { - "id": "Nyu1zWSUJKbO" - }, - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "ARANGO_CONFIG = {\n", - " \"hosts\": \"https://bfc25a0e3c74.arangodb.cloud:8529\",\n", - " \"username\": \"root\",\n", - " \"password\": \"VnicTWKeXaDasFNfmCfU\",\n", - " \"db_name\": \"pubmed_graph\",\n", - " \"chunk_col\": \"Chunks\",\n", - " \"context_edge\": \"HAS_CONTEXT\",\n", - " \"mention_edge\": \"MENTIONS\"\n", - "}" - ], - "metadata": { - "id": "KAdtBe5TG5E5" - }, - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dW9J5OorEnsz" - }, - "outputs": [], - "source": [ - "# @title 🚀 GraphRAG Builder (Fixed & Complete)\n", - "# This script installs dependencies, connects to ArangoDB, sets up the schema,\n", - "# and ingests the PubMedQA dataset into the graph.\n", - "\n", - "# --- MANUAL CONFIGURATION ---\n", - "# Paste your details directly here to avoid input errors:\n", - "\n", - "# 1. The URL must start with https:// and usually ends with :8529\n", - "ARANGO_URL = \"https://bfc25a0e3c74.arangodb.cloud:8529\"\n", - "\n", - "# 2. The Username is almost always 'root'\n", - "ARANGO_USER = \"root\"\n", - "\n", - "# 3. Paste the password you copied from the 'Users' tab\n", - "ARANGO_PASS = \"VnicTWKeXaDasFNfmCfU\"\n", - "\n", - "# Database Name\n", - "DB_NAME = \"pubmed_graph\"\n", - "\n", - "# --- CONNECT ---\n", - "print(f\"Connecting to {ARANGO_URL}...\")\n", - "client = ArangoClient(hosts=ARANGO_URL)\n", - "sys_db = client.db(\"_system\", username=ARANGO_USER, password=ARANGO_PASS)\n", - "\n", - "# Create/Connect to specific database\n", - "if not sys_db.has_database(DB_NAME):\n", - " sys_db.create_database(DB_NAME)\n", - " print(f\"Created database: {DB_NAME}\")\n", - "else:\n", - " print(f\"Using existing database: {DB_NAME}\")\n", - "\n", - "db = client.db(DB_NAME, username=ARANGO_USER, password=ARANGO_PASS)\n", - "print(\"✅ Connected Successfully!\")\n", - "\n", - "# --- SCHEMA SETUP ---\n", - "print(\"\\nCreating Graph Schema...\")\n", - "\n", - "# 1. Define Node Collections\n", - "node_collections = [\"Papers\", \"Chunks\", \"Concepts\"]\n", - "for col in node_collections:\n", - " if not db.has_collection(col):\n", - " db.create_collection(col)\n", - " print(f\" - Created Node Collection: {col}\")\n", - "\n", - "# 2. Define Edge Collections\n", - "edge_collections = [\"HAS_CONTEXT\", \"MENTIONS\"]\n", - "for col in edge_collections:\n", - " if not db.has_collection(col):\n", - " db.create_collection(col, edge=True)\n", - " print(f\" - Created Edge Collection: {col}\")\n", - "\n", - "# 3. Create ArangoSearch View (Fallback for Vector Search)\n", - "# FIXED: The 'vector' index type is experimental in your version.\n", - "# We use an ArangoSearch View instead, which is robust and works on all versions.\n", - "view_name = \"pubmed_view\"\n", - "\n", - "# FIXED: Use db.views() list comprehension to check existence instead of .has_view()\n", - "existing_views = [v[\"name\"] for v in db.views()]\n", - "\n", - "if view_name not in existing_views:\n", - " # FIXED: Use dedicated method 'create_arangosearch_view' to avoid TypeError on 'type' arg\n", - " db.create_arangosearch_view(\n", - " name=view_name,\n", - " properties={\n", - " \"links\": {\n", - " \"Chunks\": {\n", - " \"fields\": {\n", - " \"embedding\": {\n", - " \"analyzers\": [\"identity\"] # Needed for vector operations\n", - " },\n", - " \"text\": {\n", - " \"analyzers\": [\"text_en\"] # Useful for keyword search\n", - " }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " )\n", - " print(f\" - Created ArangoSearch View: {view_name}\")\n", - "else:\n", - " print(f\" - ArangoSearch View '{view_name}' already exists.\")\n", - "\n", - "print(\"\\n✅ Database Configured Successfully!\")\n", - "\n", - "# --- LOAD DATA & MODEL ---\n", - "print(\"\\nLoading Embedding Model & Dataset...\")\n", - "\n", - "# Load Model (Runs on GPU if available in Colab)\n", - "# We use all-MiniLM-L6-v2 for speed and good performance\n", - "model = SentenceTransformer('all-MiniLM-L6-v2')\n", - "\n", - "# Load Dataset (Standard download to avoid 429 Rate Limits)\n", - "# REMOVED: streaming=True to prevent \"Too Many Requests\" error\n", - "ds = load_dataset(\"qiaojin/PubMedQA\", \"pqa_unlabeled\", split=\"train\")\n", - "\n", - "print(\"✅ Model and Data ready.\")\n", - "\n", - "# --- PROCESSING LOOP ---\n", - "# This loop processes papers, chunks them, embeds them, and inserts into ArangoDB.\n", - "\n", - "BATCH_SIZE = 50 # Number of papers to process before sending to DB (smaller batch for safety)\n", - "LIMIT_PAPERS = None # Limit for this run to ensure it finishes quickly (Set to None for full dataset)\n", - "\n", - "papers_batch = []\n", - "chunks_batch = []\n", - "concepts_batch = []\n", - "edges_batch = []\n", - "\n", - "print(f\"\\n🚀 Starting Ingestion (Limit: {LIMIT_PAPERS} papers)...\")\n", - "start_time = time.time()\n", - "\n", - "count = 0\n", - "\n", - "for row in tqdm(ds, total=LIMIT_PAPERS):\n", - " if LIMIT_PAPERS and count >= LIMIT_PAPERS:\n", - " break\n", - "\n", - " pubid = row['pubid']\n", - " question = row['question']\n", - " long_answer = row['long_answer']\n", - "\n", - " # 1. Prepare Paper Node\n", - " paper_key = str(pubid)\n", - " papers_batch.append({\n", - " \"_key\": paper_key,\n", - " \"title\": question,\n", - " \"answer\": long_answer\n", - " })\n", - "\n", - " # 2. Process Concepts (MeSH Terms)\n", - " mesh_terms = row.get('context', {}).get('meshes', [])\n", - " for mesh in mesh_terms:\n", - " # Sanitize key (Arango keys cannot contain spaces/special chars easily, so we hash or simplify)\n", - " # Here we just remove non-alphanumeric for simplicity\n", - " mesh_key = \"\".join(x for x in mesh if x.isalnum())\n", - " if not mesh_key: continue\n", - "\n", - " # Add Concept Node\n", - " concepts_batch.append({\n", - " \"_key\": mesh_key,\n", - " \"name\": mesh\n", - " })\n", - "\n", - " # Link Paper -> Concept\n", - " edges_batch.append({\n", - " \"_collection\": \"MENTIONS\",\n", - " \"_from\": f\"Papers/{paper_key}\",\n", - " \"_to\": f\"Concepts/{mesh_key}\"\n", - " })\n", - "\n", - " # 3. Process Contexts (Chunks)\n", - " contexts = row.get('context', {}).get('contexts', [])\n", - " labels = row.get('context', {}).get('labels', [])\n", - "\n", - " if contexts:\n", - " # Embed all chunks for this paper at once\n", - " embeddings = model.encode(contexts)\n", - "\n", - " for idx, (text, emb) in enumerate(zip(contexts, embeddings)):\n", - " chunk_key = f\"{paper_key}_{idx}\"\n", - "\n", - " # Add Chunk Node\n", - " chunks_batch.append({\n", - " \"_key\": chunk_key,\n", - " \"text\": text,\n", - " \"label\": labels[idx] if idx < len(labels) else \"context\",\n", - " \"embedding\": emb.tolist() # Convert numpy array to list for JSON\n", - " })\n", - "\n", - " # Link Paper -> Chunk\n", - " edges_batch.append({\n", - " \"_collection\": \"HAS_CONTEXT\",\n", - " \"_from\": f\"Papers/{paper_key}\",\n", - " \"_to\": f\"Chunks/{chunk_key}\"\n", - " })\n", - "\n", - " count += 1\n", - "\n", - " # --- BATCH INSERTION ---\n", - " if count % BATCH_SIZE == 0:\n", - " # Insert Papers\n", - " if papers_batch:\n", - " db.collection(\"Papers\").import_bulk(papers_batch, on_duplicate=\"ignore\")\n", - " # Insert Concepts\n", - " if concepts_batch:\n", - " db.collection(\"Concepts\").import_bulk(concepts_batch, on_duplicate=\"ignore\")\n", - " # Insert Chunks\n", - " if chunks_batch:\n", - " db.collection(\"Chunks\").import_bulk(chunks_batch, on_duplicate=\"ignore\")\n", - "\n", - " # Insert Edges (Must split by collection type for import_bulk)\n", - " mentions = [e for e in edges_batch if e[\"_collection\"] == \"MENTIONS\"]\n", - " contexts = [e for e in edges_batch if e[\"_collection\"] == \"HAS_CONTEXT\"]\n", - "\n", - " if mentions:\n", - " db.collection(\"MENTIONS\").import_bulk(mentions, on_duplicate=\"ignore\")\n", - " if contexts:\n", - " db.collection(\"HAS_CONTEXT\").import_bulk(contexts, on_duplicate=\"ignore\")\n", - "\n", - " # Reset batches\n", - " papers_batch = []\n", - " chunks_batch = []\n", - " concepts_batch = []\n", - " edges_batch = []\n", - "\n", - "# Final flush for remaining data\n", - "if papers_batch: db.collection(\"Papers\").import_bulk(papers_batch, on_duplicate=\"ignore\")\n", - "if concepts_batch: db.collection(\"Concepts\").import_bulk(concepts_batch, on_duplicate=\"ignore\")\n", - "if chunks_batch: db.collection(\"Chunks\").import_bulk(chunks_batch, on_duplicate=\"ignore\")\n", - "\n", - "mentions = [e for e in edges_batch if e[\"_collection\"] == \"MENTIONS\"]\n", - "contexts = [e for e in edges_batch if e[\"_collection\"] == \"HAS_CONTEXT\"]\n", - "if mentions: db.collection(\"MENTIONS\").import_bulk(mentions, on_duplicate=\"ignore\")\n", - "if contexts: db.collection(\"HAS_CONTEXT\").import_bulk(contexts, on_duplicate=\"ignore\")\n", - "\n", - "end_time = time.time()\n", - "print(f\"\\n🎉 Finished! Processed {count} papers in {end_time - start_time:.2f} seconds.\")\n", - "print(f\"Go to your ArangoDB Dashboard to see the 'pubmed_graph' database.\")\n", - "print(f\"IMPORTANT: Use 'FOR doc IN pubmed_view' in your AQL queries!\")" - ] - }, - { - "cell_type": "code", - "source": [ - "# @title ➕ Add PubMedQA \"Labeled\" Subset\n", - "# This script adds the 1,000 labeled papers to your existing graph.\n", - "\n", - "\n", - "# --- MANUAL CONFIGURATION ---\n", - "# 1. The URL must start with https:// and usually ends with :8529\n", - "ARANGO_URL = \"https://bfc25a0e3c74.arangodb.cloud:8529\"\n", - "# 2. The Username\n", - "ARANGO_USER = \"root\"\n", - "# 3. Paste the password you copied from the 'Users' tab\n", - "ARANGO_PASS = \"VnicTWKeXaDasFNfmCfU\"\n", - "# Database Name\n", - "DB_NAME = \"pubmed_graph\"\n", - "\n", - "# --- CONNECT ---\n", - "print(f\"Connecting to {ARANGO_URL}...\")\n", - "client = ArangoClient(hosts=ARANGO_URL)\n", - "db = client.db(DB_NAME, username=ARANGO_USER, password=ARANGO_PASS)\n", - "print(\"✅ Connected to 'pubmed_graph'!\")\n", - "\n", - "# --- LOAD DATA & MODEL ---\n", - "print(\"\\nLoading 'pqa_labeled' dataset...\")\n", - "\n", - "# Load the LABELED subset this time\n", - "ds_labeled = load_dataset(\"qiaojin/PubMedQA\", \"pqa_labeled\", split=\"train\")\n", - "model = SentenceTransformer('all-MiniLM-L6-v2')\n", - "\n", - "print(f\"✅ Loaded {len(ds_labeled)} labeled papers.\")\n", - "\n", - "# --- PROCESSING LOOP ---\n", - "BATCH_SIZE = 50\n", - "papers_batch = []\n", - "chunks_batch = []\n", - "concepts_batch = []\n", - "edges_batch = []\n", - "\n", - "print(\"\\n🚀 Starting Ingestion of Labeled Data...\")\n", - "start_time = time.time()\n", - "count = 0\n", - "\n", - "for row in tqdm(ds_labeled):\n", - " pubid = row['pubid']\n", - " question = row['question']\n", - " long_answer = row['long_answer']\n", - " final_decision = row.get('final_decision', None) # Unique to labeled set\n", - "\n", - " # 1. Prepare Paper Node (With extra 'final_decision' field)\n", - " paper_key = str(pubid)\n", - " papers_batch.append({\n", - " \"_key\": paper_key,\n", - " \"title\": question,\n", - " \"answer\": long_answer,\n", - " \"decision\": final_decision, # Store 'yes', 'no', or 'maybe'\n", - " \"dataset\": \"labeled\" # Tag it so we know source\n", - " })\n", - "\n", - " # 2. Process Concepts (MeSH Terms)\n", - " mesh_terms = row.get('context', {}).get('meshes', [])\n", - " for mesh in mesh_terms:\n", - " mesh_key = \"\".join(x for x in mesh if x.isalnum())\n", - " if not mesh_key: continue\n", - "\n", - " concepts_batch.append({\n", - " \"_key\": mesh_key,\n", - " \"name\": mesh\n", - " })\n", - " edges_batch.append({\n", - " \"_collection\": \"MENTIONS\",\n", - " \"_from\": f\"Papers/{paper_key}\",\n", - " \"_to\": f\"Concepts/{mesh_key}\"\n", - " })\n", - "\n", - " # 3. Process Contexts (Chunks)\n", - " contexts = row.get('context', {}).get('contexts', [])\n", - " labels = row.get('context', {}).get('labels', [])\n", - "\n", - " if contexts:\n", - " embeddings = model.encode(contexts)\n", - " for idx, (text, emb) in enumerate(zip(contexts, embeddings)):\n", - " chunk_key = f\"{paper_key}_{idx}\"\n", - " chunks_batch.append({\n", - " \"_key\": chunk_key,\n", - " \"text\": text,\n", - " \"label\": labels[idx] if idx < len(labels) else \"context\",\n", - " \"embedding\": emb.tolist()\n", - " })\n", - " edges_batch.append({\n", - " \"_collection\": \"HAS_CONTEXT\",\n", - " \"_from\": f\"Papers/{paper_key}\",\n", - " \"_to\": f\"Chunks/{chunk_key}\"\n", - " })\n", - "\n", - " count += 1\n", - "\n", - " # --- BATCH INSERTION ---\n", - " if count % BATCH_SIZE == 0:\n", - " if papers_batch: db.collection(\"Papers\").import_bulk(papers_batch, on_duplicate=\"update\") # Update if exists\n", - " if concepts_batch: db.collection(\"Concepts\").import_bulk(concepts_batch, on_duplicate=\"ignore\")\n", - " if chunks_batch: db.collection(\"Chunks\").import_bulk(chunks_batch, on_duplicate=\"ignore\")\n", - "\n", - " mentions = [e for e in edges_batch if e[\"_collection\"] == \"MENTIONS\"]\n", - " contexts = [e for e in edges_batch if e[\"_collection\"] == \"HAS_CONTEXT\"]\n", - " if mentions: db.collection(\"MENTIONS\").import_bulk(mentions, on_duplicate=\"ignore\")\n", - " if contexts: db.collection(\"HAS_CONTEXT\").import_bulk(contexts, on_duplicate=\"ignore\")\n", - "\n", - " papers_batch = []\n", - " chunks_batch = []\n", - " concepts_batch = []\n", - " edges_batch = []\n", - "\n", - "# Final Flush\n", - "if papers_batch: db.collection(\"Papers\").import_bulk(papers_batch, on_duplicate=\"update\")\n", - "if concepts_batch: db.collection(\"Concepts\").import_bulk(concepts_batch, on_duplicate=\"ignore\")\n", - "if chunks_batch: db.collection(\"Chunks\").import_bulk(chunks_batch, on_duplicate=\"ignore\")\n", - "mentions = [e for e in edges_batch if e[\"_collection\"] == \"MENTIONS\"]\n", - "contexts = [e for e in edges_batch if e[\"_collection\"] == \"HAS_CONTEXT\"]\n", - "if mentions: db.collection(\"MENTIONS\").import_bulk(mentions, on_duplicate=\"ignore\")\n", - "if contexts: db.collection(\"HAS_CONTEXT\").import_bulk(contexts, on_duplicate=\"ignore\")\n", - "\n", - "end_time = time.time()\n", - "print(f\"\\n🎉 Added {count} labeled papers in {end_time - start_time:.2f} seconds.\")" - ], - "metadata": { - "id": "q_dQy8L2Ew8r" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title 🧹 Remove \"decision\" and \"dataset\" columns\n", - "# This script iterates through Papers and deletes the specific attributes.\n", - "\n", - "# 1. Define the AQL Query\n", - "# We filter for papers that actually have these fields to save processing time.\n", - "# Setting them to 'null' with 'keepNull: false' deletes the attribute entirely.\n", - "aql_clean_columns = \"\"\"\n", - "FOR p IN Papers\n", - " FILTER HAS(p, \"decision\") OR HAS(p, \"dataset\")\n", - "\n", - " UPDATE p WITH {\n", - " decision: null,\n", - " dataset: null\n", - " } IN Papers\n", - " OPTIONS { keepNull: false }\n", - "\"\"\"\n", - "\n", - "# 2. Execute\n", - "print(\"Removing 'decision' and 'dataset' columns...\")\n", - "cursor = db.aql.execute(aql_clean_columns)\n", - "\n", - "# 3. Verify\n", - "# Let's count if any remain\n", - "verification_query = \"\"\"\n", - "FOR p IN Papers\n", - " FILTER HAS(p, \"decision\")\n", - " COLLECT WITH COUNT INTO count\n", - " RETURN count\n", - "\"\"\"\n", - "count = list(db.aql.execute(verification_query))[0]\n", - "\n", - "if count == 0:\n", - " print(\"✅ Success! Columns removed. All papers now have a uniform schema.\")\n", - "else:\n", - " print(f\"⚠️ Something went wrong. {count} papers still have the decision column.\")" - ], - "metadata": { - "id": "olhs2y-2Eyww" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/GraphRAG.ipynb b/GraphRAG.ipynb deleted file mode 100644 index a520090..0000000 --- a/GraphRAG.ipynb +++ /dev/null @@ -1,691 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8v2LQf5_MZW7" - }, - "outputs": [], - "source": [ - "# @title 🚀 1. Install Dependencies & Setup\n", - "# This cell installs the necessary libraries to talk to ArangoDB and process the data.\n", - "# Run this cell first!\n", - "\n", - "!pip install python-arango datasets ollama gradio sentence-transformers -q\n", - "\n", - "import time\n", - "from getpass import getpass\n", - "from datasets import load_dataset\n", - "from sentence_transformers import SentenceTransformer\n", - "import subprocess\n", - "import requests\n", - "import sys\n", - "import re\n", - "import numpy as np\n", - "import warnings\n", - "from typing import List, Dict\n", - "from arango.exceptions import ServerConnectionError, ArangoServerError\n", - "from sklearn.metrics.pairwise import cosine_similarity\n", - "from tqdm import tqdm\n", - "import os\n", - "import pickle\n", - "from sentence_transformers import CrossEncoder\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", - "import gradio as gr\n", - "\n", - "!curl -fsSL https://ollama.com/install.sh | sh\n", - "\n", - "\n", - "print(\"✅ Libraries installed.\")" - ] - }, - { - "cell_type": "code", - "source": [ - "def check_and_pull_model(model_name=\"deepseek-r1:8b\"):\n", - " \"\"\"\n", - " Checks if the model exists in Ollama. If not, pulls it automatically.\n", - " \"\"\"\n", - " print(f\"🕵️ [Ollama] Checking for model: {model_name}...\")\n", - "\n", - " # 1. Check list of models\n", - " try:\n", - " result = subprocess.run([\"ollama\", \"list\"], capture_output=True, text=True)\n", - " if model_name in result.stdout:\n", - " print(f\"✅ [Ollama] Model '{model_name}' is ready.\")\n", - " return\n", - " except Exception as e:\n", - " print(f\"⚠️ [Ollama] Could not check model list: {e}\")\n", - "\n", - " # 2. If missing, pull it\n", - " print(f\"⬇️ [Ollama] Model not found. Pulling {model_name} (This takes 2-5 mins)...\")\n", - " try:\n", - " # We use Popen to stream the output so you don't think it hung\n", - " process = subprocess.Popen(\n", - " [\"ollama\", \"pull\", model_name],\n", - " stdout=subprocess.PIPE,\n", - " stderr=subprocess.PIPE\n", - " )\n", - " while True:\n", - " output = process.stderr.readline()\n", - " if output == b'' and process.poll() is not None:\n", - " break\n", - " if output:\n", - " # Print progress to console\n", - " print(output.decode().strip())\n", - "\n", - " print(f\"✅ [Ollama] Successfully pulled {model_name}!\")\n", - "\n", - " except Exception as e:\n", - " print(f\"❌ [Ollama] Failed to pull model: {e}\")\n", - " sys.exit(1) # Stop script if model fails\n", - "\n", - "MODEL_NAME = \"deepseek-r1:8b\"\n", - "OLLAMA_API = \"http://localhost:11434/api/chat\"\n", - "\n", - "\n", - "check_and_pull_model()" - ], - "metadata": { - "id": "4zktFyVCj_vW" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title\n", - "# --- 3. ROBUST UTILITIES ---\n", - "\n", - "class FuzzyEvaluator:\n", - " \"\"\"Evaluates answers with logic to handle verbosity and synonyms.\"\"\"\n", - "\n", - " def extract_answer(self, text: str) -> str:\n", - " # Strip DeepSeek \"Thinking\" blocks\n", - " clean_text = re.sub(r'.*?', '', text, flags=re.DOTALL).lower()\n", - " # Look for the last explicit declaration\n", - " match = re.search(r'(?:final answer|answer):?\\s*(yes|no|maybe)', clean_text)\n", - " if match: return match.group(1)\n", - " # Fallback: look for isolated words at end of text\n", - " matches = re.findall(r'\\b(yes|no|maybe)\\b', clean_text)\n", - " if matches: return matches[-1]\n", - " return \"maybe\" # Default safety\n", - "\n", - " def is_correct(self, gt: str, pred: str) -> bool:\n", - " gt, pred = gt.lower().strip(), pred.lower().strip()\n", - "\n", - " # 1. Exact Match\n", - " if gt == pred: return True\n", - "\n", - " # 2. Starts With (e.g. \"yes, because...\")\n", - " if pred.startswith(gt + \" \") or pred.startswith(gt + \",\"): return True\n", - "\n", - " # 3. Synonyms\n", - " positive = [\"definitely yes\", \"likely\", \"probable\", \"certainly\"]\n", - " negative = [\"unlikely\", \"doubtful\", \"never\"]\n", - "\n", - " if gt == \"yes\" and any(x in pred for x in positive): return True\n", - " if gt == \"no\" and any(x in pred for x in negative): return True\n", - "\n", - " return False\n", - "\n", - "class ArangoConnectionManager:\n", - " \"\"\"Handles the 503 Service Unavailable errors by retrying.\"\"\"\n", - "\n", - " def __init__(self, config):\n", - " self.config = config\n", - " self.client = ArangoClient(hosts=config[\"hosts\"])\n", - " self.db = self._connect_with_retry()\n", - "\n", - " def _connect_with_retry(self, max_retries=5):\n", - " for attempt in range(max_retries):\n", - " try:\n", - " # verify connection\n", - " sys_db = self.client.db(\"_system\", username=self.config[\"username\"], password=self.config[\"password\"])\n", - " sys_db.version() # Ping\n", - "\n", - " # Connect to actual DB\n", - " db = self.client.db(self.config[\"db_name\"], username=self.config[\"username\"], password=self.config[\"password\"])\n", - " print(f\"✅ [ArangoDB] Connected successfully.\")\n", - " return db\n", - " except (ServerConnectionError, ArangoServerError) as e:\n", - " wait = (attempt + 1) * 5\n", - " print(f\"⚠️ [ArangoDB] Connection failed ({e}). Retrying in {wait}s...\")\n", - " time.sleep(wait)\n", - "\n", - " raise ConnectionError(\"Could not connect to ArangoDB after retries.\")" - ], - "metadata": { - "id": "_iTxmLlfNGNB" - }, - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# ==========================================\n", - "# 1. THE CACHING FUNCTION (Defined locally)\n", - "# ==========================================\n", - "def load_vectors_smartly(db, collection_name, cache_file=\"pubmed_vectors_cache.pkl\"):\n", - " \"\"\"\n", - " Handles the logic: Check Disk -> If Missing, Download -> Save to Disk.\n", - " \"\"\"\n", - " # A. Check Disk\n", - " if os.path.exists(cache_file):\n", - " print(f\"💾 [Cache] Found local file: {cache_file}\")\n", - " try:\n", - " with open(cache_file, 'rb') as f:\n", - " data = pickle.load(f)\n", - " ids = data.get('ids', [])\n", - " texts = data.get('texts', [])\n", - " embeddings = data.get('embeddings', [])\n", - "\n", - " if len(embeddings) > 0:\n", - " print(f\"✅ [Cache] Loaded {len(embeddings)} vectors from disk instantly.\")\n", - " return ids, texts, embeddings\n", - " except Exception as e:\n", - " print(f\"⚠️ [Cache] File corrupted ({e}). Re-downloading...\")\n", - "\n", - " # B. Download from Cloud (Only if A failed)\n", - " print(f\"☁️ [Index] Cache missing. Downloading from ArangoDB (This happens only once)...\")\n", - "\n", - " ids, texts, embeddings = [], [], []\n", - "\n", - " # Get Count\n", - " try:\n", - " count = db.aql.execute(f\"RETURN LENGTH({collection_name})\").next()\n", - " except:\n", - " count = 200000\n", - "\n", - " # Paged Download\n", - " BATCH_SIZE = 5000\n", - " offset = 0\n", - "\n", - " with tqdm(total=count, desc=\"Downloading Index\", unit=\"vec\") as pbar:\n", - " while True:\n", - " aql = f\"\"\"\n", - " FOR c IN {collection_name}\n", - " FILTER c.embedding != null\n", - " LIMIT {offset}, {BATCH_SIZE}\n", - " RETURN {{ \"id\": c._id, \"text\": c.text, \"emb\": c.embedding }}\n", - " \"\"\"\n", - " try:\n", - " cursor = db.aql.execute(aql, ttl=3600)\n", - " batch_count = 0\n", - " for doc in cursor:\n", - " ids.append(doc[\"id\"])\n", - " texts.append(doc[\"text\"])\n", - " embeddings.append(doc[\"emb\"])\n", - " batch_count += 1\n", - "\n", - " pbar.update(batch_count)\n", - " offset += batch_count\n", - " if batch_count < BATCH_SIZE: break\n", - " time.sleep(0.1) # Be gentle on the server\n", - " except Exception as e:\n", - " print(f\"⚠️ Error on batch: {e}\")\n", - " if \"503\" in str(e): time.sleep(5)\n", - " else: break\n", - "\n", - " # C. Save to Disk\n", - " embeddings_np = np.array(embeddings)\n", - " if len(ids) > 0:\n", - " print(f\"💾 [Cache] Saving {len(ids)} vectors to {cache_file}...\")\n", - " with open(cache_file, 'wb') as f:\n", - " pickle.dump({'ids': ids, 'texts': texts, 'embeddings': embeddings_np}, f)\n", - " print(\"✅ [Cache] Saved.\")\n", - "\n", - " return ids, texts, embeddings_np\n", - "\n", - "class RobustGraphRAG:\n", - " def __init__(self, config):\n", - " self.config = config\n", - " self.client = ArangoClient(hosts=config[\"hosts\"])\n", - " self.db = self.client.db(config[\"db_name\"], username=config[\"username\"], password=config[\"password\"])\n", - "\n", - " print(\"⏳ [Model] Loading Encoders...\")\n", - " self.encoder = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", - " self.reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n", - "\n", - " self.chunk_ids, self.chunk_texts, self.chunk_embeddings = load_vectors_smartly(\n", - " self.db,\n", - " self.config['chunk_col']\n", - " )\n", - "\n", - " def retrieve(self, query: str, top_k=3):\n", - " if len(self.chunk_embeddings) == 0: return \"No context.\"\n", - "\n", - " # 1. Wider Vector Search (75 candidates)\n", - " # We widen this to ensure we catch \"Conclusion\" chunks that might use different wording\n", - " query_emb = self.encoder.encode([query])\n", - " sims = cosine_similarity(query_emb, self.chunk_embeddings)[0]\n", - " top_n_indices = np.argsort(sims)[-75:][::-1]\n", - "\n", - " candidate_pairs = []\n", - " for idx in top_n_indices:\n", - " candidate_pairs.append((self.chunk_texts[idx], self.chunk_ids[idx]))\n", - "\n", - " # 2. Re-Ranking\n", - " cross_inputs = [[query, text] for text, _ in candidate_pairs]\n", - " scores = self.reranker.predict(cross_inputs)\n", - " ranked_indices = np.argsort(scores)[::-1]\n", - "\n", - " best_chunk_ids = []\n", - " for i in range(top_k):\n", - " idx = ranked_indices[i]\n", - " _, cid = candidate_pairs[idx]\n", - " best_chunk_ids.append(cid)\n", - "\n", - " # 3. Graph Expansion (Parent Abstract Reconstruction)\n", - " aql = \"\"\"\n", - " WITH Papers, Chunks\n", - " FOR start_chunk_id IN @ids\n", - " LET start_doc = DOCUMENT(start_chunk_id)\n", - "\n", - " // Find Parent Paper\n", - " FOR paper IN 1..1 INBOUND start_doc HAS_CONTEXT\n", - "\n", - " // Get ALL chunks (Introduction + Results + Conclusion)\n", - " LET full_text_chunks = (\n", - " FOR c IN 1..1 OUTBOUND paper HAS_CONTEXT\n", - " RETURN c.text\n", - " )\n", - "\n", - " // Concatenate into a clean abstract\n", - " LET full_abstract = CONCAT_SEPARATOR(\" \", full_text_chunks)\n", - "\n", - " RETURN {\n", - " \"title\": paper.title,\n", - " \"abstract\": full_abstract\n", - " }\n", - " \"\"\"\n", - "\n", - " try:\n", - " cursor = self.db.aql.execute(aql, bind_vars={\"ids\": best_chunk_ids})\n", - " context_parts = []\n", - " seen_titles = set()\n", - "\n", - " for res in cursor:\n", - " title = res.get('title', 'Unknown')\n", - " if title in seen_titles: continue\n", - " seen_titles.add(title)\n", - "\n", - " # Add \"Study X\" header to help LLM distinguish separate papers\n", - " entry = (\n", - " f\"=== STUDY: {title} ===\\n\"\n", - " f\"ABSTRACT: {res.get('abstract')}\\n\"\n", - " )\n", - " context_parts.append(entry)\n", - "\n", - " return \"\\n\".join(context_parts)\n", - "\n", - " except Exception as e:\n", - " print(f\"⚠️ Graph Error ({e}).\")\n", - " fallback_texts = []\n", - " for i in range(top_k):\n", - " idx = ranked_indices[i]\n", - " t, _ = candidate_pairs[idx]\n", - " fallback_texts.append(f\"Excerpt: {t}\")\n", - " return \"\\n\".join(fallback_texts)\n", - "\n", - " def _heuristic_override(self, response_text):\n", - " \"\"\"\n", - " Python Safety Net: Catches 'Maybe' and flips it if strong keywords exist.\n", - " \"\"\"\n", - " clean_text = response_text.lower()\n", - "\n", - " # 1. Extract the explicit answer\n", - " match = re.search(r'(?:final answer|answer):?\\s*(yes|no|maybe)', clean_text)\n", - " pred = match.group(1) if match else \"maybe\"\n", - "\n", - " # 2. If prediction is YES or NO, trust the model.\n", - " if pred in [\"yes\", \"no\"]:\n", - " return pred\n", - "\n", - " # 3. If prediction is MAYBE, check the REASONING for \"Soft Signals\"\n", - " # Positive Signals\n", - " soft_yes = [\"suggests\", \"indicates\", \"significant\", \"associated with\", \"effective\", \"improved\"]\n", - " for word in soft_yes:\n", - " if word in clean_text:\n", - " return \"yes\"\n", - "\n", - " # Negative Signals\n", - " soft_no = [\"no significant\", \"did not\", \"unrelated\", \"ineffective\", \"no difference\"]\n", - " for word in soft_no:\n", - " if word in clean_text:\n", - " return \"no\"\n", - "\n", - " return \"maybe\"\n", - "\n", - " def query_ollama(self, prompt: str):\n", - " # The \"Calibration\" Prompt\n", - " # We align the model with PubMedQA's specific annotation style.\n", - "\n", - " system_msg = \"\"\"\n", - " You are a PubMedQA annotator.\n", - " Your task is to classify the answer as 'yes', 'no', or 'maybe' based on the Study Abstract.\n", - "\n", - " ANNOTATION GUIDELINES (CRITICAL):\n", - " 1. If the study suggests a positive outcome, even if \"further study is needed\", the answer is YES.\n", - " 2. If the study finds a correlation or association, the answer is YES.\n", - " 3. If the study finds \"no significant difference\", the answer is NO.\n", - " 4. ONLY use MAYBE if the abstract explicitly states \"results were inconclusive\" or provides zero data.\n", - "\n", - " Format:\n", - " Final Answer: [yes/no/maybe]\n", - " \"\"\"\n", - "\n", - " full_prompt = f\"{system_msg}\\n\\nContext:\\n{prompt}\"\n", - "\n", - " url = \"http://localhost:11434/api/chat\"\n", - " payload = {\n", - " \"model\": \"deepseek-r1:8b\",\n", - " \"messages\": [{\"role\": \"user\", \"content\": full_prompt}],\n", - " \"stream\": False,\n", - " \"options\": {\n", - " \"temperature\": 0.0,\n", - " \"num_ctx\": 4096\n", - " }\n", - " }\n", - " try:\n", - " res = requests.post(url, json=payload, timeout=300)\n", - " if res.status_code == 200:\n", - " raw_response = res.json()['message']['content']\n", - "\n", - " # --- APPLY THE PYTHON SAFETY NET ---\n", - " final_decision = self._heuristic_override(raw_response)\n", - "\n", - " # Return a format that your evaluator can parse\n", - " return f\"{raw_response}\\n\\n[Heuristic Override Result]: Final Answer: {final_decision}\"\n", - "\n", - " return f\"Error {res.status_code}\"\n", - " except Exception as e:\n", - " return f\"Exception: {e}\"\n", - "\n", - "\n", - "\n", - " def generate_chat_response(self, message, context):\n", - " \"\"\"\n", - " A specific prompt for the Chat UI (Conversational, not Yes/No).\n", - " \"\"\"\n", - " system_msg = \"\"\"\n", - " You are a Helpful Medical AI Assistant.\n", - " Use the provided Research Abstracts to answer the user's question accurately.\n", - "\n", - " Guidelines:\n", - " 1. Base your answer ONLY on the context provided.\n", - " 2. Cite the specific study titles when making claims (e.g., \"According to the study on X...\").\n", - " 3. If the studies are conflicting, explain the conflict.\n", - " 4. If the answer is not in the context, admit you don't have evidence but give your opinion.\n", - " \"\"\"\n", - "\n", - " full_prompt = f\"{system_msg}\\n\\nContext:\\n{context}\\n\\nUser Question: {message}\"\n", - "\n", - " url = \"http://localhost:11434/api/chat\"\n", - " payload = {\n", - " \"model\": \"deepseek-r1:8b\",\n", - " \"messages\": [{\"role\": \"user\", \"content\": full_prompt}],\n", - " \"stream\": False,\n", - " \"options\": {\"temperature\": 0.3, \"num_ctx\": 4096} # Slight creativity allowed\n", - " }\n", - " try:\n", - " res = requests.post(url, json=payload, timeout=300)\n", - " if res.status_code == 200:\n", - " return res.json()['message']['content']\n", - " return \"Error: Could not communicate with model.\"\n", - " except Exception as e:\n", - " return f\"Error: {e}\"\n", - "\n", - " # --- THE UI LAUNCHER ---\n", - " def launch_gradio_ui(self):\n", - " print(\"\\n🚀 Launching Gradio UI...\")\n", - "\n", - " def chat_logic(message, history):\n", - " # 1. Retrieve Context\n", - " print(f\"🔎 Retrieving for: {message}...\")\n", - " retrieved_context = self.retrieve(message)\n", - "\n", - " # 2. Generate Answer\n", - " print(f\"🤖 Generating Answer...\")\n", - " response = self.generate_chat_response(message, retrieved_context)\n", - "\n", - " # 3. Optional: Append Sources to the bottom of the answer\n", - " final_output = f\"{response}\\n\\n___\\n**Sources Retrieved:**\\n\"\n", - "\n", - " # Simple regex to extract titles for display\n", - " titles = re.findall(r\"=== STUDY: (.*?) ===\", retrieved_context)\n", - " for t in titles:\n", - " final_output += f\"- *{t}*\\n\"\n", - "\n", - " return final_output\n", - "\n", - " # Create the Interface\n", - " demo = gr.ChatInterface(\n", - " fn=chat_logic,\n", - " title=\"🧬 PubMed GraphRAG Assistant\",\n", - " description=\"Ask detailed medical questions. I will retrieve full abstracts from the Knowledge Graph to answer you.\",\n", - " examples=[\n", - " \"Do preoperative statins reduce atrial fibrillation?\",\n", - " \"Is obesity a risk factor for cirrhosis-related death or hospitalization?\",\n", - " \"Does high-dose aspirin prevent cardiovascular events?\"\n", - " ],\n", - " theme=\"soft\"\n", - " )\n", - "\n", - " demo.launch(share=True, debug=True)" - ], - "metadata": { - "id": "djEmPjhjNLej" - }, - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "class AdvancedEvaluator:\n", - " def __init__(self):\n", - " self.y_true = []\n", - " self.y_pred = []\n", - " self.start_time = None\n", - " self.end_time = None\n", - "\n", - " def start(self):\n", - " \"\"\"Starts the stopwatch.\"\"\"\n", - " self.start_time = time.time()\n", - " print(\"⏱️ Evaluation Timer Started...\")\n", - "\n", - " def stop(self):\n", - " \"\"\"Stops the stopwatch.\"\"\"\n", - " self.end_time = time.time()\n", - "\n", - " def record(self, gt, pred):\n", - " \"\"\"Records a single prediction pair.\"\"\"\n", - " # Normalize to ensure clean metrics\n", - " clean_gt = gt.lower().strip()\n", - " clean_pred = pred.lower().strip()\n", - "\n", - " # Safety: If model output garbage, classify as 'maybe'\n", - " if clean_pred not in ['yes', 'no', 'maybe']:\n", - " clean_pred = 'maybe'\n", - "\n", - " self.y_true.append(clean_gt)\n", - " self.y_pred.append(clean_pred)\n", - "\n", - " def generate_report(self):\n", - " \"\"\"Calculates and visualizes all requested metrics.\"\"\"\n", - " if not self.y_true:\n", - " print(\"⚠️ No data to report.\")\n", - " return\n", - "\n", - " # 1. Total Time\n", - " total_seconds = self.end_time - self.start_time\n", - " avg_per_sample = total_seconds / len(self.y_true)\n", - "\n", - " # 2. Accuracy\n", - " acc = accuracy_score(self.y_true, self.y_pred) * 100\n", - "\n", - " print(\"\\n\" + \"=\"*40)\n", - " print(f\"📊 FINAL EVALUATION REPORT\")\n", - " print(\"=\"*40)\n", - " print(f\"⏱️ Total Time: {total_seconds:.2f} seconds\")\n", - " print(f\"⚡ Avg Latency: {avg_per_sample:.2f} seconds/query\")\n", - " print(f\"🎯 Final Accuracy: {acc:.2f}%\")\n", - " print(\"-\" * 40)\n", - "\n", - " # 3. Prediction Summary (Counts)\n", - " df = pd.DataFrame({'Ground Truth': self.y_true, 'Prediction': self.y_pred})\n", - " print(\"\\n📋 Prediction Distribution:\")\n", - " print(df['Prediction'].value_counts())\n", - "\n", - " # 4. Classification Report\n", - " print(\"\\n📈 Detailed Classification Report:\")\n", - " # We specify labels to ensure all classes show up even if count is 0\n", - " labels = ['yes', 'no', 'maybe']\n", - " print(classification_report(self.y_true, self.y_pred, labels=labels, zero_division=0))\n", - "\n", - " # 5. Confusion Matrix Visualization\n", - " cm = confusion_matrix(self.y_true, self.y_pred, labels=labels)\n", - "\n", - " plt.figure(figsize=(8, 6))\n", - " sns.set(font_scale=1.2)\n", - " sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',\n", - " xticklabels=labels, yticklabels=labels)\n", - " plt.xlabel('Predicted Label')\n", - " plt.ylabel('True Label')\n", - " plt.title('Confusion Matrix: PubMedQA Evaluation')\n", - " plt.show()" - ], - "metadata": { - "id": "fIPqChKGR_uN" - }, - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title\n", - "# --- 5. MAIN EXECUTION (MERGED) ---\n", - "if __name__ == \"__main__\":\n", - "\n", - " # 1. Start Server (Background)\n", - " print(\"🚀 [Ollama] Ensuring server is running...\")\n", - " subprocess.Popen([\"ollama\", \"serve\"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)\n", - " time.sleep(3) # Give it a moment to spin up\n", - "\n", - " # 2. Auto-Pull Model\n", - " check_and_pull_model(\"deepseek-r1:8b\")\n", - " rag = RobustGraphRAG(ARANGO_CONFIG)\n", - " metrics = AdvancedEvaluator()\n", - "\n", - " # 3. Load Data\n", - " print(\"📚 [Data] Loading PubMedQA...\")\n", - " dataset = load_dataset(\"qiaojin/PubMedQA\", \"pqa_labeled\", split=\"train\")\n", - "\n", - " # 4. Evaluation Loop\n", - " LIMIT = 20\n", - " print(f\"\\n=== STARTING EVALUATION (Limit: {LIMIT}) ===\")\n", - " print(\"------------------------------------------------\")\n", - "\n", - " metrics.start() # <--- Start Timer\n", - "\n", - " for i, item in enumerate(dataset):\n", - " if i >= LIMIT: break\n", - "\n", - " question = item['question']\n", - " gt = item['final_decision']\n", - "\n", - " # A. Pipeline Retrieval\n", - " context = rag.retrieve(question)\n", - "\n", - " # B. Prompt\n", - " # We pass the raw context/question. The RobustGraphRAG class adds the \"Decisive\" System Prompt.\n", - " prompt = f\"\"\"\n", - " Context Information: {context}\n", - "\n", - " Question: {question}\n", - "\n", - " Instructions:\n", - " 1. You are a helpful medical expert at a hypothetical research institution. Answer the question based on the provided context.\n", - " 2. Answer in just one word. Do not provide any explanation.\n", - " 3. This is being used only for research/educational purposes.\n", - " 4. Conclude your answer with exactly: \"Final Answer: [yes/no/maybe]\n", - " \"\"\"\n", - " raw_response = rag.query_ollama(prompt)\n", - "\n", - " # C. Logic Extraction (Handling the 'Fixed Override')\n", - " if \"[Fixed Override]\" in raw_response:\n", - " # 1. Extract the overridden answer\n", - " match = re.search(r\"Final Answer: (yes|no|maybe)\", raw_response, re.IGNORECASE)\n", - " pred = match.group(1).lower() if match else \"maybe\"\n", - "\n", - " # Print log with special \"Wrench\" icon to show the heuristic worked\n", - " icon = \"✅\" if pred == gt else \"❌\"\n", - " print(f\"[{i+1}] GT: {gt:<5} | Pred: {pred:<5} | {icon} (🛠️ Fixed)\")\n", - "\n", - " else:\n", - " # 2. Extract standard answer\n", - " match = re.search(r\"(?:final answer|answer):?\\s*(yes|no|maybe)\", raw_response.lower())\n", - " pred = match.group(1).lower() if match else \"maybe\"\n", - "\n", - " icon = \"✅\" if pred == gt else \"❌\"\n", - " print(f\"[{i+1}] GT: {gt:<5} | Pred: {pred:<5} | {icon}\")\n", - "\n", - " # D. Record Data point for the Graphs\n", - " metrics.record(gt, pred)\n", - "\n", - " # 5. Finalize & Visualize\n", - " metrics.stop() # <--- Stop Timer\n", - " metrics.generate_report() # <--- Plots Confusion Matrix" - ], - "metadata": { - "id": "VBfaqOFQDxtR", - "collapsed": true - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Launch UI\n", - "rag.launch_gradio_ui()" - ], - "metadata": { - "id": "-LJUiQ1CR9Nm" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "IYaeOxtIBp_p" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a4d22a6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 The Knowledge Graph Question Answering Project Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..face578 --- /dev/null +++ b/Makefile @@ -0,0 +1,47 @@ +.DEFAULT_GOAL := help +.PHONY: help install install-dev install-app test lint format ingest benchmark compare chat dashboard clean + +help: ## Show this help + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \ + awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-12s\033[0m %s\n", $$1, $$2}' + +install: ## Install runtime dependencies + pip install -r requirements.txt + +install-dev: ## Install dev dependencies (tests + lint) + pip install -r requirements-dev.txt + +install-app: ## Install UI dependencies (gradio + streamlit) + pip install -r requirements-app.txt + +test: ## Run the test suite + pytest + +lint: ## Lint with ruff + ruff check src scripts tests app + +format: ## Auto-fix lint issues with ruff + ruff check --fix src scripts tests app + +ingest: ## Build the ArangoDB knowledge graph (needs ARANGO_PASS) + python scripts/ingest.py + +benchmark: ## Run all four arms (needs ARANGO_PASS + Ollama) + @for arm in plain plain_rr graph graph_concepts; do \ + echo "===== $$arm ====="; \ + python scripts/run_benchmark.py --arm $$arm --n 200; \ + done + +compare: ## Aggregate results into table, McNemar tests, and figure + python scripts/compare.py + +chat: ## Launch the Gradio chat demo (needs ArangoDB + Ollama) + python app/chat_app.py + +dashboard: ## Launch the Streamlit results dashboard + streamlit run app/dashboard.py + +clean: ## Remove caches and generated vector cache + rm -rf .pytest_cache .ruff_cache *.egg-info src/*.egg-info \ + pubmed_vectors_cache.pkl + find . -type d -name __pycache__ -exec rm -rf {} + diff --git a/Plain_RAG/Plain_RAG.ipynb b/Plain_RAG/Plain_RAG.ipynb deleted file mode 100644 index d35c42c..0000000 --- a/Plain_RAG/Plain_RAG.ipynb +++ /dev/null @@ -1,8831 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "SwCv3__NIaTo", - "outputId": "ad7bf0ed-b938-45a8-a266-97cd05508255" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.4/59.4 MB\u001b[0m \u001b[31m33.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h" - ] - } - ], - "source": [ - "!pip install -q sentence-transformers datasets transformers accelerate bitsandbytes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e24TcF9NZ2gO", - "outputId": "071b2bd1-ed2d-403b-b0d6-a110f624fd0e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", - "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.60.1)\n", - "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", - "Requirement already satisfied: numpy>=1.23 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.0.2)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (25.0)\n", - "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.2.5)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n", - "Requirement already satisfied: pandas>=1.2 in /usr/local/lib/python3.12/dist-packages (from seaborn) (2.2.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.2->seaborn) (2025.2)\n", - "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.2->seaborn) (2025.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n" - ] - } - ], - "source": [ - "!pip install matplotlib seaborn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "RORhfl85J00q", - "outputId": "1b0c3ab1-d493-4d0d-a58a-9d165875c2d7" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting faiss-gpu-cu12\n", - " Downloading faiss_gpu_cu12-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", - "Requirement already satisfied: numpy<3,>=2 in /usr/local/lib/python3.12/dist-packages (from faiss-gpu-cu12) (2.0.2)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from faiss-gpu-cu12) (25.0)\n", - "Requirement already satisfied: nvidia-cuda-runtime-cu12>=12.1.105 in /usr/local/lib/python3.12/dist-packages (from faiss-gpu-cu12) (12.6.77)\n", - "Requirement already satisfied: nvidia-cublas-cu12>=12.1.3.1 in /usr/local/lib/python3.12/dist-packages (from faiss-gpu-cu12) (12.6.4.1)\n", - "Downloading faiss_gpu_cu12-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.3/48.3 MB\u001b[0m \u001b[31m30.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: faiss-gpu-cu12\n", - "Successfully installed faiss-gpu-cu12-1.13.0\n" - ] - } - ], - "source": [ - "!pip install faiss-gpu-cu12" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pVq9qIBETknV" - }, - "outputs": [], - "source": [ - "import os\n", - "import torch\n", - "import faiss\n", - "import numpy as np\n", - "import time\n", - "import pickle\n", - "import gradio as gr\n", - "from tqdm.notebook import tqdm\n", - "from datasets import load_dataset\n", - "from sentence_transformers import SentenceTransformer\n", - "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", - "from sklearn.metrics import accuracy_score, classification_report\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn.metrics import confusion_matrix\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "X_tuVvKdTohM" - }, - "outputs": [], - "source": [ - "EMBEDDING_MODEL_NAME = \"sentence-transformers/all-mpnet-base-v2\"\n", - "LLM_MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Llama-8B\"\n", - "INDEX_TYPE = \"IndexFlatIP\" # Inner Product (Cosine Similarity)\n", - "BATCH_SIZE = 128\n", - "TOP_K_RETRIEVAL = 3\n", - "INDEX_FILE = \"pubmed_rag_index.bin\"\n", - "DATA_FILE = \"pubmed_rag_data.pkl\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOAJa-drTsrp" - }, - "outputs": [], - "source": [ - "class PubMedRAG:\n", - " def __init__(self):\n", - " self.device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - " print(f\"Initializing RAG Pipeline on {self.device}\")\n", - "\n", - " # 1. Load Embedding Model\n", - " print(f\"Loading Embedding Model: {EMBEDDING_MODEL_NAME}\")\n", - " self.embedder = SentenceTransformer(EMBEDDING_MODEL_NAME, device=self.device)\n", - " self.embedding_dim = self.embedder.get_sentence_embedding_dimension()\n", - "\n", - " # 2. Load LLM (4-bit quantized)\n", - " print(f\"Loading LLM: {LLM_MODEL_NAME}\")\n", - " bnb_config = BitsAndBytesConfig(\n", - " load_in_4bit=True,\n", - " bnb_4bit_compute_dtype=torch.float16,\n", - " bnb_4bit_quant_type=\"nf4\",\n", - " )\n", - " self.tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)\n", - " self.llm = AutoModelForCausalLM.from_pretrained(\n", - " LLM_MODEL_NAME,\n", - " quantization_config=bnb_config,\n", - " device_map=\"auto\"\n", - " )\n", - "\n", - " # 3. Initialize placeholders\n", - " self.index = None\n", - " self.documents = []\n", - " self.labeled_data = []\n", - "\n", - " def load_and_index_data(self):\n", - " \"\"\"Loads data. Tries to load from disk first; otherwise builds from scratch.\"\"\"\n", - "\n", - " # --- OPTION A: LOAD FROM DISK ---\n", - " if os.path.exists(INDEX_FILE) and os.path.exists(DATA_FILE):\n", - " print(f\"\\nFound saved index and data on disk!\")\n", - " print(f\" - Loading Index from {INDEX_FILE}\")\n", - " self.index = faiss.read_index(INDEX_FILE)\n", - "\n", - " # Move to GPU if possible\n", - " if self.device == \"cuda\" and hasattr(faiss, \"StandardGpuResources\"):\n", - " try:\n", - " res = faiss.StandardGpuResources()\n", - " self.index = faiss.index_cpu_to_gpu(res, 0, self.index)\n", - " print(\" Index moved to GPU.\")\n", - " except Exception as e:\n", - " print(f\" GPU move failed ({e}), keeping on CPU.\")\n", - "\n", - " print(f\" - Loading Data from {DATA_FILE}...\")\n", - " with open(DATA_FILE, \"rb\") as f:\n", - " saved_data = pickle.load(f)\n", - " self.documents = saved_data[\"documents\"]\n", - " self.labeled_data = saved_data[\"labeled_data\"]\n", - " print(\"State restored.\")\n", - " return\n", - "\n", - " # --- OPTION B: BUILD FROM SCRATCH ---\n", - " print(\"\\nLoading Datasets from Hugging Face\")\n", - " ds_labeled = load_dataset(\"qiaojin/PubmedQA\", \"pqa_labeled\", split=\"train\")\n", - " ds_unlabeled = load_dataset(\"qiaojin/PubmedQA\", \"pqa_unlabeled\", split=\"train\")\n", - " ds_artificial = load_dataset(\"qiaojin/PubmedQA\", \"pqa_artificial\", split=\"train\")\n", - "\n", - " print(f\" - Labeled: {len(ds_labeled)}\")\n", - " print(f\" - Unlabeled: {len(ds_unlabeled)}\")\n", - " print(f\" - Artificial: {len(ds_artificial)}\")\n", - "\n", - " def process_split(dataset, split_name):\n", - " docs = []\n", - " for item in tqdm(dataset, desc=f\"Processing {split_name}\"):\n", - " full_text = \" \".join(item['context']['contexts'])\n", - " question_text = item.get('question', \"\")\n", - " if not question_text and split_name == \"labeled\":\n", - " question_text = item.get('question', \"No Question Found\")\n", - "\n", - " docs.append({\n", - " \"text\": full_text,\n", - " \"pubid\": item['pubid'],\n", - " \"question\": question_text,\n", - " \"final_decision\": item.get('final_decision', None)\n", - " })\n", - " return docs\n", - "\n", - " self.labeled_data = process_split(ds_labeled, \"labeled\")\n", - " all_docs = []\n", - " all_docs.extend(self.labeled_data)\n", - " all_docs.extend(process_split(ds_unlabeled, \"unlabeled\"))\n", - " all_docs.extend(process_split(ds_artificial, \"artificial\"))\n", - "\n", - " self.documents = all_docs\n", - " print(f\"Total Documents: {len(self.documents)}\")\n", - "\n", - " print(\"\\nGenerating Embeddings\")\n", - " texts = [d['text'] for d in self.documents]\n", - " embeddings = self.embedder.encode(\n", - " texts,\n", - " batch_size=BATCH_SIZE,\n", - " show_progress_bar=True,\n", - " convert_to_numpy=True,\n", - " normalize_embeddings=True\n", - " )\n", - "\n", - " print(f\"\\nBuilding FAISS {INDEX_TYPE} Index\")\n", - " index_flat = faiss.IndexFlatIP(self.embedding_dim)\n", - " index_flat.add(embeddings)\n", - "\n", - " # Save to disk\n", - " print(\"Saving to disk for future runs\")\n", - " faiss.write_index(index_flat, INDEX_FILE)\n", - " with open(DATA_FILE, \"wb\") as f:\n", - " pickle.dump({\"documents\": self.documents, \"labeled_data\": self.labeled_data}, f)\n", - "\n", - " # Enable GPU\n", - " if self.device == \"cuda\" and hasattr(faiss, \"StandardGpuResources\"):\n", - " try:\n", - " res = faiss.StandardGpuResources()\n", - " self.index = faiss.index_cpu_to_gpu(res, 0, index_flat)\n", - " except:\n", - " self.index = index_flat\n", - " else:\n", - " self.index = index_flat\n", - "\n", - " def retrieve(self, query, k=TOP_K_RETRIEVAL):\n", - " query_vec = self.embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)\n", - " distances, indices = self.index.search(query_vec, k)\n", - " results = []\n", - " for i, idx in enumerate(indices[0]):\n", - " if idx != -1:\n", - " results.append(self.documents[idx])\n", - " return results\n", - "\n", - " def generate_response(self, query, retrieved_docs, mode=\"detailed\"):\n", - " context_text = \"\\n\\n\".join([f\"Abstract {i+1}: {doc['text']}\" for i, doc in enumerate(retrieved_docs)])\n", - "\n", - " # 1. Define System & User Prompts\n", - " if mode == \"benchmark\":\n", - " sys_msg = (\n", - " \"Answer in just one word based on the given context.Do not provide any explanation. You final answer should be one of 3 words: yes, no, maybe\"\n", - " )\n", - " temp = 0.6\n", - " max_tokens = 2048\n", - " rep_penalty = 1.1\n", - "\n", - " else:\n", - " sys_msg = (\n", - " \"You are a helpful medical assistant. Answer the user's question based on the provided medical abstracts. \"\n", - " \"Cite the abstracts by number if necessary. Be concise.\"\n", - " )\n", - " temp = 0.6\n", - " max_tokens = 1024\n", - " rep_penalty = 1.1\n", - "\n", - " # 2. Create Chat Structure (Standard for Llama/DeepSeek)\n", - " messages = [\n", - " {\"role\": \"system\", \"content\": sys_msg},\n", - " {\"role\": \"user\", \"content\": f\"Contexts:\\n{context_text}\\n\\nQuestion: {query}\"}\n", - " ]\n", - "\n", - " # 3. Apply Chat Template (Handles special tokens like <|begin_of_text|>)\n", - " inputs = self.tokenizer.apply_chat_template(\n", - " messages,\n", - " tokenize=True,\n", - " add_generation_prompt=True,\n", - " return_tensors=\"pt\"\n", - " ).to(self.device)\n", - "\n", - " # 4. Generate\n", - " outputs = self.llm.generate(\n", - " inputs,\n", - " max_new_tokens=max_tokens,\n", - " temperature=temp,\n", - " top_p=1.0,\n", - " do_sample=False if mode==\"benchmark\" else True,\n", - " repetition_penalty=rep_penalty,\n", - ")\n", - "\n", - " # 5. Decode ONLY the new tokens (Slice off the prompt)\n", - " # This removes the need to manually split \"Question:...\" from the output\n", - " generated_tokens = outputs[0][len(inputs[0]):]\n", - " response = self.tokenizer.decode(generated_tokens, skip_special_tokens=False)\n", - "\n", - " # 6. Clean up DeepSeek tags\n", - " if \"\" in response:\n", - " response = response.split(\"\")[-1].strip()\n", - " #else:\n", - " # Fallback: specific regex if tags are still missing but reasoning is evident\n", - " #response = re.sub(r'.*?', '', response, flags=re.DOTALL).strip()\n", - "\n", - " # 7. Clean up \"Answer:\" prefix if present\n", - " if response.startswith(\"Answer:\"):\n", - " response = response[7:].strip()\n", - "\n", - " # 8. Final clean of EOS tokens\n", - " response = response.replace(\"\", \"\").replace(\"<|end_of_text|>\", \"\").replace(\"<|end_of_sentence|>\", \"\").replace(\"<|end▁of▁sentence|>\", \"\").strip()\n", - "\n", - " return response\n", - "\n", - "\n", - " def run_benchmark(self, sample_size=50):\n", - "\n", - " print(f\"\\nSTARTING BENCHMARK (Sample Size: {sample_size})...\")\n", - "\n", - " # Slice the test set\n", - " test_set = self.labeled_data[:sample_size]\n", - " y_true = []\n", - " y_pred = []\n", - "\n", - " start_time = time.time()\n", - "\n", - " for item in tqdm(test_set, desc=\"Benchmarking\"):\n", - " # Safety check for data integrity\n", - " question = item.get('question')\n", - " ground_truth = item.get('final_decision')\n", - "\n", - " if not question or not ground_truth:\n", - " continue\n", - "\n", - " # Retrieve and Generate\n", - " retrieved = self.retrieve(question, k=TOP_K_RETRIEVAL)\n", - " response = self.generate_response(question, retrieved, mode=\"benchmark\")\n", - "\n", - " # Normalize prediction\n", - " pred_lower = response.lower()\n", - " prediction = \"maybe\"\n", - " if \"yes\" in pred_lower:\n", - " prediction = \"yes\"\n", - " elif \"no\" in pred_lower:\n", - " prediction = \"no\"\n", - "\n", - " y_true.append(ground_truth)\n", - " y_pred.append(prediction)\n", - "\n", - " duration = time.time() - start_time\n", - "\n", - " print(\"\\n\" + \"=\"*50)\n", - " print(\"BENCHMARK RESULTS\")\n", - " print(\"=\"*50)\n", - " print(f\"Time taken: {duration:.2f}s\")\n", - " print(f\"Accuracy: {accuracy_score(y_true, y_pred):.2%}\")\n", - "\n", - " # 1. Classification Report (Existing)\n", - " labels = [\"yes\", \"no\", \"maybe\"]\n", - " print(\"\\n--- Classification Report ---\")\n", - " print(classification_report(y_true, y_pred, labels=labels, zero_division=0))\n", - "\n", - " # 2. Prediction Counts (New)\n", - " print(\"\\n--- Prediction Summary ---\")\n", - " pred_counts = pd.Series(y_pred).value_counts().reindex(labels, fill_value=0)\n", - " print(pred_counts)\n", - "\n", - " # 3. Confusion Matrix (New)\n", - " cm = confusion_matrix(y_true, y_pred, labels=labels)\n", - "\n", - " # Plotting the Matrix\n", - " plt.figure(figsize=(8, 6))\n", - " sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',\n", - " xticklabels=labels, yticklabels=labels)\n", - " plt.xlabel('Predicted')\n", - " plt.ylabel('Actual')\n", - " plt.title('Confusion Matrix')\n", - " plt.show()\n", - "\n", - " def launch_gradio_ui(self):\n", - " \"\"\"Launches the Gradio Chat Interface.\"\"\"\n", - " print(\"\\nLaunching Gradio UI\")\n", - "\n", - " def chat_logic(message, history):\n", - " # We ignore history for single-turn RAG to keep context clean and fast\n", - " retrieved = self.retrieve(message)\n", - " response = self.generate_response(message, retrieved, mode=\"interactive\")\n", - " return response\n", - "\n", - " demo = gr.ChatInterface(\n", - " fn=chat_logic,\n", - " title=\"PubMed Medical AI Assistant\",\n", - " description=\"Ask detailed medical questions. The AI retrieves relevant abstracts from the PubMedQA dataset to generate answers.\",\n", - " examples=[\n", - " \"Do preoperative statins reduce atrial fibrillation?\",\n", - " \"Is Hirschsprung disease a mendelian or a multifactorial disorder?\",\n", - " \"Does high-dose aspirin prevent cardiovascular events?\"\n", - " ],\n", - " theme=\"soft\"\n", - " )\n", - "\n", - " # share=True creates a public link\n", - " demo.launch(share=True, debug=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "rITTxoHN7cfw", - "outputId": "14a279a4-8c76-4bfd-ef9c-432d89212b79" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GPU Memory Cleared. Current allocated: 0.00 MB\n" - ] - } - ], - "source": [ - "import gc\n", - "if 'rag_system' in globals():\n", - " del rag_system\n", - " print(\"Deleted rag_system object.\")\n", - "\n", - "if 'app' in globals():\n", - " del app\n", - " print(\"Deleted app object.\")\n", - "\n", - "# 2. Run Garbage Collector\n", - "gc.collect()\n", - "if torch.cuda.is_available():\n", - " torch.cuda.empty_cache()\n", - " torch.cuda.ipc_collect() # Clear IPC memory if using multiprocessing\n", - " print(f\"GPU Memory Cleared. Current allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB\")\n", - "else:\n", - " print(\"No GPU detected.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 833, - "referenced_widgets": [ - "4a0ea4434e584a36bdafbd77fdf915d4", - "9ffe20e0f2124c14b7c12ce8daf7c4b5", - "ea06f0dd061c4e3a96240433969ed793", - "c527b5e31da3403fb57d9fd4b2b9b191", - "76f4bb7b9b104810985282bd16842650", - "afa27f83547045aca2c4e8e92fe49878", - "871272ea241147bb87feeaaf793e1d9e", - "482e5a82919f4a1ab8168b0006b2d979", - "8ac89f580fb8427697cac9fad7ee1693", - "5dd09f0c273647bd89678aa2fe32f6df", - "e4ea1f9ea6a343ef995c2eface0efd13", - "c2c50ced649b428097bffe04f97a137f", - "38b4097af230467a81ee65454f907eb5", - "0b6e7094f94f4cb481e772bd5443a025", - "1d4b2d25edcd4207bce2972e824e815d", - "60574c59d6da4bfa92b4b93884d2b5d8", - "ab64037914e04ee7adb2877784271790", - "feeb9cc484ee467b851fb576af521adf", - "a4ba3264f68348eb9650d38fd1668a26", - "421cda2009074fcd9c5d4a81109b67ce", - "e2a80bf2c52d4a0c98731e3de6a24f22", - "b06f97d725d14843a55ff2442af9a97e", - "c2536ef6eb4341e3b54cf473ae498ca9", - "92a456546b7b493d8321c24a62a6b551", - "ea161cf843184232bb844545176cffb0", - "9a80ee6fd91a401dac1652acd15d44f5", - "3bd2d7fb9da04416b2bcca81d4d15bd5", - "c9d5bcb10e674784a570b625a59f12cf", - "e88678951fc2460ea8241edba800a2c0", - "bdf14d4a82c2425d8ec0c9a2acc63055", - "603d475c14724766b576206c2ec53a45", - "7f5af4598bb24c0b992424624f6dfb6d", - "74af2c01c93847bdb011d0368acf44f7", - "b316a24c01d341ca878ebc93bb6db8ad", - "77232511fed041feaeefc974aceee07f", - "1d67cff04e32401da2213ad82228b20d", - "774add6aa42e4884809cdf435ce39068", - "0f56e7ca761d45c5ab1345df35e81f9e", - "d0c8f23c7434465ab284741201956266", - "13a434a2a4624e6192ca51ae7e394667", - "fac8b597f40c43a29f69ce1e6f3a77f9", - "aad85d3f773545799449c1a6080e7302", - "a0d0c70f3f11437f9c99619436e7f077", - "f6821378164c4c2eb9458d1458ca58e1", - "e34578ea4acd4623ba900470212faa90", - "9eddcbf6d4cc4f7ca5ba92e7d2dfd6db", - "2b77339b62684a4785226247f3e5e85d", - "29fe6407ea524db591c8f3579a4f8410", - "e07c41df4bf447acae4a58ae3b0c0d73", - "61ff82cf14374a28a23b941d965027eb", - "fa92d691af6946568131ab68afffbf00", - "efde71ea0b7440acbb9f45fdb5678264", - "196df619acff47b4a6ec9f9ce1710dd1", - "41f98a688b284ab4b6ffa93c1703abf4", - "94471425ae534a29a6d3d86a79312039", - "1d7ed0749dae4c81bad40d13c43d7629", - "439dc3fac7864363a295684ae40dbb35", - "5fad1f69c31243bebf49515b52540989", - "d031a5cdcf8d4a3aa030c22f9bbb423b", - "4867ed9d8d294c239f589708056ed572", - "17ea216363d94349b98f267554d7aeae", - "035286955a1b4a83957eb326f6bb0bee", - "3dd72d49bb3548fca32d19eded946c2d", - "7d3ad593a3314885a4f213e4c89da0c4", - "7254e9e3c0a24d0eaa7908b8018e3f83", - "28a9db2641bf474b84eb193e3a5143be", - "6ac6e1708632488b9af0bbefb6542a02", - "e847e23b3dc64c3684d713e36a2d43e5", - "bb112f11bd27452fadf6e17f6f5f5022", - "81d8ade37ad944c2bf20d5d2bec8e94c", - "b4d7798f0f9e4d68bb10104c227c8236", - "45041ac463164980bb6437335bcf7edf", - "4696a263556c417489660ea5c6240551", - "f41affd88fb944ed8ac6cc632d2d7edc", - "a423c1bf1c4f47dcb3ca275f27f4b23b", - "dfdac9cdd0c546429604490fdecb22a7", - "db2458bb9a0a49d8b087a2b1ebd0864d", - "38dcc7d3f5c142c68e7d5c5f6b86087d", - "f5be3e18dfc24499ae25a2ddb496ae02", - "cd046088208841c0be5b5c78203b00b8", - "7d874d8f7e744ce191f0cdae0798e707", - "eb6ee46fe93a46489385ae7ee409665d", - "a10211fa0818443f89043174a09bee1a", - "c0765fdc3eb143d6880ff4e00141ce14", - "d5f205776c3f42ccaa9542f715b85748", - "b19cf7b7c453462ea0afb26e12f404f2", - "89f29ab2c0f84bdf812d34265d0319d2", - "889fb0ea3dc3479ba47569301abf1095", - "1ef175665a774f44896a1f938d318c3d", - "7d74b694cba0407f992bce853ce926a3", - "a15d8fd61eba4842b6be4c015cca167d", - "0e048e0c3d8045309f54e9da505dc91b", - "bffaec4d3d254433b55a82dec359bf2c", - "d5bc725f098447a6920aee7ff3e00a97", - "6583e5aadfca452c93f8f87ec2f97b32", - "94a70ea4f9f94d7dbfbda2f820bf7016", - "66d531e4e851439e9b66b7a9b2d285b1", - "49b1777da63c42e69be269a810493345", - "aa365a31e1964c5bb4298a227b23d66c", - "6c52dc5d7772478eac56b01ea54c8e19", - "6a88eb28c414476c86ca250e5df03db6", - "54b693f1fd55413e825bcf420fa43e6a", - "1feef1a0e35c475e964d9e598cf1b4c5", - "e5a8f5ebb26843cf92150b2f817b2dad", - "2fba292dcd994df7bfeb0e2bdf672b5d", - "b54500bcbde249ffa53ccd68c2105b7a", - "0206f0d4ca17415f8691d3125d380aff", - "0b57c423eedc41a287fefe85e1f63d7c", - "927fff0357d7428480d2bd936b21ae6f", - "8e48b73fd5c648eb9609a8aa9830d12f", - "1405d35083dc4c11a9ad396085c97a86", - "f73b129ccce84d9f8b09c85ea7e13a5d", - "0b2f7c9999d84d49afa93b78e67ef2fa", - "9951e4c31cf445049d9f14280c1b063e", - "937d0c8d494d40c98927ea97d39933a9", - "599727a146204539892e35143c16557b", - "18b23622d00442ca9e977b587b0c7399", - "3633835bfdc24858bf09a23ddb01a75c", - "a917cc0882cf4354bd0d0ca37b0f5e62", - "a6be0858867147f6b1c64b626d583a2c", - "890f5d59c2944577921d3f31529aeaf9", - "bb412ac42ad64c30934b9594bfad392b", - "cac61cb1e1d7406e9dc97aa868525dfe", - "b38d78293de747829603e05788a7482c", - "538567e28b9942808b2d7197c156fdb1", - "9a02b190859e49efaeb6d51605eee3b2", - "428e6cbcb8bc40bcb1b414667bb76b2a", - "3ea8fd43ac404f9bb6981c2a90ff1e89", - "a366ef49fb9e4e068c7206fa5de9a4d3", - "6824c16467c644af923b0f11ac97ddb7", - "638a3ecec14d45f399461d6b38ee81fd", - "c8a3b8286e3b4027b67758fdf25c2e35", - "9aee096b2c81404bb99d96fe337cee8f", - "248e6c55a0284f4c857aad39c860ec75", - "efba2394d02944e5927907b4bfc3ccee", - "0462deedee3648ae8aaa3ca3f2f1ef3a", - "8c4ba041dafb4b429a4f23a77efb4746", - "c0a319544cd34d56bfd4006624a97e4d", - "caea2c391cab4442a9ea8361cc7f8155", - "3a0263be005f445fbfcd5ad0c9519d91", - "0d437a74f6644ef3a5fe13a4e9d4b9e8", - "c4b1153f8e0e4f3aa43d38bd0fce633a", - "833dd5db9c8641fb8297fa3761d8aedf", - "6dfc2bf4e9424660a65a5eed380280f3", - "3fe726c210d24f67a6d07d4446a852e6", - "9efa8add1b4e4b79a542b40d4d24e613", - "dd9daff9f9f94764aeb9234e72ea6249", - "2c01b612ca0f4e259dcf092f49186446", - "db060cbdb85a4aa480a98c3784b95029", - "b62fe168ee7f4be4958f377409753e2a", - "2239b2fa9dc64255ac7dec5ce5228856", - "c564890b26554bd89876286934cc622e", - "4a8d56c5081342a1a5ea977f0e5e4903", - "1368e371c0e445e69601acb25f90e5a9", - "966b2b6f86dc49e7a579b500bef5b031", - "6925e5ccc9be44f6bc8477a974e75478", - "f8d10d68c78048e5ab8fc3caf3df6643", - "34a668472a424843bf5793ddcb449ec1", - "e2a296c76c4845fbba8070a639deddbd", - "405dd53da4354822a2a2c504a4def91d", - "cdd057a3ca3b4a18a9eabd1e267f4924", - "ccec33fd75964b30a96ce2e2a1e9815f", - "07cd5bdb960f4508b89a8367222ac494", - "a7150866ea154d5b856961cb6057b36a", - "257493a319dc4612b0f9fed2793913a9", - "146f8df9d3854b5b887412a8f61d66e0", - "986e6f7bc5264864ac8835058e949ad0", - "e30cfabacedb4a17a1f3953535d1712f", - "01c0eaf975ad4a8aa970f967e255981f", - "5e66b50859b04182b3cf604f09444c68", - "40e78669fb1c4c7dbd18601a3f2a3543", - "b8540d0b29c34a9ab6045b7276ed0ba4", - "3efb64c7f88a4db080cdb2255c8469fc", - "9e68a7867b4941109d73f86659f5335a", - "1a8107dc0d53406ba59789b98aff7f9a", - "9c79d2d8bd534bd982681128ff66155a", - "4e7174b5381546b98c0483631316f9e5", - "2f1d706566f146febfb49aa02a4fc7de", - "f5ebdd8a776542d3b7f9b476c0aa5512", - "2ae85a21b46a469cac1f9f7d482de268", - "3f59d1d5e6174d6894661efecdd33ab8", - "5e5995ebed594d9cad5621faa35f77a2", - "89cc23e9cf2847788ec3edf1ba2db8e3", - "665ef92b2901437b87fec93d13b2d4ac", - "b0035682525c443884fc1212f968b320", - "5992d99cf8bc4146983af1d8cd7b265d", - "62519dce485943e1b82fa59e251c1356", - "f6a54fc4dc2847659f547739c35a21b0", - "73e01096faf44ad9b74735371430b492", - "5acbc6e8cb7c4bb486e3db5b55aff134", - "0cc723ccf64a4b03bc743c1a564894b0", - "6fde41413a1a4839973593da2f78ced3", - "6831116bb8884a9480d7f3bfe9973d9a", - "a7e3a11dd3f94e4fb54037ae40aba206", - "3ff040d751764df1bd75e25f24d9942e", - "fb0f1fb808504ea294a729f05b1abcc1", - "0b8590fba60f40f6af3835446752cd37", - "9d32b2d2e5c64bc49f72f2226483cfa9", - "f8a4062f155d4750a30bfbdc5292ddea", - "f40afdca622848cfb153e6dc1c3f8d64", - "6d4efff530d54625a97583d6bed2520c", - "22ef9732885147e3983c606a466e0615", - "8a46241fb1a34bd1ac48677447aa2942", - "c6b666ea73f74788af455ec3eb739918", - "25c0858689be446d8403e09f4a61dbbc", - "5dc0cbd7081645b6bee0ac67da226f7f", - "182592a6f90940c488cfb93b88d11de8", - "7116032437674606b9f3966eea6457bd", - "ab3fe113b61548c0bc356114fc0c054e", - "411d3970835f4a40a916329318af5be4", - "79fd48757aac4ad0a24fefd6ea1d305b", - "ae1cb2179d3b4fcb92b168699d9bdabf", - "fad638c32e4a407b97e6b464c8b2070b", - "5503d335b4d649bea4fcb23e41279b0a", - "1c27ab84b04f4473b264c20a8429eba1", - "236f08564a1c434e9d265b9cbd601f0c", - "5c2a29cd2e5d4b1e9ace1b517f80247c", - "3be95f3a95854ba6b8f68ff4b63b6649", - "5dbf531a776c48728899c0c00ab9b8f1", - "c401a44341474936876919f32aa32ca4" - ] - }, - "id": "3teRQj_RZr2l", - "outputId": "19d120d2-89f6-49a6-8c72-52e3ca326485" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Initializing RAG Pipeline on cuda\n", - "Loading Embedding Model: sentence-transformers/all-mpnet-base-v2\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", - "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", - "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", - "You will be able to reuse this secret in all of your notebooks.\n", - "Please note that authentication is recommended but still optional to access public models or datasets.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4a0ea4434e584a36bdafbd77fdf915d4", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "modules.json: 0%| | 0.00/349 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# 3. Run Benchmark\n", - "rag_system.run_benchmark(sample_size=200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 854 - }, - "id": "WyXrV0W_Zym1", - "outputId": "fe936e62-a0bb-4b42-abf5-e0cc24ecab29" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Launching Gradio UI\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.12/dist-packages/gradio/chat_interface.py:347: UserWarning: The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.\n", - " self.chatbot = Chatbot(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n", - "* Running on public URL: https://c9f3b4c1ac2f5412a9.gradio.live\n", - "\n", - "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" - ] - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Keyboard interruption in main thread... closing server.\n", - "Killing tunnel 127.0.0.1:7860 <> https://c9f3b4c1ac2f5412a9.gradio.live\n" - ] - } - ], - "source": [ - "# 4. Launch UI\n", - "rag_system.launch_gradio_ui()" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "A100", - "machine_shape": "hm", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "01c0eaf975ad4a8aa970f967e255981f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1a8107dc0d53406ba59789b98aff7f9a", - "placeholder": "​", - "style": "IPY_MODEL_9c79d2d8bd534bd982681128ff66155a", - "value": " 2/2 [00:36<00:00, 36.46s/it]" - } - }, - "0206f0d4ca17415f8691d3125d380aff": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "035286955a1b4a83957eb326f6bb0bee": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0407b8dd4aeb47b5b764bbff04602228": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0462deedee3648ae8aaa3ca3f2f1ef3a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c4b1153f8e0e4f3aa43d38bd0fce633a", - "placeholder": "​", - "style": "IPY_MODEL_833dd5db9c8641fb8297fa3761d8aedf", - "value": " 9.08M/? [00:00<00:00, 151MB/s]" - } - }, - "07cd5bdb960f4508b89a8367222ac494": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "0b2f7c9999d84d49afa93b78e67ef2fa": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3633835bfdc24858bf09a23ddb01a75c", - "max": 190, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_a917cc0882cf4354bd0d0ca37b0f5e62", - "value": 190 - } - }, - "0b57c423eedc41a287fefe85e1f63d7c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "0b6e7094f94f4cb481e772bd5443a025": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a4ba3264f68348eb9650d38fd1668a26", - "max": 116, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_421cda2009074fcd9c5d4a81109b67ce", - "value": 116 - } - }, - "0b8590fba60f40f6af3835446752cd37": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0cc723ccf64a4b03bc743c1a564894b0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0b8590fba60f40f6af3835446752cd37", - "placeholder": "​", - "style": "IPY_MODEL_9d32b2d2e5c64bc49f72f2226483cfa9", - "value": " 8.67G/8.67G [00:36<00:00, 518MB/s]" - } - }, - "0d437a74f6644ef3a5fe13a4e9d4b9e8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "0e048e0c3d8045309f54e9da505dc91b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_49b1777da63c42e69be269a810493345", - "placeholder": "​", - "style": "IPY_MODEL_aa365a31e1964c5bb4298a227b23d66c", - "value": " 466k/? [00:00<00:00, 40.0MB/s]" - } - }, - "0f56e7ca761d45c5ab1345df35e81f9e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1081d141162244228d6d499183cf8739": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_58b03b1be4494a2ab597eee6bcf7ae54", - "IPY_MODEL_ce3396be9e2a410699c283ce39bf43d0", - "IPY_MODEL_35fdd3442b744bb18d190720cd3ee675" - ], - "layout": "IPY_MODEL_ed3439b4f7614a2e895aa6b8c68f0dd7" - } - }, - "1368e371c0e445e69601acb25f90e5a9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "13a434a2a4624e6192ca51ae7e394667": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "1405d35083dc4c11a9ad396085c97a86": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_f73b129ccce84d9f8b09c85ea7e13a5d", - "IPY_MODEL_0b2f7c9999d84d49afa93b78e67ef2fa", - "IPY_MODEL_9951e4c31cf445049d9f14280c1b063e" - ], - "layout": "IPY_MODEL_937d0c8d494d40c98927ea97d39933a9" - } - }, - "146f8df9d3854b5b887412a8f61d66e0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_986e6f7bc5264864ac8835058e949ad0", - "IPY_MODEL_e30cfabacedb4a17a1f3953535d1712f", - "IPY_MODEL_01c0eaf975ad4a8aa970f967e255981f" - ], - "layout": "IPY_MODEL_5e66b50859b04182b3cf604f09444c68" - } - }, - "17ea216363d94349b98f267554d7aeae": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "182592a6f90940c488cfb93b88d11de8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "18b23622d00442ca9e977b587b0c7399": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "196df619acff47b4a6ec9f9ce1710dd1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "1a8107dc0d53406ba59789b98aff7f9a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1c27ab84b04f4473b264c20a8429eba1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1d4b2d25edcd4207bce2972e824e815d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e2a80bf2c52d4a0c98731e3de6a24f22", - "placeholder": "​", - "style": "IPY_MODEL_b06f97d725d14843a55ff2442af9a97e", - "value": " 116/116 [00:00<00:00, 15.0kB/s]" - } - }, - "1d67cff04e32401da2213ad82228b20d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fac8b597f40c43a29f69ce1e6f3a77f9", - "max": 53, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_aad85d3f773545799449c1a6080e7302", - "value": 53 - } - }, - "1d7ed0749dae4c81bad40d13c43d7629": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_439dc3fac7864363a295684ae40dbb35", - "IPY_MODEL_5fad1f69c31243bebf49515b52540989", - "IPY_MODEL_d031a5cdcf8d4a3aa030c22f9bbb423b" - ], - "layout": "IPY_MODEL_4867ed9d8d294c239f589708056ed572" - } - }, - "1ef175665a774f44896a1f938d318c3d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_7d74b694cba0407f992bce853ce926a3", - "IPY_MODEL_a15d8fd61eba4842b6be4c015cca167d", - "IPY_MODEL_0e048e0c3d8045309f54e9da505dc91b" - ], - "layout": "IPY_MODEL_bffaec4d3d254433b55a82dec359bf2c" - } - }, - "1feef1a0e35c475e964d9e598cf1b4c5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_927fff0357d7428480d2bd936b21ae6f", - "placeholder": "​", - "style": "IPY_MODEL_8e48b73fd5c648eb9609a8aa9830d12f", - "value": " 239/239 [00:00<00:00, 26.4kB/s]" - } - }, - "2239b2fa9dc64255ac7dec5ce5228856": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "22ef9732885147e3983c606a466e0615": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7116032437674606b9f3966eea6457bd", - "placeholder": "​", - "style": "IPY_MODEL_ab3fe113b61548c0bc356114fc0c054e", - "value": " 2/2 [00:17<00:00,  8.55s/it]" - } - }, - "236f08564a1c434e9d265b9cbd601f0c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "248e6c55a0284f4c857aad39c860ec75": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c0a319544cd34d56bfd4006624a97e4d", - "placeholder": "​", - "style": "IPY_MODEL_caea2c391cab4442a9ea8361cc7f8155", - "value": "tokenizer.json: " - } - }, - "257493a319dc4612b0f9fed2793913a9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "25c0858689be446d8403e09f4a61dbbc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "28a9db2641bf474b84eb193e3a5143be": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "29fe6407ea524db591c8f3579a4f8410": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_41f98a688b284ab4b6ffa93c1703abf4", - "placeholder": "​", - "style": "IPY_MODEL_94471425ae534a29a6d3d86a79312039", - "value": " 571/571 [00:00<00:00, 54.4kB/s]" - } - }, - "2ae85a21b46a469cac1f9f7d482de268": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5992d99cf8bc4146983af1d8cd7b265d", - "placeholder": "​", - "style": "IPY_MODEL_62519dce485943e1b82fa59e251c1356", - "value": " 7.39G/7.39G [00:32<00:00, 57.2MB/s]" - } - }, - "2b77339b62684a4785226247f3e5e85d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_efde71ea0b7440acbb9f45fdb5678264", - "max": 571, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_196df619acff47b4a6ec9f9ce1710dd1", - "value": 571 - } - }, - "2c01b612ca0f4e259dcf092f49186446": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2f1d706566f146febfb49aa02a4fc7de": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5e5995ebed594d9cad5621faa35f77a2", - "placeholder": "​", - "style": "IPY_MODEL_89cc23e9cf2847788ec3edf1ba2db8e3", - "value": "model-00002-of-000002.safetensors: 100%" - } - }, - "2fba292dcd994df7bfeb0e2bdf672b5d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "31a0d00d72494794826148e1f442b848": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "34a668472a424843bf5793ddcb449ec1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a7150866ea154d5b856961cb6057b36a", - "placeholder": "​", - "style": "IPY_MODEL_257493a319dc4612b0f9fed2793913a9", - "value": " 24.2k/? [00:00<00:00, 2.65MB/s]" - } - }, - "35fdd3442b744bb18d190720cd3ee675": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_8442e31b4f95484c84e74d7f85bf09ef", - "placeholder": "​", - "style": "IPY_MODEL_31a0d00d72494794826148e1f442b848", - "value": " 200/200 [1:15:56<00:00, 18.67s/it]" - } - }, - "3633835bfdc24858bf09a23ddb01a75c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "38b4097af230467a81ee65454f907eb5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ab64037914e04ee7adb2877784271790", - "placeholder": "​", - "style": "IPY_MODEL_feeb9cc484ee467b851fb576af521adf", - "value": "config_sentence_transformers.json: 100%" - } - }, - "38dcc7d3f5c142c68e7d5c5f6b86087d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_f5be3e18dfc24499ae25a2ddb496ae02", - "IPY_MODEL_cd046088208841c0be5b5c78203b00b8", - "IPY_MODEL_7d874d8f7e744ce191f0cdae0798e707" - ], - "layout": "IPY_MODEL_eb6ee46fe93a46489385ae7ee409665d" - } - }, - "3a0263be005f445fbfcd5ad0c9519d91": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "3bd2d7fb9da04416b2bcca81d4d15bd5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3be95f3a95854ba6b8f68ff4b63b6649": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "3dd72d49bb3548fca32d19eded946c2d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3ea8fd43ac404f9bb6981c2a90ff1e89": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3efb64c7f88a4db080cdb2255c8469fc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3f59d1d5e6174d6894661efecdd33ab8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3fe726c210d24f67a6d07d4446a852e6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_db060cbdb85a4aa480a98c3784b95029", - "placeholder": "​", - "style": "IPY_MODEL_b62fe168ee7f4be4958f377409753e2a", - "value": "config.json: 100%" - } - }, - "3ff040d751764df1bd75e25f24d9942e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "405dd53da4354822a2a2c504a4def91d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "40e78669fb1c4c7dbd18601a3f2a3543": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "411d3970835f4a40a916329318af5be4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_79fd48757aac4ad0a24fefd6ea1d305b", - "IPY_MODEL_ae1cb2179d3b4fcb92b168699d9bdabf", - "IPY_MODEL_fad638c32e4a407b97e6b464c8b2070b" - ], - "layout": "IPY_MODEL_5503d335b4d649bea4fcb23e41279b0a" - } - }, - "41f98a688b284ab4b6ffa93c1703abf4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "421cda2009074fcd9c5d4a81109b67ce": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "428e6cbcb8bc40bcb1b414667bb76b2a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "439dc3fac7864363a295684ae40dbb35": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_17ea216363d94349b98f267554d7aeae", - "placeholder": "​", - "style": "IPY_MODEL_035286955a1b4a83957eb326f6bb0bee", - "value": "model.safetensors: 100%" - } - }, - "45041ac463164980bb6437335bcf7edf": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4696a263556c417489660ea5c6240551": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "482e5a82919f4a1ab8168b0006b2d979": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4867ed9d8d294c239f589708056ed572": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "49b1777da63c42e69be269a810493345": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4a0ea4434e584a36bdafbd77fdf915d4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_9ffe20e0f2124c14b7c12ce8daf7c4b5", - "IPY_MODEL_ea06f0dd061c4e3a96240433969ed793", - "IPY_MODEL_c527b5e31da3403fb57d9fd4b2b9b191" - ], - "layout": "IPY_MODEL_76f4bb7b9b104810985282bd16842650" - } - }, - "4a8d56c5081342a1a5ea977f0e5e4903": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4e7174b5381546b98c0483631316f9e5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_2f1d706566f146febfb49aa02a4fc7de", - "IPY_MODEL_f5ebdd8a776542d3b7f9b476c0aa5512", - "IPY_MODEL_2ae85a21b46a469cac1f9f7d482de268" - ], - "layout": "IPY_MODEL_3f59d1d5e6174d6894661efecdd33ab8" - } - }, - "538567e28b9942808b2d7197c156fdb1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_638a3ecec14d45f399461d6b38ee81fd", - "placeholder": "​", - "style": "IPY_MODEL_c8a3b8286e3b4027b67758fdf25c2e35", - "value": " 3.07k/? [00:00<00:00, 408kB/s]" - } - }, - "54b693f1fd55413e825bcf420fa43e6a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0206f0d4ca17415f8691d3125d380aff", - "max": 239, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_0b57c423eedc41a287fefe85e1f63d7c", - "value": 239 - } - }, - "5503d335b4d649bea4fcb23e41279b0a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "58b03b1be4494a2ab597eee6bcf7ae54": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0407b8dd4aeb47b5b764bbff04602228", - "placeholder": "​", - "style": "IPY_MODEL_f1387e7b89c543beb5a20cd2cc13b4cc", - "value": "Benchmarking: 100%" - } - }, - "5992d99cf8bc4146983af1d8cd7b265d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "599727a146204539892e35143c16557b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5acbc6e8cb7c4bb486e3db5b55aff134": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3ff040d751764df1bd75e25f24d9942e", - "max": 8667826246, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_fb0f1fb808504ea294a729f05b1abcc1", - "value": 8667826246 - } - }, - "5c2a29cd2e5d4b1e9ace1b517f80247c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5dbf531a776c48728899c0c00ab9b8f1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5dc0cbd7081645b6bee0ac67da226f7f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5dd09f0c273647bd89678aa2fe32f6df": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5e5995ebed594d9cad5621faa35f77a2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5e66b50859b04182b3cf604f09444c68": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5fad1f69c31243bebf49515b52540989": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3dd72d49bb3548fca32d19eded946c2d", - "max": 437971872, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_7d3ad593a3314885a4f213e4c89da0c4", - "value": 437971872 - } - }, - "603d475c14724766b576206c2ec53a45": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "60574c59d6da4bfa92b4b93884d2b5d8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "61ff82cf14374a28a23b941d965027eb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "62519dce485943e1b82fa59e251c1356": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "638a3ecec14d45f399461d6b38ee81fd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6583e5aadfca452c93f8f87ec2f97b32": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "665ef92b2901437b87fec93d13b2d4ac": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "66d531e4e851439e9b66b7a9b2d285b1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "6824c16467c644af923b0f11ac97ddb7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "6831116bb8884a9480d7f3bfe9973d9a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6925e5ccc9be44f6bc8477a974e75478": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_405dd53da4354822a2a2c504a4def91d", - "placeholder": "​", - "style": "IPY_MODEL_cdd057a3ca3b4a18a9eabd1e267f4924", - "value": "model.safetensors.index.json: " - } - }, - "6a88eb28c414476c86ca250e5df03db6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2fba292dcd994df7bfeb0e2bdf672b5d", - "placeholder": "​", - "style": "IPY_MODEL_b54500bcbde249ffa53ccd68c2105b7a", - "value": "special_tokens_map.json: 100%" - } - }, - "6ac6e1708632488b9af0bbefb6542a02": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e847e23b3dc64c3684d713e36a2d43e5", - "IPY_MODEL_bb112f11bd27452fadf6e17f6f5f5022", - "IPY_MODEL_81d8ade37ad944c2bf20d5d2bec8e94c" - ], - "layout": "IPY_MODEL_b4d7798f0f9e4d68bb10104c227c8236" - } - }, - "6c52dc5d7772478eac56b01ea54c8e19": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_6a88eb28c414476c86ca250e5df03db6", - "IPY_MODEL_54b693f1fd55413e825bcf420fa43e6a", - "IPY_MODEL_1feef1a0e35c475e964d9e598cf1b4c5" - ], - "layout": "IPY_MODEL_e5a8f5ebb26843cf92150b2f817b2dad" - } - }, - "6d4efff530d54625a97583d6bed2520c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5dc0cbd7081645b6bee0ac67da226f7f", - "max": 2, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_182592a6f90940c488cfb93b88d11de8", - "value": 2 - } - }, - "6dfc2bf4e9424660a65a5eed380280f3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_3fe726c210d24f67a6d07d4446a852e6", - "IPY_MODEL_9efa8add1b4e4b79a542b40d4d24e613", - "IPY_MODEL_dd9daff9f9f94764aeb9234e72ea6249" - ], - "layout": "IPY_MODEL_2c01b612ca0f4e259dcf092f49186446" - } - }, - "6fde41413a1a4839973593da2f78ced3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7116032437674606b9f3966eea6457bd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7254e9e3c0a24d0eaa7908b8018e3f83": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "73e01096faf44ad9b74735371430b492": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_6831116bb8884a9480d7f3bfe9973d9a", - "placeholder": "​", - "style": "IPY_MODEL_a7e3a11dd3f94e4fb54037ae40aba206", - "value": "model-00001-of-000002.safetensors: 100%" - } - }, - "74af2c01c93847bdb011d0368acf44f7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "76f4bb7b9b104810985282bd16842650": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "77232511fed041feaeefc974aceee07f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d0c8f23c7434465ab284741201956266", - "placeholder": "​", - "style": "IPY_MODEL_13a434a2a4624e6192ca51ae7e394667", - "value": "sentence_bert_config.json: 100%" - } - }, - "774add6aa42e4884809cdf435ce39068": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a0d0c70f3f11437f9c99619436e7f077", - "placeholder": "​", - "style": "IPY_MODEL_f6821378164c4c2eb9458d1458ca58e1", - "value": " 53.0/53.0 [00:00<00:00, 7.44kB/s]" - } - }, - "79fd48757aac4ad0a24fefd6ea1d305b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1c27ab84b04f4473b264c20a8429eba1", - "placeholder": "​", - "style": "IPY_MODEL_236f08564a1c434e9d265b9cbd601f0c", - "value": "generation_config.json: 100%" - } - }, - "7d3ad593a3314885a4f213e4c89da0c4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "7d74b694cba0407f992bce853ce926a3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d5bc725f098447a6920aee7ff3e00a97", - "placeholder": "​", - "style": "IPY_MODEL_6583e5aadfca452c93f8f87ec2f97b32", - "value": "tokenizer.json: " - } - }, - "7d874d8f7e744ce191f0cdae0798e707": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_89f29ab2c0f84bdf812d34265d0319d2", - "placeholder": "​", - "style": "IPY_MODEL_889fb0ea3dc3479ba47569301abf1095", - "value": " 232k/? [00:00<00:00, 16.1MB/s]" - } - }, - "7f5af4598bb24c0b992424624f6dfb6d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "81d8ade37ad944c2bf20d5d2bec8e94c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_dfdac9cdd0c546429604490fdecb22a7", - "placeholder": "​", - "style": "IPY_MODEL_db2458bb9a0a49d8b087a2b1ebd0864d", - "value": " 363/363 [00:00<00:00, 45.8kB/s]" - } - }, - "833dd5db9c8641fb8297fa3761d8aedf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "8442e31b4f95484c84e74d7f85bf09ef": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "871272ea241147bb87feeaaf793e1d9e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "889fb0ea3dc3479ba47569301abf1095": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "890f5d59c2944577921d3f31529aeaf9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "89cc23e9cf2847788ec3edf1ba2db8e3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "89f29ab2c0f84bdf812d34265d0319d2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8a46241fb1a34bd1ac48677447aa2942": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8ac89f580fb8427697cac9fad7ee1693": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "8c4ba041dafb4b429a4f23a77efb4746": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8e48b73fd5c648eb9609a8aa9830d12f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "927fff0357d7428480d2bd936b21ae6f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "92a456546b7b493d8321c24a62a6b551": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c9d5bcb10e674784a570b625a59f12cf", - "placeholder": "​", - "style": "IPY_MODEL_e88678951fc2460ea8241edba800a2c0", - "value": "README.md: " - } - }, - "937d0c8d494d40c98927ea97d39933a9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "94471425ae534a29a6d3d86a79312039": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "94a70ea4f9f94d7dbfbda2f820bf7016": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "966b2b6f86dc49e7a579b500bef5b031": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_6925e5ccc9be44f6bc8477a974e75478", - "IPY_MODEL_f8d10d68c78048e5ab8fc3caf3df6643", - "IPY_MODEL_34a668472a424843bf5793ddcb449ec1" - ], - "layout": "IPY_MODEL_e2a296c76c4845fbba8070a639deddbd" - } - }, - "986e6f7bc5264864ac8835058e949ad0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_40e78669fb1c4c7dbd18601a3f2a3543", - "placeholder": "​", - "style": "IPY_MODEL_b8540d0b29c34a9ab6045b7276ed0ba4", - "value": "Fetching 2 files: 100%" - } - }, - "9951e4c31cf445049d9f14280c1b063e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a6be0858867147f6b1c64b626d583a2c", - "placeholder": "​", - "style": "IPY_MODEL_890f5d59c2944577921d3f31529aeaf9", - "value": " 190/190 [00:00<00:00, 24.8kB/s]" - } - }, - "9a02b190859e49efaeb6d51605eee3b2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9a80ee6fd91a401dac1652acd15d44f5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7f5af4598bb24c0b992424624f6dfb6d", - "placeholder": "​", - "style": "IPY_MODEL_74af2c01c93847bdb011d0368acf44f7", - "value": " 11.6k/? [00:00<00:00, 1.38MB/s]" - } - }, - "9aee096b2c81404bb99d96fe337cee8f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_248e6c55a0284f4c857aad39c860ec75", - "IPY_MODEL_efba2394d02944e5927907b4bfc3ccee", - "IPY_MODEL_0462deedee3648ae8aaa3ca3f2f1ef3a" - ], - "layout": "IPY_MODEL_8c4ba041dafb4b429a4f23a77efb4746" - } - }, - "9c79d2d8bd534bd982681128ff66155a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "9d32b2d2e5c64bc49f72f2226483cfa9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "9e68a7867b4941109d73f86659f5335a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "9eddcbf6d4cc4f7ca5ba92e7d2dfd6db": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_61ff82cf14374a28a23b941d965027eb", - "placeholder": "​", - "style": "IPY_MODEL_fa92d691af6946568131ab68afffbf00", - "value": "config.json: 100%" - } - }, - "9efa8add1b4e4b79a542b40d4d24e613": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2239b2fa9dc64255ac7dec5ce5228856", - "max": 826, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_c564890b26554bd89876286934cc622e", - "value": 826 - } - }, - "9ffe20e0f2124c14b7c12ce8daf7c4b5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_afa27f83547045aca2c4e8e92fe49878", - "placeholder": "​", - "style": "IPY_MODEL_871272ea241147bb87feeaaf793e1d9e", - "value": "modules.json: 100%" - } - }, - "a0d0c70f3f11437f9c99619436e7f077": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a10211fa0818443f89043174a09bee1a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a15d8fd61eba4842b6be4c015cca167d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_94a70ea4f9f94d7dbfbda2f820bf7016", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_66d531e4e851439e9b66b7a9b2d285b1", - "value": 1 - } - }, - "a366ef49fb9e4e068c7206fa5de9a4d3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "a423c1bf1c4f47dcb3ca275f27f4b23b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "a4ba3264f68348eb9650d38fd1668a26": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a6be0858867147f6b1c64b626d583a2c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a7150866ea154d5b856961cb6057b36a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a7e3a11dd3f94e4fb54037ae40aba206": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a917cc0882cf4354bd0d0ca37b0f5e62": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "a9a353632bc6456da4c6a6210cf2e7ae": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "aa365a31e1964c5bb4298a227b23d66c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "aad85d3f773545799449c1a6080e7302": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "ab3fe113b61548c0bc356114fc0c054e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ab64037914e04ee7adb2877784271790": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ae1cb2179d3b4fcb92b168699d9bdabf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5c2a29cd2e5d4b1e9ace1b517f80247c", - "max": 181, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_3be95f3a95854ba6b8f68ff4b63b6649", - "value": 181 - } - }, - "afa27f83547045aca2c4e8e92fe49878": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b0035682525c443884fc1212f968b320": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "b06f97d725d14843a55ff2442af9a97e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b19cf7b7c453462ea0afb26e12f404f2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "b316a24c01d341ca878ebc93bb6db8ad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_77232511fed041feaeefc974aceee07f", - "IPY_MODEL_1d67cff04e32401da2213ad82228b20d", - "IPY_MODEL_774add6aa42e4884809cdf435ce39068" - ], - "layout": "IPY_MODEL_0f56e7ca761d45c5ab1345df35e81f9e" - } - }, - "b38d78293de747829603e05788a7482c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a366ef49fb9e4e068c7206fa5de9a4d3", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6824c16467c644af923b0f11ac97ddb7", - "value": 1 - } - }, - "b4d7798f0f9e4d68bb10104c227c8236": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b54500bcbde249ffa53ccd68c2105b7a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b62fe168ee7f4be4958f377409753e2a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b8540d0b29c34a9ab6045b7276ed0ba4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "bb112f11bd27452fadf6e17f6f5f5022": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f41affd88fb944ed8ac6cc632d2d7edc", - "max": 363, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_a423c1bf1c4f47dcb3ca275f27f4b23b", - "value": 363 - } - }, - "bb412ac42ad64c30934b9594bfad392b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_cac61cb1e1d7406e9dc97aa868525dfe", - "IPY_MODEL_b38d78293de747829603e05788a7482c", - "IPY_MODEL_538567e28b9942808b2d7197c156fdb1" - ], - "layout": "IPY_MODEL_9a02b190859e49efaeb6d51605eee3b2" - } - }, - "bdf14d4a82c2425d8ec0c9a2acc63055": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "bffaec4d3d254433b55a82dec359bf2c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c0765fdc3eb143d6880ff4e00141ce14": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c0a319544cd34d56bfd4006624a97e4d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c2536ef6eb4341e3b54cf473ae498ca9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_92a456546b7b493d8321c24a62a6b551", - "IPY_MODEL_ea161cf843184232bb844545176cffb0", - "IPY_MODEL_9a80ee6fd91a401dac1652acd15d44f5" - ], - "layout": "IPY_MODEL_3bd2d7fb9da04416b2bcca81d4d15bd5" - } - }, - "c2c50ced649b428097bffe04f97a137f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_38b4097af230467a81ee65454f907eb5", - "IPY_MODEL_0b6e7094f94f4cb481e772bd5443a025", - "IPY_MODEL_1d4b2d25edcd4207bce2972e824e815d" - ], - "layout": "IPY_MODEL_60574c59d6da4bfa92b4b93884d2b5d8" - } - }, - "c401a44341474936876919f32aa32ca4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c4b1153f8e0e4f3aa43d38bd0fce633a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c527b5e31da3403fb57d9fd4b2b9b191": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5dd09f0c273647bd89678aa2fe32f6df", - "placeholder": "​", - "style": "IPY_MODEL_e4ea1f9ea6a343ef995c2eface0efd13", - "value": " 349/349 [00:00<00:00, 31.2kB/s]" - } - }, - "c564890b26554bd89876286934cc622e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "c6b666ea73f74788af455ec3eb739918": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c8a3b8286e3b4027b67758fdf25c2e35": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c9d5bcb10e674784a570b625a59f12cf": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "cac61cb1e1d7406e9dc97aa868525dfe": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_428e6cbcb8bc40bcb1b414667bb76b2a", - "placeholder": "​", - "style": "IPY_MODEL_3ea8fd43ac404f9bb6981c2a90ff1e89", - "value": "tokenizer_config.json: " - } - }, - "caea2c391cab4442a9ea8361cc7f8155": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ccec33fd75964b30a96ce2e2a1e9815f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "cd046088208841c0be5b5c78203b00b8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d5f205776c3f42ccaa9542f715b85748", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_b19cf7b7c453462ea0afb26e12f404f2", - "value": 1 - } - }, - "cdd057a3ca3b4a18a9eabd1e267f4924": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ce3396be9e2a410699c283ce39bf43d0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fbe77566bd3448a7b83f76591c6de21f", - "max": 200, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_a9a353632bc6456da4c6a6210cf2e7ae", - "value": 200 - } - }, - "d031a5cdcf8d4a3aa030c22f9bbb423b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7254e9e3c0a24d0eaa7908b8018e3f83", - "placeholder": "​", - "style": "IPY_MODEL_28a9db2641bf474b84eb193e3a5143be", - "value": " 438M/438M [00:02<00:00, 415MB/s]" - } - }, - "d0c8f23c7434465ab284741201956266": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d5bc725f098447a6920aee7ff3e00a97": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d5f205776c3f42ccaa9542f715b85748": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "db060cbdb85a4aa480a98c3784b95029": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "db2458bb9a0a49d8b087a2b1ebd0864d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "dd9daff9f9f94764aeb9234e72ea6249": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4a8d56c5081342a1a5ea977f0e5e4903", - "placeholder": "​", - "style": "IPY_MODEL_1368e371c0e445e69601acb25f90e5a9", - "value": " 826/826 [00:00<00:00, 78.4kB/s]" - } - }, - "dfdac9cdd0c546429604490fdecb22a7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e07c41df4bf447acae4a58ae3b0c0d73": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e2a296c76c4845fbba8070a639deddbd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e2a80bf2c52d4a0c98731e3de6a24f22": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e30cfabacedb4a17a1f3953535d1712f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3efb64c7f88a4db080cdb2255c8469fc", - "max": 2, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_9e68a7867b4941109d73f86659f5335a", - "value": 2 - } - }, - "e34578ea4acd4623ba900470212faa90": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_9eddcbf6d4cc4f7ca5ba92e7d2dfd6db", - "IPY_MODEL_2b77339b62684a4785226247f3e5e85d", - "IPY_MODEL_29fe6407ea524db591c8f3579a4f8410" - ], - "layout": "IPY_MODEL_e07c41df4bf447acae4a58ae3b0c0d73" - } - }, - "e4ea1f9ea6a343ef995c2eface0efd13": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e5a8f5ebb26843cf92150b2f817b2dad": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e847e23b3dc64c3684d713e36a2d43e5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_45041ac463164980bb6437335bcf7edf", - "placeholder": "​", - "style": "IPY_MODEL_4696a263556c417489660ea5c6240551", - "value": "tokenizer_config.json: 100%" - } - }, - "e88678951fc2460ea8241edba800a2c0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ea06f0dd061c4e3a96240433969ed793": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_482e5a82919f4a1ab8168b0006b2d979", - "max": 349, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_8ac89f580fb8427697cac9fad7ee1693", - "value": 349 - } - }, - "ea161cf843184232bb844545176cffb0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_bdf14d4a82c2425d8ec0c9a2acc63055", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_603d475c14724766b576206c2ec53a45", - "value": 1 - } - }, - "eb6ee46fe93a46489385ae7ee409665d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ed3439b4f7614a2e895aa6b8c68f0dd7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "efba2394d02944e5927907b4bfc3ccee": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3a0263be005f445fbfcd5ad0c9519d91", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_0d437a74f6644ef3a5fe13a4e9d4b9e8", - "value": 1 - } - }, - "efde71ea0b7440acbb9f45fdb5678264": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f1387e7b89c543beb5a20cd2cc13b4cc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f40afdca622848cfb153e6dc1c3f8d64": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c6b666ea73f74788af455ec3eb739918", - "placeholder": "​", - "style": "IPY_MODEL_25c0858689be446d8403e09f4a61dbbc", - "value": "Loading checkpoint shards: 100%" - } - }, - "f41affd88fb944ed8ac6cc632d2d7edc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f5be3e18dfc24499ae25a2ddb496ae02": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a10211fa0818443f89043174a09bee1a", - "placeholder": "​", - "style": "IPY_MODEL_c0765fdc3eb143d6880ff4e00141ce14", - "value": "vocab.txt: " - } - }, - "f5ebdd8a776542d3b7f9b476c0aa5512": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_665ef92b2901437b87fec93d13b2d4ac", - "max": 7392730108, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_b0035682525c443884fc1212f968b320", - "value": 7392730108 - } - }, - "f6821378164c4c2eb9458d1458ca58e1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f6a54fc4dc2847659f547739c35a21b0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_73e01096faf44ad9b74735371430b492", - "IPY_MODEL_5acbc6e8cb7c4bb486e3db5b55aff134", - "IPY_MODEL_0cc723ccf64a4b03bc743c1a564894b0" - ], - "layout": "IPY_MODEL_6fde41413a1a4839973593da2f78ced3" - } - }, - "f73b129ccce84d9f8b09c85ea7e13a5d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_599727a146204539892e35143c16557b", - "placeholder": "​", - "style": "IPY_MODEL_18b23622d00442ca9e977b587b0c7399", - "value": "config.json: 100%" - } - }, - "f8a4062f155d4750a30bfbdc5292ddea": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_f40afdca622848cfb153e6dc1c3f8d64", - "IPY_MODEL_6d4efff530d54625a97583d6bed2520c", - "IPY_MODEL_22ef9732885147e3983c606a466e0615" - ], - "layout": "IPY_MODEL_8a46241fb1a34bd1ac48677447aa2942" - } - }, - "f8d10d68c78048e5ab8fc3caf3df6643": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ccec33fd75964b30a96ce2e2a1e9815f", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_07cd5bdb960f4508b89a8367222ac494", - "value": 1 - } - }, - "fa92d691af6946568131ab68afffbf00": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "fac8b597f40c43a29f69ce1e6f3a77f9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fad638c32e4a407b97e6b464c8b2070b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5dbf531a776c48728899c0c00ab9b8f1", - "placeholder": "​", - "style": "IPY_MODEL_c401a44341474936876919f32aa32ca4", - "value": " 181/181 [00:00<00:00, 22.2kB/s]" - } - }, - "fb0f1fb808504ea294a729f05b1abcc1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "fbe77566bd3448a7b83f76591c6de21f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "feeb9cc484ee467b851fb576af521adf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/README.md b/README.md new file mode 100644 index 0000000..a5c8973 --- /dev/null +++ b/README.md @@ -0,0 +1,260 @@ +
+ +# 🧬 Knowledge Graph Question Answering + +### GraphRAG vs PlainRAG on PubMedQA — a fair, leakage-free, statistically-tested ablation + +[![CI](https://github.com/vardhjain/Knowledge_Graph_Question_Answering/actions/workflows/ci.yml/badge.svg)](https://github.com/vardhjain/Knowledge_Graph_Question_Answering/actions/workflows/ci.yml) +[![codecov](https://codecov.io/gh/vardhjain/Knowledge_Graph_Question_Answering/graph/badge.svg)](https://codecov.io/gh/vardhjain/Knowledge_Graph_Question_Answering) +[![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/) +[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE) +[![Lint: ruff](https://img.shields.io/badge/lint-ruff-261230.svg)](https://github.com/astral-sh/ruff) +[![Live Demo](https://img.shields.io/badge/Streamlit-Live%20Demo-FF4B4B?logo=streamlit&logoColor=white)](https://vardhjain-knowledge-graph-question-answerin-appdashboard-hkwi57.streamlit.app) +[![Docs](https://img.shields.io/badge/docs-online-1f6feb)](https://vardhjain.github.io/Knowledge_Graph_Question_Answering/) + +[**▶ Live demo**](https://vardhjain-knowledge-graph-question-answerin-appdashboard-hkwi57.streamlit.app)  ·  [**Results**](#results)  ·  [**Why it's fair**](#why-the-original-comparison-was-unfair-and-what-changed)  ·  [**Setup**](#setup) + +
+ +A controlled study of **what a knowledge graph actually contributes** to +retrieval-augmented question answering on biomedical literature +([PubMedQA](https://pubmedqa.github.io/)). + +Most "GraphRAG beats RAG" demos are confounded: the graph pipeline quietly also +gets a reranker, a different corpus, or — worst of all — leaks the answer into +the prompt. This repo throws those out and runs a **4-arm ablation** where every +layer is held constant and the *only* thing that changes is how much graph +structure the retriever uses. + +``` +plain ─► plain_rr ─► graph ─► graph_concepts + (RAG) (+rerank) (+parent (+MeSH concept + expansion) hop) +``` + +Same corpus, same chunking, same embedder, same reranker, same prompt, same LLM, +same seeded sample, same top-k. The accuracy delta between adjacent arms is +attributable to exactly one component, and we report a **paired McNemar test** so +you can tell a real effect from noise. + +![Architecture and 4-arm ablation](assets/architecture.svg) + +--- + +## Why the original comparison was unfair (and what changed) + +This started from a working but confounded notebook comparison. The audit found +six issues; all are fixed in this revamp: + +| # | Flaw (before) | Fix (now) | +| --- | --- | --- | +| 1 | GraphRAG had a cross-encoder reranker; PlainRAG was raw FAISS top-3 | The reranker is its **own arm** (`plain_rr`). The graph arms build *on top of* `plain_rr`, so the rerank is controlled for, not a hidden advantage | +| 2 | The two pipelines indexed **different corpora** | All arms search one shared `ChunkStore` (labeled + unlabeled, identical chunks) | +| 3 | Different granularity (whole abstracts vs per-section chunks) | Identical per-section chunking for every arm | +| 4 | **Label leakage**: papers stored `title = question` and `final_decision`, injected into the prompt as `=== STUDY: {title} ===` | Ingestion stores **no** question-derived title and **no** `final_decision`; graph context uses generic `=== STUDY n ===` labels with abstracts only. A unit test asserts the question never appears in the context | +| 5 | `Concepts` (MeSH) and `MENTIONS` edges were built but **never used** | The `graph_concepts` arm hops across shared MeSH concepts to pull in related papers | +| 6 | `NameError` in the graph fallback; first-100 samples, no seed, no significance test | Fixed fallback; seeded random sample (default n=200); paired McNemar test | + +**What we expected vs. what we found.** Going in, we expected concept-hop +expansion to be where the graph shines and a plain parent-expansion gain to be +modest. The data said the opposite: the decisive, statistically significant win +came from **parent-document expansion**, while concept-hop did not help on this +single-abstract dataset. We report that honestly rather than bury it — see +[Results](#results). + +--- + +## 🗂️ Repository layout + +``` +src/kgqa/ importable package — single source of truth + config.py all shared constants (models, top-k, seed, n) + prompts.py benchmark/chat prompts (identical across arms) + llm.py Ollama client + data.py seeded sampling + canonical chunking + evaluation.py answer extraction, metrics, McNemar test + models.py encoder / reranker / ArangoDB loaders + retrieval/ + base.py ChunkStore + BaseRetriever (encode→rerank→select) + plain.py plain, plain_rr arms + graph.py graph, graph_concepts arms +scripts/ + ingest.py build the leakage-free graph in ArangoDB (run once) + run_benchmark.py run one arm: --arm {plain,plain_rr,graph,graph_concepts} + compare.py summary table + McNemar + ablation figure +notebooks/ + 01_ingest.ipynb thin Colab wrapper for ingestion + 02_benchmark.ipynb thin Colab wrapper for all arms + comparison +tests/ pytest suite (runs on CPU, no Ollama/ArangoDB needed) +docs/ project report (PDF) and slides (PPTX) +``` + +## 🧰 Stack + +- **Dataset:** PubMedQA (`pqa_labeled` for evaluation, `pqa_unlabeled` for corpus) +- **Embeddings:** `all-MiniLM-L6-v2` (384-dim) +- **Reranker:** `cross-encoder/ms-marco-MiniLM-L-6-v2` +- **Graph DB:** ArangoDB — any instance (local Docker or [ArangoDB Oasis](https://cloud.arangodb.com)); schema: Papers / Chunks / Concepts; HAS_CONTEXT / MENTIONS +- **LLM:** `deepseek-r1:8b` via [Ollama](https://ollama.com) + +--- + +## Setup + +```bash +pip install -r requirements.txt # add -r requirements-dev.txt for tests +cp .env.example .env # then set ARANGO_PASS (and ARANGO_HOST if remote) +``` + +All connection settings are read from the environment (or a local `.env`, or +Colab Secrets) — `ARANGO_HOST`, `ARANGO_USER`, `ARANGO_PASS`, `ARANGO_DB`. +**Nothing is hardcoded**; the default host is `http://localhost:8529`. + +You need two services: an **ArangoDB** instance and a running **Ollama**. + +```bash +# ArangoDB — option A: local, via the bundled compose file +docker compose up -d # ArangoDB at localhost:8529 (root / devpassword) +export ARANGO_PASS=devpassword # PowerShell: $env:ARANGO_PASS="devpassword" + +# ArangoDB — option B: a cloud deployment (e.g. ArangoDB Oasis free tier) +# export ARANGO_HOST=https://.arangodb.cloud:8529 +# export ARANGO_PASS= + +# Ollama (LLM) +ollama serve & ollama pull deepseek-r1:8b +``` + +## ⚙️ Running the benchmark + +```bash +python scripts/ingest.py # build the graph once +make benchmark # all four arms (n=200) +# or run arms individually: +# python scripts/run_benchmark.py --arm plain --n 200 (plain_rr / graph / graph_concepts) +python scripts/compare.py # table + McNemar + figure -> results/ +``` + +The benchmark is LLM-bound and benefits from a GPU. If you don't have one, +**Google Colab** works well: run [`notebooks/01_ingest.ipynb`](notebooks/01_ingest.ipynb) +once, then [`notebooks/02_benchmark.ipynb`](notebooks/02_benchmark.ipynb) (set +`ARANGO_HOST` / `ARANGO_PASS` in Colab Secrets). + +--- + +## ▶ Live demo + +**[▶ Open the results dashboard](https://vardhjain-knowledge-graph-question-answerin-appdashboard-hkwi57.streamlit.app)** — an +interactive Streamlit dashboard of the 4-arm ablation: headline accuracy, the +paired McNemar significance tests, latency, and (when raw results are present) +per-class confusion matrices. No setup, no login — it reads the committed +`results/` artifacts, so it needs no LLM, database, or GPU. + +[![Results dashboard](assets/dashboard.png)](https://vardhjain-knowledge-graph-question-answerin-appdashboard-hkwi57.streamlit.app) + +Run the dashboard locally: + +```bash +pip install -r app/requirements.txt +make dashboard # or: streamlit run app/dashboard.py +``` + +**Chat demo** — a Gradio assistant that answers from the graph and cites PubMed +IDs (the winning `graph` arm). It's a *live* pipeline that needs a reachable +ArangoDB + Ollama, so run it yourself (best on a GPU Colab): + +```bash +pip install -r requirements-app.txt +python app/chat_app.py --share # public Gradio link +``` + +![GraphRAG chat interface](assets/chat.png) + +A hosted always-on chat isn't provided on purpose — it would need a paid GPU and +a persistent ArangoDB. See [app/README.md](app/README.md) for details. + +--- + +## Results + +Seeded random sample of **n = 200** PubMedQA `pqa_labeled` questions (seed 42, +identical across arms), `deepseek-r1:8b` via Ollama on an A100. Regenerate with +`scripts/compare.py` (writes `results/summary.md` and `results/ablation.png`). + +| Arm | Accuracy | Macro F1 | Avg latency | Adds | +| --- | --- | --- | --- | --- | +| `plain` | 30.0% | 29.7% | 6.4 s | baseline chunk RAG | +| `plain_rr` | 37.0% | 35.2% | 6.6 s | + cross-encoder reranker | +| **`graph`** | **59.5%** | **50.5%** | 7.5 s | + parent-paper expansion | +| `graph_concepts` | 57.5% | 50.0% | 40.8 s | + MeSH concept hop | + +**Paired McNemar tests** — each contrast isolates one component on the same 200 questions: + +| Contrast | Δ accuracy | gains / losses | p | significant? | +| --- | --- | --- | --- | --- | +| `plain → plain_rr` (reranker) | +7.0 pp | 35 / 21 | 0.081 | no | +| `plain_rr → graph` (parent expansion) | **+22.5 pp** | 71 / 26 | **<0.0001** | **yes** | +| `graph → graph_concepts` (concept hop) | −2.0 pp | 26 / 30 | 0.69 | no | + +![4-arm ablation on PubMedQA](results/ablation.png) + +### What the ablation shows + +1. **The graph's decisive win is parent-document expansion** (+22.5 pp, + p < 0.0001). Retrieving at the fine-grained chunk level but feeding the LLM the + *full reconstructed abstract* (chunk → paper → all sections, via `HAS_CONTEXT`) + is what moves the needle — for only ~1 s over `plain_rr`. With the label + leakage fixed, this is a clean, legitimate graph advantage. +2. **Single-fragment retrieval is not enough for PubMedQA.** `plain` and + `plain_rr` land *below* the majority-class baseline (PubMedQA is ≈55% "yes"); a + lone ~250-character section rarely contains enough to judge the question. + Context sufficiency — which the graph supplies — is the dominant factor, and + `graph` is the only arm that clears the trivial baseline. +3. **The reranker helps modestly but not significantly** at this sample size + (+7 pp, p = 0.08). +4. **Concept-hop expansion does not help here** (−2 pp, p = 0.69) and costs ~5× + the latency. An honest — and expected — negative result: on single-abstract QA, + papers pulled in via shared MeSH terms act mostly as distractors. The graph + helps by *deepening* context (the full document), not by *broadening* it + (related documents). + +The macro-F1 / accuracy gap on the graph arms reflects weak recall on the rare +`maybe` class (~11% of the data) — a dataset property, not a retrieval one. + +--- + +## 🧪 Development + +```bash +make install-dev # deps for tests + lint +make test # pytest — 17 tests, all CPU, no external services +make lint # ruff +make help # all shortcuts (ingest, benchmark, compare, ...) +``` + +CI runs ruff + pytest on every push/PR (Python 3.10 and 3.11). Unit tests inject +fakes for the encoder, reranker, and ArangoDB, so the heavy ML dependencies are +never needed just to verify the logic. Optionally `pre-commit install` to run +ruff automatically on each commit. + +## 📖 Documentation + +- **[Project site](https://vardhjain.github.io/Knowledge_Graph_Question_Answering/)** — the story and results at a glance (GitHub Pages) +- **[Project report (PDF)](docs/Project_Report.pdf)** and **[slides](docs/Graph_RAG_PPT.pptx)** +- **[Architecture diagram](assets/architecture.svg)** · **[CHANGELOG](CHANGELOG.md)** · **[CONTRIBUTING](CONTRIBUTING.md)** + +## 🤝 Contributing + +Contributions are welcome — see [CONTRIBUTING.md](CONTRIBUTING.md) for setup, the +project layout, and the fairness ground rules. Changes are tracked in +[CHANGELOG.md](CHANGELOG.md); please be kind and follow the +[Code of Conduct](CODE_OF_CONDUCT.md). + +## 📚 Citing + +If this project or its findings are useful in your work, please cite it — see +[CITATION.cff](CITATION.cff) (GitHub renders a "Cite this repository" button). + +## 📄 License + +[MIT](LICENSE). diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..714ea7d --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,23 @@ +# Security Policy + +This is a research project, but a few things are worth handling carefully. + +## Reporting a vulnerability + +If you find a security issue (for example, an accidental credential commit or a +dependency vulnerability), please **do not open a public issue**. Instead, use +GitHub's [private vulnerability reporting](https://github.com/vardhjain/Knowledge_Graph_Question_Answering/security/advisories/new) +or email the maintainer. You can expect an acknowledgement within a few days. + +## Secrets + +- Never commit real credentials. ArangoDB and LLM settings are read from the + environment (or a local `.env`, which is git-ignored). Use `.env.example` as a + template, and Colab **Secrets** for notebook runs. +- If a secret is ever committed, rotate it immediately — removing it from the + latest commit is not enough, as it remains in git history. + +## Supported versions + +The latest release on `main` is supported. This project pins minimum dependency +versions in `requirements.txt`; run `pip list --outdated` periodically. diff --git a/app/README.md b/app/README.md new file mode 100644 index 0000000..a70c7d7 --- /dev/null +++ b/app/README.md @@ -0,0 +1,52 @@ +# Apps + +Two optional front-ends. Install their deps with `pip install -r requirements-app.txt`. + +## `chat_app.py` — live GraphRAG chat (Gradio) + +An interactive assistant over the winning `graph` arm: it retrieves from the +knowledge graph, answers with `deepseek-r1:8b`, and cites the source PubMed IDs. + +```bash +python app/chat_app.py # http://localhost:7860 +python app/chat_app.py --share # public share link (handy on Colab) +python app/chat_app.py --concepts # use the graph_concepts arm +``` + +This is a **live** demo, so it needs the backend running: a reachable ArangoDB +(`ARANGO_HOST` / `ARANGO_PASS`) and Ollama with `deepseek-r1:8b` pulled. To host +it on **Hugging Face Spaces**, set the Space SDK to Gradio and `app_file: +app/chat_app.py`, and point `ARANGO_HOST`/`ARANGO_PASS` at a hosted database via +Space secrets. + +## `dashboard.py` — results dashboard (Streamlit) + +Visualizes the saved benchmark: per-arm accuracy/F1, the paired McNemar tests, +the ablation figure, and (if the per-sample `results/*_results.json` are present) +confusion matrices and per-class F1. No LLM or database required — it only reads +`results/`, so it's light and deploys anywhere. + +```bash +pip install -r app/requirements.txt # light: streamlit + pandas + scikit-learn +streamlit run app/dashboard.py +``` + +### Deploy to Streamlit Community Cloud (free, always-on) + +The dashboard is the project's hosted demo. `app/requirements.txt` sits next to +the entrypoint so Streamlit Cloud installs only the light deps (it searches the +entrypoint's directory before the heavy root `requirements.txt`). + +1. Push these to `main`: `app/dashboard.py`, `app/requirements.txt`, + `.streamlit/config.toml`, and the `results/` artifacts. +2. Go to , sign in with GitHub, authorize the repo. +3. **Create app → Deploy a public app from GitHub.** +4. Repository `vardhjain/Knowledge_Graph_Question_Answering`, Branch `main`, + **Main file path `app/dashboard.py`**. +5. (Optional) Advanced settings → Python 3.11. Set a custom subdomain (e.g. + `kgqa-ablation`) for a clean URL, or accept the auto-generated one. +6. **Deploy.** Copy the final `*.streamlit.app` URL and point the badge + + "Live demo" link in the root README at it. + +> Tip: commit the per-sample `results/{arm}_results.json` files too (if you still +> have them from the benchmark run) to light up the confusion-matrix section. diff --git a/app/chat_app.py b/app/chat_app.py new file mode 100644 index 0000000..5c4f235 --- /dev/null +++ b/app/chat_app.py @@ -0,0 +1,83 @@ +"""Gradio chat demo over the GraphRAG (`graph`) arm — the ablation's winner. + + python app/chat_app.py # local: http://localhost:7860 + python app/chat_app.py --share # public share link (Colab / remote) + python app/chat_app.py --concepts # use the graph_concepts arm instead + +Requirements: `pip install gradio` (see requirements-app.txt), a reachable +ArangoDB (set ARANGO_HOST / ARANGO_PASS), and a running Ollama with the model +pulled. This is a *live* demo — it retrieves from the graph and calls the LLM. +""" + +from __future__ import annotations + +import argparse +import os +import re +import sys + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.join(ROOT, "src")) + +EXAMPLES = [ + "Do preoperative statins reduce postoperative atrial fibrillation?", + "Is vitamin D deficiency associated with increased mortality?", + "Does laparoscopic surgery reduce hospital stay versus open surgery?", +] + + +def _strip_think(text: str) -> str: + """Drop the reasoning model's ... block for a clean answer.""" + return re.sub(r".*?", "", text, flags=re.DOTALL).strip() + + +def build_retriever(use_concepts: bool): + from kgqa.config import ArangoConfig + from kgqa.models import connect_arango, load_encoder, load_reranker + from kgqa.retrieval import ChunkStore, GraphRetriever + + db = connect_arango(ArangoConfig()) + cache = os.path.join(ROOT, "pubmed_vectors_cache.pkl") + store = ChunkStore.from_arango(db, cache_file=cache) + print(f"[demo] {len(store):,} chunks loaded") + return GraphRetriever(store, load_encoder(), db, + reranker=load_reranker(), use_concepts=use_concepts) + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--share", action="store_true", help="create a public share link") + parser.add_argument("--concepts", action="store_true", help="use the graph_concepts arm") + parser.add_argument("--port", type=int, default=7860) + args = parser.parse_args() + + import gradio as gr + + rag = build_retriever(args.concepts) + + def respond(message, history): + result = rag.chat(message) + answer = _strip_think(result["answer"]) or "_No answer produced._" + sources = result.get("sources", []) + if sources: + links = "\n".join( + f"- [PMID {pid}](https://pubmed.ncbi.nlm.nih.gov/{pid}/)" for pid in sources + ) + answer += f"\n\n**Sources**\n{links}" + return answer + + gr.ChatInterface( + fn=respond, + title="PubMed GraphRAG assistant", + description=( + "Graph-augmented retrieval over PubMedQA: matched chunks are expanded " + "to full abstracts via the knowledge graph, then answered by " + "deepseek-r1:8b. Answers cite the source PubMed IDs." + ), + examples=EXAMPLES, + ).launch(share=args.share, server_port=args.port) + + +if __name__ == "__main__": + main() diff --git a/app/dashboard.py b/app/dashboard.py new file mode 100644 index 0000000..07b5f65 --- /dev/null +++ b/app/dashboard.py @@ -0,0 +1,147 @@ +"""Streamlit dashboard for the GraphRAG vs PlainRAG ablation results. + + pip install streamlit # see requirements-app.txt + streamlit run app/dashboard.py + +Reads results/summary.json (always) for the headline metrics and significance +tests, and results/{arm}_results.json (if present) for confusion matrices and +per-class F1. No LLM or database needed — it just visualizes the saved results, +so it deploys cleanly to Streamlit Cloud. +""" + +from __future__ import annotations + +import json +import os +import sys + +import pandas as pd +import streamlit as st + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.join(ROOT, "src")) +RESULTS_DIR = os.path.join(ROOT, "results") +ARM_ORDER = ["plain", "plain_rr", "graph", "graph_concepts"] +LABELS = ["yes", "no", "maybe"] + + +@st.cache_data +def load_summary(): + with open(os.path.join(RESULTS_DIR, "summary.json")) as f: + return json.load(f) + + +@st.cache_data +def load_raw(): + raw = {} + for arm in ARM_ORDER: + path = os.path.join(RESULTS_DIR, f"{arm}_results.json") + if os.path.exists(path): + with open(path) as f: + raw[arm] = json.load(f) + return raw + + +def main(): + repo = "https://github.com/vardhjain/Knowledge_Graph_Question_Answering" + st.set_page_config( + page_title="GraphRAG vs PlainRAG — PubMedQA Ablation", + page_icon="🧬", + layout="wide", + initial_sidebar_state="collapsed", + menu_items={ + "Get Help": repo, + "Report a bug": f"{repo}/issues", + "About": ( + "### GraphRAG vs PlainRAG — a fair 4-arm ablation on PubMedQA\n" + "Every layer held constant; only the retrieval strategy changes.\n\n" + f"Source: [{repo}]({repo})" + ), + }, + ) + st.title("GraphRAG vs PlainRAG — a fair 4-arm ablation on PubMedQA") + + try: + summary = load_summary() + except FileNotFoundError: + st.error("results/summary.json not found. Run `python scripts/compare.py` first.") + st.stop() + + st.caption( + f"n = {summary['n']} questions · seed {summary['seed']} · " + f"{summary['model']} · {summary['dataset']}. " + "Every layer held constant; only the retrieval strategy changes." + ) + + arms = summary["arms"] + best = max(arms, key=lambda a: a["accuracy"]) + + # ── headline metrics ────────────────────────────────────────────────────── + cols = st.columns(len(arms)) + for col, arm in zip(cols, arms, strict=False): + delta = f"{arm['accuracy'] - arms[0]['accuracy']:+.1f} pp vs plain" \ + if arm["arm"] != "plain" else None + col.metric(arm["arm"], f"{arm['accuracy']:.1f}%", delta) + + st.success( + f"**Winner: `{best['arm']}` at {best['accuracy']:.1f}%.** The decisive, " + "statistically significant gain comes from parent-document expansion " + "(`plain_rr → graph`: +22.5 pp, McNemar p < 0.0001). The reranker helps " + "but isn't significant; the concept hop doesn't help and costs ~5× latency." + ) + + with st.expander("How this is measured (fairness)"): + st.markdown( + "All four arms share the same corpus, chunking, embedder, reranker, " + "prompt, LLM, seed, and top-k — **only the retrieval strategy changes**, " + "so each adjacent contrast isolates one component. Significance is a " + "paired **McNemar** test on the same questions. The graph context is " + "leakage-free: no question-derived titles or gold labels ever reach the " + "prompt." + ) + + left, right = st.columns([3, 2]) + + with left: + st.subheader("Accuracy & macro-F1 by arm") + df = pd.DataFrame(arms).set_index("arm") + st.bar_chart(df[["accuracy", "macro_f1"]], stack=False, color=["#2196F3", "#FF9800"]) + st.dataframe( + df[["adds", "accuracy", "macro_f1", "avg_latency", "samples"]], + use_container_width=True, + ) + + with right: + st.subheader("Significance (paired McNemar)") + cdf = pd.DataFrame(summary["contrasts"]) + cdf["contrast"] = cdf["from"] + " → " + cdf["to"] + " (" + cdf["effect"] + ")" + cdf["significant"] = cdf["significant"].map({True: "yes", False: "no"}) + st.dataframe( + cdf[["contrast", "delta_acc", "gains", "losses", "p_value", "significant"]], + use_container_width=True, hide_index=True, + ) + st.caption("Latency by arm (seconds / query)") + st.bar_chart(df["avg_latency"], color="#26A69A", horizontal=True) + + # ── optional: per-class detail from raw per-sample results ──────────────── + raw = load_raw() + if raw: + st.subheader("Per-class detail") + from sklearn.metrics import confusion_matrix, f1_score + tabs = st.tabs([a for a in ARM_ORDER if a in raw]) + for tab, arm in zip(tabs, [a for a in ARM_ORDER if a in raw], strict=False): + with tab: + r = raw[arm] + cm = confusion_matrix(r["y_true"], r["y_pred"], labels=LABELS) + st.write("Confusion matrix (rows = actual, cols = predicted)") + st.dataframe(pd.DataFrame(cm, index=LABELS, columns=LABELS)) + f1s = f1_score(r["y_true"], r["y_pred"], labels=LABELS, + average=None, zero_division=0) + st.write("Per-class F1") + st.bar_chart(pd.Series(f1s, index=LABELS)) + + st.caption(f"Source: {repo}") + + +if __name__ == "__main__": + main() diff --git a/app/requirements.txt b/app/requirements.txt new file mode 100644 index 0000000..40d73da --- /dev/null +++ b/app/requirements.txt @@ -0,0 +1,14 @@ +# Streamlit Community Cloud deploy dependencies for app/dashboard.py ONLY. +# +# This file lives next to the entrypoint on purpose: Community Cloud searches the +# entrypoint's directory FIRST, so this light file is used and the heavy root +# requirements.txt (torch, sentence-transformers, datasets, python-arango) is +# never installed for the hosted dashboard. Keep the deploy's "Main file path" +# set to app/dashboard.py. +# +# Also handy locally — `pip install -r app/requirements.txt` runs just the +# dashboard. It needs streamlit + pandas (+ scikit-learn, used lazily for the +# per-class confusion matrices when results/{arm}_results.json files are present). +streamlit>=1.39 +pandas>=2.0 +scikit-learn>=1.3 diff --git a/assets/architecture.svg b/assets/architecture.svg new file mode 100644 index 0000000..6424cd3 --- /dev/null +++ b/assets/architecture.svg @@ -0,0 +1,81 @@ + + + + + + + + + + Knowledge Graph QA — fair 4-arm ablation + Every layer is held constant; only the retrieval strategy changes. + + + + + PubMedQA question + + + Encode · all-MiniLM-L6-v2 + + + Vector search · ChunkStore + 206,613 chunks · ArangoDB + + + Cross-encoder rerank + rerank arms only + + + Context assembly + ← the only thing that differs + + + LLM · deepseek-r1:8b (Ollama) + + + Extract yes/no/maybe → McNemar + + + + + + + + + + + + + + + + + + The four arms (accuracy, n=200) + + + + + + plain + raw top-k chunks + 30.0% + + plain_rr + + cross-encoder reranker + 37.0% + + graph ★ + + parent abstracts (HAS_CONTEXT) + 59.5% + + graph_concepts + + concept hop (MENTIONS) + 57.5% + + + + Parent-document expansion: +22.5 pp over plain_rr + paired McNemar p < 0.0001 · concept hop did not help (−2 pp, n.s.) + diff --git a/assets/chat.png b/assets/chat.png new file mode 100644 index 0000000..daef3cc Binary files /dev/null and b/assets/chat.png differ diff --git a/assets/dashboard.png b/assets/dashboard.png new file mode 100644 index 0000000..2dea607 Binary files /dev/null and b/assets/dashboard.png differ diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..2cf966a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,22 @@ +# Local ArangoDB for development and running the benchmark without a cloud account. +# +# docker compose up -d +# export ARANGO_PASS=devpassword # PowerShell: $env:ARANGO_PASS="devpassword" +# python scripts/ingest.py # then run_benchmark.py / compare.py +# +# Web UI: http://localhost:8529 (user: root, password: devpassword) +# Change the password below (and ARANGO_PASS) before exposing this anywhere. + +services: + arangodb: + image: arangodb:3.11 + container_name: kgqa-arangodb + environment: + ARANGO_ROOT_PASSWORD: devpassword + ports: + - "8529:8529" + volumes: + - arango_data:/var/lib/arangodb3 + +volumes: + arango_data: diff --git a/Graph_RAG_PPT.pptx b/docs/Graph_RAG_PPT.pptx similarity index 100% rename from Graph_RAG_PPT.pptx rename to docs/Graph_RAG_PPT.pptx diff --git a/Project Report.pdf b/docs/Project_Report.pdf similarity index 100% rename from Project Report.pdf rename to docs/Project_Report.pdf diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..2faa444 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,10 @@ +# GitHub Pages site (Settings → Pages → Source: Deploy from a branch → main → /docs) +title: Knowledge Graph Question Answering +description: GraphRAG vs PlainRAG on PubMedQA — a fair, leakage-free, statistically-tested ablation +theme: jekyll-theme-cayman +show_downloads: false + +# Keep the repo's data/binaries out of the built site. +exclude: + - "*.pdf" + - "*.pptx" diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..d01bc0a --- /dev/null +++ b/docs/index.md @@ -0,0 +1,39 @@ +--- +--- + +[**▶ Live demo**](https://vardhjain-knowledge-graph-question-answerin-appdashboard-hkwi57.streamlit.app)  ·  [**GitHub repo**](https://github.com/vardhjain/Knowledge_Graph_Question_Answering)  ·  [**Project report (PDF)**](https://github.com/vardhjain/Knowledge_Graph_Question_Answering/blob/main/docs/Project_Report.pdf)  ·  [**Slides**](https://github.com/vardhjain/Knowledge_Graph_Question_Answering/blob/main/docs/Graph_RAG_PPT.pptx) + +## What this is + +Most "GraphRAG beats RAG" demos are confounded — the graph pipeline quietly also +gets a reranker, a different corpus, or even leaks the answer into the prompt. +This project runs a **4-arm ablation** on [PubMedQA](https://pubmedqa.github.io/) +where every layer (corpus, chunking, embedder, reranker, prompt, LLM, top-k, seed) +is held constant, so the accuracy change between adjacent arms is attributable to +exactly one component — verified with a paired **McNemar** test. + +![Architecture and 4-arm ablation](https://raw.githubusercontent.com/vardhjain/Knowledge_Graph_Question_Answering/main/assets/architecture.svg) + +## Results (n = 200, seed 42) + +| Arm | Accuracy | Macro F1 | Adds | +| --- | --- | --- | --- | +| `plain` | 30.0% | 29.7% | baseline chunk RAG | +| `plain_rr` | 37.0% | 35.2% | + cross-encoder reranker | +| **`graph`** | **59.5%** | **50.5%** | + parent-paper expansion | +| `graph_concepts` | 57.5% | 50.0% | + MeSH concept hop | + +![4-arm ablation](https://raw.githubusercontent.com/vardhjain/Knowledge_Graph_Question_Answering/main/results/ablation.png) + +**The honest finding:** the graph's decisive, statistically significant win comes +from **parent-document expansion** (`plain_rr → graph`: **+22.5 pp**, McNemar +**p < 0.0001**). The reranker helps but isn't significant (+7 pp, p = 0.08), and +MeSH concept-hop expansion does **not** help on this single-abstract dataset +(−2 pp, p = 0.69) while costing ~5× the latency. The graph helps by *deepening* +context, not by *broadening* it. + +## Explore + +- **[Live results dashboard](https://vardhjain-knowledge-graph-question-answerin-appdashboard-hkwi57.streamlit.app)** — interactive bars, significance tests, per-class breakdown +- **[Source code & README](https://github.com/vardhjain/Knowledge_Graph_Question_Answering)** — package, scripts, tests, CI +- **[Project report (PDF)](https://github.com/vardhjain/Knowledge_Graph_Question_Answering/blob/main/docs/Project_Report.pdf)** and **[slides](https://github.com/vardhjain/Knowledge_Graph_Question_Answering/blob/main/docs/Graph_RAG_PPT.pptx)** diff --git a/notebooks/01_ingest.ipynb b/notebooks/01_ingest.ipynb new file mode 100644 index 0000000..8bd0838 --- /dev/null +++ b/notebooks/01_ingest.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 01 — Ingest PubMedQA into ArangoDB (run once)\n", + "\n", + "Thin Colab wrapper around `scripts/ingest.py`. Builds the **leakage-free** knowledge graph\n", + "(Papers / Chunks / Concepts + HAS_CONTEXT / MENTIONS edges).\n", + "\n", + "**Before running**, add these to the Colab **Secrets** panel (key icon, left sidebar):\n", + "- `ARANGO_PASS` — your ArangoDB password (required)\n", + "- `ARANGO_HOST` — your endpoint, e.g. `https://.arangodb.cloud:8529`\n", + " (ArangoDB Oasis offers a free tier; or run any reachable ArangoDB)\n", + "\n", + "A **GPU** runtime (+ High-RAM) speeds the embedding pass. Run this notebook **once**,\n", + "then use `02_benchmark.ipynb`. Ingesting labeled + unlabeled (~62k papers) is mostly\n", + "network-bound on the inserts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Reset-safe clone: always starts from /content and removes any prior copy,\n", + "# so re-running this cell can never nest a second checkout.\n", + "%cd /content\n", + "!rm -rf Knowledge_Graph_Question_Answering\n", + "!git clone -b main https://github.com/vardhjain/Knowledge_Graph_Question_Answering.git -q\n", + "%cd Knowledge_Graph_Question_Answering\n", + "!pip install -q -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from google.colab import userdata\n", + "\n", + "# Pull connection settings from Colab Secrets (nothing is hardcoded).\n", + "for key in ['ARANGO_PASS', 'ARANGO_HOST', 'ARANGO_DB']:\n", + " try:\n", + " val = userdata.get(key)\n", + " if val:\n", + " os.environ[key] = val\n", + " except Exception:\n", + " pass\n", + "\n", + "assert os.environ.get('ARANGO_PASS'), 'Add ARANGO_PASS in the Secrets panel.'\n", + "print('ARANGO_HOST:', os.environ.get('ARANGO_HOST', '(default http://localhost:8529)'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Quick smoke test first (labeled split only, ~1k papers) to confirm the\n", + "# connection + schema before the full run:\n", + "!python scripts/ingest.py --no-unlabeled" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Full ingestion (labeled + unlabeled). Safe to re-run: papers/chunks upsert by key.\n", + "!python scripts/ingest.py" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": {"provenance": [], "gpuType": "A100", "machine_shape": "hm"}, + "kernelspec": {"display_name": "Python 3", "name": "python3"}, + "language_info": {"name": "python"} + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/02_benchmark.ipynb b/notebooks/02_benchmark.ipynb new file mode 100644 index 0000000..4ad293a --- /dev/null +++ b/notebooks/02_benchmark.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 02 — Benchmark: 4-arm GraphRAG vs PlainRAG ablation\n", + "\n", + "Thin Colab wrapper around `scripts/run_benchmark.py` and `scripts/compare.py`.\n", + "\n", + "**Use a GPU runtime** (a faster GPU mainly cuts wall-clock since this is\n", + "LLM-inference-bound). Add `ARANGO_PASS` and `ARANGO_HOST` in the Colab **Secrets**\n", + "panel. Run `01_ingest.ipynb` first.\n", + "\n", + "Arms (each isolates one component):\n", + "\n", + "| arm | adds |\n", + "| --- | --- |\n", + "| `plain` | vector top-k chunks (baseline) |\n", + "| `plain_rr` | + cross-encoder reranker |\n", + "| `graph` | + parent-paper expansion (full abstracts) |\n", + "| `graph_concepts` | + MeSH concept-hop expansion |\n", + "\n", + "The runner retries failed questions and auto-restarts Ollama if it crashes, and it\n", + "checkpoints every 25 questions — so a transient error can't abort an arm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Confirm the GPU (optional).\n", + "!nvidia-smi --query-gpu=name,memory.total --format=csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Reset-safe clone: always starts from /content and removes any prior copy,\n", + "# so re-running this cell can never nest a second checkout.\n", + "%cd /content\n", + "!rm -rf Knowledge_Graph_Question_Answering\n", + "!git clone -b main https://github.com/vardhjain/Knowledge_Graph_Question_Answering.git -q\n", + "%cd Knowledge_Graph_Question_Answering\n", + "!pip install -q -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install Ollama and pull the LLM (once). The benchmark script manages the\n", + "# server from here on (health-check + auto-restart).\n", + "!which ollama || (apt-get install -y zstd -q && curl -fsSL https://ollama.com/install.sh | sh)\n", + "import subprocess, time\n", + "subprocess.Popen(['ollama', 'serve'])\n", + "time.sleep(5)\n", + "!ollama pull deepseek-r1:8b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from google.colab import userdata\n", + "\n", + "# Connection settings from Colab Secrets (nothing hardcoded).\n", + "for key in ['ARANGO_PASS', 'ARANGO_HOST', 'ARANGO_DB']:\n", + " try:\n", + " val = userdata.get(key)\n", + " if val:\n", + " os.environ[key] = val\n", + " except Exception:\n", + " pass\n", + "assert os.environ.get('ARANGO_PASS'), 'Add ARANGO_PASS in the Secrets panel.'\n", + "\n", + "# Generation knobs (identical across arms, so the comparison is unaffected).\n", + "# Raise NUM_CTX on a large-VRAM GPU; lower it on a small one if you hit OOM.\n", + "os.environ['LLM_NUM_CTX'] = '8192'\n", + "os.environ['LLM_NUM_PREDICT'] = '1024'\n", + "print('ARANGO_HOST:', os.environ.get('ARANGO_HOST', '(default http://localhost:8529)'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run all four arms. The chunk corpus is downloaded once and cached, then\n", + "# reused by every arm (identical corpus -> fair comparison). Each arm saves its\n", + "# own results JSON, so if one dies you can re-run just that arm.\n", + "for arm in ['plain', 'plain_rr', 'graph', 'graph_concepts']:\n", + " print(f'\\n===== {arm} =====')\n", + " !python scripts/run_benchmark.py --arm {arm} --n 200" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Summary table, paired McNemar tests, and the ablation figure.\n", + "!python scripts/compare.py\n", + "from IPython.display import Image, display, Markdown\n", + "display(Markdown(open('results/summary.md').read()))\n", + "display(Image('results/ablation.png'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: commit results back to GitHub (set a PAT first).\n", + "# !git config user.email you@example.com && git config user.name you\n", + "# !git add results/ && git commit -m 'Add benchmark results' && git push" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": {"provenance": [], "gpuType": "A100", "machine_shape": "hm"}, + "kernelspec": {"display_name": "Python 3", "name": "python3"}, + "language_info": {"name": "python"} + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4fed7f2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,56 @@ +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[project] +name = "kgqa" +version = "1.0.0" +description = "Fair GraphRAG vs PlainRAG comparison on PubMedQA" +readme = "README.md" +requires-python = ">=3.10" +license = { text = "MIT" } +authors = [{ name = "Vardh Jain", email = "vardhjain20@gmail.com" }] +keywords = ["graphrag", "rag", "knowledge-graph", "pubmedqa", "arangodb", "llm", "ablation"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] + +[project.urls] +Homepage = "https://github.com/vardhjain/Knowledge_Graph_Question_Answering" +Repository = "https://github.com/vardhjain/Knowledge_Graph_Question_Answering" +Issues = "https://github.com/vardhjain/Knowledge_Graph_Question_Answering/issues" + +[project.optional-dependencies] +dev = ["pytest>=8.0", "ruff>=0.4.0", "pre-commit>=3.5"] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +pythonpath = ["src", "."] +testpaths = ["tests"] +addopts = "-q" + +[tool.coverage.run] +source = ["kgqa"] + +[tool.coverage.report] +show_missing = true + +[tool.ruff] +line-length = 100 +src = ["src", "scripts", "tests", "app"] +target-version = "py310" + +[tool.ruff.lint] +select = ["E", "F", "I", "W", "UP", "B"] +ignore = ["E501"] # line length handled by formatter; long AQL strings are fine + +[tool.ruff.lint.per-file-ignores] +"scripts/*" = ["E402"] # sys.path insert before imports is intentional +"app/*" = ["E402"] # same: sys.path setup precedes imports diff --git a/requirements-app.txt b/requirements-app.txt new file mode 100644 index 0000000..ea3ec7b --- /dev/null +++ b/requirements-app.txt @@ -0,0 +1,5 @@ +-r requirements.txt + +# Interactive UIs (app/) +gradio>=4.0 # app/chat_app.py — live GraphRAG chat demo +streamlit>=1.30 # app/dashboard.py — benchmark results dashboard diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..23b34e5 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,6 @@ +-r requirements.txt + +# Testing & linting (CI) +pytest>=8.0 +pytest-cov>=5.0 +ruff>=0.4.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..40a5ff9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,23 @@ +# Core ML / retrieval +sentence-transformers>=2.7.0 +datasets>=2.18.0 +numpy>=1.24 +scikit-learn>=1.3 +scipy>=1.10 + +# Knowledge graph +python-arango>=7.9.0 + +# LLM client +requests>=2.31 + +# Plotting / reporting +matplotlib>=3.7 +seaborn>=0.13 +pandas>=2.0 + +# Notebooks / UI (optional at runtime, used by notebooks) +tqdm>=4.66 + +# Config +python-dotenv>=1.0 diff --git a/results/ablation.png b/results/ablation.png new file mode 100644 index 0000000..a09a649 Binary files /dev/null and b/results/ablation.png differ diff --git a/results/summary.json b/results/summary.json new file mode 100644 index 0000000..afae7fb --- /dev/null +++ b/results/summary.json @@ -0,0 +1,17 @@ +{ + "n": 200, + "seed": 42, + "model": "deepseek-r1:8b", + "dataset": "PubMedQA (pqa_labeled)", + "arms": [ + {"arm": "plain", "accuracy": 30.0, "macro_f1": 29.69, "avg_latency": 6.4, "samples": 200, "adds": "baseline chunk RAG"}, + {"arm": "plain_rr", "accuracy": 37.0, "macro_f1": 35.21, "avg_latency": 6.6, "samples": 200, "adds": "+ cross-encoder reranker"}, + {"arm": "graph", "accuracy": 59.5, "macro_f1": 50.51, "avg_latency": 7.5, "samples": 200, "adds": "+ parent-paper expansion"}, + {"arm": "graph_concepts", "accuracy": 57.5, "macro_f1": 49.97, "avg_latency": 40.8, "samples": 200, "adds": "+ MeSH concept hop"} + ], + "contrasts": [ + {"from": "plain", "to": "plain_rr", "effect": "reranker", "delta_acc": 7.0, "gains": 35, "losses": 21, "p_value": 0.0814, "significant": false}, + {"from": "plain_rr", "to": "graph", "effect": "parent expansion", "delta_acc": 22.5, "gains": 71, "losses": 26, "p_value": 0.0000, "significant": true}, + {"from": "graph", "to": "graph_concepts", "effect": "concept hop", "delta_acc": -2.0, "gains": 26, "losses": 30, "p_value": 0.6889, "significant": false} + ] +} diff --git a/results/summary.md b/results/summary.md new file mode 100644 index 0000000..8f41522 --- /dev/null +++ b/results/summary.md @@ -0,0 +1,14 @@ +| Arm | Accuracy | Macro F1 | Avg latency (s) | n | +| --- | --- | --- | --- | --- | +| plain | 30.00% | 29.69% | 6.4 | 200 | +| plain_rr | 37.00% | 35.21% | 6.6 | 200 | +| graph | 59.50% | 50.51% | 7.5 | 200 | +| graph_concepts | 57.50% | 49.97% | 40.8 | 200 | + +### Significance (paired McNemar) + +| Contrast | Δacc (pp) | gains | losses | p | sig? | +| --- | --- | --- | --- | --- | --- | +| plain → plain_rr (reranker effect) | +7.00 | 35 | 21 | 0.0814 | no | +| plain_rr → graph (parent-expansion effect) | +22.50 | 71 | 26 | 0.0000 | yes | +| graph → graph_concepts (concept-hop effect) | -2.00 | 26 | 30 | 0.6889 | no | diff --git a/scripts/compare.py b/scripts/compare.py new file mode 100644 index 0000000..5f9e4b0 --- /dev/null +++ b/scripts/compare.py @@ -0,0 +1,168 @@ +"""Aggregate arm results: summary table, McNemar tests, and figures. + + python scripts/compare.py + +Reads results/{arm}_results.json for whichever arms are present and writes +figures + a markdown snippet to results/. The McNemar tests are paired on pubid, +so they only run for arms evaluated on the same seeded sample. +""" + +from __future__ import annotations + +import json +import os +import sys + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.join(ROOT, "src")) + +from kgqa.config import DATASET_NAME, LLM_MODEL, RANDOM_SEED # noqa: E402 +from kgqa.evaluation import mcnemar_test # noqa: E402 + +RESULTS_DIR = os.path.join(ROOT, "results") +ARM_ORDER = ["plain", "plain_rr", "graph", "graph_concepts"] +ARM_ADDS = { + "plain": "baseline chunk RAG", + "plain_rr": "+ cross-encoder reranker", + "graph": "+ parent-paper expansion", + "graph_concepts": "+ MeSH concept hop", +} +# Adjacent-arm contrasts that isolate each component's contribution. +CONTRASTS = [ + ("plain", "plain_rr", "reranker"), + ("plain_rr", "graph", "parent expansion"), + ("graph", "graph_concepts", "concept hop"), +] + + +def load_results(): + out = {} + for arm in ARM_ORDER: + path = os.path.join(RESULTS_DIR, f"{arm}_results.json") + if os.path.exists(path): + with open(path) as f: + out[arm] = json.load(f) + return out + + +def aligned(a, b): + """Align two arms' predictions on shared pubids (same seed -> same order).""" + ids_a = a.get("ids") or list(range(len(a["y_pred"]))) + ids_b = b.get("ids") or list(range(len(b["y_pred"]))) + idx_b = {sid: i for i, sid in enumerate(ids_b)} + gt, pa, pb = [], [], [] + for i, sid in enumerate(ids_a): + j = idx_b.get(sid) + if j is None: + continue + gt.append(a["y_true"][i]) + pa.append(a["y_pred"][i]) + pb.append(b["y_pred"][j]) + return gt, pa, pb + + +def main(): + results = load_results() + if not results: + print(f"No results found in {RESULTS_DIR}. Run scripts/run_benchmark.py first.") + sys.exit(1) + + lines = ["| Arm | Accuracy | Macro F1 | Avg latency (s) | n |", + "| --- | --- | --- | --- | --- |"] + arms_json, contrasts_json, max_n = [], [], 0 + print("\n" + "=" * 64) + print(" RESULTS SUMMARY") + print("=" * 64) + for arm in ARM_ORDER: + if arm not in results: + continue + r = results[arm] + acc, f1 = r["accuracy"] * 100, r.get("macro_f1", 0) * 100 + lat, n = r["avg_latency"], r["samples"] + max_n = max(max_n, n) + print(f" {arm:<16} acc={acc:6.2f}% f1={f1:6.2f}% lat={lat:5.1f}s n={n}") + lines.append(f"| {arm} | {acc:.2f}% | {f1:.2f}% | {lat:.1f} | {n} |") + arms_json.append({"arm": arm, "accuracy": round(acc, 2), "macro_f1": round(f1, 2), + "avg_latency": round(lat, 1), "samples": n, + "adds": ARM_ADDS.get(arm, "")}) + + print("\n" + "=" * 64) + print(" PAIRED McNEMAR TESTS (adjacent ablation contrasts)") + print("=" * 64) + lines += ["", "### Significance (paired McNemar)", "", + "| Contrast | Δacc (pp) | gains | losses | p | sig? |", + "| --- | --- | --- | --- | --- | --- |"] + for a_name, b_name, desc in CONTRASTS: + if a_name not in results or b_name not in results: + continue + gt, pa, pb = aligned(results[a_name], results[b_name]) + if not gt: + continue + test = mcnemar_test(gt, pa, pb) + acc_a = sum(p == g for p, g in zip(pa, gt, strict=False)) / len(gt) + acc_b = sum(p == g for p, g in zip(pb, gt, strict=False)) / len(gt) + d = (acc_b - acc_a) * 100 + sig = "yes" if test["significant_at_0.05"] else "no" + print(f" {a_name} -> {b_name} ({desc})") + print(f" Δacc={d:+.2f}pp gains={test['b_gains']} losses={test['c_losses']}" + f" p={test['p_value']:.4f} sig={sig}") + lines.append(f"| {a_name} → {b_name} ({desc}) | {d:+.2f} | {test['b_gains']} " + f"| {test['c_losses']} | {test['p_value']:.4f} | {sig} |") + contrasts_json.append({"from": a_name, "to": b_name, "effect": desc, + "delta_acc": round(d, 2), "gains": test["b_gains"], + "losses": test["c_losses"], + "p_value": round(test["p_value"], 4), + "significant": test["significant_at_0.05"]}) + + md_path = os.path.join(RESULTS_DIR, "summary.md") + with open(md_path, "w") as f: + f.write("\n".join(lines) + "\n") + print(f"\nWrote {md_path}") + + json_path = os.path.join(RESULTS_DIR, "summary.json") + with open(json_path, "w") as f: + json.dump({"n": max_n, "seed": RANDOM_SEED, "model": LLM_MODEL, + "dataset": "PubMedQA (pqa_labeled)" if "PubMedQA" in DATASET_NAME + else DATASET_NAME, + "arms": arms_json, "contrasts": contrasts_json}, f, indent=2) + print(f"Wrote {json_path}") + + _plot(results) + + +def _plot(results): + try: + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + except Exception as exc: # pragma: no cover + print(f"(skipping figures: {exc})") + return + + arms = [a for a in ARM_ORDER if a in results] + accs = [results[a]["accuracy"] * 100 for a in arms] + f1s = [results[a].get("macro_f1", 0) * 100 for a in arms] + + fig, ax = plt.subplots(figsize=(9, 5)) + import numpy as np + x = np.arange(len(arms)) + w = 0.38 + ax.bar(x - w / 2, accs, w, label="Accuracy", color="#2196F3") + ax.bar(x + w / 2, f1s, w, label="Macro F1", color="#FF9800") + ax.set_xticks(x) + ax.set_xticklabels(arms, rotation=15) + ax.set_ylabel("%") + ax.set_ylim(0, 100) + ax.set_title("4-arm ablation — PubMedQA") + ax.legend() + for i, (a, f) in enumerate(zip(accs, f1s, strict=False)): + ax.text(i - w / 2, a + 1, f"{a:.1f}", ha="center", fontsize=8) + ax.text(i + w / 2, f + 1, f"{f:.1f}", ha="center", fontsize=8) + fig.tight_layout() + out = os.path.join(RESULTS_DIR, "ablation.png") + fig.savefig(out, dpi=150, bbox_inches="tight") + print(f"Wrote {out}") + + +if __name__ == "__main__": + main() diff --git a/scripts/ingest.py b/scripts/ingest.py new file mode 100644 index 0000000..311e151 --- /dev/null +++ b/scripts/ingest.py @@ -0,0 +1,139 @@ +"""Build the ArangoDB knowledge graph from PubMedQA — leakage-free schema. + +Differences from the original ingestion (the fairness fixes): + * Papers store NO question-derived title and NO final_decision, so the + benchmark question/answer can never leak into a retrieved context. + * Chunks carry an explicit ``paper_key`` for fast, unambiguous corpus loading. + +Run ONCE before benchmarking: + export ARANGO_PASS=... # or set in PowerShell / Colab Secrets + python scripts/ingest.py +""" + +from __future__ import annotations + +import argparse +import os +import sys +import time + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.join(ROOT, "src")) + +from arango import ArangoClient # noqa: E402 +from datasets import load_dataset # noqa: E402 +from sentence_transformers import SentenceTransformer # noqa: E402 +from tqdm import tqdm # noqa: E402 + +from kgqa.config import ( # noqa: E402 + DATASET_NAME, + EDGE_COLLECTIONS, + EMBEDDING_MODEL, + LABELED_CONFIG, + NODE_COLLECTIONS, + UNLABELED_CONFIG, + ArangoConfig, +) + + +def setup_schema(db): + for col in NODE_COLLECTIONS: + if not db.has_collection(col): + db.create_collection(col) + print(f" created node collection: {col}") + for col in EDGE_COLLECTIONS: + if not db.has_collection(col): + db.create_collection(col, edge=True) + print(f" created edge collection: {col}") + + +def ingest_split(db, dataset, model, on_duplicate_paper="ignore", batch_size=50): + papers, chunks, concepts, has_ctx, mentions = [], [], [], [], [] + count = 0 + + def flush(): + if papers: + db.collection("Papers").import_bulk(papers, on_duplicate=on_duplicate_paper) + if concepts: + db.collection("Concepts").import_bulk(concepts, on_duplicate="ignore") + if chunks: + db.collection("Chunks").import_bulk(chunks, on_duplicate="ignore") + if has_ctx: + db.collection("HAS_CONTEXT").import_bulk(has_ctx, on_duplicate="ignore") + if mentions: + db.collection("MENTIONS").import_bulk(mentions, on_duplicate="ignore") + for buf in (papers, chunks, concepts, has_ctx, mentions): + buf.clear() + + for row in tqdm(dataset): + paper_key = str(row["pubid"]) + # Leakage-free Paper node: no title, no final_decision. + papers.append({"_key": paper_key}) + + for mesh in row.get("context", {}).get("meshes", []): + mesh_key = "".join(c for c in mesh if c.isalnum()) + if not mesh_key: + continue + concepts.append({"_key": mesh_key, "name": mesh}) + mentions.append({"_from": f"Papers/{paper_key}", "_to": f"Concepts/{mesh_key}"}) + + ctx_texts = row.get("context", {}).get("contexts", []) + ctx_labels = row.get("context", {}).get("labels", []) + if ctx_texts: + embeddings = model.encode(ctx_texts) + for idx, (text, emb) in enumerate(zip(ctx_texts, embeddings, strict=False)): + chunk_key = f"{paper_key}_{idx}" + chunks.append({ + "_key": chunk_key, + "paper_key": paper_key, + "text": text, + "label": ctx_labels[idx] if idx < len(ctx_labels) else "context", + "embedding": emb.tolist(), + }) + has_ctx.append({"_from": f"Papers/{paper_key}", "_to": f"Chunks/{chunk_key}"}) + + count += 1 + if count % batch_size == 0: + flush() + flush() + return count + + +def main(): + parser = argparse.ArgumentParser(description="Ingest PubMedQA into ArangoDB.") + parser.add_argument("--no-unlabeled", action="store_true", + help="Ingest only the labeled split (faster, for testing).") + args = parser.parse_args() + + cfg = ArangoConfig() + cfg.require_password() + client = ArangoClient(hosts=cfg.host) + sys_db = client.db("_system", username=cfg.user, password=cfg.password) + if not sys_db.has_database(cfg.db_name): + sys_db.create_database(cfg.db_name) + print(f"created database: {cfg.db_name}") + db = client.db(cfg.db_name, username=cfg.user, password=cfg.password) + + setup_schema(db) + model = SentenceTransformer(EMBEDDING_MODEL) + + if not args.no_unlabeled: + print("Ingesting pqa_unlabeled...") + ds = load_dataset(DATASET_NAME, UNLABELED_CONFIG, split="train") + t0 = time.time() + n = ingest_split(db, ds, model, on_duplicate_paper="ignore") + print(f" {n:,} papers in {time.time() - t0:.1f}s") + + print("Ingesting pqa_labeled...") + ds = load_dataset(DATASET_NAME, LABELED_CONFIG, split="train") + t0 = time.time() + n = ingest_split(db, ds, model, on_duplicate_paper="update") + print(f" {n:,} papers in {time.time() - t0:.1f}s") + + print("\nCollection counts:") + for col in (*NODE_COLLECTIONS, *EDGE_COLLECTIONS): + print(f" {col:<15}: {db.collection(col).count():>8,}") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_benchmark.py b/scripts/run_benchmark.py new file mode 100644 index 0000000..fa0f787 --- /dev/null +++ b/scripts/run_benchmark.py @@ -0,0 +1,175 @@ +"""Run one arm of the GraphRAG vs PlainRAG ablation on PubMedQA. + + python scripts/run_benchmark.py --arm plain_rr --n 200 + +Arms: + plain vector top-k chunks (baseline) + plain_rr + cross-encoder rerank + graph + parent-paper expansion (full abstracts) + graph_concepts + MeSH concept-hop expansion + +All arms share one ArangoDB-backed chunk corpus (cached locally), the same +encoder, reranker, prompt, LLM, seed and sample — so results are comparable and +the only moving part is the retrieval strategy named by --arm. + +Resilience: each question is retried, and a wedged/crashed Ollama is restarted +between attempts, so a single 500/timeout cannot abort the whole arm. Partial +results are checkpointed every 25 questions. +""" + +from __future__ import annotations + +import argparse +import os +import subprocess +import sys +import time + +import requests + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.join(ROOT, "src")) + +ARMS = ("plain", "plain_rr", "graph", "graph_concepts") +MAX_TRIES = 3 +CHECKPOINT_EVERY = 25 + + +def _ollama_base(api_url: str) -> str: + return api_url.split("/api/")[0] + + +def _ollama_healthy(api_url: str, timeout: int = 5) -> bool: + try: + return requests.get(_ollama_base(api_url) + "/api/tags", timeout=timeout).ok + except Exception: + return False + + +def ensure_ollama(api_url: str, model: str, restart: bool = False, wait: int = 90) -> bool: + """Make sure a healthy Ollama is serving; (re)start it if not.""" + import shutil + + if restart: + try: + subprocess.run(["pkill", "-f", "ollama"], capture_output=True) + time.sleep(3) + except FileNotFoundError: + pass # no pkill (e.g. Windows) — fall through and try to start + + if not restart and _ollama_healthy(api_url): + return True + + ollama = shutil.which("ollama") or "/usr/local/bin/ollama" + try: + subprocess.Popen([ollama, "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except FileNotFoundError: + print("[Ollama] binary not found; assuming a server is reachable elsewhere.") + + deadline = time.time() + wait + while time.time() < deadline: + if _ollama_healthy(api_url): + try: # warm the model so the next real call isn't a cold load + requests.post( + _ollama_base(api_url) + "/api/generate", + json={"model": model, "prompt": "ok", "stream": False, + "keep_alive": "30m", "options": {"num_predict": 1}}, + timeout=180, + ) + except Exception: + pass + return True + time.sleep(2) + print("[Ollama] WARNING: server did not become healthy in time.") + return False + + +def build_retriever(arm, store, encoder, reranker, db): + from kgqa.retrieval import GraphRetriever, PlainRetriever + + if arm == "plain": + return PlainRetriever(store, encoder, reranker=None) + if arm == "plain_rr": + return PlainRetriever(store, encoder, reranker=reranker) + if arm == "graph": + return GraphRetriever(store, encoder, db, reranker=reranker, use_concepts=False) + if arm == "graph_concepts": + return GraphRetriever(store, encoder, db, reranker=reranker, use_concepts=True) + raise ValueError(f"unknown arm: {arm}") + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--arm", required=True, choices=ARMS) + parser.add_argument("--n", type=int, default=None, help="sample size (default: config BENCHMARK_N)") + parser.add_argument("--seed", type=int, default=None, help="random seed (default: config RANDOM_SEED)") + parser.add_argument("--output", default=None, help="results JSON path") + parser.add_argument("--no-ollama-start", action="store_true", + help="don't auto-start/health-check the Ollama server") + args = parser.parse_args() + + from kgqa.config import BENCHMARK_N, LLM_MODEL, OLLAMA_API, RANDOM_SEED, ArangoConfig + from kgqa.data import load_benchmark_samples + from kgqa.evaluation import Evaluator, FuzzyEvaluator + from kgqa.models import connect_arango, load_encoder, load_reranker + from kgqa.retrieval import ChunkStore + + if not args.no_ollama_start: + print("[Ollama] Ensuring server is healthy...") + ensure_ollama(OLLAMA_API, LLM_MODEL) + + n = args.n or BENCHMARK_N + seed = args.seed if args.seed is not None else RANDOM_SEED + results_dir = os.path.join(ROOT, "results") + os.makedirs(results_dir, exist_ok=True) + out_path = args.output or os.path.join(results_dir, f"{args.arm}_results.json") + cache_file = os.path.join(ROOT, "pubmed_vectors_cache.pkl") + + db = connect_arango(ArangoConfig()) + print("[Corpus] Loading chunk store from ArangoDB (cached after first run)...") + store = ChunkStore.from_arango(db, cache_file=cache_file) + print(f"[Corpus] {len(store):,} chunks loaded.") + + encoder = load_encoder() + reranker = load_reranker() if args.arm != "plain" else None + + retriever = build_retriever(args.arm, store, encoder, reranker, db) + samples = load_benchmark_samples(n=n, seed=seed) + + fuzzy = FuzzyEvaluator() + evaluator = Evaluator(args.arm) + print(f"\n=== Benchmark: {args.arm} (n={len(samples)}, seed={seed}) ===") + for i, s in enumerate(samples): + t0 = time.time() + raw = None + for attempt in range(1, MAX_TRIES + 1): + try: + raw = retriever.answer_benchmark(s.question) + break + except Exception as exc: + print(f" [warn] q{i + 1} attempt {attempt}/{MAX_TRIES} failed: " + f"{type(exc).__name__}: {exc}") + if attempt < MAX_TRIES and not args.no_ollama_start: + ensure_ollama(OLLAMA_API, LLM_MODEL, restart=True) + latency = time.time() - t0 + + if raw is None: + pred = "maybe" # last resort so one bad call doesn't abort the arm + print(f"[{i + 1:3d}] GT={s.final_decision:<5} Pred={pred:<5} ! " + f"(skipped after {MAX_TRIES} tries)") + else: + pred = fuzzy.extract_answer(raw) + icon = "v" if pred == s.final_decision.lower().strip() else "x" + print(f"[{i + 1:3d}] GT={s.final_decision:<5} Pred={pred:<5} {icon} ({latency:.1f}s)") + + evaluator.record(s.final_decision, pred, latency, sample_id=s.pubid) + if (i + 1) % CHECKPOINT_EVERY == 0: + evaluator.save(out_path) # checkpoint partial progress + + evaluator.report() + evaluator.save(out_path) + + +if __name__ == "__main__": + main() diff --git a/src/kgqa/__init__.py b/src/kgqa/__init__.py new file mode 100644 index 0000000..27cf1c6 --- /dev/null +++ b/src/kgqa/__init__.py @@ -0,0 +1,14 @@ +"""Knowledge Graph Question Answering — fair GraphRAG vs PlainRAG comparison. + +A 4-arm ablation on PubMedQA that isolates exactly what a knowledge graph +contributes to retrieval-augmented QA, holding every other layer constant +(corpus, chunking, embedder, reranker, prompt, LLM, top-k). + +Arms: + plain vector search -> top-k chunks (baseline) + plain_rr vector search -> cross-encoder rerank -> top-k chunks + graph plain_rr -> parent-paper expansion (full abstracts) + graph_concepts graph -> MeSH concept-hop expansion (related papers) +""" + +__version__ = "1.0.0" diff --git a/src/kgqa/config.py b/src/kgqa/config.py new file mode 100644 index 0000000..fa14d3f --- /dev/null +++ b/src/kgqa/config.py @@ -0,0 +1,75 @@ +"""Central configuration — the single source of truth for every constant. + +Every arm of the comparison reads from here, so the *only* differences between +PlainRAG and GraphRAG are the retrieval strategy and context assembly. Anything +that could confound the comparison (embedder, reranker, prompt, LLM, top-k, +sample size, seed) lives in this file and nowhere else. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field + +try: # optional: load a local .env if python-dotenv is installed + from dotenv import load_dotenv + + load_dotenv() +except Exception: # pragma: no cover - dotenv is optional + pass + + +# ── Shared models (identical across all arms) ───────────────────────────────── +EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # 384-dim +CROSS_ENCODER = "cross-encoder/ms-marco-MiniLM-L-6-v2" +LLM_MODEL = os.environ.get("LLM_MODEL", "deepseek-r1:8b") + +# ── Retrieval hyper-parameters (identical across all arms) ──────────────────── +TOP_K_FINAL = 3 # documents handed to the LLM +TOP_K_CANDIDATES = 75 # wide pool fed to the reranker (rerank arms only) +CONCEPT_HOP_PAPERS = 3 # extra related papers pulled in by the concept arm + +# ── Benchmark protocol (identical across all arms) ──────────────────────────── +BENCHMARK_N = int(os.environ.get("BENCHMARK_N", "200")) +RANDOM_SEED = int(os.environ.get("RANDOM_SEED", "42")) +DATASET_NAME = "qiaojin/PubMedQA" +LABELED_CONFIG = "pqa_labeled" +UNLABELED_CONFIG = "pqa_unlabeled" + +# ── LLM serving ─────────────────────────────────────────────────────────────── +OLLAMA_API = os.environ.get("OLLAMA_API", "http://localhost:11434/api/chat") +LLM_TEMPERATURE = 0.0 # deterministic for benchmarking +# Env-tunable so the run can be sized to the GPU without code changes. num_predict +# caps generation so a runaway reasoning chain can't stall (or crash) the server; +# the answer extractor tolerates a truncated chain. Lower NUM_CTX to 4096 on a +# small-VRAM GPU (e.g. T4) if you hit out-of-memory 500s. +LLM_NUM_CTX = int(os.environ.get("LLM_NUM_CTX", "4096")) +LLM_NUM_PREDICT = int(os.environ.get("LLM_NUM_PREDICT", "1024")) +LLM_KEEP_ALIVE = os.environ.get("LLM_KEEP_ALIVE", "30m") +LLM_TIMEOUT = int(os.environ.get("LLM_TIMEOUT", "180")) + +# ── Graph schema (must match scripts/ingest.py) ─────────────────────────────── +NODE_COLLECTIONS = ("Papers", "Chunks", "Concepts") +EDGE_COLLECTIONS = ("HAS_CONTEXT", "MENTIONS") +HAS_CONTEXT = "HAS_CONTEXT" # Paper -> Chunk +MENTIONS = "MENTIONS" # Paper -> Concept + + +@dataclass +class ArangoConfig: + """ArangoDB Oasis connection settings, read from the environment.""" + + host: str = field(default_factory=lambda: os.environ.get( + "ARANGO_HOST", "http://localhost:8529")) + user: str = field(default_factory=lambda: os.environ.get("ARANGO_USER", "root")) + password: str = field(default_factory=lambda: os.environ.get("ARANGO_PASS", "")) + db_name: str = field(default_factory=lambda: os.environ.get("ARANGO_DB", "pubmed_graph")) + + def require_password(self) -> None: + if not self.password: + raise OSError( + "ARANGO_PASS is not set. Set it before connecting:\n" + ' PowerShell : $env:ARANGO_PASS = "your_password"\n' + " bash : export ARANGO_PASS=your_password\n" + " Colab : add ARANGO_PASS in the Secrets panel" + ) diff --git a/src/kgqa/data.py b/src/kgqa/data.py new file mode 100644 index 0000000..d4584e8 --- /dev/null +++ b/src/kgqa/data.py @@ -0,0 +1,77 @@ +"""Dataset loading, seeded sampling, and chunk-corpus construction. + +The chunk corpus is built the same way the graph is ingested (per-section +chunks from the labeled + unlabeled splits), so every arm retrieves over an +identical pool of documents. +""" + +from __future__ import annotations + +import random +from dataclasses import dataclass + +from .config import ( + BENCHMARK_N, + DATASET_NAME, + LABELED_CONFIG, + RANDOM_SEED, + UNLABELED_CONFIG, +) + + +@dataclass +class BenchmarkSample: + pubid: str + question: str + final_decision: str + + +def load_benchmark_samples(n: int = BENCHMARK_N, seed: int = RANDOM_SEED) -> list[BenchmarkSample]: + """Return a deterministic random sample of labeled PubMedQA questions. + + Uses a seeded shuffle so the same questions are evaluated across every arm + and across re-runs — a prerequisite for the paired McNemar test. + """ + from datasets import load_dataset + + ds = load_dataset(DATASET_NAME, LABELED_CONFIG, split="train") + indices = list(range(len(ds))) + random.Random(seed).shuffle(indices) + + samples: list[BenchmarkSample] = [] + for idx in indices: + item = ds[idx] + decision = item.get("final_decision") + if not item.get("question") or not decision: + continue + samples.append(BenchmarkSample( + pubid=str(item["pubid"]), + question=item["question"], + final_decision=decision, + )) + if len(samples) >= n: + break + return samples + + +def iter_chunks(include_unlabeled: bool = True): + """Yield ``(paper_key, chunk_index, text)`` for every abstract section. + + This is the canonical chunking used both at ingestion time and when + building the in-memory PlainRAG corpus, guaranteeing an identical document + pool across arms. + """ + from datasets import load_dataset + + configs = [LABELED_CONFIG] + if include_unlabeled: + configs.append(UNLABELED_CONFIG) + + for config in configs: + ds = load_dataset(DATASET_NAME, config, split="train") + for item in ds: + paper_key = str(item["pubid"]) + contexts = item.get("context", {}).get("contexts", []) + for idx, text in enumerate(contexts): + if text and text.strip(): + yield paper_key, idx, text diff --git a/src/kgqa/evaluation.py b/src/kgqa/evaluation.py new file mode 100644 index 0000000..d53a79b --- /dev/null +++ b/src/kgqa/evaluation.py @@ -0,0 +1,131 @@ +"""Answer extraction, metrics, and significance testing. + +Kept free of any plotting import at module load so it is importable in headless +CI. Figure generation lives in ``scripts/compare.py``. +""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field + +from sklearn.metrics import accuracy_score, classification_report, f1_score + +LABELS = ("yes", "no", "maybe") + + +class FuzzyEvaluator: + """Extracts a normalised yes/no/maybe from verbose model output.""" + + def extract_answer(self, text: str) -> str: + clean = re.sub(r".*?", "", text, flags=re.DOTALL).lower() + match = re.search(r"final answer\s*:\s*(yes|no|maybe)", clean) + if match: + return match.group(1) + matches = re.findall(r"\b(yes|no|maybe)\b", clean) + return matches[-1] if matches else "maybe" + + +@dataclass +class Evaluator: + """Accumulates predictions and computes plot-free metrics. + + ``ids`` records the dataset pubid of each sample so a paired significance + test (McNemar) can be run across arms on exactly the same questions. + """ + + model_name: str + y_true: list = field(default_factory=list) + y_pred: list = field(default_factory=list) + latencies: list = field(default_factory=list) + ids: list = field(default_factory=list) + + def record(self, ground_truth: str, prediction: str, + latency: float = 0.0, sample_id: str | None = None) -> None: + pred = prediction.lower().strip() + if pred not in LABELS: + pred = "maybe" + self.y_true.append(ground_truth.lower().strip()) + self.y_pred.append(pred) + self.latencies.append(latency) + self.ids.append(sample_id) + + # ── metrics ─────────────────────────────────────────────────────────────── + def accuracy(self) -> float: + return accuracy_score(self.y_true, self.y_pred) if self.y_true else 0.0 + + def macro_f1(self) -> float: + if not self.y_true: + return 0.0 + return f1_score(self.y_true, self.y_pred, labels=list(LABELS), + average="macro", zero_division=0) + + def avg_latency(self) -> float: + return sum(self.latencies) / len(self.latencies) if self.latencies else 0.0 + + def summary(self) -> dict: + return { + "model": self.model_name, + "accuracy": self.accuracy(), + "macro_f1": self.macro_f1(), + "samples": len(self.y_true), + "total_time": sum(self.latencies), + "avg_latency": self.avg_latency(), + "y_true": self.y_true, + "y_pred": self.y_pred, + "ids": self.ids, + } + + def report(self) -> dict: + if not self.y_true: + print("No data recorded.") + return {} + print(f"\n{'=' * 52}") + print(f" {self.model_name} — Evaluation Report") + print(f"{'=' * 52}") + print(f" Samples : {len(self.y_true)}") + print(f" Accuracy : {self.accuracy():.2%}") + print(f" Macro F1 : {self.macro_f1():.2%}") + print(f" Avg/query : {self.avg_latency():.1f}s") + print(f"{'-' * 52}") + print(classification_report(self.y_true, self.y_pred, + labels=list(LABELS), zero_division=0)) + return self.summary() + + def save(self, path: str) -> None: + with open(path, "w") as f: + json.dump(self.summary(), f, indent=2) + print(f"Results saved to {path}") + + +def mcnemar_test(y_true: list, pred_a: list, pred_b: list) -> dict: + """Paired McNemar test: is arm B's accuracy change over arm A significant? + + Compares the two arms only on the samples where exactly one is correct + (the discordant pairs). Uses the exact binomial test, which is valid for + the small discordant counts typical of n~200 benchmarks. + """ + from scipy.stats import binomtest + + if not (len(y_true) == len(pred_a) == len(pred_b)): + raise ValueError("y_true, pred_a, pred_b must be the same length") + + # b: A wrong, B right (B's gains). c: A right, B wrong (B's losses). + b = c = 0 + for gt, a, bb in zip(y_true, pred_a, pred_b, strict=False): + a_ok, b_ok = (a == gt), (bb == gt) + if a_ok and not b_ok: + c += 1 + elif b_ok and not a_ok: + b += 1 + + n = b + c + p_value = float(binomtest(b, n, 0.5).pvalue) if n > 0 else 1.0 + return { + "b_gains": b, # B right, A wrong + "c_losses": c, # A right, B wrong + "discordant": n, + "p_value": p_value, + "significant_at_0.05": bool(p_value < 0.05), + } diff --git a/src/kgqa/llm.py b/src/kgqa/llm.py new file mode 100644 index 0000000..94b956c --- /dev/null +++ b/src/kgqa/llm.py @@ -0,0 +1,44 @@ +"""Thin Ollama client — the single LLM entry point shared by all arms.""" + +from __future__ import annotations + +import requests + +from .config import ( + LLM_KEEP_ALIVE, + LLM_MODEL, + LLM_NUM_CTX, + LLM_NUM_PREDICT, + LLM_TEMPERATURE, + LLM_TIMEOUT, + OLLAMA_API, +) + + +def call_ollama( + prompt: str, + system: str = "", + temperature: float = LLM_TEMPERATURE, + model: str = LLM_MODEL, + api_url: str = OLLAMA_API, +) -> str: + """Single synchronous chat completion against a local Ollama server.""" + messages = [] + if system: + messages.append({"role": "system", "content": system}) + messages.append({"role": "user", "content": prompt}) + + payload = { + "model": model, + "messages": messages, + "stream": False, + "keep_alive": LLM_KEEP_ALIVE, # keep the model resident across the run + "options": { + "temperature": temperature, + "num_ctx": LLM_NUM_CTX, + "num_predict": LLM_NUM_PREDICT, # cap generation so a call can't run away + }, + } + resp = requests.post(api_url, json=payload, timeout=LLM_TIMEOUT) + resp.raise_for_status() + return resp.json()["message"]["content"] diff --git a/src/kgqa/models.py b/src/kgqa/models.py new file mode 100644 index 0000000..6af8a47 --- /dev/null +++ b/src/kgqa/models.py @@ -0,0 +1,44 @@ +"""Lazy loaders for the shared embedder and reranker. + +Kept here so every script and notebook instantiates the *same* models the same +way. Imports are local so the package can be imported without the heavy ML deps +installed (e.g. in unit tests that inject fakes).""" + +from __future__ import annotations + +from .config import CROSS_ENCODER, EMBEDDING_MODEL + + +def load_encoder(model_name: str = EMBEDDING_MODEL, device: str | None = None): + from sentence_transformers import SentenceTransformer + + return SentenceTransformer(model_name, device=device) + + +def load_reranker(model_name: str = CROSS_ENCODER, device: str | None = None): + from sentence_transformers import CrossEncoder + + return CrossEncoder(model_name, device=device) + + +def connect_arango(cfg, max_retries: int = 5): + """Connect to ArangoDB Oasis with retries. ``cfg`` is an ArangoConfig.""" + import time + + from arango import ArangoClient + from arango.exceptions import ArangoServerError, ServerConnectionError + + cfg.require_password() + client = ArangoClient(hosts=cfg.host) + for attempt in range(max_retries): + try: + sys_db = client.db("_system", username=cfg.user, password=cfg.password) + sys_db.version() + db = client.db(cfg.db_name, username=cfg.user, password=cfg.password) + print("[ArangoDB] Connected.") + return db + except (ServerConnectionError, ArangoServerError): + wait = (attempt + 1) * 5 + print(f"[ArangoDB] Attempt {attempt + 1} failed. Retrying in {wait}s...") + time.sleep(wait) + raise ConnectionError("Could not connect to ArangoDB.") diff --git a/src/kgqa/prompts.py b/src/kgqa/prompts.py new file mode 100644 index 0000000..ca67313 --- /dev/null +++ b/src/kgqa/prompts.py @@ -0,0 +1,28 @@ +"""Prompts — word-for-word identical across every arm. + +The benchmark prompt classifies a PubMedQA question as yes/no/maybe. It is the +same string for PlainRAG and GraphRAG; only the retrieved ``context`` differs. +""" + +BENCHMARK_SYSTEM_PROMPT = ( + "You are a PubMedQA annotator. Classify the answer as yes, no, or maybe.\n\n" + "Guidelines:\n" + "- YES : the study finds a positive outcome, correlation, or association,\n" + " even if further research is recommended.\n" + "- NO : the study finds no significant difference or a negative result.\n" + "- MAYBE: only if the abstract explicitly states inconclusive results\n" + " with no supporting data.\n\n" + "End your response with exactly: Final Answer: [yes/no/maybe]" +) + +CHAT_SYSTEM_PROMPT = ( + "You are a helpful medical AI assistant. " + "Use the provided research abstracts to answer the user question. " + "If studies conflict, explain the conflict. " + "If the context is insufficient, say so and give your best assessment." +) + + +def build_prompt(context: str, question: str) -> str: + """Assemble the user-turn prompt — identical structure for every arm.""" + return f"Context:\n{context}\n\nQuestion: {question}" diff --git a/src/kgqa/retrieval/__init__.py b/src/kgqa/retrieval/__init__.py new file mode 100644 index 0000000..b38798a --- /dev/null +++ b/src/kgqa/retrieval/__init__.py @@ -0,0 +1,13 @@ +"""Retrieval arms for the GraphRAG vs PlainRAG ablation.""" + +from .base import BaseRetriever, Candidate, ChunkStore +from .graph import GraphRetriever +from .plain import PlainRetriever + +__all__ = [ + "BaseRetriever", + "ChunkStore", + "Candidate", + "PlainRetriever", + "GraphRetriever", +] diff --git a/src/kgqa/retrieval/base.py b/src/kgqa/retrieval/base.py new file mode 100644 index 0000000..0bfbb4b --- /dev/null +++ b/src/kgqa/retrieval/base.py @@ -0,0 +1,181 @@ +"""Shared retrieval scaffolding. + +``ChunkStore`` is the single document pool every arm searches over, so the +corpus, chunking, and embeddings are provably identical across arms. +``BaseRetriever`` owns the encode -> (optional) rerank -> select pipeline; each +subclass only customises how the selected chunks become an LLM context string. +""" + +from __future__ import annotations + +import pickle +from abc import ABC, abstractmethod +from dataclasses import dataclass + +import numpy as np + +from ..config import TOP_K_CANDIDATES, TOP_K_FINAL +from ..llm import call_ollama +from ..prompts import BENCHMARK_SYSTEM_PROMPT, CHAT_SYSTEM_PROMPT, build_prompt + + +@dataclass +class Candidate: + """A retrieved chunk plus its provenance.""" + + chunk_id: str # ArangoDB _id or local id, e.g. "Chunks/12345_0" + paper_key: str # owning paper, e.g. "12345" + text: str + score: float = 0.0 + + +def _normalize(matrix: np.ndarray) -> np.ndarray: + norms = np.linalg.norm(matrix, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + return matrix / norms + + +class ChunkStore: + """In-memory, L2-normalised chunk embeddings with cosine search.""" + + def __init__(self, ids: list[str], paper_keys: list[str], + texts: list[str], embeddings: np.ndarray): + self.ids = ids + self.paper_keys = paper_keys + self.texts = texts + self.embeddings = _normalize(np.asarray(embeddings, dtype=np.float32)) \ + if len(embeddings) else np.zeros((0, 0), dtype=np.float32) + + def __len__(self) -> int: + return len(self.ids) + + def search(self, query_emb: np.ndarray, k: int) -> list[int]: + """Return indices of the top-k chunks by cosine similarity.""" + if len(self) == 0: + return [] + q = _normalize(np.atleast_2d(np.asarray(query_emb, dtype=np.float32))) + sims = (self.embeddings @ q[0]) + k = min(k, len(self)) + top = np.argpartition(sims, -k)[-k:] + return list(top[np.argsort(sims[top])[::-1]]) + + def candidate(self, idx: int, score: float = 0.0) -> Candidate: + return Candidate(self.ids[idx], self.paper_keys[idx], self.texts[idx], score) + + # ── builders ─────────────────────────────────────────────────────────────── + @classmethod + def from_dataset(cls, encoder, include_unlabeled: bool = True, + batch_size: int = 128) -> ChunkStore: + """Build the corpus locally from PubMedQA (no ArangoDB needed).""" + from ..data import iter_chunks + + ids, paper_keys, texts = [], [], [] + for paper_key, chunk_idx, text in iter_chunks(include_unlabeled): + ids.append(f"Chunks/{paper_key}_{chunk_idx}") + paper_keys.append(paper_key) + texts.append(text) + embeddings = encoder.encode( + texts, batch_size=batch_size, convert_to_numpy=True, + normalize_embeddings=True, show_progress_bar=True, + ) + return cls(ids, paper_keys, texts, embeddings) + + @classmethod + def from_arango(cls, db, collection: str = "Chunks", batch: int = 5000, + cache_file: str | None = None) -> ChunkStore: + """Download chunk vectors from ArangoDB (with optional pickle cache).""" + if cache_file: + import os + + if os.path.exists(cache_file): + with open(cache_file, "rb") as f: + data = pickle.load(f) + if len(data["embeddings"]): + return cls(data["ids"], data["paper_keys"], + data["texts"], np.asarray(data["embeddings"])) + + ids, paper_keys, texts, embeddings = [], [], [], [] + offset = 0 + while True: + aql = f""" + FOR c IN {collection} + FILTER c.embedding != null + LIMIT {offset}, {batch} + RETURN {{ id: c._id, paper: c.paper_key, + text: c.text, emb: c.embedding }} + """ + page = list(db.aql.execute(aql, ttl=3600)) + if not page: + break + for doc in page: + ids.append(doc["id"]) + paper_keys.append(doc.get("paper") or doc["id"].split("/")[-1].rsplit("_", 1)[0]) + texts.append(doc["text"]) + embeddings.append(doc["emb"]) + offset += len(page) + if len(page) < batch: + break + + embeddings_np = np.asarray(embeddings, dtype=np.float32) + if cache_file and ids: + with open(cache_file, "wb") as f: + pickle.dump({"ids": ids, "paper_keys": paper_keys, + "texts": texts, "embeddings": embeddings_np}, f) + return cls(ids, paper_keys, texts, embeddings_np) + + +class BaseRetriever(ABC): + """encode -> (optional) rerank -> select -> build context -> answer.""" + + name: str = "base" + + def __init__(self, store: ChunkStore, encoder, reranker=None, + top_k_final: int = TOP_K_FINAL, + top_k_candidates: int = TOP_K_CANDIDATES): + self.store = store + self.encoder = encoder + self.reranker = reranker + self.top_k_final = top_k_final + self.top_k_candidates = top_k_candidates + + def _select(self, query: str) -> list[Candidate]: + """Top-k chunks, optionally cross-encoder reranked from a wide pool.""" + query_emb = self.encoder.encode([query], normalize_embeddings=True) + pool_k = self.top_k_candidates if self.reranker else self.top_k_final + idxs = self.store.search(query_emb, pool_k) + candidates = [self.store.candidate(i) for i in idxs] + + if self.reranker and candidates: + scores = self.reranker.predict([[query, c.text] for c in candidates]) + order = np.argsort(scores)[::-1][:self.top_k_final] + return [ + Candidate(candidates[i].chunk_id, candidates[i].paper_key, + candidates[i].text, float(scores[i])) + for i in order + ] + return candidates[:self.top_k_final] + + @abstractmethod + def _build_context(self, query: str, candidates: list[Candidate]) -> str: + """Turn selected chunks into the LLM context string.""" + + def retrieve(self, query: str) -> str: + return self._build_context(query, self._select(query)) + + def answer_benchmark(self, question: str) -> str: + context = self.retrieve(question) + return call_ollama(build_prompt(context, question), + system=BENCHMARK_SYSTEM_PROMPT) + + def chat(self, question: str, temperature: float = 0.3) -> dict: + """Conversational answer plus the source paper pubids it retrieved. + + Runs retrieval once and returns the cited papers (their PubMedQA pubids, + which are real PubMed IDs) so a UI can link back to the sources. + """ + candidates = self._select(question) + context = self._build_context(question, candidates) + answer = call_ollama(build_prompt(context, question), + system=CHAT_SYSTEM_PROMPT, temperature=temperature) + sources = list(dict.fromkeys(c.paper_key for c in candidates)) + return {"answer": answer, "sources": sources, "context": context} diff --git a/src/kgqa/retrieval/graph.py b/src/kgqa/retrieval/graph.py new file mode 100644 index 0000000..03009fd --- /dev/null +++ b/src/kgqa/retrieval/graph.py @@ -0,0 +1,123 @@ +"""GraphRAG arms: ``graph`` (parent expansion) and ``graph_concepts``. + +Both reuse the identical encode + rerank + select pipeline from ``BaseRetriever`` +(so the reranker is *controlled for*, not a confound). The graph then adds: + + graph parent-paper expansion — reconstruct each selected chunk's + full abstract via HAS_CONTEXT traversal. + graph_concepts the above, plus a MeSH concept hop — pull in a few related + papers that share concepts with the selected papers. + +Leakage is stripped: studies are labelled generically ("=== STUDY n ===") and +no question-derived title or ``final_decision`` ever reaches the prompt. +""" + +from __future__ import annotations + +from ..config import CONCEPT_HOP_PAPERS, HAS_CONTEXT, MENTIONS +from .base import BaseRetriever, Candidate + +# Reconstruct the full abstract of each selected chunk's parent paper. +_PARENT_AQL = """ + WITH Papers, Chunks + FOR cid IN @ids + LET chunk = DOCUMENT(cid) + FOR paper IN 1..1 INBOUND chunk @@has_context + LET sections = ( + FOR c IN 1..1 OUTBOUND paper @@has_context + SORT c._key + RETURN c.text + ) + RETURN DISTINCT { + paper: paper._key, + abstract: CONCAT_SEPARATOR(" ", sections) + } +""" + +# From the seed papers, hop across shared MeSH concepts to related papers. +# Two-stage: rank neighbours by how many concepts they share with the seeds +# (cheap), then reconstruct abstracts only for the top-N (avoids building an +# abstract for every candidate on every query). +_CONCEPT_AQL = """ + WITH Papers, Chunks, Concepts + LET seeds = @paper_keys + LET ranked = ( + FOR pkey IN seeds + LET paper = DOCUMENT(CONCAT("Papers/", pkey)) + FILTER paper != null + FOR concept IN 1..1 OUTBOUND paper @@mentions + FOR neighbour IN 1..1 INBOUND concept @@mentions + FILTER neighbour._key NOT IN seeds + COLLECT nkey = neighbour._key WITH COUNT INTO shared + SORT shared DESC + LIMIT @limit + RETURN { nkey: nkey, shared: shared } + ) + FOR n IN ranked + LET sections = ( + FOR c IN 1..1 OUTBOUND DOCUMENT(CONCAT("Papers/", n.nkey)) @@has_context + SORT c._key + RETURN c.text + ) + RETURN { paper: n.nkey, abstract: CONCAT_SEPARATOR(" ", sections), shared: n.shared } +""" + + +class GraphRetriever(BaseRetriever): + name = "graph" + + def __init__(self, store, encoder, db, reranker=None, + use_concepts: bool = False, + concept_hop_papers: int = CONCEPT_HOP_PAPERS, **kwargs): + super().__init__(store, encoder, reranker=reranker, **kwargs) + self.db = db + self.use_concepts = use_concepts + self.concept_hop_papers = concept_hop_papers + if use_concepts: + self.name = "graph_concepts" + + def _parent_abstracts(self, chunk_ids: list[str]) -> list[tuple[str, str]]: + rows = self.db.aql.execute( + _PARENT_AQL, + bind_vars={"ids": chunk_ids, "@has_context": HAS_CONTEXT}, + ) + out, seen = [], set() + for row in rows: + key = row["paper"] + if key in seen: + continue + seen.add(key) + out.append((key, row.get("abstract", ""))) + return out + + def _concept_neighbours(self, paper_keys: list[str]) -> list[tuple[str, str]]: + rows = self.db.aql.execute( + _CONCEPT_AQL, + bind_vars={ + "paper_keys": paper_keys, + "@mentions": MENTIONS, + "@has_context": HAS_CONTEXT, + "limit": self.concept_hop_papers, + }, + ) + return [(row["paper"], row.get("abstract", "")) for row in rows] + + def _build_context(self, query: str, candidates: list[Candidate]) -> str: + chunk_ids = [c.chunk_id for c in candidates] + try: + studies = self._parent_abstracts(chunk_ids) + seed_keys = [k for k, _ in studies] + if self.use_concepts and seed_keys: + for key, abstract in self._concept_neighbours(seed_keys): + if key not in seed_keys and abstract: + studies.append((key, abstract)) + except Exception as exc: # graph unreachable -> degrade to raw chunks + print(f"[GraphRAG] Graph expansion failed ({exc}). Using raw chunks.") + studies = [(c.paper_key, c.text) for c in candidates] + + parts = [ + f"=== STUDY {i + 1} ===\n{abstract}" + for i, (_key, abstract) in enumerate(studies) + if abstract + ] + return "\n\n".join(parts) if parts else "No context found." diff --git a/src/kgqa/retrieval/plain.py b/src/kgqa/retrieval/plain.py new file mode 100644 index 0000000..15b6aa1 --- /dev/null +++ b/src/kgqa/retrieval/plain.py @@ -0,0 +1,27 @@ +"""PlainRAG arms: ``plain`` (no rerank) and ``plain_rr`` (with rerank). + +Context is the raw retrieved chunk text — no graph structure is used. With +``reranker=None`` this is the baseline; pass a CrossEncoder for the ``plain_rr`` +arm that isolates the reranker's contribution. +""" + +from __future__ import annotations + +from .base import BaseRetriever, Candidate + + +class PlainRetriever(BaseRetriever): + name = "plain" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.reranker is not None: + self.name = "plain_rr" + + def _build_context(self, query: str, candidates: list[Candidate]) -> str: + if not candidates: + return "No context available." + return "\n\n".join( + f"Abstract {i + 1}: {c.text}" + for i, c in enumerate(candidates) + ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..b17632d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,84 @@ +"""Shared fakes so the suite runs on CPU with no Ollama, ArangoDB, or ML deps.""" + +from __future__ import annotations + +import numpy as np +import pytest + + +class FakeEncoder: + """Deterministic hashing encoder — stable vectors without downloading a model.""" + + dim = 16 + + def encode(self, texts, normalize_embeddings=False, convert_to_numpy=True, + batch_size=32, show_progress_bar=False): + single = isinstance(texts, str) + items = [texts] if single else list(texts) + vecs = np.zeros((len(items), self.dim), dtype=np.float32) + for i, t in enumerate(items): + for token in str(t).lower().split(): + vecs[i, hash(token) % self.dim] += 1.0 + if normalize_embeddings: + norms = np.linalg.norm(vecs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + vecs = vecs / norms + return vecs[0] if single else vecs + + +class FakeReranker: + """Scores by lexical overlap between query and candidate text.""" + + def predict(self, pairs): + scores = [] + for query, text in pairs: + q = set(str(query).lower().split()) + d = set(str(text).lower().split()) + scores.append(float(len(q & d))) + return np.array(scores) + + +class FakeAQL: + def __init__(self, db): + self.db = db + + def execute(self, query, bind_vars=None, **kwargs): + bind_vars = bind_vars or {} + # Parent expansion: map chunk ids -> parent paper full abstracts. + if "INBOUND chunk" in query: + seen, out = set(), [] + for cid in bind_vars["ids"]: + pkey = cid.split("/")[-1].rsplit("_", 1)[0] + if pkey in seen: + continue + seen.add(pkey) + out.append({"paper": pkey, "abstract": self.db.abstracts[pkey]}) + return out + # Concept hop: return configured neighbours for the seed papers. + if "@mentions" in query or "mentions" in query.lower(): + seeds = set(bind_vars["paper_keys"]) + out = [] + for nkey, abstract in self.db.neighbours: + if nkey not in seeds: + out.append({"paper": nkey, "abstract": abstract, "shared": 1}) + return out[: bind_vars.get("limit", 3)] + return [] + + +class FakeDB: + """Minimal ArangoDB stand-in for graph-expansion tests.""" + + def __init__(self, abstracts, neighbours=()): + self.abstracts = abstracts # {paper_key: full abstract} + self.neighbours = list(neighbours) # [(paper_key, abstract), ...] + self.aql = FakeAQL(self) + + +@pytest.fixture +def fake_encoder(): + return FakeEncoder() + + +@pytest.fixture +def fake_reranker(): + return FakeReranker() diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..88fbecf --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,25 @@ +import pytest + +from kgqa.config import TOP_K_CANDIDATES, TOP_K_FINAL, ArangoConfig +from kgqa.prompts import build_prompt + + +def test_arango_requires_password(): + cfg = ArangoConfig(password="") + with pytest.raises(EnvironmentError): + cfg.require_password() + + +def test_arango_password_ok(): + ArangoConfig(password="secret").require_password() # no raise + + +def test_retrieval_constants_sane(): + assert TOP_K_FINAL >= 1 + assert TOP_K_CANDIDATES >= TOP_K_FINAL + + +def test_build_prompt_structure(): + p = build_prompt("CTX", "Q?") + assert "Context:\nCTX" in p + assert "Question: Q?" in p diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..57159db --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,43 @@ +"""Tests for dataset sampling and chunking — the `datasets` dependency is faked +so these run without it installed and without any download.""" + +from __future__ import annotations + +import sys +import types + + +def _fake_datasets(monkeypatch, rows): + mod = types.ModuleType("datasets") + mod.load_dataset = lambda *a, **k: rows + monkeypatch.setitem(sys.modules, "datasets", mod) + + +def test_load_benchmark_samples_seeded_and_filtered(monkeypatch): + from kgqa import data + + rows = [{"pubid": i, "question": f"q{i}", "final_decision": ["yes", "no", "maybe"][i % 3]} + for i in range(30)] + rows.append({"pubid": 900, "question": "", "final_decision": "yes"}) # dropped: no question + rows.append({"pubid": 901, "question": "x", "final_decision": None}) # dropped: no label + _fake_datasets(monkeypatch, rows) + + a = data.load_benchmark_samples(n=10, seed=42) + b = data.load_benchmark_samples(n=10, seed=42) + assert len(a) == 10 + assert [s.pubid for s in a] == [s.pubid for s in b] # deterministic + assert all(s.question and s.final_decision for s in a) # filtered + assert all(isinstance(s.pubid, str) for s in a) # pubid stringified + assert data.load_benchmark_samples(n=10, seed=7) != a # seed changes order + + +def test_iter_chunks_skips_empty_and_yields_indices(monkeypatch): + from kgqa import data + + rows = [{"pubid": 5, "context": {"contexts": ["alpha", "beta", " "]}}] + _fake_datasets(monkeypatch, rows) + + chunks = list(data.iter_chunks(include_unlabeled=False)) + assert ("5", 0, "alpha") in chunks + assert ("5", 1, "beta") in chunks + assert len(chunks) == 2 # blank section dropped diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py new file mode 100644 index 0000000..067b701 --- /dev/null +++ b/tests/test_evaluation.py @@ -0,0 +1,71 @@ +from kgqa.evaluation import Evaluator, FuzzyEvaluator, mcnemar_test + + +def test_extract_final_answer_tag(): + fz = FuzzyEvaluator() + assert fz.extract_answer("blah blah Final Answer: yes") == "yes" + assert fz.extract_answer("FINAL ANSWER : No") == "no" + + +def test_extract_strips_think_block(): + fz = FuzzyEvaluator() + text = "maybe yes no The study shows ... Final Answer: maybe" + assert fz.extract_answer(text) == "maybe" + + +def test_extract_falls_back_to_last_mention(): + fz = FuzzyEvaluator() + assert fz.extract_answer("I think the answer is no") == "no" + assert fz.extract_answer("nothing useful here") == "maybe" + + +def test_evaluator_metrics_and_normalisation(): + ev = Evaluator("plain") + ev.record("yes", "yes", 1.0, sample_id="1") + ev.record("no", "garbage", 2.0, sample_id="2") # invalid -> maybe + ev.record("maybe", "maybe", 3.0, sample_id="3") + s = ev.summary() + assert s["samples"] == 3 + assert s["y_pred"][1] == "maybe" + assert abs(s["accuracy"] - 2 / 3) < 1e-9 + assert abs(s["avg_latency"] - 2.0) < 1e-9 + assert s["ids"] == ["1", "2", "3"] + + +def test_mcnemar_detects_one_sided_gain(): + gt = ["yes"] * 10 + a = ["no"] * 10 # arm A always wrong + b = ["yes"] * 10 # arm B always right + res = mcnemar_test(gt, a, b) + assert res["b_gains"] == 10 + assert res["c_losses"] == 0 + assert res["significant_at_0.05"] is True + + +def test_mcnemar_no_difference(): + gt = ["yes", "no", "maybe"] + res = mcnemar_test(gt, gt, gt) + assert res["discordant"] == 0 + assert res["p_value"] == 1.0 + + +def test_mcnemar_length_mismatch_raises(): + import pytest + with pytest.raises(ValueError): + mcnemar_test(["yes"], ["yes"], ["yes", "no"]) + + +def test_report_and_save_roundtrip(tmp_path): + import json + ev = Evaluator("graph") + ev.record("yes", "yes", 1.0, "1") + ev.record("no", "yes", 2.0, "2") + summary = ev.report() + assert summary["model"] == "graph" and summary["samples"] == 2 + assert "macro_f1" in summary + + path = tmp_path / "results.json" + ev.save(str(path)) + loaded = json.loads(path.read_text()) + assert loaded["samples"] == 2 + assert loaded["ids"] == ["1", "2"] diff --git a/tests/test_llm.py b/tests/test_llm.py new file mode 100644 index 0000000..4960521 --- /dev/null +++ b/tests/test_llm.py @@ -0,0 +1,33 @@ +"""Tests for the Ollama client — requests.post is faked, so no server is needed.""" + +from __future__ import annotations + + +def test_call_ollama_builds_payload_and_returns_content(monkeypatch): + import kgqa.llm as llm + + captured = {} + + class FakeResp: + def raise_for_status(self): + pass + + def json(self): + return {"message": {"content": "the answer"}} + + def fake_post(url, json=None, timeout=None): + captured["url"] = url + captured["payload"] = json + return FakeResp() + + monkeypatch.setattr(llm.requests, "post", fake_post) + + out = llm.call_ollama("my prompt", system="be helpful", temperature=0.0) + assert out == "the answer" + + payload = captured["payload"] + assert payload["messages"][0] == {"role": "system", "content": "be helpful"} + assert payload["messages"][-1] == {"role": "user", "content": "my prompt"} + assert payload["stream"] is False + assert "num_predict" in payload["options"] # generation cap is applied + assert "keep_alive" in payload # model kept resident diff --git a/tests/test_retrieval.py b/tests/test_retrieval.py new file mode 100644 index 0000000..9fe8c1b --- /dev/null +++ b/tests/test_retrieval.py @@ -0,0 +1,113 @@ +import numpy as np + +from kgqa.retrieval import ChunkStore, GraphRetriever, PlainRetriever +from tests.conftest import FakeDB + + +def make_store(encoder): + texts = [ + "aspirin reduces heart attack risk in patients", + "statins lower cholesterol levels significantly", + "regular exercise improves mood and sleep", + ] + keys = ["1", "2", "3"] + ids = [f"Chunks/{k}_0" for k in keys] + embs = encoder.encode(texts, normalize_embeddings=True) + return ChunkStore(ids, keys, texts, np.asarray(embs)) + + +def test_chunkstore_search_ranks_relevant_first(fake_encoder): + store = make_store(fake_encoder) + idxs = store.search(fake_encoder.encode(["aspirin heart attack"]), k=3) + assert store.paper_keys[idxs[0]] == "1" + + +def test_plain_arm_naming(fake_encoder, fake_reranker): + assert PlainRetriever(make_store(fake_encoder), fake_encoder).name == "plain" + assert PlainRetriever(make_store(fake_encoder), fake_encoder, + reranker=fake_reranker).name == "plain_rr" + + +def test_plain_context_is_raw_chunks(fake_encoder): + store = make_store(fake_encoder) + r = PlainRetriever(store, fake_encoder, top_k_final=1) + ctx = r.retrieve("aspirin heart attack") + assert ctx.startswith("Abstract 1:") + assert "aspirin" in ctx + + +def test_graph_parent_expansion_uses_full_abstract(fake_encoder, fake_reranker): + store = make_store(fake_encoder) + db = FakeDB(abstracts={ + "1": "FULL ABSTRACT 1: aspirin trial methods results conclusion", + "2": "FULL ABSTRACT 2: statin trial", + "3": "FULL ABSTRACT 3: exercise study", + }) + r = GraphRetriever(store, fake_encoder, db, reranker=fake_reranker, top_k_final=1) + assert r.name == "graph" + ctx = r.retrieve("aspirin heart attack") + assert "=== STUDY 1 ===" in ctx + assert "FULL ABSTRACT 1" in ctx + + +def test_graph_concept_hop_adds_neighbour(fake_encoder, fake_reranker): + store = make_store(fake_encoder) + db = FakeDB( + abstracts={"1": "FULL ABSTRACT 1: aspirin", "2": "x", "3": "y"}, + neighbours=[("99", "NEIGHBOUR ABSTRACT via shared MeSH concept")], + ) + r = GraphRetriever(store, fake_encoder, db, reranker=fake_reranker, + use_concepts=True, top_k_final=1) + assert r.name == "graph_concepts" + ctx = r.retrieve("aspirin heart attack") + assert "NEIGHBOUR ABSTRACT" in ctx + assert ctx.count("=== STUDY") == 2 + + +def test_graph_context_has_no_question_leakage(fake_encoder, fake_reranker): + """The benchmark question/title must never appear in the graph context.""" + store = make_store(fake_encoder) + db = FakeDB(abstracts={"1": "FULL ABSTRACT 1: aspirin", "2": "x", "3": "y"}) + r = GraphRetriever(store, fake_encoder, db, reranker=fake_reranker, top_k_final=1) + question = "does aspirin reduce heart attack risk" + ctx = r.retrieve(question) + assert question not in ctx + assert "STUDY:" not in ctx # old leaky "=== STUDY: {title} ===" format is gone + + +def test_graph_degrades_to_raw_chunks_on_db_error(fake_encoder, fake_reranker): + class BrokenDB: + class aql: + @staticmethod + def execute(*a, **k): + raise RuntimeError("no connection") + store = make_store(fake_encoder) + r = GraphRetriever(store, fake_encoder, BrokenDB(), reranker=fake_reranker, top_k_final=1) + ctx = r.retrieve("aspirin heart attack") + assert "=== STUDY 1 ===" in ctx + assert "aspirin" in ctx + + +def test_chat_returns_answer_and_source_pubids(fake_encoder, fake_reranker, monkeypatch): + import kgqa.retrieval.base as base + monkeypatch.setattr(base, "call_ollama", + lambda *a, **k: "reasoning Yes, it does.") + store = make_store(fake_encoder) + db = FakeDB(abstracts={"1": "FULL ABS 1: aspirin", "2": "x", "3": "y"}) + r = GraphRetriever(store, fake_encoder, db, reranker=fake_reranker, top_k_final=1) + out = r.chat("does aspirin reduce heart attack risk") + assert set(out) >= {"answer", "sources", "context"} + assert out["sources"] == ["1"] # the retrieved paper's pubid + assert "Yes" in out["answer"] + + +def test_chunkstore_from_dataset_builds_corpus(monkeypatch, fake_encoder): + import kgqa.data as data + from kgqa.retrieval import ChunkStore + + monkeypatch.setattr(data, "iter_chunks", + lambda include_unlabeled=True: iter([("1", 0, "alpha"), ("2", 0, "beta")])) + store = ChunkStore.from_dataset(fake_encoder, include_unlabeled=False) + assert len(store) == 2 + assert store.paper_keys == ["1", "2"] + assert store.ids == ["Chunks/1_0", "Chunks/2_0"]