From 4be55f79f9481880c28bd4dd86ac0f33fa1662d5 Mon Sep 17 00:00:00 2001
From: Guohao Zhang <akidezhang@outlook.com>
Date: Tue, 23 Jun 2026 21:27:41 +0800
Subject: [PATCH 1/4] feat(pdf): add shared extraction contract

---
 openkb/config.py            |   1 +
 openkb/pdf_extractor.py     | 144 ++++++++++++++++++++++++++++++++++++
 tests/test_config.py        |   2 +
 tests/test_pdf_extractor.py |  96 ++++++++++++++++++++++++
 4 files changed, 243 insertions(+)
 create mode 100644 openkb/pdf_extractor.py
 create mode 100644 tests/test_pdf_extractor.py

diff --git a/openkb/config.py b/openkb/config.py
index 9d0d6cd4..80e10fc2 100644
--- a/openkb/config.py
+++ b/openkb/config.py
@@ -16,6 +16,7 @@
     "model": "gpt-5.4-mini",
     "language": "en",
     "pageindex_threshold": 20,
+    "pdf_parser": "local",
 }
 
 # Default entity-type vocabulary. Overridable per-KB via the optional
diff --git a/openkb/pdf_extractor.py b/openkb/pdf_extractor.py
new file mode 100644
index 00000000..376a092f
--- /dev/null
+++ b/openkb/pdf_extractor.py
@@ -0,0 +1,144 @@
+"""Shared PDF page extraction for OpenKB."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Protocol
+
+
+@dataclass(frozen=True)
+class PageContent:
+    """Extracted content for one PDF page."""
+
+    page: int
+    content: str
+    images: list[dict[str, Any]]
+
+
+class PdfExtractor(Protocol):
+    """Protocol implemented by PDF page extraction backends."""
+
+    def parse_pages(
+        self,
+        pdf_path: Path,
+        doc_name: str,
+        images_dir: Path,
+    ) -> list[PageContent]:
+        """Extract ordered page content from *pdf_path*."""
+
+
+class LocalPdfExtractor:
+    """Local PyMuPDF-backed PDF extractor preserving existing behavior."""
+
+    provider = "local"
+
+    def parse_pages(
+        self,
+        pdf_path: Path,
+        doc_name: str,
+        images_dir: Path,
+    ) -> list[PageContent]:
+        from openkb.images import convert_pdf_to_pages
+
+        return normalize_page_content(convert_pdf_to_pages(pdf_path, doc_name, images_dir))
+
+
+def normalize_page_content(raw_pages: Any) -> list[PageContent]:
+    """Normalize raw page dictionaries into :class:`PageContent` objects."""
+    if not isinstance(raw_pages, list):
+        return []
+
+    pages: list[PageContent] = []
+    for index, item in enumerate(raw_pages, start=1):
+        if isinstance(item, str):
+            content = item.strip()
+            if content:
+                pages.append(PageContent(page=index, content=content, images=[]))
+            continue
+
+        if not isinstance(item, dict):
+            continue
+
+        raw_page = item.get("page", item.get("page_number", item.get("page_num", index)))
+        try:
+            page_number = int(raw_page)
+        except (TypeError, ValueError):
+            page_number = index
+        if page_number < 1:
+            page_number = index
+
+        content = item.get("content", item.get("markdown", item.get("text", "")))
+        if content is None:
+            content = ""
+        content = str(content).strip()
+
+        images = item.get("images", [])
+        if not isinstance(images, list):
+            images = []
+        normalized_images = [
+            image for image in images
+            if isinstance(image, dict) and isinstance(image.get("path"), str)
+        ]
+
+        if content or normalized_images:
+            pages.append(PageContent(
+                page=page_number,
+                content=content,
+                images=normalized_images,
+            ))
+
+    return pages
+
+
+def pages_to_markdown(pages: list[PageContent]) -> str:
+    """Render extracted pages as the Markdown source used for short PDFs."""
+    rendered_pages: list[str] = []
+    for page in pages:
+        parts: list[str] = []
+        if page.content:
+            parts.append(page.content)
+
+        for image in page.images:
+            path = image.get("path")
+            if not isinstance(path, str) or not path:
+                continue
+            image_markdown = f"![image]({path})"
+            if image_markdown not in page.content and path not in page.content:
+                parts.append(image_markdown)
+
+        page_markdown = "\n\n".join(parts).strip()
+        if page_markdown:
+            rendered_pages.append(page_markdown)
+
+    return "\n\n".join(rendered_pages).strip()
+
+
+def pages_to_json(pages: list[PageContent]) -> list[dict[str, Any]]:
+    """Render extracted pages as OpenKB's long-PDF source JSON shape."""
+    return [
+        {
+            "page": page.page,
+            "content": page.content,
+            "images": page.images,
+        }
+        for page in pages
+    ]
+
+
+def _provider_from_config(config: dict[str, Any]) -> str:
+    raw = config.get("pdf_parser", "local")
+    if isinstance(raw, str):
+        provider = raw
+    elif isinstance(raw, dict):
+        provider = raw.get("provider", "local")
+    else:
+        provider = "local"
+    return str(provider).strip().lower() or "local"
+
+
+def resolve_pdf_extractor(config: dict[str, Any]) -> PdfExtractor:
+    """Resolve the configured PDF extraction backend."""
+    provider = _provider_from_config(config)
+    if provider == "local":
+        return LocalPdfExtractor()
+    raise ValueError(f"Unsupported pdf_parser provider: {provider}")
diff --git a/tests/test_config.py b/tests/test_config.py
index 22b720f7..faea26d6 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -12,12 +12,14 @@ def test_default_config_keys():
     assert "model" in DEFAULT_CONFIG
     assert "language" in DEFAULT_CONFIG
     assert "pageindex_threshold" in DEFAULT_CONFIG
+    assert "pdf_parser" in DEFAULT_CONFIG
 
 
 def test_default_config_values():
     assert DEFAULT_CONFIG["model"] == "gpt-5.4-mini"
     assert DEFAULT_CONFIG["language"] == "en"
     assert DEFAULT_CONFIG["pageindex_threshold"] == 20
+    assert DEFAULT_CONFIG["pdf_parser"] == "local"
 
 
 def test_load_missing_file_returns_defaults(tmp_path):
diff --git a/tests/test_pdf_extractor.py b/tests/test_pdf_extractor.py
new file mode 100644
index 00000000..6c8227bc
--- /dev/null
+++ b/tests/test_pdf_extractor.py
@@ -0,0 +1,96 @@
+"""Tests for shared PDF page extraction."""
+from __future__ import annotations
+
+from unittest.mock import patch
+
+import pytest
+
+from openkb.pdf_extractor import (
+    LocalPdfExtractor,
+    PageContent,
+    normalize_page_content,
+    pages_to_json,
+    pages_to_markdown,
+    resolve_pdf_extractor,
+)
+
+
+def test_normalize_page_content_accepts_common_shapes():
+    pages = normalize_page_content([
+        {"page_number": "2", "markdown": "  Page two  ", "images": [{"path": "sources/images/doc/a.png"}]},
+        {"page_num": 3, "text": "Page three", "images": "bad"},
+        " page four ",
+    ])
+
+    assert pages == [
+        PageContent(page=2, content="Page two", images=[{"path": "sources/images/doc/a.png"}]),
+        PageContent(page=3, content="Page three", images=[]),
+        PageContent(page=3, content="page four", images=[]),
+    ]
+
+
+def test_pages_to_markdown_joins_page_content():
+    pages = [
+        PageContent(page=1, content="Page one", images=[]),
+        PageContent(page=2, content="Page two", images=[]),
+    ]
+
+    assert pages_to_markdown(pages) == "Page one\n\nPage two"
+
+
+def test_pages_to_markdown_renders_image_metadata_when_not_in_content():
+    pages = [
+        PageContent(
+            page=1,
+            content="Page one",
+            images=[{"path": "sources/images/doc/p1.png"}],
+        ),
+        PageContent(
+            page=2,
+            content="![image](sources/images/doc/p2.png)",
+            images=[{"path": "sources/images/doc/p2.png"}],
+        ),
+    ]
+
+    assert pages_to_markdown(pages) == (
+        "Page one\n\n![image](sources/images/doc/p1.png)\n\n"
+        "![image](sources/images/doc/p2.png)"
+    )
+
+
+def test_pages_to_json_preserves_openkb_shape():
+    pages = [
+        PageContent(page=1, content="Page one", images=[{"path": "sources/images/doc/p1.png"}]),
+    ]
+
+    assert pages_to_json(pages) == [
+        {"page": 1, "content": "Page one", "images": [{"path": "sources/images/doc/p1.png"}]},
+    ]
+
+
+def test_local_extractor_uses_existing_pymupdf_page_converter(tmp_path):
+    pdf = tmp_path / "paper.pdf"
+    pdf.write_bytes(b"%PDF-1.4 fake")
+    images_dir = tmp_path / "images"
+
+    with patch("openkb.images.convert_pdf_to_pages", return_value=[
+        {"page": 1, "content": "Page one", "images": []},
+    ]) as convert:
+        pages = LocalPdfExtractor().parse_pages(pdf, "paper", images_dir)
+
+    convert.assert_called_once_with(pdf, "paper", images_dir)
+    assert pages == [PageContent(page=1, content="Page one", images=[])]
+
+
+def test_resolve_pdf_extractor_defaults_to_local():
+    assert isinstance(resolve_pdf_extractor({}), LocalPdfExtractor)
+
+
+def test_resolve_pdf_extractor_accepts_string_and_object_local_config():
+    assert isinstance(resolve_pdf_extractor({"pdf_parser": "local"}), LocalPdfExtractor)
+    assert isinstance(resolve_pdf_extractor({"pdf_parser": {"provider": "local"}}), LocalPdfExtractor)
+
+
+def test_resolve_pdf_extractor_rejects_unknown_provider():
+    with pytest.raises(ValueError, match="Unsupported pdf_parser provider: mineru"):
+        resolve_pdf_extractor({"pdf_parser": "mineru"})

From b472a41a5b7bd8ba1aca92206a9ab11cf70d8a7a Mon Sep 17 00:00:00 2001
From: Guohao Zhang <akidezhang@outlook.com>
Date: Tue, 23 Jun 2026 21:28:23 +0800
Subject: [PATCH 2/4] feat(pdf): route short PDFs through shared extractor

---
 openkb/converter.py     |  9 ++++++---
 tests/test_converter.py | 26 ++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/openkb/converter.py b/openkb/converter.py
index 9ebac762..c76ea012 100644
--- a/openkb/converter.py
+++ b/openkb/converter.py
@@ -13,7 +13,8 @@
 from markitdown import MarkItDown
 
 from openkb.config import load_config
-from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
+from openkb.images import copy_relative_images, extract_base64_images
+from openkb.pdf_extractor import pages_to_markdown, resolve_pdf_extractor
 from openkb.state import HashRegistry
 
 logger = logging.getLogger(__name__)
@@ -187,8 +188,10 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
         markdown = src.read_text(encoding="utf-8")
         markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir)
     elif src.suffix.lower() == ".pdf":
-        # Use pymupdf dict-mode for PDFs: text + images inline at correct positions
-        markdown = convert_pdf_with_images(src, doc_name, images_dir)
+        pdf_extractor = resolve_pdf_extractor(config)
+        markdown = pages_to_markdown(
+            pdf_extractor.parse_pages(src, doc_name, images_dir)
+        )
     else:
         # Non-PDF, non-MD: use markitdown (docx, pptx, html, etc.)
         mid = MarkItDown()
diff --git a/tests/test_converter.py b/tests/test_converter.py
index 5ee2f5df..0c5e7cc8 100644
--- a/tests/test_converter.py
+++ b/tests/test_converter.py
@@ -5,6 +5,7 @@
 
 
 from openkb.converter import convert_document, get_pdf_page_count
+from openkb.pdf_extractor import PageContent
 
 
 # ---------------------------------------------------------------------------
@@ -78,14 +79,23 @@ def test_md_raw_file_copied(self, kb_dir):
 
 
 class TestConvertDocumentPdfShort:
-    def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path):
-        """PDF under threshold is converted with pymupdf (convert_pdf_with_images)."""
+    def test_short_pdf_converted_via_shared_extractor(self, kb_dir, tmp_path):
+        """PDF under threshold is converted through the configured extractor."""
         src = tmp_path / "short.pdf"
         src.write_bytes(b"%PDF-1.4 fake content")
 
+        extractor = MagicMock()
+        extractor.parse_pages.return_value = [
+            PageContent(
+                page=1,
+                content="# Short PDF\n\n![image](sources/images/short/p1_img1.png)",
+                images=[{"path": "sources/images/short/p1_img1.png"}],
+            )
+        ]
+
         with (
             patch("openkb.converter.pymupdf.open") as mock_mu,
-            patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi,
+            patch("openkb.converter.resolve_pdf_extractor", return_value=extractor) as resolve,
         ):
             fake_doc = MagicMock()
             fake_doc.page_count = 5  # below default threshold of 20
@@ -95,11 +105,19 @@ def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path):
 
             result = convert_document(src, kb_dir)
 
-        mock_cpwi.assert_called_once()
+        resolve.assert_called_once()
+        extractor.parse_pages.assert_called_once_with(
+            src,
+            "short",
+            kb_dir / "wiki" / "sources" / "images" / "short",
+        )
         assert result.skipped is False
         assert result.is_long_doc is False
         assert result.source_path is not None
         assert result.source_path.exists()
+        assert result.source_path.read_text(encoding="utf-8") == (
+            "# Short PDF\n\n![image](sources/images/short/p1_img1.png)"
+        )
 
 
 # ---------------------------------------------------------------------------

From 9ab571c8d1c0353b2c853676d66676ddb8b22777 Mon Sep 17 00:00:00 2001
From: Guohao Zhang <akidezhang@outlook.com>
Date: Tue, 23 Jun 2026 21:28:49 +0800
Subject: [PATCH 3/4] feat(pdf): route long PDF page JSON through shared
 extractor

---
 openkb/indexer.py     | 58 ++++---------------------------------
 tests/test_indexer.py | 67 +++++++++++++++++++++++++++----------------
 2 files changed, 48 insertions(+), 77 deletions(-)

diff --git a/openkb/indexer.py b/openkb/indexer.py
index 348c7e32..fdfae88d 100644
--- a/openkb/indexer.py
+++ b/openkb/indexer.py
@@ -13,6 +13,7 @@
 from pageindex import IndexConfig, PageIndexClient
 
 from openkb.config import load_config
+from openkb.pdf_extractor import normalize_page_content, pages_to_json, resolve_pdf_extractor
 from openkb.tree_renderer import render_summary_md
 
 logger = logging.getLogger(__name__)
@@ -29,49 +30,7 @@ class IndexResult:
 
 def _normalize_page_content(raw_pages: Any) -> list[dict[str, Any]]:
     """Normalize PageIndex/local PDF page content into OpenKB's JSON shape."""
-    if not isinstance(raw_pages, list):
-        return []
-
-    pages: list[dict[str, Any]] = []
-    for index, item in enumerate(raw_pages, start=1):
-        if isinstance(item, str):
-            content = item.strip()
-            if content:
-                pages.append({"page": index, "content": content, "images": []})
-            continue
-
-        if not isinstance(item, dict):
-            continue
-
-        raw_page = item.get("page", item.get("page_number", item.get("page_num", index)))
-        try:
-            page_number = int(raw_page)
-        except (TypeError, ValueError):
-            page_number = index
-        if page_number < 1:
-            page_number = index
-
-        content = item.get("content", item.get("markdown", item.get("text", "")))
-        if content is None:
-            content = ""
-        content = str(content).strip()
-
-        images = item.get("images", [])
-        if not isinstance(images, list):
-            images = []
-        normalized_images = [
-            image for image in images
-            if isinstance(image, dict) and isinstance(image.get("path"), str)
-        ]
-
-        if content or normalized_images:
-            pages.append({
-                "page": page_number,
-                "content": content,
-                "images": normalized_images,
-            })
-
-    return pages
+    return pages_to_json(normalize_page_content(raw_pages))
 
 
 def _get_pdf_page_count(pdf_path: Path) -> int:
@@ -80,12 +39,6 @@ def _get_pdf_page_count(pdf_path: Path) -> int:
     return get_pdf_page_count(pdf_path)
 
 
-def _convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict[str, Any]]:
-    from openkb.images import convert_pdf_to_pages
-
-    return convert_pdf_to_pages(pdf_path, doc_name, images_dir)
-
-
 def index_long_document(
     pdf_path: Path, kb_dir: Path, doc_name: str | None = None
 ) -> IndexResult:
@@ -151,8 +104,6 @@ def index_long_document(
 
     all_pages: list[dict[str, Any]] = []
     if pageindex_api_key:
-        # Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content
-        # requires a page range, so pass "1-N".
         page_count = _get_pdf_page_count(pdf_path)
         try:
             all_pages = _normalize_page_content(col.get_page_content(doc_id, f"1-{page_count}"))
@@ -161,8 +112,9 @@ def index_long_document(
 
     if not all_pages:
         if pageindex_api_key:
-            logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
-        all_pages = _normalize_page_content(_convert_pdf_to_pages(pdf_path, source_name, images_dir))
+            logger.warning("Cloud returned no pages for %s; falling back to configured PDF extractor", pdf_path.name)
+        pdf_extractor = resolve_pdf_extractor(config)
+        all_pages = pages_to_json(pdf_extractor.parse_pages(pdf_path, source_name, images_dir))
 
     if not all_pages:
         raise RuntimeError(f"No page content extracted for {pdf_path.name}")
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
index 843f6614..5a31b1a6 100644
--- a/tests/test_indexer.py
+++ b/tests/test_indexer.py
@@ -5,6 +5,7 @@
 
 
 from openkb.indexer import IndexResult, _normalize_page_content, index_long_document
+from openkb.pdf_extractor import PageContent
 
 
 class TestNormalizePageContent:
@@ -57,8 +58,8 @@ def _make_fake_collection(self, doc_id: str, sample_tree: dict):
 
     def _fake_pages(self):
         return [
-            {"page": 1, "content": "Page one text.", "images": []},
-            {"page": 2, "content": "Page two text.", "images": []},
+            PageContent(page=1, content="Page one text.", images=[]),
+            PageContent(page=2, content="Page two text.", images=[]),
         ]
 
     def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
@@ -71,8 +72,11 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
         pdf_path = tmp_path / "sample.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
 
+        extractor = MagicMock()
+        extractor.parse_pages.return_value = self._fake_pages()
+
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
-             patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
+             patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor):
             result = index_long_document(pdf_path, kb_dir)
 
         assert isinstance(result, IndexResult)
@@ -88,17 +92,14 @@ def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path):
 
         fake_client = MagicMock()
         fake_client.collection.return_value = fake_col
-        # Mock get_page_content to return page data
-        fake_col.get_page_content.return_value = [
-            {"page": 1, "content": "Page one text."},
-            {"page": 2, "content": "Page two text."},
-        ]
-
         pdf_path = tmp_path / "sample.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
 
+        extractor = MagicMock()
+        extractor.parse_pages.return_value = self._fake_pages()
+
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
-             patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
+             patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor):
             index_long_document(pdf_path, kb_dir)
 
         json_file = kb_dir / "wiki" / "sources" / "sample.json"
@@ -119,8 +120,11 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path):
         pdf_path = tmp_path / "sample.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
 
+        extractor = MagicMock()
+        extractor.parse_pages.return_value = self._fake_pages()
+
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
-             patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
+             patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor):
             index_long_document(pdf_path, kb_dir)
 
         summary_file = kb_dir / "wiki" / "summaries" / "sample.md"
@@ -140,8 +144,11 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat
         pdf_path = tmp_path / "report.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
 
+        extractor = MagicMock()
+        extractor.parse_pages.return_value = self._fake_pages()
+
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls, \
-             patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
+             patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor):
             index_long_document(pdf_path, kb_dir)
 
         # Verify PageIndexClient was instantiated with correct IndexConfig
@@ -153,7 +160,7 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat
         assert ic.if_add_node_summary is True
         assert ic.if_add_doc_description is True
 
-    def test_cloud_page_content_is_normalized(self, kb_dir, sample_tree, tmp_path, monkeypatch):
+    def test_cloud_page_content_is_normalized_before_shared_extractor_fallback(self, kb_dir, sample_tree, tmp_path, monkeypatch):
         doc_id = "cloud-123"
         fake_col = self._make_fake_collection(doc_id, sample_tree)
         fake_col.get_page_content.return_value = [
@@ -168,17 +175,21 @@ def test_cloud_page_content_is_normalized(self, kb_dir, sample_tree, tmp_path, m
         pdf_path.write_bytes(b"%PDF-1.4 fake")
         monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key")
 
+        extractor = MagicMock()
+        extractor.parse_pages.return_value = self._fake_pages()
+
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
              patch("openkb.indexer._get_pdf_page_count", return_value=2), \
-             patch("openkb.indexer._convert_pdf_to_pages") as local_pages:
+             patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor):
             index_long_document(pdf_path, kb_dir)
 
-        local_pages.assert_not_called()
+        fake_col.get_page_content.assert_called_once_with(doc_id, "1-2")
+        extractor.parse_pages.assert_not_called()
         json_file = kb_dir / "wiki" / "sources" / "sample.json"
         assert '"content": "Cloud page one."' in json_file.read_text(encoding="utf-8")
         assert '"content": "Cloud page two."' in json_file.read_text(encoding="utf-8")
 
-    def test_invalid_cloud_page_content_falls_back_to_local(self, kb_dir, sample_tree, tmp_path, monkeypatch):
+    def test_invalid_cloud_page_content_falls_back_to_shared_extractor(self, kb_dir, sample_tree, tmp_path, monkeypatch):
         doc_id = "cloud-456"
         fake_col = self._make_fake_collection(doc_id, sample_tree)
         fake_col.get_page_content.return_value = {"bad": "shape"}
@@ -190,16 +201,20 @@ def test_invalid_cloud_page_content_falls_back_to_local(self, kb_dir, sample_tre
         pdf_path.write_bytes(b"%PDF-1.4 fake")
         monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key")
 
+        extractor = MagicMock()
+        extractor.parse_pages.return_value = self._fake_pages()
+
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
              patch("openkb.indexer._get_pdf_page_count", return_value=2), \
-             patch("openkb.indexer._convert_pdf_to_pages", return_value=self._fake_pages()) as local_pages:
+             patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor):
             index_long_document(pdf_path, kb_dir)
 
-        local_pages.assert_called_once()
+        fake_col.get_page_content.assert_called_once_with(doc_id, "1-2")
+        extractor.parse_pages.assert_called_once()
         json_file = kb_dir / "wiki" / "sources" / "sample.json"
         assert "Page one text." in json_file.read_text(encoding="utf-8")
 
-    def test_empty_cloud_and_local_pages_fail(self, kb_dir, sample_tree, tmp_path, monkeypatch):
+    def test_empty_shared_extractor_pages_fail(self, kb_dir, sample_tree, tmp_path, monkeypatch):
         doc_id = "empty-123"
         fake_col = self._make_fake_collection(doc_id, sample_tree)
 
@@ -210,9 +225,12 @@ def test_empty_cloud_and_local_pages_fail(self, kb_dir, sample_tree, tmp_path, m
         pdf_path.write_bytes(b"%PDF-1.4 fake")
         monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key")
 
+        extractor = MagicMock()
+        extractor.parse_pages.return_value = []
+
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
              patch("openkb.indexer._get_pdf_page_count", return_value=2), \
-             patch("openkb.indexer._convert_pdf_to_pages", return_value=[]):
+             patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor):
             try:
                 index_long_document(pdf_path, kb_dir)
             except RuntimeError as exc:
@@ -237,10 +255,11 @@ def test_index_long_document_uses_explicit_doc_name(kb_dir, monkeypatch):
     pdf = kb_dir / "raw" / "original.pdf"
     pdf.write_bytes(b"%PDF-1.4 fake")
 
+    extractor = MagicMock()
+    extractor.parse_pages.return_value = [PageContent(page=1, content="p1", images=[])]
+
     with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
-         patch("openkb.indexer._get_pdf_page_count", return_value=30), \
-         patch("openkb.indexer._convert_pdf_to_pages",
-               return_value=[{"page": 1, "text": "p1"}]) as mock_convert:
+         patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor):
         result = index_long_document(pdf, kb_dir, doc_name="original-abc12345")
 
     assert result.doc_id == "doc-123"
@@ -251,7 +270,7 @@ def test_index_long_document_uses_explicit_doc_name(kb_dir, monkeypatch):
     assert not (kb_dir / "wiki" / "summaries" / "original.md").exists()
     # the page extractor receives the explicit doc_name and its images dir
     expected_images = kb_dir / "wiki" / "sources" / "images" / "original-abc12345"
-    mock_convert.assert_called_once_with(pdf, "original-abc12345", expected_images)
+    extractor.parse_pages.assert_called_once_with(pdf, "original-abc12345", expected_images)
     # summary frontmatter points full_text at the doc_name artifact
     summary_text = (kb_dir / "wiki" / "summaries" / "original-abc12345.md").read_text(encoding="utf-8")
     assert "original-abc12345" in summary_text

From 8930a8850dff3223691662d5f78910784b685483 Mon Sep 17 00:00:00 2001
From: Guohao Zhang <akidezhang@outlook.com>
Date: Tue, 23 Jun 2026 21:29:09 +0800
Subject: [PATCH 4/4] test(cli): close mocked async compile coroutines

---
 tests/conftest.py         | 23 +++++++++++++++++++++++
 tests/test_add_command.py | 12 ++++++------
 tests/test_remove.py      |  8 ++++----
 tests/test_url_ingest.py  | 18 ++++++++++++------
 4 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 3c556551..617a86f6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,4 @@
+import inspect
 import json
 import pytest
 
@@ -10,6 +11,28 @@ def _reset_extra_headers():
     set_extra_headers({})
 
 
+@pytest.fixture
+def close_coroutine_run():
+    """Test double for asyncio.run that closes unawaited coroutines."""
+    def _run(awaitable):
+        if inspect.iscoroutine(awaitable):
+            awaitable.close()
+
+    return _run
+
+
+@pytest.fixture
+def raise_after_closing_coroutine(close_coroutine_run):
+    def _factory(exc):
+        def _run(awaitable):
+            close_coroutine_run(awaitable)
+            raise exc
+
+        return _run
+
+    return _factory
+
+
 @pytest.fixture
 def kb_dir(tmp_path):
     """Create a minimal knowledge base directory structure for testing."""
diff --git a/tests/test_add_command.py b/tests/test_add_command.py
index 0199c9e2..9567b9c1 100644
--- a/tests/test_add_command.py
+++ b/tests/test_add_command.py
@@ -109,7 +109,7 @@ def test_add_nonexistent_path(self, tmp_path):
             result = runner.invoke(cli, ["add", str(tmp_path / "nonexistent.pdf")])
             assert "does not exist" in result.output
 
-    def test_add_skipped_file(self, tmp_path):
+    def test_add_skipped_file(self, tmp_path, close_coroutine_run):
         kb_dir = self._setup_kb(tmp_path)
         doc = tmp_path / "test.md"
         doc.write_text("# Hello")
@@ -120,12 +120,12 @@ def test_add_skipped_file(self, tmp_path):
         runner = CliRunner()
         with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \
              patch("openkb.cli.convert_document", return_value=mock_result), \
-             patch("openkb.cli.asyncio.run") as mock_arun:
+             patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run) as mock_arun:
             result = runner.invoke(cli, ["add", str(doc)])
             assert "SKIP" in result.output
             mock_arun.assert_not_called()
 
-    def test_add_short_doc_runs_compiler(self, tmp_path):
+    def test_add_short_doc_runs_compiler(self, tmp_path, close_coroutine_run):
         kb_dir = self._setup_kb(tmp_path)
         doc = tmp_path / "test.md"
         doc.write_text("# Hello")
@@ -152,7 +152,7 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
         runner = CliRunner()
         with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \
              patch("openkb.cli.convert_document", return_value=mock_result), \
-             patch("openkb.cli.asyncio.run") as mock_arun:
+             patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run) as mock_arun:
             result = runner.invoke(cli, ["add", str(doc)])
             mock_arun.assert_called_once()
             assert "OK" in result.output
@@ -168,7 +168,7 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
         assert "path" in meta
         assert "stale-old-hash" not in hashes
 
-    def test_add_oldest_legacy_entry_converges_to_single_entry(self, tmp_path):
+    def test_add_oldest_legacy_entry_converges_to_single_entry(self, tmp_path, close_coroutine_run):
         """Editing a pre-doc_name-era document must not fork the registry.
 
         convert_document backfills doc_name/path onto the legacy entry on
@@ -192,7 +192,7 @@ def test_add_oldest_legacy_entry_converges_to_single_entry(self, tmp_path):
         # the legacy backfill actually happens on disk mid-pipeline.
         runner = CliRunner()
         with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \
-             patch("openkb.cli.asyncio.run"):
+             patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run):
             result = runner.invoke(cli, ["add", str(doc)])
             assert "OK" in result.output
 
diff --git a/tests/test_remove.py b/tests/test_remove.py
index 75daadf0..d8174813 100644
--- a/tests/test_remove.py
+++ b/tests/test_remove.py
@@ -696,7 +696,7 @@ def test_cli_remove_preserves_ghosts_in_unrelated_pages(kb_dir):
 # ---------------------------------------------------------------------------
 
 
-def test_add_persists_doc_name_for_later_remove(tmp_path):
+def test_add_persists_doc_name_for_later_remove(tmp_path, close_coroutine_run):
     """End-to-end: `openkb add` writes a registry entry with `doc_name`,
     and a subsequent `openkb remove` actually prunes that entry.
     """
@@ -743,7 +743,7 @@ def test_add_persists_doc_name_for_later_remove(tmp_path):
     # Mock convert_document + asyncio.run to skip the LLM-driven compile.
     with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \
          patch("openkb.cli.convert_document", return_value=mock_result), \
-         patch("openkb.cli.asyncio.run"):
+         patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run):
         add_res = runner.invoke(cli, ["add", str(doc)])
     assert add_res.exit_code == 0, add_res.output
 
@@ -944,7 +944,7 @@ def test_cli_remove_dry_run_does_not_touch_images(kb_dir):
 # ---------------------------------------------------------------------------
 
 
-def test_add_long_pdf_persists_doc_id_to_registry(tmp_path):
+def test_add_long_pdf_persists_doc_id_to_registry(tmp_path, close_coroutine_run):
     """Long-doc ingest must record `doc_id` in the registry. Without it,
     the remove path has no handle to feed `Collection.delete_document`.
     """
@@ -987,7 +987,7 @@ def test_add_long_pdf_persists_doc_id_to_registry(tmp_path):
     with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \
          patch("openkb.cli.convert_document", return_value=convert_mock), \
          patch("openkb.indexer.index_long_document", return_value=index_mock), \
-         patch("openkb.cli.asyncio.run"):
+         patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run):
         result = runner.invoke(cli, ["add", str(pdf)])
 
     assert result.exit_code == 0, result.output
diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py
index 1b8548ee..579f5e03 100644
--- a/tests/test_url_ingest.py
+++ b/tests/test_url_ingest.py
@@ -461,7 +461,7 @@ def test_fetch_pdf_uses_post_redirect_url_for_filename(tmp_path):
     assert result.name == "great-paper.pdf"
 
 
-def test_add_single_file_returns_added_on_success(tmp_path):
+def test_add_single_file_returns_added_on_success(tmp_path, close_coroutine_run):
     """Tri-state return contract: ``"added"`` when the file was newly
     indexed. URL-ingest uses this to decide whether to keep / unlink
     the just-downloaded file."""
@@ -489,7 +489,7 @@ def test_add_single_file_returns_added_on_success(tmp_path):
     )
 
     with patch("openkb.cli.convert_document", return_value=mock_result), \
-         patch("openkb.cli.asyncio.run"):
+         patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run):
         outcome = add_single_file(doc, tmp_path)
 
     assert outcome == "added"
@@ -513,7 +513,7 @@ def test_add_single_file_returns_skipped_on_dedup(tmp_path):
     assert outcome == "skipped"
 
 
-def test_add_single_file_returns_failed_on_pipeline_error(tmp_path):
+def test_add_single_file_returns_failed_on_pipeline_error(tmp_path, raise_after_closing_coroutine):
     """A pipeline failure (e.g. transient LLM error during compilation)
     must be distinguishable from dedup-skip, so URL-ingest can preserve
     the raw file for retry instead of deleting it."""
@@ -540,7 +540,10 @@ def test_add_single_file_returns_failed_on_pipeline_error(tmp_path):
 
     # Make both compile attempts raise to drive the failure path.
     with patch("openkb.cli.convert_document", return_value=mock_result), \
-         patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \
+         patch(
+             "openkb.cli.asyncio.run",
+             side_effect=raise_after_closing_coroutine(RuntimeError("LLM 503")),
+         ), \
          patch("openkb.cli.time.sleep"):
         outcome = add_single_file(doc, tmp_path)
 
@@ -580,7 +583,7 @@ def test_url_ingest_cleans_up_orphan_on_dedup_skip(tmp_path, monkeypatch):
     assert not fetched_path.exists()
 
 
-def test_url_ingest_keeps_raw_file_on_pipeline_failure(tmp_path):
+def test_url_ingest_keeps_raw_file_on_pipeline_failure(tmp_path, raise_after_closing_coroutine):
     """The point of the tri-state return: a pipeline failure (e.g. LLM
     timeout during compilation) must NOT delete the downloaded file —
     the user can retry without re-downloading, and we don't lose data
@@ -611,7 +614,10 @@ def test_url_ingest_keeps_raw_file_on_pipeline_failure(tmp_path):
     with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \
          patch("openkb.url_ingest.fetch_url_to_raw", return_value=fetched_path), \
          patch("openkb.cli.convert_document", return_value=mock_result), \
-         patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \
+         patch(
+             "openkb.cli.asyncio.run",
+             side_effect=raise_after_closing_coroutine(RuntimeError("LLM 503")),
+         ), \
          patch("openkb.cli.time.sleep"):
         result = runner.invoke(cli, ["add", "https://example.com/paper.pdf"])