From 4be55f79f9481880c28bd4dd86ac0f33fa1662d5 Mon Sep 17 00:00:00 2001 From: Guohao Zhang Date: Tue, 23 Jun 2026 21:27:41 +0800 Subject: [PATCH 1/4] feat(pdf): add shared extraction contract --- openkb/config.py | 1 + openkb/pdf_extractor.py | 144 ++++++++++++++++++++++++++++++++++++ tests/test_config.py | 2 + tests/test_pdf_extractor.py | 96 ++++++++++++++++++++++++ 4 files changed, 243 insertions(+) create mode 100644 openkb/pdf_extractor.py create mode 100644 tests/test_pdf_extractor.py diff --git a/openkb/config.py b/openkb/config.py index 9d0d6cd4..80e10fc2 100644 --- a/openkb/config.py +++ b/openkb/config.py @@ -16,6 +16,7 @@ "model": "gpt-5.4-mini", "language": "en", "pageindex_threshold": 20, + "pdf_parser": "local", } # Default entity-type vocabulary. Overridable per-KB via the optional diff --git a/openkb/pdf_extractor.py b/openkb/pdf_extractor.py new file mode 100644 index 00000000..376a092f --- /dev/null +++ b/openkb/pdf_extractor.py @@ -0,0 +1,144 @@ +"""Shared PDF page extraction for OpenKB.""" +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Protocol + + +@dataclass(frozen=True) +class PageContent: + """Extracted content for one PDF page.""" + + page: int + content: str + images: list[dict[str, Any]] + + +class PdfExtractor(Protocol): + """Protocol implemented by PDF page extraction backends.""" + + def parse_pages( + self, + pdf_path: Path, + doc_name: str, + images_dir: Path, + ) -> list[PageContent]: + """Extract ordered page content from *pdf_path*.""" + + +class LocalPdfExtractor: + """Local PyMuPDF-backed PDF extractor preserving existing behavior.""" + + provider = "local" + + def parse_pages( + self, + pdf_path: Path, + doc_name: str, + images_dir: Path, + ) -> list[PageContent]: + from openkb.images import convert_pdf_to_pages + + return normalize_page_content(convert_pdf_to_pages(pdf_path, doc_name, images_dir)) + + +def normalize_page_content(raw_pages: Any) -> list[PageContent]: + """Normalize raw page dictionaries into :class:`PageContent` objects.""" + if not isinstance(raw_pages, list): + return [] + + pages: list[PageContent] = [] + for index, item in enumerate(raw_pages, start=1): + if isinstance(item, str): + content = item.strip() + if content: + pages.append(PageContent(page=index, content=content, images=[])) + continue + + if not isinstance(item, dict): + continue + + raw_page = item.get("page", item.get("page_number", item.get("page_num", index))) + try: + page_number = int(raw_page) + except (TypeError, ValueError): + page_number = index + if page_number < 1: + page_number = index + + content = item.get("content", item.get("markdown", item.get("text", ""))) + if content is None: + content = "" + content = str(content).strip() + + images = item.get("images", []) + if not isinstance(images, list): + images = [] + normalized_images = [ + image for image in images + if isinstance(image, dict) and isinstance(image.get("path"), str) + ] + + if content or normalized_images: + pages.append(PageContent( + page=page_number, + content=content, + images=normalized_images, + )) + + return pages + + +def pages_to_markdown(pages: list[PageContent]) -> str: + """Render extracted pages as the Markdown source used for short PDFs.""" + rendered_pages: list[str] = [] + for page in pages: + parts: list[str] = [] + if page.content: + parts.append(page.content) + + for image in page.images: + path = image.get("path") + if not isinstance(path, str) or not path: + continue + image_markdown = f"![image]({path})" + if image_markdown not in page.content and path not in page.content: + parts.append(image_markdown) + + page_markdown = "\n\n".join(parts).strip() + if page_markdown: + rendered_pages.append(page_markdown) + + return "\n\n".join(rendered_pages).strip() + + +def pages_to_json(pages: list[PageContent]) -> list[dict[str, Any]]: + """Render extracted pages as OpenKB's long-PDF source JSON shape.""" + return [ + { + "page": page.page, + "content": page.content, + "images": page.images, + } + for page in pages + ] + + +def _provider_from_config(config: dict[str, Any]) -> str: + raw = config.get("pdf_parser", "local") + if isinstance(raw, str): + provider = raw + elif isinstance(raw, dict): + provider = raw.get("provider", "local") + else: + provider = "local" + return str(provider).strip().lower() or "local" + + +def resolve_pdf_extractor(config: dict[str, Any]) -> PdfExtractor: + """Resolve the configured PDF extraction backend.""" + provider = _provider_from_config(config) + if provider == "local": + return LocalPdfExtractor() + raise ValueError(f"Unsupported pdf_parser provider: {provider}") diff --git a/tests/test_config.py b/tests/test_config.py index 22b720f7..faea26d6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -12,12 +12,14 @@ def test_default_config_keys(): assert "model" in DEFAULT_CONFIG assert "language" in DEFAULT_CONFIG assert "pageindex_threshold" in DEFAULT_CONFIG + assert "pdf_parser" in DEFAULT_CONFIG def test_default_config_values(): assert DEFAULT_CONFIG["model"] == "gpt-5.4-mini" assert DEFAULT_CONFIG["language"] == "en" assert DEFAULT_CONFIG["pageindex_threshold"] == 20 + assert DEFAULT_CONFIG["pdf_parser"] == "local" def test_load_missing_file_returns_defaults(tmp_path): diff --git a/tests/test_pdf_extractor.py b/tests/test_pdf_extractor.py new file mode 100644 index 00000000..6c8227bc --- /dev/null +++ b/tests/test_pdf_extractor.py @@ -0,0 +1,96 @@ +"""Tests for shared PDF page extraction.""" +from __future__ import annotations + +from unittest.mock import patch + +import pytest + +from openkb.pdf_extractor import ( + LocalPdfExtractor, + PageContent, + normalize_page_content, + pages_to_json, + pages_to_markdown, + resolve_pdf_extractor, +) + + +def test_normalize_page_content_accepts_common_shapes(): + pages = normalize_page_content([ + {"page_number": "2", "markdown": " Page two ", "images": [{"path": "sources/images/doc/a.png"}]}, + {"page_num": 3, "text": "Page three", "images": "bad"}, + " page four ", + ]) + + assert pages == [ + PageContent(page=2, content="Page two", images=[{"path": "sources/images/doc/a.png"}]), + PageContent(page=3, content="Page three", images=[]), + PageContent(page=3, content="page four", images=[]), + ] + + +def test_pages_to_markdown_joins_page_content(): + pages = [ + PageContent(page=1, content="Page one", images=[]), + PageContent(page=2, content="Page two", images=[]), + ] + + assert pages_to_markdown(pages) == "Page one\n\nPage two" + + +def test_pages_to_markdown_renders_image_metadata_when_not_in_content(): + pages = [ + PageContent( + page=1, + content="Page one", + images=[{"path": "sources/images/doc/p1.png"}], + ), + PageContent( + page=2, + content="![image](sources/images/doc/p2.png)", + images=[{"path": "sources/images/doc/p2.png"}], + ), + ] + + assert pages_to_markdown(pages) == ( + "Page one\n\n![image](sources/images/doc/p1.png)\n\n" + "![image](sources/images/doc/p2.png)" + ) + + +def test_pages_to_json_preserves_openkb_shape(): + pages = [ + PageContent(page=1, content="Page one", images=[{"path": "sources/images/doc/p1.png"}]), + ] + + assert pages_to_json(pages) == [ + {"page": 1, "content": "Page one", "images": [{"path": "sources/images/doc/p1.png"}]}, + ] + + +def test_local_extractor_uses_existing_pymupdf_page_converter(tmp_path): + pdf = tmp_path / "paper.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + images_dir = tmp_path / "images" + + with patch("openkb.images.convert_pdf_to_pages", return_value=[ + {"page": 1, "content": "Page one", "images": []}, + ]) as convert: + pages = LocalPdfExtractor().parse_pages(pdf, "paper", images_dir) + + convert.assert_called_once_with(pdf, "paper", images_dir) + assert pages == [PageContent(page=1, content="Page one", images=[])] + + +def test_resolve_pdf_extractor_defaults_to_local(): + assert isinstance(resolve_pdf_extractor({}), LocalPdfExtractor) + + +def test_resolve_pdf_extractor_accepts_string_and_object_local_config(): + assert isinstance(resolve_pdf_extractor({"pdf_parser": "local"}), LocalPdfExtractor) + assert isinstance(resolve_pdf_extractor({"pdf_parser": {"provider": "local"}}), LocalPdfExtractor) + + +def test_resolve_pdf_extractor_rejects_unknown_provider(): + with pytest.raises(ValueError, match="Unsupported pdf_parser provider: mineru"): + resolve_pdf_extractor({"pdf_parser": "mineru"}) From b472a41a5b7bd8ba1aca92206a9ab11cf70d8a7a Mon Sep 17 00:00:00 2001 From: Guohao Zhang Date: Tue, 23 Jun 2026 21:28:23 +0800 Subject: [PATCH 2/4] feat(pdf): route short PDFs through shared extractor --- openkb/converter.py | 9 ++++++--- tests/test_converter.py | 26 ++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/openkb/converter.py b/openkb/converter.py index 9ebac762..c76ea012 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -13,7 +13,8 @@ from markitdown import MarkItDown from openkb.config import load_config -from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images +from openkb.images import copy_relative_images, extract_base64_images +from openkb.pdf_extractor import pages_to_markdown, resolve_pdf_extractor from openkb.state import HashRegistry logger = logging.getLogger(__name__) @@ -187,8 +188,10 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: markdown = src.read_text(encoding="utf-8") markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir) elif src.suffix.lower() == ".pdf": - # Use pymupdf dict-mode for PDFs: text + images inline at correct positions - markdown = convert_pdf_with_images(src, doc_name, images_dir) + pdf_extractor = resolve_pdf_extractor(config) + markdown = pages_to_markdown( + pdf_extractor.parse_pages(src, doc_name, images_dir) + ) else: # Non-PDF, non-MD: use markitdown (docx, pptx, html, etc.) mid = MarkItDown() diff --git a/tests/test_converter.py b/tests/test_converter.py index 5ee2f5df..0c5e7cc8 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -5,6 +5,7 @@ from openkb.converter import convert_document, get_pdf_page_count +from openkb.pdf_extractor import PageContent # --------------------------------------------------------------------------- @@ -78,14 +79,23 @@ def test_md_raw_file_copied(self, kb_dir): class TestConvertDocumentPdfShort: - def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path): - """PDF under threshold is converted with pymupdf (convert_pdf_with_images).""" + def test_short_pdf_converted_via_shared_extractor(self, kb_dir, tmp_path): + """PDF under threshold is converted through the configured extractor.""" src = tmp_path / "short.pdf" src.write_bytes(b"%PDF-1.4 fake content") + extractor = MagicMock() + extractor.parse_pages.return_value = [ + PageContent( + page=1, + content="# Short PDF\n\n![image](sources/images/short/p1_img1.png)", + images=[{"path": "sources/images/short/p1_img1.png"}], + ) + ] + with ( patch("openkb.converter.pymupdf.open") as mock_mu, - patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi, + patch("openkb.converter.resolve_pdf_extractor", return_value=extractor) as resolve, ): fake_doc = MagicMock() fake_doc.page_count = 5 # below default threshold of 20 @@ -95,11 +105,19 @@ def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path): result = convert_document(src, kb_dir) - mock_cpwi.assert_called_once() + resolve.assert_called_once() + extractor.parse_pages.assert_called_once_with( + src, + "short", + kb_dir / "wiki" / "sources" / "images" / "short", + ) assert result.skipped is False assert result.is_long_doc is False assert result.source_path is not None assert result.source_path.exists() + assert result.source_path.read_text(encoding="utf-8") == ( + "# Short PDF\n\n![image](sources/images/short/p1_img1.png)" + ) # --------------------------------------------------------------------------- From 9ab571c8d1c0353b2c853676d66676ddb8b22777 Mon Sep 17 00:00:00 2001 From: Guohao Zhang Date: Tue, 23 Jun 2026 21:28:49 +0800 Subject: [PATCH 3/4] feat(pdf): route long PDF page JSON through shared extractor --- openkb/indexer.py | 58 ++++--------------------------------- tests/test_indexer.py | 67 +++++++++++++++++++++++++++---------------- 2 files changed, 48 insertions(+), 77 deletions(-) diff --git a/openkb/indexer.py b/openkb/indexer.py index 348c7e32..fdfae88d 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -13,6 +13,7 @@ from pageindex import IndexConfig, PageIndexClient from openkb.config import load_config +from openkb.pdf_extractor import normalize_page_content, pages_to_json, resolve_pdf_extractor from openkb.tree_renderer import render_summary_md logger = logging.getLogger(__name__) @@ -29,49 +30,7 @@ class IndexResult: def _normalize_page_content(raw_pages: Any) -> list[dict[str, Any]]: """Normalize PageIndex/local PDF page content into OpenKB's JSON shape.""" - if not isinstance(raw_pages, list): - return [] - - pages: list[dict[str, Any]] = [] - for index, item in enumerate(raw_pages, start=1): - if isinstance(item, str): - content = item.strip() - if content: - pages.append({"page": index, "content": content, "images": []}) - continue - - if not isinstance(item, dict): - continue - - raw_page = item.get("page", item.get("page_number", item.get("page_num", index))) - try: - page_number = int(raw_page) - except (TypeError, ValueError): - page_number = index - if page_number < 1: - page_number = index - - content = item.get("content", item.get("markdown", item.get("text", ""))) - if content is None: - content = "" - content = str(content).strip() - - images = item.get("images", []) - if not isinstance(images, list): - images = [] - normalized_images = [ - image for image in images - if isinstance(image, dict) and isinstance(image.get("path"), str) - ] - - if content or normalized_images: - pages.append({ - "page": page_number, - "content": content, - "images": normalized_images, - }) - - return pages + return pages_to_json(normalize_page_content(raw_pages)) def _get_pdf_page_count(pdf_path: Path) -> int: @@ -80,12 +39,6 @@ def _get_pdf_page_count(pdf_path: Path) -> int: return get_pdf_page_count(pdf_path) -def _convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict[str, Any]]: - from openkb.images import convert_pdf_to_pages - - return convert_pdf_to_pages(pdf_path, doc_name, images_dir) - - def index_long_document( pdf_path: Path, kb_dir: Path, doc_name: str | None = None ) -> IndexResult: @@ -151,8 +104,6 @@ def index_long_document( all_pages: list[dict[str, Any]] = [] if pageindex_api_key: - # Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content - # requires a page range, so pass "1-N". page_count = _get_pdf_page_count(pdf_path) try: all_pages = _normalize_page_content(col.get_page_content(doc_id, f"1-{page_count}")) @@ -161,8 +112,9 @@ def index_long_document( if not all_pages: if pageindex_api_key: - logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name) - all_pages = _normalize_page_content(_convert_pdf_to_pages(pdf_path, source_name, images_dir)) + logger.warning("Cloud returned no pages for %s; falling back to configured PDF extractor", pdf_path.name) + pdf_extractor = resolve_pdf_extractor(config) + all_pages = pages_to_json(pdf_extractor.parse_pages(pdf_path, source_name, images_dir)) if not all_pages: raise RuntimeError(f"No page content extracted for {pdf_path.name}") diff --git a/tests/test_indexer.py b/tests/test_indexer.py index 843f6614..5a31b1a6 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -5,6 +5,7 @@ from openkb.indexer import IndexResult, _normalize_page_content, index_long_document +from openkb.pdf_extractor import PageContent class TestNormalizePageContent: @@ -57,8 +58,8 @@ def _make_fake_collection(self, doc_id: str, sample_tree: dict): def _fake_pages(self): return [ - {"page": 1, "content": "Page one text.", "images": []}, - {"page": 2, "content": "Page two text.", "images": []}, + PageContent(page=1, content="Page one text.", images=[]), + PageContent(page=2, content="Page two text.", images=[]), ] def test_returns_index_result(self, kb_dir, sample_tree, tmp_path): @@ -71,8 +72,11 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path): pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") + extractor = MagicMock() + extractor.parse_pages.return_value = self._fake_pages() + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ - patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): + patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor): result = index_long_document(pdf_path, kb_dir) assert isinstance(result, IndexResult) @@ -88,17 +92,14 @@ def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path): fake_client = MagicMock() fake_client.collection.return_value = fake_col - # Mock get_page_content to return page data - fake_col.get_page_content.return_value = [ - {"page": 1, "content": "Page one text."}, - {"page": 2, "content": "Page two text."}, - ] - pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") + extractor = MagicMock() + extractor.parse_pages.return_value = self._fake_pages() + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ - patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): + patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor): index_long_document(pdf_path, kb_dir) json_file = kb_dir / "wiki" / "sources" / "sample.json" @@ -119,8 +120,11 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path): pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") + extractor = MagicMock() + extractor.parse_pages.return_value = self._fake_pages() + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ - patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): + patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor): index_long_document(pdf_path, kb_dir) summary_file = kb_dir / "wiki" / "summaries" / "sample.md" @@ -140,8 +144,11 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat pdf_path = tmp_path / "report.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") + extractor = MagicMock() + extractor.parse_pages.return_value = self._fake_pages() + with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls, \ - patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): + patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor): index_long_document(pdf_path, kb_dir) # Verify PageIndexClient was instantiated with correct IndexConfig @@ -153,7 +160,7 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat assert ic.if_add_node_summary is True assert ic.if_add_doc_description is True - def test_cloud_page_content_is_normalized(self, kb_dir, sample_tree, tmp_path, monkeypatch): + def test_cloud_page_content_is_normalized_before_shared_extractor_fallback(self, kb_dir, sample_tree, tmp_path, monkeypatch): doc_id = "cloud-123" fake_col = self._make_fake_collection(doc_id, sample_tree) fake_col.get_page_content.return_value = [ @@ -168,17 +175,21 @@ def test_cloud_page_content_is_normalized(self, kb_dir, sample_tree, tmp_path, m pdf_path.write_bytes(b"%PDF-1.4 fake") monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key") + extractor = MagicMock() + extractor.parse_pages.return_value = self._fake_pages() + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ patch("openkb.indexer._get_pdf_page_count", return_value=2), \ - patch("openkb.indexer._convert_pdf_to_pages") as local_pages: + patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor): index_long_document(pdf_path, kb_dir) - local_pages.assert_not_called() + fake_col.get_page_content.assert_called_once_with(doc_id, "1-2") + extractor.parse_pages.assert_not_called() json_file = kb_dir / "wiki" / "sources" / "sample.json" assert '"content": "Cloud page one."' in json_file.read_text(encoding="utf-8") assert '"content": "Cloud page two."' in json_file.read_text(encoding="utf-8") - def test_invalid_cloud_page_content_falls_back_to_local(self, kb_dir, sample_tree, tmp_path, monkeypatch): + def test_invalid_cloud_page_content_falls_back_to_shared_extractor(self, kb_dir, sample_tree, tmp_path, monkeypatch): doc_id = "cloud-456" fake_col = self._make_fake_collection(doc_id, sample_tree) fake_col.get_page_content.return_value = {"bad": "shape"} @@ -190,16 +201,20 @@ def test_invalid_cloud_page_content_falls_back_to_local(self, kb_dir, sample_tre pdf_path.write_bytes(b"%PDF-1.4 fake") monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key") + extractor = MagicMock() + extractor.parse_pages.return_value = self._fake_pages() + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ patch("openkb.indexer._get_pdf_page_count", return_value=2), \ - patch("openkb.indexer._convert_pdf_to_pages", return_value=self._fake_pages()) as local_pages: + patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor): index_long_document(pdf_path, kb_dir) - local_pages.assert_called_once() + fake_col.get_page_content.assert_called_once_with(doc_id, "1-2") + extractor.parse_pages.assert_called_once() json_file = kb_dir / "wiki" / "sources" / "sample.json" assert "Page one text." in json_file.read_text(encoding="utf-8") - def test_empty_cloud_and_local_pages_fail(self, kb_dir, sample_tree, tmp_path, monkeypatch): + def test_empty_shared_extractor_pages_fail(self, kb_dir, sample_tree, tmp_path, monkeypatch): doc_id = "empty-123" fake_col = self._make_fake_collection(doc_id, sample_tree) @@ -210,9 +225,12 @@ def test_empty_cloud_and_local_pages_fail(self, kb_dir, sample_tree, tmp_path, m pdf_path.write_bytes(b"%PDF-1.4 fake") monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key") + extractor = MagicMock() + extractor.parse_pages.return_value = [] + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ patch("openkb.indexer._get_pdf_page_count", return_value=2), \ - patch("openkb.indexer._convert_pdf_to_pages", return_value=[]): + patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor): try: index_long_document(pdf_path, kb_dir) except RuntimeError as exc: @@ -237,10 +255,11 @@ def test_index_long_document_uses_explicit_doc_name(kb_dir, monkeypatch): pdf = kb_dir / "raw" / "original.pdf" pdf.write_bytes(b"%PDF-1.4 fake") + extractor = MagicMock() + extractor.parse_pages.return_value = [PageContent(page=1, content="p1", images=[])] + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ - patch("openkb.indexer._get_pdf_page_count", return_value=30), \ - patch("openkb.indexer._convert_pdf_to_pages", - return_value=[{"page": 1, "text": "p1"}]) as mock_convert: + patch("openkb.indexer.resolve_pdf_extractor", return_value=extractor): result = index_long_document(pdf, kb_dir, doc_name="original-abc12345") assert result.doc_id == "doc-123" @@ -251,7 +270,7 @@ def test_index_long_document_uses_explicit_doc_name(kb_dir, monkeypatch): assert not (kb_dir / "wiki" / "summaries" / "original.md").exists() # the page extractor receives the explicit doc_name and its images dir expected_images = kb_dir / "wiki" / "sources" / "images" / "original-abc12345" - mock_convert.assert_called_once_with(pdf, "original-abc12345", expected_images) + extractor.parse_pages.assert_called_once_with(pdf, "original-abc12345", expected_images) # summary frontmatter points full_text at the doc_name artifact summary_text = (kb_dir / "wiki" / "summaries" / "original-abc12345.md").read_text(encoding="utf-8") assert "original-abc12345" in summary_text From 8930a8850dff3223691662d5f78910784b685483 Mon Sep 17 00:00:00 2001 From: Guohao Zhang Date: Tue, 23 Jun 2026 21:29:09 +0800 Subject: [PATCH 4/4] test(cli): close mocked async compile coroutines --- tests/conftest.py | 23 +++++++++++++++++++++++ tests/test_add_command.py | 12 ++++++------ tests/test_remove.py | 8 ++++---- tests/test_url_ingest.py | 18 ++++++++++++------ 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3c556551..617a86f6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import inspect import json import pytest @@ -10,6 +11,28 @@ def _reset_extra_headers(): set_extra_headers({}) +@pytest.fixture +def close_coroutine_run(): + """Test double for asyncio.run that closes unawaited coroutines.""" + def _run(awaitable): + if inspect.iscoroutine(awaitable): + awaitable.close() + + return _run + + +@pytest.fixture +def raise_after_closing_coroutine(close_coroutine_run): + def _factory(exc): + def _run(awaitable): + close_coroutine_run(awaitable) + raise exc + + return _run + + return _factory + + @pytest.fixture def kb_dir(tmp_path): """Create a minimal knowledge base directory structure for testing.""" diff --git a/tests/test_add_command.py b/tests/test_add_command.py index 0199c9e2..9567b9c1 100644 --- a/tests/test_add_command.py +++ b/tests/test_add_command.py @@ -109,7 +109,7 @@ def test_add_nonexistent_path(self, tmp_path): result = runner.invoke(cli, ["add", str(tmp_path / "nonexistent.pdf")]) assert "does not exist" in result.output - def test_add_skipped_file(self, tmp_path): + def test_add_skipped_file(self, tmp_path, close_coroutine_run): kb_dir = self._setup_kb(tmp_path) doc = tmp_path / "test.md" doc.write_text("# Hello") @@ -120,12 +120,12 @@ def test_add_skipped_file(self, tmp_path): runner = CliRunner() with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \ patch("openkb.cli.convert_document", return_value=mock_result), \ - patch("openkb.cli.asyncio.run") as mock_arun: + patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run) as mock_arun: result = runner.invoke(cli, ["add", str(doc)]) assert "SKIP" in result.output mock_arun.assert_not_called() - def test_add_short_doc_runs_compiler(self, tmp_path): + def test_add_short_doc_runs_compiler(self, tmp_path, close_coroutine_run): kb_dir = self._setup_kb(tmp_path) doc = tmp_path / "test.md" doc.write_text("# Hello") @@ -152,7 +152,7 @@ def test_add_short_doc_runs_compiler(self, tmp_path): runner = CliRunner() with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \ patch("openkb.cli.convert_document", return_value=mock_result), \ - patch("openkb.cli.asyncio.run") as mock_arun: + patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run) as mock_arun: result = runner.invoke(cli, ["add", str(doc)]) mock_arun.assert_called_once() assert "OK" in result.output @@ -168,7 +168,7 @@ def test_add_short_doc_runs_compiler(self, tmp_path): assert "path" in meta assert "stale-old-hash" not in hashes - def test_add_oldest_legacy_entry_converges_to_single_entry(self, tmp_path): + def test_add_oldest_legacy_entry_converges_to_single_entry(self, tmp_path, close_coroutine_run): """Editing a pre-doc_name-era document must not fork the registry. convert_document backfills doc_name/path onto the legacy entry on @@ -192,7 +192,7 @@ def test_add_oldest_legacy_entry_converges_to_single_entry(self, tmp_path): # the legacy backfill actually happens on disk mid-pipeline. runner = CliRunner() with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \ - patch("openkb.cli.asyncio.run"): + patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run): result = runner.invoke(cli, ["add", str(doc)]) assert "OK" in result.output diff --git a/tests/test_remove.py b/tests/test_remove.py index 75daadf0..d8174813 100644 --- a/tests/test_remove.py +++ b/tests/test_remove.py @@ -696,7 +696,7 @@ def test_cli_remove_preserves_ghosts_in_unrelated_pages(kb_dir): # --------------------------------------------------------------------------- -def test_add_persists_doc_name_for_later_remove(tmp_path): +def test_add_persists_doc_name_for_later_remove(tmp_path, close_coroutine_run): """End-to-end: `openkb add` writes a registry entry with `doc_name`, and a subsequent `openkb remove` actually prunes that entry. """ @@ -743,7 +743,7 @@ def test_add_persists_doc_name_for_later_remove(tmp_path): # Mock convert_document + asyncio.run to skip the LLM-driven compile. with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \ patch("openkb.cli.convert_document", return_value=mock_result), \ - patch("openkb.cli.asyncio.run"): + patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run): add_res = runner.invoke(cli, ["add", str(doc)]) assert add_res.exit_code == 0, add_res.output @@ -944,7 +944,7 @@ def test_cli_remove_dry_run_does_not_touch_images(kb_dir): # --------------------------------------------------------------------------- -def test_add_long_pdf_persists_doc_id_to_registry(tmp_path): +def test_add_long_pdf_persists_doc_id_to_registry(tmp_path, close_coroutine_run): """Long-doc ingest must record `doc_id` in the registry. Without it, the remove path has no handle to feed `Collection.delete_document`. """ @@ -987,7 +987,7 @@ def test_add_long_pdf_persists_doc_id_to_registry(tmp_path): with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \ patch("openkb.cli.convert_document", return_value=convert_mock), \ patch("openkb.indexer.index_long_document", return_value=index_mock), \ - patch("openkb.cli.asyncio.run"): + patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run): result = runner.invoke(cli, ["add", str(pdf)]) assert result.exit_code == 0, result.output diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py index 1b8548ee..579f5e03 100644 --- a/tests/test_url_ingest.py +++ b/tests/test_url_ingest.py @@ -461,7 +461,7 @@ def test_fetch_pdf_uses_post_redirect_url_for_filename(tmp_path): assert result.name == "great-paper.pdf" -def test_add_single_file_returns_added_on_success(tmp_path): +def test_add_single_file_returns_added_on_success(tmp_path, close_coroutine_run): """Tri-state return contract: ``"added"`` when the file was newly indexed. URL-ingest uses this to decide whether to keep / unlink the just-downloaded file.""" @@ -489,7 +489,7 @@ def test_add_single_file_returns_added_on_success(tmp_path): ) with patch("openkb.cli.convert_document", return_value=mock_result), \ - patch("openkb.cli.asyncio.run"): + patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run): outcome = add_single_file(doc, tmp_path) assert outcome == "added" @@ -513,7 +513,7 @@ def test_add_single_file_returns_skipped_on_dedup(tmp_path): assert outcome == "skipped" -def test_add_single_file_returns_failed_on_pipeline_error(tmp_path): +def test_add_single_file_returns_failed_on_pipeline_error(tmp_path, raise_after_closing_coroutine): """A pipeline failure (e.g. transient LLM error during compilation) must be distinguishable from dedup-skip, so URL-ingest can preserve the raw file for retry instead of deleting it.""" @@ -540,7 +540,10 @@ def test_add_single_file_returns_failed_on_pipeline_error(tmp_path): # Make both compile attempts raise to drive the failure path. with patch("openkb.cli.convert_document", return_value=mock_result), \ - patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \ + patch( + "openkb.cli.asyncio.run", + side_effect=raise_after_closing_coroutine(RuntimeError("LLM 503")), + ), \ patch("openkb.cli.time.sleep"): outcome = add_single_file(doc, tmp_path) @@ -580,7 +583,7 @@ def test_url_ingest_cleans_up_orphan_on_dedup_skip(tmp_path, monkeypatch): assert not fetched_path.exists() -def test_url_ingest_keeps_raw_file_on_pipeline_failure(tmp_path): +def test_url_ingest_keeps_raw_file_on_pipeline_failure(tmp_path, raise_after_closing_coroutine): """The point of the tri-state return: a pipeline failure (e.g. LLM timeout during compilation) must NOT delete the downloaded file — the user can retry without re-downloading, and we don't lose data @@ -611,7 +614,10 @@ def test_url_ingest_keeps_raw_file_on_pipeline_failure(tmp_path): with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \ patch("openkb.url_ingest.fetch_url_to_raw", return_value=fetched_path), \ patch("openkb.cli.convert_document", return_value=mock_result), \ - patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \ + patch( + "openkb.cli.asyncio.run", + side_effect=raise_after_closing_coroutine(RuntimeError("LLM 503")), + ), \ patch("openkb.cli.time.sleep"): result = runner.invoke(cli, ["add", "https://example.com/paper.pdf"])