Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions openkb/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"model": "gpt-5.4-mini",
"language": "en",
"pageindex_threshold": 20,
"pdf_parser": "local",
}

# Default entity-type vocabulary. Overridable per-KB via the optional
Expand Down
9 changes: 6 additions & 3 deletions openkb/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from markitdown import MarkItDown

from openkb.config import load_config
from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
from openkb.images import copy_relative_images, extract_base64_images
from openkb.pdf_extractor import pages_to_markdown, resolve_pdf_extractor
from openkb.state import HashRegistry

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -187,8 +188,10 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
markdown = src.read_text(encoding="utf-8")
markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir)
elif src.suffix.lower() == ".pdf":
# Use pymupdf dict-mode for PDFs: text + images inline at correct positions
markdown = convert_pdf_with_images(src, doc_name, images_dir)
pdf_extractor = resolve_pdf_extractor(config)
markdown = pages_to_markdown(
pdf_extractor.parse_pages(src, doc_name, images_dir)
)
else:
# Non-PDF, non-MD: use markitdown (docx, pptx, html, etc.)
mid = MarkItDown()
Expand Down
58 changes: 5 additions & 53 deletions openkb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pageindex import IndexConfig, PageIndexClient

from openkb.config import load_config
from openkb.pdf_extractor import normalize_page_content, pages_to_json, resolve_pdf_extractor
from openkb.tree_renderer import render_summary_md

logger = logging.getLogger(__name__)
Expand All @@ -29,49 +30,7 @@ class IndexResult:

def _normalize_page_content(raw_pages: Any) -> list[dict[str, Any]]:
"""Normalize PageIndex/local PDF page content into OpenKB's JSON shape."""
if not isinstance(raw_pages, list):
return []

pages: list[dict[str, Any]] = []
for index, item in enumerate(raw_pages, start=1):
if isinstance(item, str):
content = item.strip()
if content:
pages.append({"page": index, "content": content, "images": []})
continue

if not isinstance(item, dict):
continue

raw_page = item.get("page", item.get("page_number", item.get("page_num", index)))
try:
page_number = int(raw_page)
except (TypeError, ValueError):
page_number = index
if page_number < 1:
page_number = index

content = item.get("content", item.get("markdown", item.get("text", "")))
if content is None:
content = ""
content = str(content).strip()

images = item.get("images", [])
if not isinstance(images, list):
images = []
normalized_images = [
image for image in images
if isinstance(image, dict) and isinstance(image.get("path"), str)
]

if content or normalized_images:
pages.append({
"page": page_number,
"content": content,
"images": normalized_images,
})

return pages
return pages_to_json(normalize_page_content(raw_pages))


def _get_pdf_page_count(pdf_path: Path) -> int:
Expand All @@ -80,12 +39,6 @@ def _get_pdf_page_count(pdf_path: Path) -> int:
return get_pdf_page_count(pdf_path)


def _convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict[str, Any]]:
from openkb.images import convert_pdf_to_pages

return convert_pdf_to_pages(pdf_path, doc_name, images_dir)


def index_long_document(
pdf_path: Path, kb_dir: Path, doc_name: str | None = None
) -> IndexResult:
Expand Down Expand Up @@ -151,8 +104,6 @@ def index_long_document(

all_pages: list[dict[str, Any]] = []
if pageindex_api_key:
# Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content
# requires a page range, so pass "1-N".
page_count = _get_pdf_page_count(pdf_path)
try:
all_pages = _normalize_page_content(col.get_page_content(doc_id, f"1-{page_count}"))
Expand All @@ -161,8 +112,9 @@ def index_long_document(

if not all_pages:
if pageindex_api_key:
logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
all_pages = _normalize_page_content(_convert_pdf_to_pages(pdf_path, source_name, images_dir))
logger.warning("Cloud returned no pages for %s; falling back to configured PDF extractor", pdf_path.name)
pdf_extractor = resolve_pdf_extractor(config)
all_pages = pages_to_json(pdf_extractor.parse_pages(pdf_path, source_name, images_dir))

if not all_pages:
raise RuntimeError(f"No page content extracted for {pdf_path.name}")
Expand Down
144 changes: 144 additions & 0 deletions openkb/pdf_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""Shared PDF page extraction for OpenKB."""
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Protocol


@dataclass(frozen=True)
class PageContent:
"""Extracted content for one PDF page."""

page: int
content: str
images: list[dict[str, Any]]


class PdfExtractor(Protocol):
"""Protocol implemented by PDF page extraction backends."""

def parse_pages(
self,
pdf_path: Path,
doc_name: str,
images_dir: Path,
) -> list[PageContent]:
"""Extract ordered page content from *pdf_path*."""


class LocalPdfExtractor:
"""Local PyMuPDF-backed PDF extractor preserving existing behavior."""

provider = "local"

def parse_pages(
self,
pdf_path: Path,
doc_name: str,
images_dir: Path,
) -> list[PageContent]:
from openkb.images import convert_pdf_to_pages

return normalize_page_content(convert_pdf_to_pages(pdf_path, doc_name, images_dir))


def normalize_page_content(raw_pages: Any) -> list[PageContent]:
"""Normalize raw page dictionaries into :class:`PageContent` objects."""
if not isinstance(raw_pages, list):
return []

pages: list[PageContent] = []
for index, item in enumerate(raw_pages, start=1):
if isinstance(item, str):
content = item.strip()
if content:
pages.append(PageContent(page=index, content=content, images=[]))
continue

if not isinstance(item, dict):
continue

raw_page = item.get("page", item.get("page_number", item.get("page_num", index)))
try:
page_number = int(raw_page)
except (TypeError, ValueError):
page_number = index
if page_number < 1:
page_number = index

content = item.get("content", item.get("markdown", item.get("text", "")))
if content is None:
content = ""
content = str(content).strip()

images = item.get("images", [])
if not isinstance(images, list):
images = []
normalized_images = [
image for image in images
if isinstance(image, dict) and isinstance(image.get("path"), str)
]

if content or normalized_images:
pages.append(PageContent(
page=page_number,
content=content,
images=normalized_images,
))

return pages


def pages_to_markdown(pages: list[PageContent]) -> str:
"""Render extracted pages as the Markdown source used for short PDFs."""
rendered_pages: list[str] = []
for page in pages:
parts: list[str] = []
if page.content:
parts.append(page.content)

for image in page.images:
path = image.get("path")
if not isinstance(path, str) or not path:
continue
image_markdown = f"![image]({path})"
if image_markdown not in page.content and path not in page.content:
parts.append(image_markdown)

page_markdown = "\n\n".join(parts).strip()
if page_markdown:
rendered_pages.append(page_markdown)

return "\n\n".join(rendered_pages).strip()


def pages_to_json(pages: list[PageContent]) -> list[dict[str, Any]]:
"""Render extracted pages as OpenKB's long-PDF source JSON shape."""
return [
{
"page": page.page,
"content": page.content,
"images": page.images,
}
for page in pages
]


def _provider_from_config(config: dict[str, Any]) -> str:
raw = config.get("pdf_parser", "local")
if isinstance(raw, str):
provider = raw
elif isinstance(raw, dict):
provider = raw.get("provider", "local")
else:
provider = "local"
return str(provider).strip().lower() or "local"


def resolve_pdf_extractor(config: dict[str, Any]) -> PdfExtractor:
"""Resolve the configured PDF extraction backend."""
provider = _provider_from_config(config)
if provider == "local":
return LocalPdfExtractor()
raise ValueError(f"Unsupported pdf_parser provider: {provider}")
23 changes: 23 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import inspect
import json
import pytest

Expand All @@ -11,6 +12,28 @@ def _reset_extra_headers():
set_timeout(None)


@pytest.fixture
def close_coroutine_run():
"""Test double for asyncio.run that closes unawaited coroutines."""
def _run(awaitable):
if inspect.iscoroutine(awaitable):
awaitable.close()

return _run


@pytest.fixture
def raise_after_closing_coroutine(close_coroutine_run):
def _factory(exc):
def _run(awaitable):
close_coroutine_run(awaitable)
raise exc

return _run

return _factory


@pytest.fixture
def kb_dir(tmp_path):
"""Create a minimal knowledge base directory structure for testing."""
Expand Down
12 changes: 6 additions & 6 deletions tests/test_add_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def test_add_nonexistent_path(self, tmp_path):
result = runner.invoke(cli, ["add", str(tmp_path / "nonexistent.pdf")])
assert "does not exist" in result.output

def test_add_skipped_file(self, tmp_path):
def test_add_skipped_file(self, tmp_path, close_coroutine_run):
kb_dir = self._setup_kb(tmp_path)
doc = tmp_path / "test.md"
doc.write_text("# Hello")
Expand All @@ -120,12 +120,12 @@ def test_add_skipped_file(self, tmp_path):
runner = CliRunner()
with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \
patch("openkb.cli.convert_document", return_value=mock_result), \
patch("openkb.cli.asyncio.run") as mock_arun:
patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run) as mock_arun:
result = runner.invoke(cli, ["add", str(doc)])
assert "SKIP" in result.output
mock_arun.assert_not_called()

def test_add_short_doc_runs_compiler(self, tmp_path):
def test_add_short_doc_runs_compiler(self, tmp_path, close_coroutine_run):
kb_dir = self._setup_kb(tmp_path)
doc = tmp_path / "test.md"
doc.write_text("# Hello")
Expand All @@ -152,7 +152,7 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
runner = CliRunner()
with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \
patch("openkb.cli.convert_document", return_value=mock_result), \
patch("openkb.cli.asyncio.run") as mock_arun:
patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run) as mock_arun:
result = runner.invoke(cli, ["add", str(doc)])
mock_arun.assert_called_once()
assert "OK" in result.output
Expand All @@ -168,7 +168,7 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
assert "path" in meta
assert "stale-old-hash" not in hashes

def test_add_oldest_legacy_entry_converges_to_single_entry(self, tmp_path):
def test_add_oldest_legacy_entry_converges_to_single_entry(self, tmp_path, close_coroutine_run):
"""Editing a pre-doc_name-era document must not fork the registry.

convert_document backfills doc_name/path onto the legacy entry on
Expand All @@ -192,7 +192,7 @@ def test_add_oldest_legacy_entry_converges_to_single_entry(self, tmp_path):
# the legacy backfill actually happens on disk mid-pipeline.
runner = CliRunner()
with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \
patch("openkb.cli.asyncio.run"):
patch("openkb.cli.asyncio.run", side_effect=close_coroutine_run):
result = runner.invoke(cli, ["add", str(doc)])
assert "OK" in result.output

Expand Down
2 changes: 2 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@ def test_default_config_keys():
assert "model" in DEFAULT_CONFIG
assert "language" in DEFAULT_CONFIG
assert "pageindex_threshold" in DEFAULT_CONFIG
assert "pdf_parser" in DEFAULT_CONFIG


def test_default_config_values():
assert DEFAULT_CONFIG["model"] == "gpt-5.4-mini"
assert DEFAULT_CONFIG["language"] == "en"
assert DEFAULT_CONFIG["pageindex_threshold"] == 20
assert DEFAULT_CONFIG["pdf_parser"] == "local"


def test_load_missing_file_returns_defaults(tmp_path):
Expand Down
Loading