diff --git a/akd_ext/tools/__init__.py b/akd_ext/tools/__init__.py index fb1328e..6e6d56c 100644 --- a/akd_ext/tools/__init__.py +++ b/akd_ext/tools/__init__.py @@ -20,6 +20,12 @@ RepositorySearchToolOutputSchema, RepositorySearchToolConfig, ) +from .pdf_parser import ( + PDFParserTool, + PDFParserToolConfig, + PDFParserToolInputSchema, + PDFParserToolOutputSchema, +) __all__ = [ "DummyTool", @@ -38,4 +44,8 @@ "RepositorySearchToolInputSchema", "RepositorySearchToolOutputSchema", "RepositorySearchToolConfig", + "PDFParserTool", + "PDFParserToolConfig", + "PDFParserToolInputSchema", + "PDFParserToolOutputSchema", ] diff --git a/akd_ext/tools/pdf_parser.py b/akd_ext/tools/pdf_parser.py new file mode 100644 index 0000000..5935ba7 --- /dev/null +++ b/akd_ext/tools/pdf_parser.py @@ -0,0 +1,144 @@ +"""PDF parser tool using AKD core backends.""" + +from __future__ import annotations + +import sys +from pathlib import Path +from typing import Any, Literal +from akd._base import InputSchema, OutputSchema +from akd.tools import BaseTool, BaseToolConfig +from akd.tools.scrapers import ( + DoclingScraper, + DoclingScraperConfig, + ScraperToolOutputSchema, + SimplePDFScraper, +) +from pydantic import Field + +from akd_ext.mcp import mcp_tool + +Mode = Literal["fast", "accurate", "ocr"] +BackendHint = Literal[ + "akd_simple", + "akd_docling", +] + + +class PDFParserToolInputSchema(InputSchema): + """Input schema for PDF parsing.""" + + url_or_path: str = Field(..., description="HTTP(S) URL or local filesystem path to a PDF") + mode: Mode = Field(default="accurate", description="Parsing mode: fast, accurate, or ocr") + backend_hint: BackendHint | None = Field( + default=None, + description="Optional backend override (akd_simple or akd_docling)", + ) + return_format: Literal["markdown", "html", "json"] = Field( + default="markdown", + description="Preferred output format hint for backend parsing", + ) + + +class PDFParserToolOutputSchema(OutputSchema): + """Output schema for parsed PDF content.""" + + content: str = Field(..., description="Parsed text content") + metadata: dict[str, Any] = Field(default_factory=dict, description="Parser and document metadata") + + +class PDFParserToolConfig(BaseToolConfig): + """Configuration for PDF parser backend scrapers.""" + + akd_simple_config: dict[str, Any] | None = Field( + default=None, + description="Optional configuration forwarded to SimplePDFScraper.", + ) + akd_docling_config: DoclingScraperConfig | None = Field( + default=None, + description="Optional configuration forwarded to DoclingScraper.", + ) + + +def _normalize_url_or_path(url_or_path: str) -> str: + lower = url_or_path.lower() + if lower.startswith(("http://", "https://", "file://")): + return url_or_path + + p = Path(url_or_path).expanduser().resolve() + as_uri = p.as_uri() + local_path = str(p) + + if sys.platform.startswith("win"): + return local_path + return as_uri + +async def _run_akd_simple( + url_or_path: str, config: dict[str, Any] | None = None +) -> ScraperToolOutputSchema: + scraper = SimplePDFScraper(config=config) + params = scraper.input_schema(url=_normalize_url_or_path(url_or_path)) + return await scraper.arun(params) + + +async def _run_akd_docling( + url_or_path: str, mode: Mode, config: DoclingScraperConfig | None = None +) -> ScraperToolOutputSchema: + if mode == "fast": + default_cfg = DoclingScraperConfig(pdf_mode="fast", do_table_structure=False, use_ocr=False) + elif mode == "accurate": + default_cfg = DoclingScraperConfig(pdf_mode="accurate", do_table_structure=True, use_ocr=False) + else: + default_cfg = DoclingScraperConfig(pdf_mode="accurate", do_table_structure=True, use_ocr=True) + + scraper = DoclingScraper(config=config or default_cfg) + params = scraper.input_schema(url=_normalize_url_or_path(url_or_path)) + return await scraper.arun(params) + + +def _scraper_to_result(out: ScraperToolOutputSchema) -> dict[str, Any]: + return {"content": out.content, "metadata": out.metadata.model_dump()} + + +@mcp_tool +class PDFParserTool(BaseTool[PDFParserToolInputSchema, PDFParserToolOutputSchema]): + """Parse PDFs into LLM-ready content using AKD core backends.""" + + input_schema = PDFParserToolInputSchema + output_schema = PDFParserToolOutputSchema + config_schema = PDFParserToolConfig + + async def _arun(self, params: PDFParserToolInputSchema) -> PDFParserToolOutputSchema: + backend = params.backend_hint + if backend is None: + backend = "akd_simple" if params.mode == "fast" else "akd_docling" + + tool_config = self.config + + if backend == "akd_simple": + result = _scraper_to_result( + await _run_akd_simple( + params.url_or_path, + config=tool_config.akd_simple_config, + ) + ) + elif backend == "akd_docling": + result = _scraper_to_result( + await _run_akd_docling( + params.url_or_path, + params.mode, + config=tool_config.akd_docling_config, + ) + ) + else: + raise ValueError(f"Unsupported backend: {backend!r}") + + metadata = result.get("metadata", {}) + if not isinstance(metadata, dict): + metadata = {"raw_metadata": metadata} + metadata["backend"] = backend + metadata["return_format"] = params.return_format + + return PDFParserToolOutputSchema( + content=str(result.get("content", "") or ""), + metadata=metadata, + ) diff --git a/tests/tools/test_pdf_parser.py b/tests/tools/test_pdf_parser.py new file mode 100644 index 0000000..8f1cc30 --- /dev/null +++ b/tests/tools/test_pdf_parser.py @@ -0,0 +1,95 @@ +"""Tests for PDF parser tool routing and errors.""" + +import pytest + +from akd_ext.mcp.registry import MCPToolRegistry +from akd_ext.tools.pdf_parser import ( + DoclingScraperConfig, + PDFParserTool, + PDFParserToolConfig, + PDFParserToolInputSchema, + _normalize_url_or_path, +) + + +@pytest.mark.asyncio +async def test_pdf_parser_defaults_fast_to_akd_simple(monkeypatch): + tool = PDFParserTool(config=PDFParserToolConfig(akd_simple_config={"foo": "bar"})) + + async def fake_simple(url_or_path, config=None): + return {"content": "simple", "metadata": {"source": url_or_path, "config": config}} + + def fake_scraper_to_result(out): + return out + + monkeypatch.setattr("akd_ext.tools.pdf_parser._run_akd_simple", fake_simple) + monkeypatch.setattr("akd_ext.tools.pdf_parser._scraper_to_result", fake_scraper_to_result) + + result = await tool.arun( + PDFParserToolInputSchema( + url_or_path="https://example.com/test.pdf", + mode="fast", + ) + ) + + assert result.content == "simple" + assert result.metadata["backend"] == "akd_simple" + assert result.metadata["config"] == {"foo": "bar"} + + +@pytest.mark.asyncio +async def test_pdf_parser_defaults_non_fast_to_akd_docling(monkeypatch): + tool = PDFParserTool( + config=PDFParserToolConfig( + akd_docling_config=DoclingScraperConfig(pdf_mode="accurate", do_table_structure=False, use_ocr=True) + ) + ) + + async def fake_docling(url_or_path, mode, config=None): + return {"content": f"docling-{mode}", "metadata": {"source": url_or_path, "config": config}} + + def fake_scraper_to_result(out): + return out + + monkeypatch.setattr("akd_ext.tools.pdf_parser._run_akd_docling", fake_docling) + monkeypatch.setattr("akd_ext.tools.pdf_parser._scraper_to_result", fake_scraper_to_result) + + result = await tool.arun( + PDFParserToolInputSchema( + url_or_path="https://example.com/test.pdf", + mode="accurate", + ) + ) + + assert result.content == "docling-accurate" + assert result.metadata["backend"] == "akd_docling" + assert isinstance(result.metadata["config"], DoclingScraperConfig) + assert result.metadata["config"].use_ocr is True + + +@pytest.mark.asyncio +async def test_pdf_parser_unsupported_backend(monkeypatch): + tool = PDFParserTool() + + # Bypass schema validation intentionally to test runtime fallback branch. + params = PDFParserToolInputSchema.model_construct( + url_or_path="https://example.com/test.pdf", + mode="fast", + backend_hint="unknown_backend", + return_format="markdown", + ) + + with pytest.raises(ValueError, match="Unsupported backend"): + await tool._arun(params) + + +def test_pdf_parser_registered_in_mcp_registry(): + import akd_ext.tools # noqa: F401 + + tool_names = {tool.__name__ for tool in MCPToolRegistry().get_tools()} + assert "PDFParserTool" in tool_names + + +def test_normalize_local_windows_path_keeps_path(): + normalized = _normalize_url_or_path("C:/temp/file.pdf") + assert normalized.lower().endswith("file.pdf")