-
Notifications
You must be signed in to change notification settings - Fork 8
Add PDFParserTool #51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
23969df
5174f23
0e9f505
eb328f0
976045d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,144 @@ | ||
| """PDF parser tool using AKD core backends.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import sys | ||
| from pathlib import Path | ||
| from typing import Any, Literal | ||
| from akd._base import InputSchema, OutputSchema | ||
| from akd.tools import BaseTool, BaseToolConfig | ||
| from akd.tools.scrapers import ( | ||
| DoclingScraper, | ||
| DoclingScraperConfig, | ||
| ScraperToolOutputSchema, | ||
| SimplePDFScraper, | ||
| ) | ||
| from pydantic import Field | ||
|
|
||
| from akd_ext.mcp import mcp_tool | ||
|
|
||
| Mode = Literal["fast", "accurate", "ocr"] | ||
| BackendHint = Literal[ | ||
| "akd_simple", | ||
| "akd_docling", | ||
| ] | ||
|
|
||
|
|
||
| class PDFParserToolInputSchema(InputSchema): | ||
| """Input schema for PDF parsing.""" | ||
|
|
||
| url_or_path: str = Field(..., description="HTTP(S) URL or local filesystem path to a PDF") | ||
| mode: Mode = Field(default="accurate", description="Parsing mode: fast, accurate, or ocr") | ||
| backend_hint: BackendHint | None = Field( | ||
| default=None, | ||
| description="Optional backend override (akd_simple or akd_docling)", | ||
| ) | ||
| return_format: Literal["markdown", "html", "json"] = Field( | ||
| default="markdown", | ||
| description="Preferred output format hint for backend parsing", | ||
| ) | ||
|
|
||
|
|
||
| class PDFParserToolOutputSchema(OutputSchema): | ||
| """Output schema for parsed PDF content.""" | ||
|
|
||
| content: str = Field(..., description="Parsed text content") | ||
| metadata: dict[str, Any] = Field(default_factory=dict, description="Parser and document metadata") | ||
|
|
||
|
|
||
| class PDFParserToolConfig(BaseToolConfig): | ||
| """Configuration for PDF parser backend scrapers.""" | ||
|
|
||
| akd_simple_config: dict[str, Any] | None = Field( | ||
| default=None, | ||
| description="Optional configuration forwarded to SimplePDFScraper.", | ||
| ) | ||
| akd_docling_config: DoclingScraperConfig | None = Field( | ||
| default=None, | ||
| description="Optional configuration forwarded to DoclingScraper.", | ||
| ) | ||
|
|
||
|
|
||
| def _normalize_url_or_path(url_or_path: str) -> str: | ||
| lower = url_or_path.lower() | ||
| if lower.startswith(("http://", "https://", "file://")): | ||
| return url_or_path | ||
|
|
||
| p = Path(url_or_path).expanduser().resolve() | ||
| as_uri = p.as_uri() | ||
| local_path = str(p) | ||
|
|
||
| if sys.platform.startswith("win"): | ||
| return local_path | ||
| return as_uri | ||
|
|
||
| async def _run_akd_simple( | ||
| url_or_path: str, config: dict[str, Any] | None = None | ||
| ) -> ScraperToolOutputSchema: | ||
| scraper = SimplePDFScraper(config=config) | ||
| params = scraper.input_schema(url=_normalize_url_or_path(url_or_path)) | ||
| return await scraper.arun(params) | ||
|
|
||
|
|
||
| async def _run_akd_docling( | ||
| url_or_path: str, mode: Mode, config: DoclingScraperConfig | None = None | ||
| ) -> ScraperToolOutputSchema: | ||
| if mode == "fast": | ||
| default_cfg = DoclingScraperConfig(pdf_mode="fast", do_table_structure=False, use_ocr=False) | ||
| elif mode == "accurate": | ||
| default_cfg = DoclingScraperConfig(pdf_mode="accurate", do_table_structure=True, use_ocr=False) | ||
| else: | ||
| default_cfg = DoclingScraperConfig(pdf_mode="accurate", do_table_structure=True, use_ocr=True) | ||
|
|
||
| scraper = DoclingScraper(config=config or default_cfg) | ||
| params = scraper.input_schema(url=_normalize_url_or_path(url_or_path)) | ||
| return await scraper.arun(params) | ||
|
|
||
|
|
||
| def _scraper_to_result(out: ScraperToolOutputSchema) -> dict[str, Any]: | ||
| return {"content": out.content, "metadata": out.metadata.model_dump()} | ||
|
|
||
|
|
||
| @mcp_tool | ||
| class PDFParserTool(BaseTool[PDFParserToolInputSchema, PDFParserToolOutputSchema]): | ||
| """Parse PDFs into LLM-ready content using AKD core backends.""" | ||
|
|
||
| input_schema = PDFParserToolInputSchema | ||
| output_schema = PDFParserToolOutputSchema | ||
| config_schema = PDFParserToolConfig | ||
|
|
||
| async def _arun(self, params: PDFParserToolInputSchema) -> PDFParserToolOutputSchema: | ||
| backend = params.backend_hint | ||
| if backend is None: | ||
| backend = "akd_simple" if params.mode == "fast" else "akd_docling" | ||
|
|
||
| tool_config = self.config | ||
|
|
||
| if backend == "akd_simple": | ||
| result = _scraper_to_result( | ||
| await _run_akd_simple( | ||
| params.url_or_path, | ||
| config=tool_config.akd_simple_config, | ||
| ) | ||
| ) | ||
| elif backend == "akd_docling": | ||
| result = _scraper_to_result( | ||
| await _run_akd_docling( | ||
| params.url_or_path, | ||
| params.mode, | ||
| config=tool_config.akd_docling_config, | ||
| ) | ||
| ) | ||
| else: | ||
| raise ValueError(f"Unsupported backend: {backend!r}") | ||
|
|
||
| metadata = result.get("metadata", {}) | ||
| if not isinstance(metadata, dict): | ||
| metadata = {"raw_metadata": metadata} | ||
| metadata["backend"] = backend | ||
| metadata["return_format"] = params.return_format | ||
|
|
||
| return PDFParserToolOutputSchema( | ||
| content=str(result.get("content", "") or ""), | ||
| metadata=metadata, | ||
| ) | ||
|
Comment on lines
+103
to
+144
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also consider adding general scraper tool as well. akd core already has composite scraper where we can pass scraper objects. Maybe like |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| """Tests for PDF parser tool routing and errors.""" | ||
|
|
||
| import pytest | ||
|
|
||
| from akd_ext.mcp.registry import MCPToolRegistry | ||
| from akd_ext.tools.pdf_parser import ( | ||
| DoclingScraperConfig, | ||
| PDFParserTool, | ||
| PDFParserToolConfig, | ||
| PDFParserToolInputSchema, | ||
| _normalize_url_or_path, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_pdf_parser_defaults_fast_to_akd_simple(monkeypatch): | ||
| tool = PDFParserTool(config=PDFParserToolConfig(akd_simple_config={"foo": "bar"})) | ||
|
|
||
| async def fake_simple(url_or_path, config=None): | ||
| return {"content": "simple", "metadata": {"source": url_or_path, "config": config}} | ||
|
|
||
| def fake_scraper_to_result(out): | ||
| return out | ||
|
|
||
| monkeypatch.setattr("akd_ext.tools.pdf_parser._run_akd_simple", fake_simple) | ||
| monkeypatch.setattr("akd_ext.tools.pdf_parser._scraper_to_result", fake_scraper_to_result) | ||
|
|
||
| result = await tool.arun( | ||
| PDFParserToolInputSchema( | ||
| url_or_path="https://example.com/test.pdf", | ||
| mode="fast", | ||
| ) | ||
| ) | ||
|
|
||
| assert result.content == "simple" | ||
| assert result.metadata["backend"] == "akd_simple" | ||
| assert result.metadata["config"] == {"foo": "bar"} | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_pdf_parser_defaults_non_fast_to_akd_docling(monkeypatch): | ||
| tool = PDFParserTool( | ||
| config=PDFParserToolConfig( | ||
| akd_docling_config=DoclingScraperConfig(pdf_mode="accurate", do_table_structure=False, use_ocr=True) | ||
| ) | ||
| ) | ||
|
|
||
| async def fake_docling(url_or_path, mode, config=None): | ||
| return {"content": f"docling-{mode}", "metadata": {"source": url_or_path, "config": config}} | ||
|
|
||
| def fake_scraper_to_result(out): | ||
| return out | ||
|
|
||
| monkeypatch.setattr("akd_ext.tools.pdf_parser._run_akd_docling", fake_docling) | ||
| monkeypatch.setattr("akd_ext.tools.pdf_parser._scraper_to_result", fake_scraper_to_result) | ||
|
|
||
| result = await tool.arun( | ||
| PDFParserToolInputSchema( | ||
| url_or_path="https://example.com/test.pdf", | ||
| mode="accurate", | ||
| ) | ||
| ) | ||
|
|
||
| assert result.content == "docling-accurate" | ||
| assert result.metadata["backend"] == "akd_docling" | ||
| assert isinstance(result.metadata["config"], DoclingScraperConfig) | ||
| assert result.metadata["config"].use_ocr is True | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_pdf_parser_unsupported_backend(monkeypatch): | ||
| tool = PDFParserTool() | ||
|
|
||
| # Bypass schema validation intentionally to test runtime fallback branch. | ||
| params = PDFParserToolInputSchema.model_construct( | ||
| url_or_path="https://example.com/test.pdf", | ||
| mode="fast", | ||
| backend_hint="unknown_backend", | ||
| return_format="markdown", | ||
| ) | ||
|
|
||
| with pytest.raises(ValueError, match="Unsupported backend"): | ||
| await tool._arun(params) | ||
|
|
||
|
|
||
| def test_pdf_parser_registered_in_mcp_registry(): | ||
| import akd_ext.tools # noqa: F401 | ||
|
|
||
| tool_names = {tool.__name__ for tool in MCPToolRegistry().get_tools()} | ||
| assert "PDFParserTool" in tool_names | ||
|
|
||
|
|
||
| def test_normalize_local_windows_path_keeps_path(): | ||
| normalized = _normalize_url_or_path("C:/temp/file.pdf") | ||
| assert normalized.lower().endswith("file.pdf") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clarify what do you mean by local file system path
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was under the initial impression that the server runs on the user's own machine, is that incorrect?
If incorrect, we can remove the local file path option, as it is just a holdover from my benchmarking process.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By local filesystem path, I mean a path that is resolvable by the AKD server process at runtime (e.g., /data/docs/a.pdf or C:\docs\a.pdf when server runs there). If the server is remote, a path on the client machine is not accessible; in that case HTTP(S) URL or accessible file:// URI should be used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed! As the MCP server will stay away from the local path, it'll be good to reflect that.