Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions akd_ext/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
RepositorySearchToolOutputSchema,
RepositorySearchToolConfig,
)
from .pdf_parser import (
PDFParserTool,
PDFParserToolConfig,
PDFParserToolInputSchema,
PDFParserToolOutputSchema,
)

__all__ = [
"DummyTool",
Expand All @@ -38,4 +44,8 @@
"RepositorySearchToolInputSchema",
"RepositorySearchToolOutputSchema",
"RepositorySearchToolConfig",
"PDFParserTool",
"PDFParserToolConfig",
"PDFParserToolInputSchema",
"PDFParserToolOutputSchema",
]
144 changes: 144 additions & 0 deletions akd_ext/tools/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""PDF parser tool using AKD core backends."""

from __future__ import annotations

import sys
from pathlib import Path
from typing import Any, Literal
from akd._base import InputSchema, OutputSchema
from akd.tools import BaseTool, BaseToolConfig
from akd.tools.scrapers import (
DoclingScraper,
DoclingScraperConfig,
ScraperToolOutputSchema,
SimplePDFScraper,
)
from pydantic import Field

from akd_ext.mcp import mcp_tool

Mode = Literal["fast", "accurate", "ocr"]
BackendHint = Literal[
"akd_simple",
"akd_docling",
]


class PDFParserToolInputSchema(InputSchema):
"""Input schema for PDF parsing."""

url_or_path: str = Field(..., description="HTTP(S) URL or local filesystem path to a PDF")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clarify what do you mean by local file system path

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was under the initial impression that the server runs on the user's own machine, is that incorrect?

If incorrect, we can remove the local file path option, as it is just a holdover from my benchmarking process.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By local filesystem path, I mean a path that is resolvable by the AKD server process at runtime (e.g., /data/docs/a.pdf or C:\docs\a.pdf when server runs there). If the server is remote, a path on the client machine is not accessible; in that case HTTP(S) URL or accessible file:// URI should be used.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed! As the MCP server will stay away from the local path, it'll be good to reflect that.

mode: Mode = Field(default="accurate", description="Parsing mode: fast, accurate, or ocr")
backend_hint: BackendHint | None = Field(
default=None,
description="Optional backend override (akd_simple or akd_docling)",
)
return_format: Literal["markdown", "html", "json"] = Field(
default="markdown",
description="Preferred output format hint for backend parsing",
)


class PDFParserToolOutputSchema(OutputSchema):
"""Output schema for parsed PDF content."""

content: str = Field(..., description="Parsed text content")
metadata: dict[str, Any] = Field(default_factory=dict, description="Parser and document metadata")


class PDFParserToolConfig(BaseToolConfig):
"""Configuration for PDF parser backend scrapers."""

akd_simple_config: dict[str, Any] | None = Field(
default=None,
description="Optional configuration forwarded to SimplePDFScraper.",
)
akd_docling_config: DoclingScraperConfig | None = Field(
default=None,
description="Optional configuration forwarded to DoclingScraper.",
)


def _normalize_url_or_path(url_or_path: str) -> str:
lower = url_or_path.lower()
if lower.startswith(("http://", "https://", "file://")):
return url_or_path

p = Path(url_or_path).expanduser().resolve()
as_uri = p.as_uri()
local_path = str(p)

if sys.platform.startswith("win"):
return local_path
return as_uri

async def _run_akd_simple(
url_or_path: str, config: dict[str, Any] | None = None
) -> ScraperToolOutputSchema:
scraper = SimplePDFScraper(config=config)
params = scraper.input_schema(url=_normalize_url_or_path(url_or_path))
return await scraper.arun(params)


async def _run_akd_docling(
url_or_path: str, mode: Mode, config: DoclingScraperConfig | None = None
) -> ScraperToolOutputSchema:
if mode == "fast":
default_cfg = DoclingScraperConfig(pdf_mode="fast", do_table_structure=False, use_ocr=False)
elif mode == "accurate":
default_cfg = DoclingScraperConfig(pdf_mode="accurate", do_table_structure=True, use_ocr=False)
else:
default_cfg = DoclingScraperConfig(pdf_mode="accurate", do_table_structure=True, use_ocr=True)

scraper = DoclingScraper(config=config or default_cfg)
params = scraper.input_schema(url=_normalize_url_or_path(url_or_path))
return await scraper.arun(params)


def _scraper_to_result(out: ScraperToolOutputSchema) -> dict[str, Any]:
return {"content": out.content, "metadata": out.metadata.model_dump()}


@mcp_tool
class PDFParserTool(BaseTool[PDFParserToolInputSchema, PDFParserToolOutputSchema]):
"""Parse PDFs into LLM-ready content using AKD core backends."""

input_schema = PDFParserToolInputSchema
output_schema = PDFParserToolOutputSchema
config_schema = PDFParserToolConfig

async def _arun(self, params: PDFParserToolInputSchema) -> PDFParserToolOutputSchema:
backend = params.backend_hint
if backend is None:
backend = "akd_simple" if params.mode == "fast" else "akd_docling"

tool_config = self.config

if backend == "akd_simple":
result = _scraper_to_result(
await _run_akd_simple(
params.url_or_path,
config=tool_config.akd_simple_config,
)
)
elif backend == "akd_docling":
result = _scraper_to_result(
await _run_akd_docling(
params.url_or_path,
params.mode,
config=tool_config.akd_docling_config,
)
)
else:
raise ValueError(f"Unsupported backend: {backend!r}")

metadata = result.get("metadata", {})
if not isinstance(metadata, dict):
metadata = {"raw_metadata": metadata}
metadata["backend"] = backend
metadata["return_format"] = params.return_format

return PDFParserToolOutputSchema(
content=str(result.get("content", "") or ""),
metadata=metadata,
)
Comment on lines +103 to +144
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also consider adding general scraper tool as well. akd core already has composite scraper where we can pass scraper objects.

Maybe like GeneralScraperTool or something will be good addition as well and the compsoite scraper will handle url webpage, pdf, antyhing direclty as well.

95 changes: 95 additions & 0 deletions tests/tools/test_pdf_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""Tests for PDF parser tool routing and errors."""

import pytest

from akd_ext.mcp.registry import MCPToolRegistry
from akd_ext.tools.pdf_parser import (
DoclingScraperConfig,
PDFParserTool,
PDFParserToolConfig,
PDFParserToolInputSchema,
_normalize_url_or_path,
)


@pytest.mark.asyncio
async def test_pdf_parser_defaults_fast_to_akd_simple(monkeypatch):
tool = PDFParserTool(config=PDFParserToolConfig(akd_simple_config={"foo": "bar"}))

async def fake_simple(url_or_path, config=None):
return {"content": "simple", "metadata": {"source": url_or_path, "config": config}}

def fake_scraper_to_result(out):
return out

monkeypatch.setattr("akd_ext.tools.pdf_parser._run_akd_simple", fake_simple)
monkeypatch.setattr("akd_ext.tools.pdf_parser._scraper_to_result", fake_scraper_to_result)

result = await tool.arun(
PDFParserToolInputSchema(
url_or_path="https://example.com/test.pdf",
mode="fast",
)
)

assert result.content == "simple"
assert result.metadata["backend"] == "akd_simple"
assert result.metadata["config"] == {"foo": "bar"}


@pytest.mark.asyncio
async def test_pdf_parser_defaults_non_fast_to_akd_docling(monkeypatch):
tool = PDFParserTool(
config=PDFParserToolConfig(
akd_docling_config=DoclingScraperConfig(pdf_mode="accurate", do_table_structure=False, use_ocr=True)
)
)

async def fake_docling(url_or_path, mode, config=None):
return {"content": f"docling-{mode}", "metadata": {"source": url_or_path, "config": config}}

def fake_scraper_to_result(out):
return out

monkeypatch.setattr("akd_ext.tools.pdf_parser._run_akd_docling", fake_docling)
monkeypatch.setattr("akd_ext.tools.pdf_parser._scraper_to_result", fake_scraper_to_result)

result = await tool.arun(
PDFParserToolInputSchema(
url_or_path="https://example.com/test.pdf",
mode="accurate",
)
)

assert result.content == "docling-accurate"
assert result.metadata["backend"] == "akd_docling"
assert isinstance(result.metadata["config"], DoclingScraperConfig)
assert result.metadata["config"].use_ocr is True


@pytest.mark.asyncio
async def test_pdf_parser_unsupported_backend(monkeypatch):
tool = PDFParserTool()

# Bypass schema validation intentionally to test runtime fallback branch.
params = PDFParserToolInputSchema.model_construct(
url_or_path="https://example.com/test.pdf",
mode="fast",
backend_hint="unknown_backend",
return_format="markdown",
)

with pytest.raises(ValueError, match="Unsupported backend"):
await tool._arun(params)


def test_pdf_parser_registered_in_mcp_registry():
import akd_ext.tools # noqa: F401

tool_names = {tool.__name__ for tool in MCPToolRegistry().get_tools()}
assert "PDFParserTool" in tool_names


def test_normalize_local_windows_path_keeps_path():
normalized = _normalize_url_or_path("C:/temp/file.pdf")
assert normalized.lower().endswith("file.pdf")