Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 43 additions & 35 deletions akd_ext/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,49 @@
"""Tools module for akd_ext."""

from .dummy import DummyInputSchema, DummyOutputSchema, DummyTool
from .sde_search import (
SDEDocument,
SDESearchTool,
SDESearchToolConfig,
SDESearchToolInputSchema,
SDESearchToolOutputSchema,
)
from .code_search.code_signals import (
CodeSignalsSearchInputSchema,
CodeSignalsSearchOutputSchema,
CodeSignalsSearchTool,
CodeSignalsSearchToolConfig,
)
from .code_search.repository_search import (
RepositorySearchTool,
RepositorySearchToolInputSchema,
RepositorySearchToolOutputSchema,
RepositorySearchToolConfig,
# from .dummy import DummyInputSchema, DummyOutputSchema, DummyTool
# from .sde_search import (
# SDEDocument,
# SDESearchTool,
# SDESearchToolConfig,
# SDESearchToolInputSchema,
# SDESearchToolOutputSchema,
# )
# from .code_search.code_signals import (
# CodeSignalsSearchInputSchema,
# CodeSignalsSearchOutputSchema,
# CodeSignalsSearchTool,
# CodeSignalsSearchToolConfig,
# )
# from .code_search.repository_search import (
# RepositorySearchTool,
# RepositorySearchToolInputSchema,
# RepositorySearchToolOutputSchema,
# RepositorySearchToolConfig,
# )
from .pdf_parser import (
PDFParserTool,
PDFParserToolInputSchema,
PDFParserToolOutputSchema,
)

__all__ = [
"DummyTool",
"DummyInputSchema",
"DummyOutputSchema",
"SDESearchTool",
"SDESearchToolInputSchema",
"SDESearchToolOutputSchema",
"SDESearchToolConfig",
"SDEDocument",
"CodeSignalsSearchInputSchema",
"CodeSignalsSearchOutputSchema",
"CodeSignalsSearchTool",
"CodeSignalsSearchToolConfig",
"RepositorySearchTool",
"RepositorySearchToolInputSchema",
"RepositorySearchToolOutputSchema",
"RepositorySearchToolConfig",
# "DummyTool",
# "DummyInputSchema",
# "DummyOutputSchema",
# "SDESearchTool",
# "SDESearchToolInputSchema",
# "SDESearchToolOutputSchema",
# "SDESearchToolConfig",
# "SDEDocument",
# "CodeSignalsSearchInputSchema",
# "CodeSignalsSearchOutputSchema",
# "CodeSignalsSearchTool",
# "CodeSignalsSearchToolConfig",
# "RepositorySearchTool",
# "RepositorySearchToolInputSchema",
# "RepositorySearchToolOutputSchema",
# "RepositorySearchToolConfig",
"PDFParserTool",
"PDFParserToolInputSchema",
"PDFParserToolOutputSchema",
]
92 changes: 92 additions & 0 deletions akd_ext/tools/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""PDF parser tool using AKD core backends."""

from __future__ import annotations

import sys
from pathlib import Path
from typing import Any, Literal

from akd._base import InputSchema, OutputSchema
from akd.tools import BaseTool
from akd.tools.scrapers.pdf_scrapers import (
ScraperToolOutputSchema,
SimplePDFScraper,
)
from pydantic import Field

from akd_ext.mcp import mcp_tool


class PDFParserToolInputSchema(InputSchema):
"""Input schema for PDF parsing."""

url: str = Field(..., description="HTTP(S) URL to a PDF")
return_format: Literal["markdown", "html", "json"] = Field(
default="markdown",
description="Preferred output format hint for backend parsing",
)


class PDFParserToolOutputSchema(OutputSchema):
"""Output schema for parsed PDF content."""

content: str = Field(..., description="Parsed text content")
metadata: dict[str, Any] = Field(default_factory=dict, description="Parser and document metadata")


def _normalize_url_or_path(url: str) -> str:
lower = url.lower()
if lower.startswith(("http://", "https://", "file://")):
return url

p = Path(url).expanduser().resolve()
as_uri = p.as_uri()
local_path = str(p)

if sys.platform.startswith("win"):
return local_path
return as_uri


async def _run_akd_simple(url: str, config: dict[str, Any] | None = None) -> ScraperToolOutputSchema:
scraper = SimplePDFScraper(config=config)
params = scraper.input_schema(url=_normalize_url_or_path(url))
return await scraper.arun(params)


def _scraper_to_result(out: ScraperToolOutputSchema) -> dict[str, Any]:
return {"content": out.content, "metadata": out.metadata.model_dump()}


@mcp_tool
class PDFParserTool(BaseTool[PDFParserToolInputSchema, PDFParserToolOutputSchema]):
"""Parse a PDF into LLM-ready text.

Given an HTTP(S) URL to a PDF, returns the parsed text content plus a
metadata dict (backend, return_format, plus parser/document metadata).

Output text format is selectable via ``return_format``: ``markdown``
(default), ``html``, or ``json``.

Uses a simple PDF scraper backend — best for digital-native PDFs. Does
not perform OCR (scanned or image-only PDFs will yield poor or empty
text), does not discover or search for PDFs (URL must be supplied),
and does not summarize.
"""

input_schema = PDFParserToolInputSchema
output_schema = PDFParserToolOutputSchema

async def _arun(self, params: PDFParserToolInputSchema) -> PDFParserToolOutputSchema:
result = _scraper_to_result(await _run_akd_simple(params.url))

metadata = result.get("metadata", {})
if not isinstance(metadata, dict):
metadata = {"raw_metadata": metadata}
metadata["backend"] = "akd_simple"
metadata["return_format"] = params.return_format

return PDFParserToolOutputSchema(
content=str(result.get("content", "") or ""),
metadata=metadata,
)
10 changes: 8 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ classifiers = [
]
requires-python = ">=3.12"
dependencies = [
"akd @ git+https://github.com/NASA-IMPACT/akd-core.git@develop",
"akd @ git+https://${GITHUB_TOKEN}@github.com/NASA-IMPACT/akd-core.git@develop",
"docling>=2.93.0",
"fastmcp>=2.0.0,<3.2.4",
"griffe>=1.0.0,<2",
"openai-agents>=0.6.7",
"opencv-python-headless>=4.10",
"pydantic-ai>=1.81.0",
"PyGithub>=2.1.1",
]
Expand Down Expand Up @@ -73,7 +75,11 @@ markers = [
]

[tool.uv]
override-dependencies = ["wrapt>=1.14", "mistralai<2"]
override-dependencies = [
"wrapt>=1.14",
"mistralai<2",
"opencv-python ; sys_platform == 'never'",
]

[tool.ruff]
target-version = "py312"
Expand Down
44 changes: 44 additions & 0 deletions tests/tools/test_pdf_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Tests for PDF parser tool routing and errors."""

import pytest

from akd_ext.mcp.registry import MCPToolRegistry
from akd_ext.tools.pdf_parser import (
PDFParserTool,
PDFParserToolInputSchema,
_normalize_url_or_path,
)


@pytest.mark.asyncio
async def test_pdf_parser_routes_to_akd_simple(monkeypatch):
tool = PDFParserTool()

async def fake_simple(url):
return {"content": "simple", "metadata": {"source": url}}

def fake_scraper_to_result(out):
return out

monkeypatch.setattr("akd_ext.tools.pdf_parser._run_akd_simple", fake_simple)
monkeypatch.setattr("akd_ext.tools.pdf_parser._scraper_to_result", fake_scraper_to_result)

result = await tool.arun(
PDFParserToolInputSchema(url="https://example.com/test.pdf"),
)

assert result.content == "simple"
assert result.metadata["backend"] == "akd_simple"
assert result.metadata["return_format"] == "markdown"


def test_pdf_parser_registered_in_mcp_registry():
import akd_ext.tools # noqa: F401

tool_names = {tool.__name__ for tool in MCPToolRegistry().get_tools()}
assert "PDFParserTool" in tool_names


def test_normalize_local_windows_path_keeps_path():
normalized = _normalize_url_or_path("C:/temp/file.pdf")
assert normalized.lower().endswith("file.pdf")
Loading