NASA-IMPACT · sanzog03 · Mar 30, 2026 · Mar 31, 2026 · May 11, 2026 · May 11, 2026
diff --git a/akd_ext/tools/__init__.py b/akd_ext/tools/__init__.py
@@ -1,41 +1,49 @@
 """Tools module for akd_ext."""
 
-from .dummy import DummyInputSchema, DummyOutputSchema, DummyTool
-from .sde_search import (
-    SDEDocument,
-    SDESearchTool,
-    SDESearchToolConfig,
-    SDESearchToolInputSchema,
-    SDESearchToolOutputSchema,
-)
-from .code_search.code_signals import (
-    CodeSignalsSearchInputSchema,
-    CodeSignalsSearchOutputSchema,
-    CodeSignalsSearchTool,
-    CodeSignalsSearchToolConfig,
-)
-from .code_search.repository_search import (
-    RepositorySearchTool,
-    RepositorySearchToolInputSchema,
-    RepositorySearchToolOutputSchema,
-    RepositorySearchToolConfig,
+# from .dummy import DummyInputSchema, DummyOutputSchema, DummyTool
+# from .sde_search import (
+#     SDEDocument,
+#     SDESearchTool,
+#     SDESearchToolConfig,
+#     SDESearchToolInputSchema,
+#     SDESearchToolOutputSchema,
+# )
+# from .code_search.code_signals import (
+#     CodeSignalsSearchInputSchema,
+#     CodeSignalsSearchOutputSchema,
+#     CodeSignalsSearchTool,
+#     CodeSignalsSearchToolConfig,
+# )
+# from .code_search.repository_search import (
+#     RepositorySearchTool,
+#     RepositorySearchToolInputSchema,
+#     RepositorySearchToolOutputSchema,
+#     RepositorySearchToolConfig,
+# )
+from .pdf_parser import (
+    PDFParserTool,
+    PDFParserToolInputSchema,
+    PDFParserToolOutputSchema,
 )
 
 __all__ = [
-    "DummyTool",
-    "DummyInputSchema",
-    "DummyOutputSchema",
-    "SDESearchTool",
-    "SDESearchToolInputSchema",
-    "SDESearchToolOutputSchema",
-    "SDESearchToolConfig",
-    "SDEDocument",
-    "CodeSignalsSearchInputSchema",
-    "CodeSignalsSearchOutputSchema",
-    "CodeSignalsSearchTool",
-    "CodeSignalsSearchToolConfig",
-    "RepositorySearchTool",
-    "RepositorySearchToolInputSchema",
-    "RepositorySearchToolOutputSchema",
-    "RepositorySearchToolConfig",
+    # "DummyTool",
+    # "DummyInputSchema",
+    # "DummyOutputSchema",
+    # "SDESearchTool",
+    # "SDESearchToolInputSchema",
+    # "SDESearchToolOutputSchema",
+    # "SDESearchToolConfig",
+    # "SDEDocument",
+    # "CodeSignalsSearchInputSchema",
+    # "CodeSignalsSearchOutputSchema",
+    # "CodeSignalsSearchTool",
+    # "CodeSignalsSearchToolConfig",
+    # "RepositorySearchTool",
+    # "RepositorySearchToolInputSchema",
+    # "RepositorySearchToolOutputSchema",
+    # "RepositorySearchToolConfig",
+    "PDFParserTool",
+    "PDFParserToolInputSchema",
+    "PDFParserToolOutputSchema",
 ]
diff --git a/akd_ext/tools/pdf_parser.py b/akd_ext/tools/pdf_parser.py
@@ -0,0 +1,92 @@
+"""PDF parser tool using AKD core backends."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from typing import Any, Literal
+
+from akd._base import InputSchema, OutputSchema
+from akd.tools import BaseTool
+from akd.tools.scrapers.pdf_scrapers import (
+    ScraperToolOutputSchema,
+    SimplePDFScraper,
+)
+from pydantic import Field
+
+from akd_ext.mcp import mcp_tool
+
+
+class PDFParserToolInputSchema(InputSchema):
+    """Input schema for PDF parsing."""
+
+    url: str = Field(..., description="HTTP(S) URL to a PDF")
+    return_format: Literal["markdown", "html", "json"] = Field(
+        default="markdown",
+        description="Preferred output format hint for backend parsing",
+    )
+
+
+class PDFParserToolOutputSchema(OutputSchema):
+    """Output schema for parsed PDF content."""
+
+    content: str = Field(..., description="Parsed text content")
+    metadata: dict[str, Any] = Field(default_factory=dict, description="Parser and document metadata")
+
+
+def _normalize_url_or_path(url: str) -> str:
+    lower = url.lower()
+    if lower.startswith(("http://", "https://", "file://")):
+        return url
+
+    p = Path(url).expanduser().resolve()
+    as_uri = p.as_uri()
+    local_path = str(p)
+
+    if sys.platform.startswith("win"):
+        return local_path
+    return as_uri
+
+
+async def _run_akd_simple(url: str, config: dict[str, Any] | None = None) -> ScraperToolOutputSchema:
+    scraper = SimplePDFScraper(config=config)
+    params = scraper.input_schema(url=_normalize_url_or_path(url))
+    return await scraper.arun(params)
+
+
+def _scraper_to_result(out: ScraperToolOutputSchema) -> dict[str, Any]:
+    return {"content": out.content, "metadata": out.metadata.model_dump()}
+
+
+@mcp_tool
+class PDFParserTool(BaseTool[PDFParserToolInputSchema, PDFParserToolOutputSchema]):
+    """Parse a PDF into LLM-ready text.
+
+    Given an HTTP(S) URL to a PDF, returns the parsed text content plus a
+    metadata dict (backend, return_format, plus parser/document metadata).
+
+    Output text format is selectable via ``return_format``: ``markdown``
+    (default), ``html``, or ``json``.
+
+    Uses a simple PDF scraper backend — best for digital-native PDFs. Does
+    not perform OCR (scanned or image-only PDFs will yield poor or empty
+    text), does not discover or search for PDFs (URL must be supplied),
+    and does not summarize.
+    """
+
+    input_schema = PDFParserToolInputSchema
+    output_schema = PDFParserToolOutputSchema
+
+    async def _arun(self, params: PDFParserToolInputSchema) -> PDFParserToolOutputSchema:
+        result = _scraper_to_result(await _run_akd_simple(params.url))
+
+        metadata = result.get("metadata", {})
+        if not isinstance(metadata, dict):
+            metadata = {"raw_metadata": metadata}
+        metadata["backend"] = "akd_simple"
+        metadata["return_format"] = params.return_format
+
+        return PDFParserToolOutputSchema(
+            content=str(result.get("content", "") or ""),
+            metadata=metadata,
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,10 +17,12 @@ classifiers = [
 ]
 requires-python = ">=3.12"
 dependencies = [
-    "akd @ git+https://github.com/NASA-IMPACT/akd-core.git@develop",
+    "akd @ git+https://${GITHUB_TOKEN}@github.com/NASA-IMPACT/akd-core.git@develop",
+    "docling>=2.93.0",
     "fastmcp>=2.0.0,<3.2.4",
     "griffe>=1.0.0,<2",
     "openai-agents>=0.6.7",
+    "opencv-python-headless>=4.10",
     "pydantic-ai>=1.81.0",
     "PyGithub>=2.1.1",
 ]
@@ -73,7 +75,11 @@ markers = [
 ]
 
 [tool.uv]
-override-dependencies = ["wrapt>=1.14", "mistralai<2"]
+override-dependencies = [
+    "wrapt>=1.14",
+    "mistralai<2",
+    "opencv-python ; sys_platform == 'never'",
+]
 
 [tool.ruff]
 target-version = "py312"

diff --git a/tests/tools/test_pdf_parser.py b/tests/tools/test_pdf_parser.py
@@ -0,0 +1,44 @@
+"""Tests for PDF parser tool routing and errors."""
+
+import pytest
+
+from akd_ext.mcp.registry import MCPToolRegistry
+from akd_ext.tools.pdf_parser import (
+    PDFParserTool,
+    PDFParserToolInputSchema,
+    _normalize_url_or_path,
+)
+
+
+@pytest.mark.asyncio
+async def test_pdf_parser_routes_to_akd_simple(monkeypatch):
+    tool = PDFParserTool()
+
+    async def fake_simple(url):
+        return {"content": "simple", "metadata": {"source": url}}
+
+    def fake_scraper_to_result(out):
+        return out
+
+    monkeypatch.setattr("akd_ext.tools.pdf_parser._run_akd_simple", fake_simple)
+    monkeypatch.setattr("akd_ext.tools.pdf_parser._scraper_to_result", fake_scraper_to_result)
+
+    result = await tool.arun(
+        PDFParserToolInputSchema(url="https://example.com/test.pdf"),
+    )
+
+    assert result.content == "simple"
+    assert result.metadata["backend"] == "akd_simple"
+    assert result.metadata["return_format"] == "markdown"
+
+
+def test_pdf_parser_registered_in_mcp_registry():
+    import akd_ext.tools  # noqa: F401
+
+    tool_names = {tool.__name__ for tool in MCPToolRegistry().get_tools()}
+    assert "PDFParserTool" in tool_names
+
+
+def test_normalize_local_windows_path_keeps_path():
+    normalized = _normalize_url_or_path("C:/temp/file.pdf")
+    assert normalized.lower().endswith("file.pdf")