From 04ccb8bbb59e6f4f16af53b238dd60da408ac201 Mon Sep 17 00:00:00 2001 From: pushwitha Date: Fri, 24 Apr 2026 15:10:02 -0500 Subject: [PATCH] Add ASCL tool for astrophysics code discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One MCP tool wrapping the Astrophysics Source Code Library search API (https://ascl.net/api/search/) for code discovery. ASCLSearchTool: search ASCL for codes by name or capability, or look up a specific entry by its ASCL id. Returns ascl_id, title, credit (semicolon-separated authors), abstract, site_list (all URLs), bibcode (the ASCL record's own ADS bibcode), described_in (ADS URLs for description papers), used_in (ADS URLs for papers that used the code), used_in_count, and views. The tool auto-detects ASCL id-shaped queries (e.g. '1303.002', 'ascl:1303.002') and rewrites them as ascl_id:"..." for exact lookup, so a single tool covers both name search and id lookup. Design: - ASCL API quirks handled inside the tool: list-valued fields (site_list, described_in, used_in) come back as PHP-serialized strings, parsed via _parse_php_array. `views` is a string and is converted to int. - Query must be quoted ("...") — the API returns 404 otherwise. - Per-request httpx.AsyncClient, errors returned via output.error, no retry logic. --- akd_ext/tools/__init__.py | 12 +++ akd_ext/tools/ascl.py | 216 ++++++++++++++++++++++++++++++++++++++ tests/tools/test_ascl.py | 165 +++++++++++++++++++++++++++++ 3 files changed, 393 insertions(+) create mode 100644 akd_ext/tools/ascl.py create mode 100644 tests/tools/test_ascl.py diff --git a/akd_ext/tools/__init__.py b/akd_ext/tools/__init__.py index fb1328e..2d2ec26 100644 --- a/akd_ext/tools/__init__.py +++ b/akd_ext/tools/__init__.py @@ -20,6 +20,13 @@ RepositorySearchToolOutputSchema, RepositorySearchToolConfig, ) +from .ascl import ( + ASCLEntry, + ASCLSearchTool, + ASCLSearchToolConfig, + ASCLSearchToolInputSchema, + ASCLSearchToolOutputSchema, +) __all__ = [ "DummyTool", @@ -38,4 +45,9 @@ "RepositorySearchToolInputSchema", "RepositorySearchToolOutputSchema", "RepositorySearchToolConfig", + "ASCLEntry", + "ASCLSearchTool", + "ASCLSearchToolConfig", + "ASCLSearchToolInputSchema", + "ASCLSearchToolOutputSchema", ] diff --git a/akd_ext/tools/ascl.py b/akd_ext/tools/ascl.py new file mode 100644 index 0000000..4d7f583 --- /dev/null +++ b/akd_ext/tools/ascl.py @@ -0,0 +1,216 @@ +"""Astrophysics Source Code Library (ASCL) search tool. + +Wraps https://ascl.net/api/search/ so agents can discover astrophysics codes +by name or capability. A single ASCL id lookup is also expressed through +this tool — pass the id as the query (e.g. '1303.002' or 'ascl:1303.002'). + +API docs: https://github.com/teuben/ascl-tools/tree/master/API +Metadata: https://ascl.net/wordpress/about-ascl/metadata-schema/ +""" + +from __future__ import annotations + +import os +import re + +import httpx +from pydantic import Field + +from akd._base import InputSchema, OutputSchema +from akd.tools import BaseTool, BaseToolConfig + +from akd_ext.mcp import mcp_tool + + +# Matches any ASCL id formatted as YYMM.### or YYMM.####. We use this to +# detect when the user's query is actually an id lookup so we can format the +# query as `ascl_id:"..."` for an exact match. +_ASCL_ID_RE = re.compile(r"\d{4}\.\d{3,4}") + +# PHP-serialized string entries look like: s:39:"https://example.com" +# ASCL stores its list-valued fields (site_list, described_in, used_in, +# keywords) as PHP-serialized strings, so we extract the inner string values +# with this pattern. Verified against live API responses. +_PHP_STRING_RE = re.compile(r's:\d+:"([^"]*)"') + + +def _parse_php_array(php_str: str) -> list[str]: + """Pull string entries out of a PHP-serialized array. + + ASCL returns list fields like:: + + a:1:{i:0;s:39:"https://emcee.readthedocs.io/en/v3.1.3/";} + + For empty lists ASCL may return either "a:0:{}" or an empty string, so + both yield []. + """ + if not php_str or php_str == "a:0:{}": + return [] + return _PHP_STRING_RE.findall(php_str) + + +def _normalize_ascl_id(raw: str) -> str: + """Strip common prefixes ('ascl:', 'ASCL:', full URL) from an ASCL id.""" + clean = raw.strip() + for prefix in ("ascl:", "ASCL:", "https://ascl.net/", "http://ascl.net/"): + if clean.startswith(prefix): + clean = clean[len(prefix):] + return clean.strip() + + +class ASCLSearchToolConfig(BaseToolConfig): + """Configuration for the ASCL search tool.""" + + base_url: str = Field( + default_factory=lambda: os.getenv("ASCL_API_URL", "https://ascl.net/api/search/"), + description="ASCL search endpoint URL.", + ) + timeout: float = Field( + default=30.0, + description="HTTP request timeout in seconds.", + ) + + +class ASCLEntry(OutputSchema): + """A single code entry from the Astrophysics Source Code Library.""" + + ascl_id: str = Field(..., description="ASCL identifier (e.g. '1303.002').") + title: str = Field(default="", description="Code title.") + credit: str = Field( + default="", + description="Author list as a semicolon-separated string (ASCL's native format).", + ) + abstract: str = Field(default="", description="Code description.") + site_list: list[str] = Field( + default_factory=list, + description="URLs associated with the code (GitHub, project site, docs, etc.).", + ) + bibcode: str | None = Field(default=None, description="ADS bibcode for this ASCL record.") + described_in: list[str] = Field( + default_factory=list, + description="ADS URLs for papers that describe/introduce this code.", + ) + used_in: list[str] = Field( + default_factory=list, + description="ADS URLs for papers that used this code.", + ) + used_in_count: int = Field(default=0, description="Number of papers that used this code.") + views: int = Field(default=0, description="ASCL page view count.") + + +class ASCLSearchToolInputSchema(InputSchema): + """Parameters the LLM can set per query.""" + + query: str = Field( + ..., + description=( + "Search terms (code name, capability) or a specific ASCL id " + "(e.g. '1303.002' or 'ascl:1303.002'). IDs are auto-detected " + "and converted to an exact-match query." + ), + ) + rows: int = Field( + default=10, + ge=1, + le=50, + description=( + "Max entries to return. Enforced client-side — the ASCL API " + "itself does not accept a row limit." + ), + ) + + +class ASCLSearchToolOutputSchema(OutputSchema): + """What the tool returns to the caller.""" + + entries: list[ASCLEntry] = Field(default_factory=list, description="Matching ASCL entries.") + num_found: int = Field(default=0, description="Total matches from ASCL (pre row-cap).") + error: str | None = Field( + default=None, + description="Error message if the query failed; null on success.", + ) + + +@mcp_tool +class ASCLSearchTool(BaseTool[ASCLSearchToolInputSchema, ASCLSearchToolOutputSchema]): + """Search the Astrophysics Source Code Library (ASCL) for codes. + + ASCL is a curated registry of ~4000 astrophysics codes. Each entry has a + canonical code URL, the ADS bibcode for the code's ASCL record, and ADS + URLs for papers that describe and use the code. + + Use the `query` field for either: + - a scientific task or capability keyword (e.g. 'radiative transfer'), or + - a specific ASCL id (e.g. '1303.002') — the tool will format it as an + exact-match query automatically. + + On failure, `error` is populated and `entries` is empty. + """ + + input_schema = ASCLSearchToolInputSchema + output_schema = ASCLSearchToolOutputSchema + config_schema = ASCLSearchToolConfig + config: ASCLSearchToolConfig + + async def _arun(self, params: ASCLSearchToolInputSchema) -> ASCLSearchToolOutputSchema: + # Decide between exact-id lookup and free-text search. The ASCL API + # requires the `q` parameter to be wrapped in quotes either way; + # omitting them returns a 404. + normalized = _normalize_ascl_id(params.query) + if _ASCL_ID_RE.fullmatch(normalized): + q = f'ascl_id:"{normalized}"' + else: + q = f'"{params.query}"' + + query_params = {"q": q} + + try: + async with httpx.AsyncClient(timeout=self.config.timeout) as client: + response = await client.get(self.config.base_url, params=query_params) + response.raise_for_status() + data = response.json() + except Exception as e: + return ASCLSearchToolOutputSchema(error=f"ASCL query failed: {e}") + + # The API returns a JSON array of entries. If we ever get something + # else back, surface it as an error rather than silently producing [] + # entries. + if not isinstance(data, list): + return ASCLSearchToolOutputSchema( + error=f"Unexpected ASCL response type: {type(data).__name__}", + ) + + return ASCLSearchToolOutputSchema( + entries=[_parse_entry(doc) for doc in data[: params.rows]], + num_found=len(data), + ) + + +def _parse_entry(doc: dict) -> ASCLEntry: + """Turn one raw ASCL record into an ASCLEntry. + + Handles two API quirks: + - list-valued fields (site_list, described_in, used_in) come as PHP- + serialized strings and need parsing. + - `views` comes back as a string and needs int conversion; invalid values + fall back to 0. + """ + try: + views = int(doc.get("views", 0)) + except (ValueError, TypeError): + views = 0 + + used_in = _parse_php_array(doc.get("used_in", "")) + + return ASCLEntry( + ascl_id=doc.get("ascl_id", ""), + title=doc.get("title", ""), + credit=doc.get("credit", ""), + abstract=doc.get("abstract", ""), + site_list=_parse_php_array(doc.get("site_list", "")), + bibcode=doc.get("bibcode") or None, + described_in=_parse_php_array(doc.get("described_in", "")), + used_in=used_in, + used_in_count=len(used_in), + views=views, + ) diff --git a/tests/tools/test_ascl.py b/tests/tools/test_ascl.py new file mode 100644 index 0000000..a2f6695 --- /dev/null +++ b/tests/tools/test_ascl.py @@ -0,0 +1,165 @@ +"""Integration tests for the ASCL search tool. + +These tests hit the live ASCL API (no auth required) and use real codes as +fixtures: +- RADMC-3D (1202.015): 3D dust continuum radiative transfer +- HEALPix (1107.018): Pixelization of the sphere for CMB analysis +""" + +import pytest + +from akd_ext.tools import ( + ASCLSearchTool, + ASCLSearchToolInputSchema, + ASCLSearchToolOutputSchema, +) + + +# --------------------------------------------------------------------------- +# Free-text / capability searches +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_ascl_search_by_code_name(): + """Searching for 'RADMC-3D' returns the expected entry (1202.015).""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="RADMC-3D", rows=5)) + + assert isinstance(result, ASCLSearchToolOutputSchema) + assert result.error is None + assert result.num_found > 0 + + ids = [e.ascl_id for e in result.entries] + assert "1202.015" in ids + + radmc = next(e for e in result.entries if e.ascl_id == "1202.015") + assert "RADMC-3D" in radmc.title + assert radmc.bibcode # every entry has an ASCL record bibcode + + +@pytest.mark.asyncio +async def test_ascl_search_by_capability(): + """Searching by capability ('radiative transfer') returns related codes.""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="radiative transfer", rows=10)) + + assert result.error is None + assert result.num_found > 0 + # At least some entries should have URLs in site_list. + assert any(e.site_list for e in result.entries) + + +@pytest.mark.asyncio +async def test_ascl_search_empty_result(): + """A nonsense query returns zero entries and no error.""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="xyznonexistent123456", rows=5)) + + assert result.error is None + assert result.num_found == 0 + assert result.entries == [] + + +# --------------------------------------------------------------------------- +# PHP-serialized field parsing +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_ascl_site_list_is_parsed_to_urls(): + """site_list should come back as clean HTTP URLs, not PHP-serialized text.""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="RADMC-3D", rows=3)) + + for entry in result.entries: + for url in entry.site_list: + assert url.startswith("http"), f"Expected a URL, got: {url}" + assert "a:" not in url, f"URL still contains PHP serialization: {url}" + + +@pytest.mark.asyncio +async def test_ascl_described_in_returns_ads_urls(): + """HEALPix has at least one describing paper; all should be ADS URLs.""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="HEALPix", rows=5)) + + healpix = next((e for e in result.entries if e.ascl_id == "1107.018"), None) + assert healpix is not None + assert len(healpix.described_in) > 0 + for url in healpix.described_in: + assert "adsabs.harvard.edu" in url + + +@pytest.mark.asyncio +async def test_ascl_used_in_count_matches_list_length(): + """used_in_count should equal len(used_in).""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="RADMC-3D", rows=3)) + + for entry in result.entries: + assert entry.used_in_count == len(entry.used_in) + + +@pytest.mark.asyncio +async def test_ascl_views_is_int(): + """`views` comes from the API as a string; the tool converts it to int.""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="HEALPix", rows=3)) + + assert result.num_found > 0 + for entry in result.entries: + assert isinstance(entry.views, int) + + +# --------------------------------------------------------------------------- +# ID-style queries +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_ascl_search_by_id(): + """Passing an ASCL id as the query should return that exact entry.""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="1202.015", rows=1)) + + assert result.error is None + assert len(result.entries) == 1 + assert result.entries[0].ascl_id == "1202.015" + assert "RADMC-3D" in result.entries[0].title + + +@pytest.mark.asyncio +async def test_ascl_search_by_id_with_prefix(): + """The 'ascl:' prefix on an id should be stripped and the exact match returned.""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="ascl:1107.018", rows=1)) + + assert result.error is None + assert len(result.entries) == 1 + assert result.entries[0].ascl_id == "1107.018" + assert "HEALPix" in result.entries[0].title + + +@pytest.mark.asyncio +async def test_ascl_search_by_full_url(): + """A full ASCL page URL should be normalized to the id and matched exactly.""" + tool = ASCLSearchTool() + result = await tool.arun( + ASCLSearchToolInputSchema(query="https://ascl.net/1202.015", rows=1) + ) + + assert result.error is None + assert len(result.entries) == 1 + assert result.entries[0].ascl_id == "1202.015" + assert "RADMC-3D" in result.entries[0].title + + +@pytest.mark.asyncio +async def test_ascl_search_by_nonexistent_id(): + """A well-formed but unused ASCL id returns no entries and no error.""" + tool = ASCLSearchTool() + result = await tool.arun(ASCLSearchToolInputSchema(query="9999.999", rows=1)) + + assert result.error is None + assert result.entries == []