From eb7a806e502a2be8c5afb9729bb4517f54652870 Mon Sep 17 00:00:00 2001 From: pushwitha Date: Fri, 24 Apr 2026 00:19:32 -0500 Subject: [PATCH] Add ADS tools for astrophysics paper and link discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two MCP tools wrapping Astrophysics Data System API: - ADSSearchTool: search the literature for papers matching a query. Returns bibcode, title, authors, abstract, citation_count, DOI, publication, and names of linked data archives (HEASARC, MAST, Chandra, etc.). - ADSLinksResolverTool: given an ADS bibcode, return its "associated" bibcodes — the records ADS shows under "Described in" on the record page. For ASCL record bibcodes, the associated bibcodes are the code's canonical method papers, which is the authoritative source for canonical-paper recovery (ASCL's own described_in is often incomplete). Shared ADSToolConfig (base_url, api_token, timeout). Per-request httpx clients, errors returned via output.error, no retry logic — matches the pattern of sde_search and the eie tools. Requires ADS_API_TOKEN env var. --- akd_ext/tools/__init__.py | 18 +++ akd_ext/tools/ads.py | 302 ++++++++++++++++++++++++++++++++++++++ tests/tools/test_ads.py | 191 ++++++++++++++++++++++++ 3 files changed, 511 insertions(+) create mode 100644 akd_ext/tools/ads.py create mode 100644 tests/tools/test_ads.py diff --git a/akd_ext/tools/__init__.py b/akd_ext/tools/__init__.py index fb1328e..b50c87b 100644 --- a/akd_ext/tools/__init__.py +++ b/akd_ext/tools/__init__.py @@ -20,6 +20,16 @@ RepositorySearchToolOutputSchema, RepositorySearchToolConfig, ) +from .ads import ( + ADSLinksResolverInputSchema, + ADSLinksResolverOutputSchema, + ADSLinksResolverTool, + ADSPaper, + ADSSearchTool, + ADSSearchToolInputSchema, + ADSSearchToolOutputSchema, + ADSToolConfig, +) __all__ = [ "DummyTool", @@ -38,4 +48,12 @@ "RepositorySearchToolInputSchema", "RepositorySearchToolOutputSchema", "RepositorySearchToolConfig", + "ADSSearchTool", + "ADSSearchToolInputSchema", + "ADSSearchToolOutputSchema", + "ADSToolConfig", + "ADSPaper", + "ADSLinksResolverTool", + "ADSLinksResolverInputSchema", + "ADSLinksResolverOutputSchema", ] diff --git a/akd_ext/tools/ads.py b/akd_ext/tools/ads.py new file mode 100644 index 0000000..cb33caa --- /dev/null +++ b/akd_ext/tools/ads.py @@ -0,0 +1,302 @@ +"""NASA Astrophysics Data System (ADS) tools. + +Two tools wrapping the ADS API: + +- ADSSearchTool: search the literature for papers matching a query. +- ADSLinksResolverTool: given a bibcode, list the 'associated' bibcodes + (ADS's "Described in" relationship). This is how + we recover the canonical method papers for an ASCL + record, which ASCL itself often under-reports. + +API docs: https://ui.adsabs.harvard.edu/help/api/api-docs.html +Dev API: https://github.com/adsabs/adsabs-dev-api +""" + +from __future__ import annotations + +import os +from urllib.parse import quote + +import httpx +from pydantic import Field, model_validator + +from akd._base import InputSchema, OutputSchema +from akd.tools import BaseTool, BaseToolConfig + +from akd_ext.mcp import mcp_tool + + +# Fields requested from ADS. This fixed set covers the common needs for +# scientific code discovery: identification (bibcode, doi), metadata (title, +# authors, year, publication), relevance (abstract, citation_count), and +# linked resources (data archive names via `data`, access via `esources`, +# flags like REFEREED via `property`). +_ADS_FIELDS = ( + "bibcode,title,first_author,author,abstract,year,pubdate," + "citation_count,doi,pub,data,esources,property" +) + + +# --------------------------------------------------------------------------- +# Shared config +# --------------------------------------------------------------------------- + + +class ADSToolConfig(BaseToolConfig): + """Shared configuration for all ADS tools. + + Both the search tool and the links-resolver tool hit the same API and + need the same values (base URL, bearer token, timeout), so they share + this single config. + """ + + base_url: str = Field( + default="https://api.adsabs.harvard.edu/v1", + description="Base URL for the ADS API.", + ) + + api_token: str = Field( + default_factory=lambda: os.environ.get("ADS_API_TOKEN", ""), + description="ADS API bearer token. Defaults to the ADS_API_TOKEN env var.", + ) + + timeout: float = Field( + default=30.0, + description="HTTP request timeout in seconds.", + ) + + @model_validator(mode="after") + def _require_api_token(self) -> "ADSToolConfig": + # ADS requires authentication. Fail fast at config creation if no token + # is available, rather than surfacing a 401 at query time. + if not self.api_token: + raise ValueError( + "ADS_API_TOKEN environment variable is not set. " + "Get a token from https://ui.adsabs.harvard.edu/user/settings/token" + ) + return self + + +# --------------------------------------------------------------------------- +# ADSSearchTool +# --------------------------------------------------------------------------- + + +class ADSPaper(OutputSchema): + """A single paper returned by ADS search.""" + + bibcode: str = Field(..., description="ADS bibcode (unique paper identifier).") + title: str = Field(default="", description="Paper title.") + first_author: str = Field(default="", description="First author name.") + authors: list[str] = Field(default_factory=list, description="All author names.") + abstract: str = Field(default="", description="Paper abstract.") + year: str | None = Field(default=None, description="Publication year.") + pubdate: str | None = Field(default=None, description="Publication date.") + citation_count: int = Field(default=0, description="Number of citations.") + doi: str | None = Field(default=None, description="DOI if available.") + pub: str | None = Field(default=None, description="Journal or publication name.") + data: list[str] = Field( + default_factory=list, + description="Names of linked data archives (e.g. 'HEASARC', 'MAST', 'Chandra').", + ) + esources: list[str] = Field( + default_factory=list, + description="Electronic source types (e.g. 'PUB_HTML', 'EPRINT_HTML').", + ) + property: list[str] = Field( + default_factory=list, + description="Paper properties (e.g. 'REFEREED', 'OPENACCESS').", + ) + + +class ADSSearchToolInputSchema(InputSchema): + """Parameters the LLM can set per query.""" + + query: str = Field( + ..., + description=( + "ADS search query. Supports free text and Solr field syntax — examples: " + "'dark matter', 'title:\"emcee\"', 'abs:\"ultra-fast outflow\"', " + "'bibcode:\"2013PASP..125..306F\"'." + ), + ) + rows: int = Field( + default=10, + ge=1, + le=50, + description="Max number of papers to return.", + ) + fq: str | None = Field( + default=None, + description="Optional filter query, e.g. 'property:refereed'.", + ) + + +class ADSSearchToolOutputSchema(OutputSchema): + """What the tool returns to the caller.""" + + papers: list[ADSPaper] = Field(default_factory=list, description="Matching papers.") + num_found: int = Field( + default=0, + description="Total matches in ADS (may exceed rows).", + ) + error: str | None = Field( + default=None, + description="Error message when the query failed; null on success.", + ) + + +@mcp_tool +class ADSSearchTool(BaseTool[ADSSearchToolInputSchema, ADSSearchToolOutputSchema]): + """Search NASA's Astrophysics Data System (ADS) for scientific papers. + + Returns a list of papers with bibcode, title, authors, abstract, citation + count, DOI, and names of linked data archives. On failure, `error` is + populated and `papers` is empty. + """ + + input_schema = ADSSearchToolInputSchema + output_schema = ADSSearchToolOutputSchema + config_schema = ADSToolConfig + config: ADSToolConfig + + async def _arun(self, params: ADSSearchToolInputSchema) -> ADSSearchToolOutputSchema: + url = f"{self.config.base_url.rstrip('/')}/search/query" + headers = {"Authorization": f"Bearer {self.config.api_token}"} + query_params: dict[str, str] = { + "q": params.query, + "fl": _ADS_FIELDS, + "rows": str(params.rows), + } + if params.fq: + query_params["fq"] = params.fq + + try: + async with httpx.AsyncClient(timeout=self.config.timeout) as client: + response = await client.get(url, params=query_params, headers=headers) + response.raise_for_status() + data = response.json() + except Exception as e: + return ADSSearchToolOutputSchema(error=f"ADS query failed: {e}") + + response_data = data.get("response", {}) + docs: list[dict] = response_data.get("docs", []) + + return ADSSearchToolOutputSchema( + papers=[_parse_paper(doc) for doc in docs], + num_found=response_data.get("numFound", 0), + ) + + +def _parse_paper(doc: dict) -> ADSPaper: + """Turn one ADS response document into an ADSPaper. + + ADS returns `title` and `doi` as single-element lists, so we unwrap them. + Everything else maps directly. + """ + title_list = doc.get("title") or [] + doi_list = doc.get("doi") or [] + return ADSPaper( + bibcode=doc.get("bibcode", ""), + title=title_list[0] if title_list else "", + first_author=doc.get("first_author", ""), + authors=doc.get("author", []), + abstract=doc.get("abstract", ""), + year=doc.get("year"), + pubdate=doc.get("pubdate"), + citation_count=doc.get("citation_count", 0), + doi=doi_list[0] if doi_list else None, + pub=doc.get("pub"), + data=doc.get("data", []), + esources=doc.get("esources", []), + property=doc.get("property", []), + ) + + +# --------------------------------------------------------------------------- +# ADSLinksResolverTool +# --------------------------------------------------------------------------- +# +# ADS maintains an "associated" relationship between records that the search +# index does not expose. For an ASCL record bibcode (e.g. 2010ascl.soft10082F), +# the associated bibcodes are the code's canonical method/description papers. +# This is the only reliable way to recover the high-citation canonical paper +# when ASCL's own `described_in` field is incomplete — which it often is: the +# FLASH ASCL record lists only the 2005 update paper, and the canonical +# Fryxell+ 2000 paper is reachable only via this resolver. +# +# Endpoint: GET /v1/resolver/{bibcode}/associated +# curl -H "Authorization: Bearer API_KEY" \ +# "https://api.adsabs.harvard.edu/v1/resolver/2010ascl.soft10082F/associated" | python3 -m json.tool + + +class ADSLinksResolverInputSchema(InputSchema): + """Parameters the LLM can set per lookup.""" + + bibcode: str = Field( + ..., + description=( + "ADS bibcode to resolve associated works for. Typically an ASCL " + "record bibcode (e.g. '2010ascl.soft10082F'), for which the " + "associated bibcodes are the code's description papers." + ), + ) + + +class ADSLinksResolverOutputSchema(OutputSchema): + """What the resolver returns to the caller.""" + + bibcode: str = Field(..., description="The bibcode that was resolved.") + associated_bibcodes: list[str] = Field( + default_factory=list, + description="Bibcodes of papers ADS lists as associated with the input bibcode.", + ) + error: str | None = Field( + default=None, + description="Error message if the lookup failed; null on success.", + ) + + +@mcp_tool +class ADSLinksResolverTool(BaseTool[ADSLinksResolverInputSchema, ADSLinksResolverOutputSchema]): + """Resolve an ADS bibcode to its 'associated' bibcodes. + + For ASCL record bibcodes (format: YYYYascl.softNNNNNN), the associated + bibcodes are the code's canonical description papers — these are the + papers ADS shows under "Described in" on the record page, and they are + the authoritative source for `describing_bibcodes` in agent output. + + On failure, `error` is populated and `associated_bibcodes` is empty. + """ + + input_schema = ADSLinksResolverInputSchema + output_schema = ADSLinksResolverOutputSchema + config_schema = ADSToolConfig + config: ADSToolConfig + + async def _arun(self, params: ADSLinksResolverInputSchema) -> ADSLinksResolverOutputSchema: + # Bibcodes can contain characters that must be percent-encoded in + # the URL path (e.g. '&' in '2005Ap&SS.298..341W'). Pass `safe=""` + # to quote() so nothing is left unescaped. + encoded_bibcode = quote(params.bibcode, safe="") + url = f"{self.config.base_url.rstrip('/')}/resolver/{encoded_bibcode}/associated" + headers = {"Authorization": f"Bearer {self.config.api_token}"} + + try: + async with httpx.AsyncClient(timeout=self.config.timeout) as client: + response = await client.get(url, headers=headers) + response.raise_for_status() + data = response.json() + except Exception as e: + return ADSLinksResolverOutputSchema( + bibcode=params.bibcode, + error=f"ADS resolver failed: {e}", + ) + + records = data.get("links", {}).get("records", []) + bibcodes = [r["bibcode"] for r in records if r.get("bibcode")] + + return ADSLinksResolverOutputSchema( + bibcode=params.bibcode, + associated_bibcodes=bibcodes, + ) diff --git a/tests/tools/test_ads.py b/tests/tools/test_ads.py new file mode 100644 index 0000000..ff3bd01 --- /dev/null +++ b/tests/tools/test_ads.py @@ -0,0 +1,191 @@ +"""Integration tests for ADS search and links-resolver tools. + +These tests hit the live ADS API and require ``ADS_API_TOKEN`` in the +environment. The whole module is skipped when the token is missing so the +rest of the test suite can still run. + +Reference bibcodes used as fixtures: +- 2010ascl.soft10082F — ASCL record for FLASH. +- 2000ApJS..131..273F — Fryxell+ 2000, the canonical FLASH paper (~2100 cites). +- 2005Ap&SS.298..341W — Weirs+ 2005, a FLASH update paper (used to test + URL encoding — the '&' must be percent-escaped). +- 2013PASP..125..306F — Foreman-Mackey emcee paper. +""" + +import os + +import pytest + +from akd_ext.tools import ( + ADSLinksResolverInputSchema, + ADSLinksResolverOutputSchema, + ADSLinksResolverTool, + ADSSearchTool, + ADSSearchToolInputSchema, + ADSSearchToolOutputSchema, + ADSToolConfig, +) + +pytestmark = pytest.mark.skipif( + not os.environ.get("ADS_API_TOKEN"), + reason="ADS_API_TOKEN not set", +) + + +@pytest.mark.asyncio +async def test_ads_search_by_title(): + """A title search for the "Source Code" paper should return it.""" + tool = ADSSearchTool() + result = await tool.arun( + ADSSearchToolInputSchema(query='title:"FLASH"', rows=5, fq='property:refereed') + ) + + assert isinstance(result, ADSSearchToolOutputSchema) + assert result.error is None + assert result.num_found > 0 + assert len(result.papers) <= 5 + + bibcodes = [p.bibcode for p in result.papers] + # 2000ApJS..131..273F is the canonical FLASH paper (Fryxell et al. 2000, ~2100 citations). + assert "2000ApJS..131..273F" in bibcodes + + +@pytest.mark.asyncio +async def test_ads_search_honors_rows(): + """When enough matches exist, we should get exactly `rows` papers.""" + tool = ADSSearchTool() + result = await tool.arun(ADSSearchToolInputSchema(query="dark energy", rows=15)) + + assert result.error is None + if result.num_found >= 15: + assert len(result.papers) == 15 + + +@pytest.mark.asyncio +async def test_ads_search_empty_result(): + """A nonsense query returns zero papers and no error.""" + tool = ADSSearchTool() + result = await tool.arun( + ADSSearchToolInputSchema(query='title:"xyznonexistent123456qweasd"', rows=5) + ) + + assert result.error is None + assert result.papers == [] + assert result.num_found == 0 + + +@pytest.mark.asyncio +async def test_ads_search_num_found_is_global_total(): + """`num_found` is the total match count in ADS, independent of `rows`.""" + tool = ADSSearchTool() + result = await tool.arun(ADSSearchToolInputSchema(query="dark matter", rows=1)) + + assert len(result.papers) == 1 + # "dark matter" easily has thousands of matches. + assert result.num_found > 1 + + +@pytest.mark.asyncio +async def test_ads_search_fq_filters_to_refereed(): + """The `fq` param filters results — refereed-only should still hit emcee.""" + tool = ADSSearchTool() + result = await tool.arun( + ADSSearchToolInputSchema( + query='title:"emcee: The MCMC Hammer"', + rows=5, + fq="property:refereed", + ) + ) + + assert result.error is None + assert result.num_found > 0 + for paper in result.papers: + assert "REFEREED" in paper.property + + +@pytest.mark.asyncio +async def test_ads_search_by_bibcode_exact_lookup(): + """Looking up a paper by its exact bibcode returns that one paper.""" + tool = ADSSearchTool() + result = await tool.arun( + ADSSearchToolInputSchema( + query='bibcode:"2000ApJS..131..273F"', + rows=1, + ) + ) + + assert result.error is None + assert result.num_found == 1 + assert len(result.papers) == 1 + + paper = result.papers[0] + assert paper.bibcode == "2000ApJS..131..273F" + assert "FLASH" in paper.title + assert paper.first_author.startswith("Fryxell") + assert paper.year == "2000" + assert paper.citation_count > 1000 # canonical paper with ~2100 citations + + + +def test_ads_config_rejects_empty_token(): + """The config validator refuses an empty api_token.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError): + ADSToolConfig(api_token="") + + +# --------------------------------------------------------------------------- +# ADSLinksResolverTool tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_ads_resolver_for_flash_ascl_record(): + """FLASH's ASCL record (2010ascl.soft10082F) should resolve to both + describing papers: the canonical Fryxell+ 2000 paper AND the Weirs+ 2005 + update paper. This is exactly the case ASCL's own `described_in` field + misses. + """ + tool = ADSLinksResolverTool() + result = await tool.arun( + ADSLinksResolverInputSchema(bibcode="2010ascl.soft10082F") + ) + + assert isinstance(result, ADSLinksResolverOutputSchema) + assert result.error is None + assert result.bibcode == "2010ascl.soft10082F" + # Canonical FLASH paper (Fryxell et al. 2000) — the whole reason we need + # this tool; ASCL's own described_in does not list it. + assert "2000ApJS..131..273F" in result.associated_bibcodes + # The update paper should also appear. + assert "2005Ap&SS.298..341W" in result.associated_bibcodes + + +@pytest.mark.asyncio +async def test_ads_resolver_handles_url_encoded_bibcode(): + """Bibcodes with special chars (e.g. '&' in 2005Ap&SS.298..341W) must be + URL-encoded in the path. Passing such a bibcode directly should work. + """ + tool = ADSLinksResolverTool() + result = await tool.arun( + ADSLinksResolverInputSchema(bibcode="2005Ap&SS.298..341W") + ) + + assert result.error is None + assert result.bibcode == "2005Ap&SS.298..341W" + + +@pytest.mark.asyncio +async def test_ads_resolver_unknown_bibcode_is_empty_not_error(): + """A non-existent bibcode should return an empty list, not raise.""" + tool = ADSLinksResolverTool() + result = await tool.arun( + ADSLinksResolverInputSchema(bibcode="9999invalid..000..000X") + ) + + # ADS may return an empty records list OR a 404 depending on internal + # behavior; either way the tool should surface it cleanly. + assert result.bibcode == "9999invalid..000..000X" + if result.error is None: + assert result.associated_bibcodes == []