diff --git a/akd_ext/tools/__init__.py b/akd_ext/tools/__init__.py index fb1328e..b50c87b 100644 --- a/akd_ext/tools/__init__.py +++ b/akd_ext/tools/__init__.py @@ -20,6 +20,16 @@ RepositorySearchToolOutputSchema, RepositorySearchToolConfig, ) +from .ads import ( + ADSLinksResolverInputSchema, + ADSLinksResolverOutputSchema, + ADSLinksResolverTool, + ADSPaper, + ADSSearchTool, + ADSSearchToolInputSchema, + ADSSearchToolOutputSchema, + ADSToolConfig, +) __all__ = [ "DummyTool", @@ -38,4 +48,12 @@ "RepositorySearchToolInputSchema", "RepositorySearchToolOutputSchema", "RepositorySearchToolConfig", + "ADSSearchTool", + "ADSSearchToolInputSchema", + "ADSSearchToolOutputSchema", + "ADSToolConfig", + "ADSPaper", + "ADSLinksResolverTool", + "ADSLinksResolverInputSchema", + "ADSLinksResolverOutputSchema", ] diff --git a/akd_ext/tools/ads.py b/akd_ext/tools/ads.py new file mode 100644 index 0000000..cb33caa --- /dev/null +++ b/akd_ext/tools/ads.py @@ -0,0 +1,302 @@ +"""NASA Astrophysics Data System (ADS) tools. + +Two tools wrapping the ADS API: + +- ADSSearchTool: search the literature for papers matching a query. +- ADSLinksResolverTool: given a bibcode, list the 'associated' bibcodes + (ADS's "Described in" relationship). This is how + we recover the canonical method papers for an ASCL + record, which ASCL itself often under-reports. + +API docs: https://ui.adsabs.harvard.edu/help/api/api-docs.html +Dev API: https://github.com/adsabs/adsabs-dev-api +""" + +from __future__ import annotations + +import os +from urllib.parse import quote + +import httpx +from pydantic import Field, model_validator + +from akd._base import InputSchema, OutputSchema +from akd.tools import BaseTool, BaseToolConfig + +from akd_ext.mcp import mcp_tool + + +# Fields requested from ADS. This fixed set covers the common needs for +# scientific code discovery: identification (bibcode, doi), metadata (title, +# authors, year, publication), relevance (abstract, citation_count), and +# linked resources (data archive names via `data`, access via `esources`, +# flags like REFEREED via `property`). +_ADS_FIELDS = ( + "bibcode,title,first_author,author,abstract,year,pubdate," + "citation_count,doi,pub,data,esources,property" +) + + +# --------------------------------------------------------------------------- +# Shared config +# --------------------------------------------------------------------------- + + +class ADSToolConfig(BaseToolConfig): + """Shared configuration for all ADS tools. + + Both the search tool and the links-resolver tool hit the same API and + need the same values (base URL, bearer token, timeout), so they share + this single config. + """ + + base_url: str = Field( + default="https://api.adsabs.harvard.edu/v1", + description="Base URL for the ADS API.", + ) + + api_token: str = Field( + default_factory=lambda: os.environ.get("ADS_API_TOKEN", ""), + description="ADS API bearer token. Defaults to the ADS_API_TOKEN env var.", + ) + + timeout: float = Field( + default=30.0, + description="HTTP request timeout in seconds.", + ) + + @model_validator(mode="after") + def _require_api_token(self) -> "ADSToolConfig": + # ADS requires authentication. Fail fast at config creation if no token + # is available, rather than surfacing a 401 at query time. + if not self.api_token: + raise ValueError( + "ADS_API_TOKEN environment variable is not set. " + "Get a token from https://ui.adsabs.harvard.edu/user/settings/token" + ) + return self + + +# --------------------------------------------------------------------------- +# ADSSearchTool +# --------------------------------------------------------------------------- + + +class ADSPaper(OutputSchema): + """A single paper returned by ADS search.""" + + bibcode: str = Field(..., description="ADS bibcode (unique paper identifier).") + title: str = Field(default="", description="Paper title.") + first_author: str = Field(default="", description="First author name.") + authors: list[str] = Field(default_factory=list, description="All author names.") + abstract: str = Field(default="", description="Paper abstract.") + year: str | None = Field(default=None, description="Publication year.") + pubdate: str | None = Field(default=None, description="Publication date.") + citation_count: int = Field(default=0, description="Number of citations.") + doi: str | None = Field(default=None, description="DOI if available.") + pub: str | None = Field(default=None, description="Journal or publication name.") + data: list[str] = Field( + default_factory=list, + description="Names of linked data archives (e.g. 'HEASARC', 'MAST', 'Chandra').", + ) + esources: list[str] = Field( + default_factory=list, + description="Electronic source types (e.g. 'PUB_HTML', 'EPRINT_HTML').", + ) + property: list[str] = Field( + default_factory=list, + description="Paper properties (e.g. 'REFEREED', 'OPENACCESS').", + ) + + +class ADSSearchToolInputSchema(InputSchema): + """Parameters the LLM can set per query.""" + + query: str = Field( + ..., + description=( + "ADS search query. Supports free text and Solr field syntax — examples: " + "'dark matter', 'title:\"emcee\"', 'abs:\"ultra-fast outflow\"', " + "'bibcode:\"2013PASP..125..306F\"'." + ), + ) + rows: int = Field( + default=10, + ge=1, + le=50, + description="Max number of papers to return.", + ) + fq: str | None = Field( + default=None, + description="Optional filter query, e.g. 'property:refereed'.", + ) + + +class ADSSearchToolOutputSchema(OutputSchema): + """What the tool returns to the caller.""" + + papers: list[ADSPaper] = Field(default_factory=list, description="Matching papers.") + num_found: int = Field( + default=0, + description="Total matches in ADS (may exceed rows).", + ) + error: str | None = Field( + default=None, + description="Error message when the query failed; null on success.", + ) + + +@mcp_tool +class ADSSearchTool(BaseTool[ADSSearchToolInputSchema, ADSSearchToolOutputSchema]): + """Search NASA's Astrophysics Data System (ADS) for scientific papers. + + Returns a list of papers with bibcode, title, authors, abstract, citation + count, DOI, and names of linked data archives. On failure, `error` is + populated and `papers` is empty. + """ + + input_schema = ADSSearchToolInputSchema + output_schema = ADSSearchToolOutputSchema + config_schema = ADSToolConfig + config: ADSToolConfig + + async def _arun(self, params: ADSSearchToolInputSchema) -> ADSSearchToolOutputSchema: + url = f"{self.config.base_url.rstrip('/')}/search/query" + headers = {"Authorization": f"Bearer {self.config.api_token}"} + query_params: dict[str, str] = { + "q": params.query, + "fl": _ADS_FIELDS, + "rows": str(params.rows), + } + if params.fq: + query_params["fq"] = params.fq + + try: + async with httpx.AsyncClient(timeout=self.config.timeout) as client: + response = await client.get(url, params=query_params, headers=headers) + response.raise_for_status() + data = response.json() + except Exception as e: + return ADSSearchToolOutputSchema(error=f"ADS query failed: {e}") + + response_data = data.get("response", {}) + docs: list[dict] = response_data.get("docs", []) + + return ADSSearchToolOutputSchema( + papers=[_parse_paper(doc) for doc in docs], + num_found=response_data.get("numFound", 0), + ) + + +def _parse_paper(doc: dict) -> ADSPaper: + """Turn one ADS response document into an ADSPaper. + + ADS returns `title` and `doi` as single-element lists, so we unwrap them. + Everything else maps directly. + """ + title_list = doc.get("title") or [] + doi_list = doc.get("doi") or [] + return ADSPaper( + bibcode=doc.get("bibcode", ""), + title=title_list[0] if title_list else "", + first_author=doc.get("first_author", ""), + authors=doc.get("author", []), + abstract=doc.get("abstract", ""), + year=doc.get("year"), + pubdate=doc.get("pubdate"), + citation_count=doc.get("citation_count", 0), + doi=doi_list[0] if doi_list else None, + pub=doc.get("pub"), + data=doc.get("data", []), + esources=doc.get("esources", []), + property=doc.get("property", []), + ) + + +# --------------------------------------------------------------------------- +# ADSLinksResolverTool +# --------------------------------------------------------------------------- +# +# ADS maintains an "associated" relationship between records that the search +# index does not expose. For an ASCL record bibcode (e.g. 2010ascl.soft10082F), +# the associated bibcodes are the code's canonical method/description papers. +# This is the only reliable way to recover the high-citation canonical paper +# when ASCL's own `described_in` field is incomplete — which it often is: the +# FLASH ASCL record lists only the 2005 update paper, and the canonical +# Fryxell+ 2000 paper is reachable only via this resolver. +# +# Endpoint: GET /v1/resolver/{bibcode}/associated +# curl -H "Authorization: Bearer API_KEY" \ +# "https://api.adsabs.harvard.edu/v1/resolver/2010ascl.soft10082F/associated" | python3 -m json.tool + + +class ADSLinksResolverInputSchema(InputSchema): + """Parameters the LLM can set per lookup.""" + + bibcode: str = Field( + ..., + description=( + "ADS bibcode to resolve associated works for. Typically an ASCL " + "record bibcode (e.g. '2010ascl.soft10082F'), for which the " + "associated bibcodes are the code's description papers." + ), + ) + + +class ADSLinksResolverOutputSchema(OutputSchema): + """What the resolver returns to the caller.""" + + bibcode: str = Field(..., description="The bibcode that was resolved.") + associated_bibcodes: list[str] = Field( + default_factory=list, + description="Bibcodes of papers ADS lists as associated with the input bibcode.", + ) + error: str | None = Field( + default=None, + description="Error message if the lookup failed; null on success.", + ) + + +@mcp_tool +class ADSLinksResolverTool(BaseTool[ADSLinksResolverInputSchema, ADSLinksResolverOutputSchema]): + """Resolve an ADS bibcode to its 'associated' bibcodes. + + For ASCL record bibcodes (format: YYYYascl.softNNNNNN), the associated + bibcodes are the code's canonical description papers — these are the + papers ADS shows under "Described in" on the record page, and they are + the authoritative source for `describing_bibcodes` in agent output. + + On failure, `error` is populated and `associated_bibcodes` is empty. + """ + + input_schema = ADSLinksResolverInputSchema + output_schema = ADSLinksResolverOutputSchema + config_schema = ADSToolConfig + config: ADSToolConfig + + async def _arun(self, params: ADSLinksResolverInputSchema) -> ADSLinksResolverOutputSchema: + # Bibcodes can contain characters that must be percent-encoded in + # the URL path (e.g. '&' in '2005Ap&SS.298..341W'). Pass `safe=""` + # to quote() so nothing is left unescaped. + encoded_bibcode = quote(params.bibcode, safe="") + url = f"{self.config.base_url.rstrip('/')}/resolver/{encoded_bibcode}/associated" + headers = {"Authorization": f"Bearer {self.config.api_token}"} + + try: + async with httpx.AsyncClient(timeout=self.config.timeout) as client: + response = await client.get(url, headers=headers) + response.raise_for_status() + data = response.json() + except Exception as e: + return ADSLinksResolverOutputSchema( + bibcode=params.bibcode, + error=f"ADS resolver failed: {e}", + ) + + records = data.get("links", {}).get("records", []) + bibcodes = [r["bibcode"] for r in records if r.get("bibcode")] + + return ADSLinksResolverOutputSchema( + bibcode=params.bibcode, + associated_bibcodes=bibcodes, + ) diff --git a/tests/tools/test_ads.py b/tests/tools/test_ads.py new file mode 100644 index 0000000..ff3bd01 --- /dev/null +++ b/tests/tools/test_ads.py @@ -0,0 +1,191 @@ +"""Integration tests for ADS search and links-resolver tools. + +These tests hit the live ADS API and require ``ADS_API_TOKEN`` in the +environment. The whole module is skipped when the token is missing so the +rest of the test suite can still run. + +Reference bibcodes used as fixtures: +- 2010ascl.soft10082F — ASCL record for FLASH. +- 2000ApJS..131..273F — Fryxell+ 2000, the canonical FLASH paper (~2100 cites). +- 2005Ap&SS.298..341W — Weirs+ 2005, a FLASH update paper (used to test + URL encoding — the '&' must be percent-escaped). +- 2013PASP..125..306F — Foreman-Mackey emcee paper. +""" + +import os + +import pytest + +from akd_ext.tools import ( + ADSLinksResolverInputSchema, + ADSLinksResolverOutputSchema, + ADSLinksResolverTool, + ADSSearchTool, + ADSSearchToolInputSchema, + ADSSearchToolOutputSchema, + ADSToolConfig, +) + +pytestmark = pytest.mark.skipif( + not os.environ.get("ADS_API_TOKEN"), + reason="ADS_API_TOKEN not set", +) + + +@pytest.mark.asyncio +async def test_ads_search_by_title(): + """A title search for the "Source Code" paper should return it.""" + tool = ADSSearchTool() + result = await tool.arun( + ADSSearchToolInputSchema(query='title:"FLASH"', rows=5, fq='property:refereed') + ) + + assert isinstance(result, ADSSearchToolOutputSchema) + assert result.error is None + assert result.num_found > 0 + assert len(result.papers) <= 5 + + bibcodes = [p.bibcode for p in result.papers] + # 2000ApJS..131..273F is the canonical FLASH paper (Fryxell et al. 2000, ~2100 citations). + assert "2000ApJS..131..273F" in bibcodes + + +@pytest.mark.asyncio +async def test_ads_search_honors_rows(): + """When enough matches exist, we should get exactly `rows` papers.""" + tool = ADSSearchTool() + result = await tool.arun(ADSSearchToolInputSchema(query="dark energy", rows=15)) + + assert result.error is None + if result.num_found >= 15: + assert len(result.papers) == 15 + + +@pytest.mark.asyncio +async def test_ads_search_empty_result(): + """A nonsense query returns zero papers and no error.""" + tool = ADSSearchTool() + result = await tool.arun( + ADSSearchToolInputSchema(query='title:"xyznonexistent123456qweasd"', rows=5) + ) + + assert result.error is None + assert result.papers == [] + assert result.num_found == 0 + + +@pytest.mark.asyncio +async def test_ads_search_num_found_is_global_total(): + """`num_found` is the total match count in ADS, independent of `rows`.""" + tool = ADSSearchTool() + result = await tool.arun(ADSSearchToolInputSchema(query="dark matter", rows=1)) + + assert len(result.papers) == 1 + # "dark matter" easily has thousands of matches. + assert result.num_found > 1 + + +@pytest.mark.asyncio +async def test_ads_search_fq_filters_to_refereed(): + """The `fq` param filters results — refereed-only should still hit emcee.""" + tool = ADSSearchTool() + result = await tool.arun( + ADSSearchToolInputSchema( + query='title:"emcee: The MCMC Hammer"', + rows=5, + fq="property:refereed", + ) + ) + + assert result.error is None + assert result.num_found > 0 + for paper in result.papers: + assert "REFEREED" in paper.property + + +@pytest.mark.asyncio +async def test_ads_search_by_bibcode_exact_lookup(): + """Looking up a paper by its exact bibcode returns that one paper.""" + tool = ADSSearchTool() + result = await tool.arun( + ADSSearchToolInputSchema( + query='bibcode:"2000ApJS..131..273F"', + rows=1, + ) + ) + + assert result.error is None + assert result.num_found == 1 + assert len(result.papers) == 1 + + paper = result.papers[0] + assert paper.bibcode == "2000ApJS..131..273F" + assert "FLASH" in paper.title + assert paper.first_author.startswith("Fryxell") + assert paper.year == "2000" + assert paper.citation_count > 1000 # canonical paper with ~2100 citations + + + +def test_ads_config_rejects_empty_token(): + """The config validator refuses an empty api_token.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError): + ADSToolConfig(api_token="") + + +# --------------------------------------------------------------------------- +# ADSLinksResolverTool tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_ads_resolver_for_flash_ascl_record(): + """FLASH's ASCL record (2010ascl.soft10082F) should resolve to both + describing papers: the canonical Fryxell+ 2000 paper AND the Weirs+ 2005 + update paper. This is exactly the case ASCL's own `described_in` field + misses. + """ + tool = ADSLinksResolverTool() + result = await tool.arun( + ADSLinksResolverInputSchema(bibcode="2010ascl.soft10082F") + ) + + assert isinstance(result, ADSLinksResolverOutputSchema) + assert result.error is None + assert result.bibcode == "2010ascl.soft10082F" + # Canonical FLASH paper (Fryxell et al. 2000) — the whole reason we need + # this tool; ASCL's own described_in does not list it. + assert "2000ApJS..131..273F" in result.associated_bibcodes + # The update paper should also appear. + assert "2005Ap&SS.298..341W" in result.associated_bibcodes + + +@pytest.mark.asyncio +async def test_ads_resolver_handles_url_encoded_bibcode(): + """Bibcodes with special chars (e.g. '&' in 2005Ap&SS.298..341W) must be + URL-encoded in the path. Passing such a bibcode directly should work. + """ + tool = ADSLinksResolverTool() + result = await tool.arun( + ADSLinksResolverInputSchema(bibcode="2005Ap&SS.298..341W") + ) + + assert result.error is None + assert result.bibcode == "2005Ap&SS.298..341W" + + +@pytest.mark.asyncio +async def test_ads_resolver_unknown_bibcode_is_empty_not_error(): + """A non-existent bibcode should return an empty list, not raise.""" + tool = ADSLinksResolverTool() + result = await tool.arun( + ADSLinksResolverInputSchema(bibcode="9999invalid..000..000X") + ) + + # ADS may return an empty records list OR a 404 depending on internal + # behavior; either way the tool should surface it cleanly. + assert result.bibcode == "9999invalid..000..000X" + if result.error is None: + assert result.associated_bibcodes == []