NCATSTranslator · gaurav · Nov 5, 2025 · Nov 5, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/api/resources/openapi.yml b/api/resources/openapi.yml
@@ -10,7 +10,8 @@ info:
     (CURIEs) from a vocabulary or ontology.  The lookup is not exact, but includes partial matches.<p/>
      Multiple results may be returned representing possible conceptual matches, but all of the identifiers
      have been correctly normalized using the
-     <a href="https://github.com/TranslatorSRI/NodeNormalization">Node Normalization</a> service.'
+     <a href="https://github.com/TranslatorSRI/NodeNormalization">Node Normalization</a> service. You can read more
+     about this API on the <a href="https://github.com/NCATSTranslator/NameResolution">NameResolution GitHub repository</a>.'
   license:
     name: MIT
     url: https://opensource.org/licenses/MIT

diff --git a/api/server.py b/api/server.py
@@ -1,16 +1,14 @@
-"""Biomedical entity name resolution service.
-
-1) split the input into fragments at spaces
-  * The order does not matter
-2) search for names including all fragments, case insensitive
-3) sort by length, ascending
-  * The curie with the shortest match is first, etc.
-  * Matching names are returned first, followed by non-matching names
+"""
+NameResolver (NameRes) API Endpoints
+
+Queries are mostly sent to the underlying the NameRes Solr instance.
 """
 import json
-import logging, warnings
+import logging
+import warnings
 import os
 import re
+from enum import Enum
 from typing import Dict, List, Union, Annotated, Optional
 
 from fastapi import Body, FastAPI, Query
@@ -102,6 +100,14 @@ async def status() -> Dict:
 
 # ENDPOINT /reverse_lookup
 
+class DebugOptions(str, Enum):
+    # A list of possible Solr debug options from https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter
+    none = "none"
+    query = "query"
+    timing = "timing"
+    results = "results"
+    all = "all"
+
 class Request(BaseModel):
     """Reverse-lookup request body."""
     curies: List[str]
@@ -210,6 +216,8 @@ class LookupResult(BaseModel):
     types: List[str]
     score: float
     clique_identifier_count: int
+    explain: Optional[str]    # Explanation for this specific result
+    debug: Optional[dict]     # The debug information for the entire query
 
 
 @app.get("/lookup",
@@ -263,17 +271,22 @@ async def lookup_curies_get(
                         "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
             # We can't use `example` here because otherwise it gets filled in when filling this in.
             # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
-        )] = None
+        )] = None,
+        debug: Annotated[Union[DebugOptions, None], Query(
+            description="Provide debugging information on the Solr query at https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter"
+        )] = 'none'
 ) -> List[LookupResult]:
     """
     Returns cliques with a name or synonym that contains a specified string.
     """
-    return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
+    return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, debug)
 
 
 @app.post("/lookup",
     summary="Look up cliques for a fragment of a name or synonym.",
-    description="Returns cliques with a name or synonym that contains a specified string.",
+    description="Returns cliques with a name or synonym that contains a specified string. "
+                "You can find out more about this endpoint in the <a href="">NameRes documentation</a>."
+                "Note that the cliques we search through are conflated ",
     response_model=List[LookupResult],
     tags=["lookup"]
 )
@@ -322,12 +335,15 @@ async def lookup_curies_post(
                         "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
             # We can't use `example` here because otherwise it gets filled in when filling this in.
             # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
-        )] = None
+        )] = None,
+        debug: Annotated[Union[DebugOptions, None], Query(
+            description="Provide debugging information on the Solr query at https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter"
+        )] = 'none'
 ) -> List[LookupResult]:
     """
     Returns cliques with a name or synonym that contains a specified string.
     """
-    return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
+    return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, debug)
 
 
 async def lookup(string: str,
@@ -338,7 +354,8 @@ async def lookup(string: str,
            biolink_types: List[str] = None,
            only_prefixes: str = "",
            exclude_prefixes: str = "",
-           only_taxa: str = ""
+           only_taxa: str = "",
+           debug: DebugOptions = 'none',
 ) -> List[LookupResult]:
     """
     Returns cliques with a name or synonym that contains a specified string.
@@ -433,6 +450,9 @@ async def lookup(string: str,
             # "hl.highlightMultiTerm": "true",
         })
 
+    if debug and debug != 'none':
+        inner_params['debug'] = debug
+
     params = {
         "query": {
             "edismax": {
@@ -459,7 +479,8 @@ async def lookup(string: str,
         "fields": "*, score",
         "params": inner_params,
     }
-    logging.debug(f"Query: {json.dumps(params, indent=2)}")
+
+    print(f"Query: {json.dumps(params, indent=2)}")
 
     query_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/name_lookup/select"
     async with httpx.AsyncClient(timeout=None) as client:
@@ -468,7 +489,12 @@ async def lookup(string: str,
         LOGGER.error("Solr REST error: %s", response.text)
         response.raise_for_status()
     response = response.json()
-    logging.debug(f"Solr response: {json.dumps(response, indent=2)}")
+    print(f"Solr response: {json.dumps(response, indent=2)}")
+
+    # Do we have any debug.explain information?
+    explain_info = {}
+    if 'debug' in response and 'explain' in response['debug']:
+        explain_info = response['debug']['explain']
 
     # Associate highlighting information with search results.
     highlighting_response = response.get("highlighting", {})
@@ -499,6 +525,17 @@ async def lookup(string: str,
             # Solr sometimes returns duplicates or a blank string here?
             synonym_matches = list(filter(lambda s: s, set(synonym_matches)))
 
+        # Prepare debugging and explain information for this request.
+        debug_for_this_request = response.get('debug', None)
+        explain_for_this_doc = None
+        if debug == 'explain' or debug == 'all':
+            if doc['id'] in explain_info:
+                explain_for_this_doc = explain_info[doc['id']]
+
+                # If we have explain information, we don't need to also include it in the debugging information.
+                debug_for_this_request['explain'] = {"_comment": "Removed to avoid data duplication"}
+
+
         outputs.append(LookupResult(curie=doc.get("curie", ""),
                            label=doc.get("preferred_name", ""),
                            highlighting={
@@ -509,7 +546,9 @@ async def lookup(string: str,
                            score=doc.get("score", ""),
                            taxa=doc.get("taxa", []),
                            clique_identifier_count=doc.get("clique_identifier_count", 0),
-                           types=[f"biolink:{d}" for d in doc.get("types", [])]))
+                           types=[f"biolink:{d}" for d in doc.get("types", [])],
+                           explain=explain_for_this_doc,
+                           debug=debug_for_this_request))
 
     return outputs
 
@@ -570,6 +609,10 @@ class NameResQuery(BaseModel):
         # We can't use `example` here because otherwise it gets filled in when filling this in.
         # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
     )
+    debug: Optional[DebugOptions] = Field(
+        'none',
+        description="Provide debugging information on the Solr query as per https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter"
+    )
 
 
 @app.post("/bulk-lookup",
@@ -590,7 +633,8 @@ async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]:
             query.biolink_types,
             query.only_prefixes,
             query.exclude_prefixes,
-            query.only_taxa)
+            query.only_taxa,
+            query.debug)
     return result
 
 

diff --git a/documentation/API.md b/documentation/API.md
@@ -0,0 +1,153 @@
+# Name Resolver API
+
+The Name Resolver API is intended to provide an [Apache Solr](https://solr.apache.org/)-based interface to the
+[Babel](https://github.com/NCATSTranslator/Babel) cliques of equivalent identifiers. Apache Solr is a document-based search engine:
+the documents in this case are descriptions of cliques as generated by the
+[Babel](https://github.com/NCATSTranslator/Babel) pipeline in its
+[Synonyms format](https://github.com/NCATSTranslator/Babel/blob/master/docs/DataFormats.md#synonym-files), including lists of all known synonyms.
+Here is an example document for [NCBIGene:1756](https://name-resolution-sri.renci.org/synonyms?preferred_curies=NCBIGene%3A1756)
+(compared with the same CURIE [on NodeNorm](https://nodenormalization-sri.renci.org/get_normalized_nodes?curie=NCBIGene:1756)):
+
+```json
+{
+    "curie": "NCBIGene:1756",
+    "preferred_name": "DMD",
+    "names": [
+      "BMD",
+      "DMD",
+      "MRX85",
+      "CMD3B",
+      "DXS164",
+      "DXS270",
+      "DXS142",
+      "DXS268",
+      "DXS206",
+      "DXS272",
+      "DXS269",
+      "DXS239",
+      "DXS230",
+      "DMD Gene",
+      "DMD gene",
+      "DYSTROPHIN",
+      "dystrophin",
+      "APO-DYSTROPHIN 1",
+      "mutant dystrophin",
+      "mental retardation, X-linked 85",
+      "muscular dystrophy, Duchenne and Becker types",
+      "Dystrophin (Muscular Dystrophy, Duchenne And Becker Types) Gene",
+      "dystrophin (muscular dystrophy, Duchenne and Becker types), includes DXS142, DXS164, DXS206, DXS230, DXS239, DXS268, DXS269, DXS270, DXS272",
+      "A0A087WV90_HUMAN Dystrophin (trembl)",
+      "A0A0S2Z3B5_HUMAN Dystrophin isoform 2 (trembl)",
+      "A0A0S2Z3J7_HUMAN Dystrophin isoform 1 (Fragment) (trembl)",
+      "A0A5H1ZRP9_HUMAN Dystrophin (trembl)",
+      "A0A5H1ZRQ1_HUMAN Dystrophin (trembl)",
+      "A0A5H1ZRQ8_HUMAN Dystrophin (trembl)",
+      "A0A5H1ZRR9_HUMAN Dystrophin (trembl)",
+      "A0A804HKY9_HUMAN Dystrophin (trembl)",
+      "A7E212_HUMAN Dystrophin (trembl)",
+      "hDMD",
+      "Dystrophin",
+      "DMD protein, human",
+      "dystrophin (human)",
+      "Dp116 protein, human",
+      "DMD_HUMAN Dystrophin (sprot)",
+      "dystrophin (muscular dystrophy, Duchenne and Becker types) protein, human",
+      "Q16484_HUMAN DMD protein (Fragment) (trembl)",
+      "Q4G0X0_HUMAN DMD protein (trembl)"
+    ],
+    "types": [
+      "Gene",
+      "GeneOrGeneProduct",
+      "GenomicEntity",
+      "ChemicalEntityOrGeneOrGeneProduct",
+      "PhysicalEssence",
+      "OntologyClass",
+      "BiologicalEntity",
+      "ThingWithTaxon",
+      "NamedThing",
+      "Entity",
+      "PhysicalEssenceOrOccurrent",
+      "MacromolecularMachineMixin",
+      "Protein",
+      "GeneProductMixin",
+      "Polypeptide",
+      "ChemicalEntityOrProteinOrPolypeptide"
+    ],
+    "shortest_name_length": 3,
+    "clique_identifier_count": 22,
+    "taxa": [
+      "NCBITaxon:9606"
+    ],
+    "curie_suffix": 1756,
+    "id": "fd3cbf13-1aa7-4538-9df4-11cb80493295",
+    "_version_": 1842436833304117200
+  }
+```
+
+The Name Resolver largely consists of two [search endpoints](#search-endpoints): `/lookup` (to search for normalized concepts),
+`/bulk-lookup` (to search for multiple normalized concepts), and one [lookup endpoint](#lookup-endpoints):
+`/synonyms` (to look up for the synonyms for a normalized CURIE).
+
+## Conflation
+
+Unlike the Node Normalizer, the Name Resolution Service does not currently support on-the-fly conflation. Instead,
+all the [Babel conflations](https://github.com/NCATSTranslator/Babel/blob/master/docs/Conflation.md) are turned on
+when Solr database is built. This means that -- for example -- protein-encoding genes will include the synonyms found
+for the protein they encode, and that no separate entry will be available for those proteins.
+
+## Scoring
+
+Every `/lookup` or `/bulk-lookup` search result returns a search score. This score value is calculated by Apache Solr
+and does not have an upper range. For every term in the query and every document in the result, Solr will calculate a
+[TF*IDF score](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) by multiplying:
+* The term frequency: the relative frequency of the term in the document. Solr uses the equation `freq / (freq + k1 * (1 - b + b * dl / avgdl))`,
+  where freq = number of occurrences of terms within this document, k1 = term saturation parameter, b = length normalization parameter,
+  dl = length of field and avgdl = average length of field.
+* The inverse document frequency: a measure of how rare this term is among all documents. Solr uses the equation
+  `log(1 + (N - n + 0.5) / (n + 0.5))`, where N = total number of documents with this field, and n = number of documents
+  containing the term.
+
+If multiple terms are matched in the same document, the sum of the score for each term will be used.
+
+The TF*IDF score will be multiplied by [several boosts](https://github.com/NCATSTranslator/NameResolution/blob/56e2151bb9e6fd120644cebdf4ff45b3bc47da05/api/server.py#L436-L461)
+that depend on four factors:
+* We index two fields: the "preferred name" of every clique and the "synonyms" of every clique. The [preferred name
+  is chosen by Babel](https://github.com/NCATSTranslator/Babel?tab=readme-ov-file#how-does-babel-choose-a-preferred-label-for-a-clique),
+  while the synonyms are collected from all the different Babel sources.
+* We set up two indexes: a [StandardTokenizer](https://solr.apache.org/guide/solr/latest/indexing-guide/tokenizers.html#standard-tokenizer)
+  that splits the field into tokens at whitespace and punctuation characters, and a
+  [KeywordTokenizer](https://solr.apache.org/guide/solr/latest/indexing-guide/tokenizers.html#keyword-tokenizer) that
+  treats the entire field as a single token.
+* We use the [Query Fields (qf)](https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter)
+  field to search for the tokens in the index, but we also use the [Phrase Fields (pf)](https://solr.apache.org/guide/solr/latest/query-guide/edismax-query-parser.html#extended-dismax-parameters)
+  field to additionally boost search results where all the tokens are found in close proximity.
+  (NOTE: this might be removed soon.)
+* We use the number of identifiers in the clique as a measure of how widely used a clique is. Since some cliques
+  share the same preferred name or label, we can use this to promote the clique most likely to be useful.
+
+We combine these factors in this way in a standard query matches:
+
+|                          | Preferred name match | Synonym match |
+|--------------------------|----------------------|---------------|
+| Keyword Tokenizer index  | 250x                 | 100x          |
+| StandardTokenizer index  | 25x                  | 10x           |
+
+And provide additional boosts for phrase matches, boosting synonym matches more than preferred name matches:
+
+|                          | Preferred name match | Synonym match |
+|--------------------------|----------------------|---------------|
+| Keyword Tokenizer index  | 300x                 | 200x          |
+| StandardTokenizer index  | 30x                  | 20x           |
+
+Finally, we multiply the total score by the (base 10) logarithm by the number of identifiers in the clique plus one.
+This boost ranges from log(2) = 0.3 for a clique that only has a single identifier to over log(1000) = 3.
+
+## Search endpoints
+
+### `/lookup`
+
+### `/bulk-lookup`
+
+## Lookup endpoints
+
+### `/synonyms`