Skip to content
3 changes: 2 additions & 1 deletion api/resources/openapi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ info:
(CURIEs) from a vocabulary or ontology. The lookup is not exact, but includes partial matches.<p/>
Multiple results may be returned representing possible conceptual matches, but all of the identifiers
have been correctly normalized using the
<a href="https://github.com/TranslatorSRI/NodeNormalization">Node Normalization</a> service.'
<a href="https://github.com/TranslatorSRI/NodeNormalization">Node Normalization</a> service. You can read more
about this API on the <a href="https://github.com/NCATSTranslator/NameResolution">NameResolution GitHub repository</a>.'
license:
name: MIT
url: https://opensource.org/licenses/MIT
Expand Down
82 changes: 63 additions & 19 deletions api/server.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
"""Biomedical entity name resolution service.

1) split the input into fragments at spaces
* The order does not matter
2) search for names including all fragments, case insensitive
3) sort by length, ascending
* The curie with the shortest match is first, etc.
* Matching names are returned first, followed by non-matching names
"""
NameResolver (NameRes) API Endpoints

Queries are mostly sent to the underlying the NameRes Solr instance.
"""
import json
import logging, warnings
import logging
import warnings
import os
import re
from enum import Enum
from typing import Dict, List, Union, Annotated, Optional

from fastapi import Body, FastAPI, Query
Expand Down Expand Up @@ -102,6 +100,14 @@ async def status() -> Dict:

# ENDPOINT /reverse_lookup

class DebugOptions(str, Enum):
# A list of possible Solr debug options from https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter
none = "none"
query = "query"
timing = "timing"
results = "results"
all = "all"

class Request(BaseModel):
"""Reverse-lookup request body."""
curies: List[str]
Expand Down Expand Up @@ -210,6 +216,8 @@ class LookupResult(BaseModel):
types: List[str]
score: float
clique_identifier_count: int
explain: Optional[str] # Explanation for this specific result
debug: Optional[dict] # The debug information for the entire query


@app.get("/lookup",
Expand Down Expand Up @@ -263,17 +271,22 @@ async def lookup_curies_get(
"e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
)] = None
)] = None,
debug: Annotated[Union[DebugOptions, None], Query(
description="Provide debugging information on the Solr query at https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter"
)] = 'none'
) -> List[LookupResult]:
"""
Returns cliques with a name or synonym that contains a specified string.
"""
return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, debug)


@app.post("/lookup",
summary="Look up cliques for a fragment of a name or synonym.",
description="Returns cliques with a name or synonym that contains a specified string.",
description="Returns cliques with a name or synonym that contains a specified string. "
"You can find out more about this endpoint in the <a href="">NameRes documentation</a>."
"Note that the cliques we search through are conflated ",
response_model=List[LookupResult],
tags=["lookup"]
)
Expand Down Expand Up @@ -322,12 +335,15 @@ async def lookup_curies_post(
"e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
)] = None
)] = None,
debug: Annotated[Union[DebugOptions, None], Query(
description="Provide debugging information on the Solr query at https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter"
)] = 'none'
) -> List[LookupResult]:
"""
Returns cliques with a name or synonym that contains a specified string.
"""
return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, debug)


async def lookup(string: str,
Expand All @@ -338,7 +354,8 @@ async def lookup(string: str,
biolink_types: List[str] = None,
only_prefixes: str = "",
exclude_prefixes: str = "",
only_taxa: str = ""
only_taxa: str = "",
debug: DebugOptions = 'none',
) -> List[LookupResult]:
"""
Returns cliques with a name or synonym that contains a specified string.
Expand Down Expand Up @@ -433,6 +450,9 @@ async def lookup(string: str,
# "hl.highlightMultiTerm": "true",
})

if debug and debug != 'none':
inner_params['debug'] = debug

params = {
"query": {
"edismax": {
Expand All @@ -459,7 +479,8 @@ async def lookup(string: str,
"fields": "*, score",
"params": inner_params,
}
logging.debug(f"Query: {json.dumps(params, indent=2)}")

print(f"Query: {json.dumps(params, indent=2)}")

query_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/name_lookup/select"
async with httpx.AsyncClient(timeout=None) as client:
Expand All @@ -468,7 +489,12 @@ async def lookup(string: str,
LOGGER.error("Solr REST error: %s", response.text)
response.raise_for_status()
response = response.json()
logging.debug(f"Solr response: {json.dumps(response, indent=2)}")
print(f"Solr response: {json.dumps(response, indent=2)}")

# Do we have any debug.explain information?
explain_info = {}
if 'debug' in response and 'explain' in response['debug']:
explain_info = response['debug']['explain']

# Associate highlighting information with search results.
highlighting_response = response.get("highlighting", {})
Expand Down Expand Up @@ -499,6 +525,17 @@ async def lookup(string: str,
# Solr sometimes returns duplicates or a blank string here?
synonym_matches = list(filter(lambda s: s, set(synonym_matches)))

# Prepare debugging and explain information for this request.
debug_for_this_request = response.get('debug', None)
explain_for_this_doc = None
if debug == 'explain' or debug == 'all':
if doc['id'] in explain_info:
explain_for_this_doc = explain_info[doc['id']]

# If we have explain information, we don't need to also include it in the debugging information.
debug_for_this_request['explain'] = {"_comment": "Removed to avoid data duplication"}


outputs.append(LookupResult(curie=doc.get("curie", ""),
label=doc.get("preferred_name", ""),
highlighting={
Expand All @@ -509,7 +546,9 @@ async def lookup(string: str,
score=doc.get("score", ""),
taxa=doc.get("taxa", []),
clique_identifier_count=doc.get("clique_identifier_count", 0),
types=[f"biolink:{d}" for d in doc.get("types", [])]))
types=[f"biolink:{d}" for d in doc.get("types", [])],
explain=explain_for_this_doc,
debug=debug_for_this_request))

return outputs

Expand Down Expand Up @@ -570,6 +609,10 @@ class NameResQuery(BaseModel):
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
)
debug: Optional[DebugOptions] = Field(
'none',
description="Provide debugging information on the Solr query as per https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter"
)


@app.post("/bulk-lookup",
Expand All @@ -590,7 +633,8 @@ async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]:
query.biolink_types,
query.only_prefixes,
query.exclude_prefixes,
query.only_taxa)
query.only_taxa,
query.debug)
return result


Expand Down
153 changes: 153 additions & 0 deletions documentation/API.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Name Resolver API

The Name Resolver API is intended to provide an [Apache Solr](https://solr.apache.org/)-based interface to the
[Babel](https://github.com/NCATSTranslator/Babel) cliques of equivalent identifiers. Apache Solr is a document-based search engine:
the documents in this case are descriptions of cliques as generated by the
[Babel](https://github.com/NCATSTranslator/Babel) pipeline in its
[Synonyms format](https://github.com/NCATSTranslator/Babel/blob/master/docs/DataFormats.md#synonym-files), including lists of all known synonyms.
Here is an example document for [NCBIGene:1756](https://name-resolution-sri.renci.org/synonyms?preferred_curies=NCBIGene%3A1756)
(compared with the same CURIE [on NodeNorm](https://nodenormalization-sri.renci.org/get_normalized_nodes?curie=NCBIGene:1756)):

```json
{
"curie": "NCBIGene:1756",
"preferred_name": "DMD",
"names": [
"BMD",
"DMD",
"MRX85",
"CMD3B",
"DXS164",
"DXS270",
"DXS142",
"DXS268",
"DXS206",
"DXS272",
"DXS269",
"DXS239",
"DXS230",
"DMD Gene",
"DMD gene",
"DYSTROPHIN",
"dystrophin",
"APO-DYSTROPHIN 1",
"mutant dystrophin",
"mental retardation, X-linked 85",
"muscular dystrophy, Duchenne and Becker types",
"Dystrophin (Muscular Dystrophy, Duchenne And Becker Types) Gene",
"dystrophin (muscular dystrophy, Duchenne and Becker types), includes DXS142, DXS164, DXS206, DXS230, DXS239, DXS268, DXS269, DXS270, DXS272",
"A0A087WV90_HUMAN Dystrophin (trembl)",
"A0A0S2Z3B5_HUMAN Dystrophin isoform 2 (trembl)",
"A0A0S2Z3J7_HUMAN Dystrophin isoform 1 (Fragment) (trembl)",
"A0A5H1ZRP9_HUMAN Dystrophin (trembl)",
"A0A5H1ZRQ1_HUMAN Dystrophin (trembl)",
"A0A5H1ZRQ8_HUMAN Dystrophin (trembl)",
"A0A5H1ZRR9_HUMAN Dystrophin (trembl)",
"A0A804HKY9_HUMAN Dystrophin (trembl)",
"A7E212_HUMAN Dystrophin (trembl)",
"hDMD",
"Dystrophin",
"DMD protein, human",
"dystrophin (human)",
"Dp116 protein, human",
"DMD_HUMAN Dystrophin (sprot)",
"dystrophin (muscular dystrophy, Duchenne and Becker types) protein, human",
"Q16484_HUMAN DMD protein (Fragment) (trembl)",
"Q4G0X0_HUMAN DMD protein (trembl)"
],
"types": [
"Gene",
"GeneOrGeneProduct",
"GenomicEntity",
"ChemicalEntityOrGeneOrGeneProduct",
"PhysicalEssence",
"OntologyClass",
"BiologicalEntity",
"ThingWithTaxon",
"NamedThing",
"Entity",
"PhysicalEssenceOrOccurrent",
"MacromolecularMachineMixin",
"Protein",
"GeneProductMixin",
"Polypeptide",
"ChemicalEntityOrProteinOrPolypeptide"
],
"shortest_name_length": 3,
"clique_identifier_count": 22,
"taxa": [
"NCBITaxon:9606"
],
"curie_suffix": 1756,
"id": "fd3cbf13-1aa7-4538-9df4-11cb80493295",
"_version_": 1842436833304117200
}
```

The Name Resolver largely consists of two [search endpoints](#search-endpoints): `/lookup` (to search for normalized concepts),
`/bulk-lookup` (to search for multiple normalized concepts), and one [lookup endpoint](#lookup-endpoints):
`/synonyms` (to look up for the synonyms for a normalized CURIE).

## Conflation

Unlike the Node Normalizer, the Name Resolution Service does not currently support on-the-fly conflation. Instead,
all the [Babel conflations](https://github.com/NCATSTranslator/Babel/blob/master/docs/Conflation.md) are turned on
when Solr database is built. This means that -- for example -- protein-encoding genes will include the synonyms found
for the protein they encode, and that no separate entry will be available for those proteins.

## Scoring

Every `/lookup` or `/bulk-lookup` search result returns a search score. This score value is calculated by Apache Solr
and does not have an upper range. For every term in the query and every document in the result, Solr will calculate a
[TF*IDF score](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) by multiplying:
* The term frequency: the relative frequency of the term in the document. Solr uses the equation `freq / (freq + k1 * (1 - b + b * dl / avgdl))`,
where freq = number of occurrences of terms within this document, k1 = term saturation parameter, b = length normalization parameter,
dl = length of field and avgdl = average length of field.
* The inverse document frequency: a measure of how rare this term is among all documents. Solr uses the equation
`log(1 + (N - n + 0.5) / (n + 0.5))`, where N = total number of documents with this field, and n = number of documents
containing the term.

If multiple terms are matched in the same document, the sum of the score for each term will be used.

The TF*IDF score will be multiplied by [several boosts](https://github.com/NCATSTranslator/NameResolution/blob/56e2151bb9e6fd120644cebdf4ff45b3bc47da05/api/server.py#L436-L461)
that depend on four factors:
* We index two fields: the "preferred name" of every clique and the "synonyms" of every clique. The [preferred name
is chosen by Babel](https://github.com/NCATSTranslator/Babel?tab=readme-ov-file#how-does-babel-choose-a-preferred-label-for-a-clique),
while the synonyms are collected from all the different Babel sources.
* We set up two indexes: a [StandardTokenizer](https://solr.apache.org/guide/solr/latest/indexing-guide/tokenizers.html#standard-tokenizer)
that splits the field into tokens at whitespace and punctuation characters, and a
[KeywordTokenizer](https://solr.apache.org/guide/solr/latest/indexing-guide/tokenizers.html#keyword-tokenizer) that
treats the entire field as a single token.
* We use the [Query Fields (qf)](https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter)
field to search for the tokens in the index, but we also use the [Phrase Fields (pf)](https://solr.apache.org/guide/solr/latest/query-guide/edismax-query-parser.html#extended-dismax-parameters)
field to additionally boost search results where all the tokens are found in close proximity.
(NOTE: this might be removed soon.)
* We use the number of identifiers in the clique as a measure of how widely used a clique is. Since some cliques
share the same preferred name or label, we can use this to promote the clique most likely to be useful.

We combine these factors in this way in a standard query matches:

| | Preferred name match | Synonym match |
|--------------------------|----------------------|---------------|
| Keyword Tokenizer index | 250x | 100x |
| StandardTokenizer index | 25x | 10x |

And provide additional boosts for phrase matches, boosting synonym matches more than preferred name matches:

| | Preferred name match | Synonym match |
|--------------------------|----------------------|---------------|
| Keyword Tokenizer index | 300x | 200x |
| StandardTokenizer index | 30x | 20x |

Finally, we multiply the total score by the (base 10) logarithm by the number of identifiers in the clique plus one.
This boost ranges from log(2) = 0.3 for a clique that only has a single identifier to over log(1000) = 3.

## Search endpoints

### `/lookup`

### `/bulk-lookup`

## Lookup endpoints

### `/synonyms`
Loading