From 11ecef0681c635417d56b9a838d2fd59d79a17c6 Mon Sep 17 00:00:00 2001 From: Mocretion Date: Thu, 18 Jun 2026 19:21:16 +0200 Subject: [PATCH] Taxon resolver --- duui-taxon-resolver/.gitignore | 11 + duui-taxon-resolver/DOCKERFILE | 21 ++ duui-taxon-resolver/docker_build.sh | 29 ++ duui-taxon-resolver/pom.xml | 94 +++++ duui-taxon-resolver/requirements.txt | 14 + .../src/main/lua/communication_layer.lua | 339 ++++++++++++++++++ .../src/main/python/gbif_api.py | 111 ++++++ .../src/main/python/ncbi_api.py | 108 ++++++ .../src/main/python/shared_taxon.py | 60 ++++ .../src/main/python/taxon-resolver.py | 179 +++++++++ .../src/main/python/taxref_loader.py | 209 +++++++++++ .../src/main/resources/typesystem.xml | 282 +++++++++++++++ .../texttechnologylab/TaxonResolverTest.java | 276 ++++++++++++++ 13 files changed, 1733 insertions(+) create mode 100644 duui-taxon-resolver/.gitignore create mode 100644 duui-taxon-resolver/DOCKERFILE create mode 100644 duui-taxon-resolver/docker_build.sh create mode 100644 duui-taxon-resolver/pom.xml create mode 100644 duui-taxon-resolver/requirements.txt create mode 100644 duui-taxon-resolver/src/main/lua/communication_layer.lua create mode 100644 duui-taxon-resolver/src/main/python/gbif_api.py create mode 100644 duui-taxon-resolver/src/main/python/ncbi_api.py create mode 100644 duui-taxon-resolver/src/main/python/shared_taxon.py create mode 100644 duui-taxon-resolver/src/main/python/taxon-resolver.py create mode 100644 duui-taxon-resolver/src/main/python/taxref_loader.py create mode 100644 duui-taxon-resolver/src/main/resources/typesystem.xml create mode 100644 duui-taxon-resolver/src/test/java/org/texttechnologylab/TaxonResolverTest.java diff --git a/duui-taxon-resolver/.gitignore b/duui-taxon-resolver/.gitignore new file mode 100644 index 0000000..fc03531 --- /dev/null +++ b/duui-taxon-resolver/.gitignore @@ -0,0 +1,11 @@ +### IDE Files ### +.idea/ +.vscode/ + +### Java Environment ### +target/ + +### Python Environment ### +__pycache__/ +*.pyc +.venv/ diff --git a/duui-taxon-resolver/DOCKERFILE b/duui-taxon-resolver/DOCKERFILE new file mode 100644 index 0000000..08473f5 --- /dev/null +++ b/duui-taxon-resolver/DOCKERFILE @@ -0,0 +1,21 @@ +FROM python:3.12.3 + +WORKDIR /app + +RUN pip install --upgrade pip + +COPY requirements.txt . +RUN pip install -r requirements.txt + +COPY src/main/python/*.py ./ +COPY src/main/lua/communication_layer.lua ./ +COPY src/main/resources/typesystem.xml ./ + +# Preload the backbone data to avoid doing it at runtime when the first request arrives +RUN python -c "import taxref_loader; taxref_loader.preload_backbone()" + +ENV TAXON_RESOLVER_EXECUTION_MODE=production + +EXPOSE 9714 +ENTRYPOINT ["uvicorn", "taxon-resolver:app", "--host", "0.0.0.0", "--port", "9714"] +CMD ["--workers", "1"] diff --git a/duui-taxon-resolver/docker_build.sh b/duui-taxon-resolver/docker_build.sh new file mode 100644 index 0000000..43495a7 --- /dev/null +++ b/duui-taxon-resolver/docker_build.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -euo pipefail + +# set default values for build args if not provided +ANNOTATOR_NAME="${ANNOTATOR_NAME:-duui-taxon-resolver}" +ANNOTATOR_VERSION="${ANNOTATOR_VERSION:-1.0.0}" +LOG_LEVEL="${LOG_LEVEL:-INFO}" + +# Check if BUILD_TOOL is set, otherwise check for podman or docker +if [ -n "${BUILD_TOOL:-}" ]; then + echo "⚙️ Using build tool: ${BUILD_TOOL}" +# Test if docker is available and can be used +elif (command -v docker > /dev/null 2>&1;) && (docker info > /dev/null 2>&1;) then + BUILD_TOOL="docker" + echo "⚙️ Using Docker as build tool" +elif (command -v podman > /dev/null 2>&1;) && (podman info > /dev/null 2>&1;) then + BUILD_TOOL="podman" + echo "⚙️ Using Podman as build tool" +else + echo "❌ Error: No build tool found or permissions missing. Please install Docker or Podman and ensure you have permission to run it." + exit 1 +fi + +${BUILD_TOOL} build \ + --env TAXON_RESOLVER_ANNOTATOR_NAME="${ANNOTATOR_NAME}" \ + --env TAXON_RESOLVER_ANNOTATOR_VERSION="${ANNOTATOR_VERSION}" \ + --env TAXON_RESOLVER_LOG_LEVEL="${LOG_LEVEL}" \ + -t "${ANNOTATOR_NAME}:${ANNOTATOR_VERSION}" \ + -f DOCKERFILE \ diff --git a/duui-taxon-resolver/pom.xml b/duui-taxon-resolver/pom.xml new file mode 100644 index 0000000..bbe6b9d --- /dev/null +++ b/duui-taxon-resolver/pom.xml @@ -0,0 +1,94 @@ + + + 4.0.0 + + org.texttechnologylab + taxon-resolver + 1.0-SNAPSHOT + + + 21 + 21 + UTF-8 + UTF-8 + 1.5.5 + 3.0.14 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + true + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + jitpack.io + https://jitpack.io + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + ${ttlab.duui.version} + + + com.github.texttechnologylab + UIMATypeSystem + + + + + + com.github.texttechnologylab + UIMATypeSystem + ${ttlab.typesystem.version} + + + + org.junit.jupiter + junit-jupiter + 6.1.0-M1 + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + 2.4.0 + test + + + + \ No newline at end of file diff --git a/duui-taxon-resolver/requirements.txt b/duui-taxon-resolver/requirements.txt new file mode 100644 index 0000000..bbb6961 --- /dev/null +++ b/duui-taxon-resolver/requirements.txt @@ -0,0 +1,14 @@ +annotated-types==0.7.0 +anyio==3.7.1 +attrs==25.4.0 +fastapi==0.104.1 +ipykernel==7.2.0 +pandas==3.0.3 +pydantic==2.13.4 +pydantic-settings==2.0.3 +python-dotenv==1.2.2 +requests==2.34.2 +# Taxoniq resolves NCBI taxonomic identifiers +taxoniq==1.0.3 +urllib3==2.6.3 +uvicorn==0.46.0 diff --git a/duui-taxon-resolver/src/main/lua/communication_layer.lua b/duui-taxon-resolver/src/main/lua/communication_layer.lua new file mode 100644 index 0000000..ded8396 --- /dev/null +++ b/duui-taxon-resolver/src/main/lua/communication_layer.lua @@ -0,0 +1,339 @@ +-- Bind static classes from java +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +Class = luajava.bindClass("java.lang.Class") +AnnotationCommentClass = Class:forName("org.texttechnologylab.annotation.AnnotationComment") +TaxonClassBio = Class:forName("org.texttechnologylab.annotation.biofid.Taxon") +TaxonClass = Class:forName("org.texttechnologylab.annotation.type.Taxon") + +--function instanceOf(clazz, object) + -- local object_class = object:getClass() + -- local object_class_name = tostring(object_class) + -- local clazz_name = tostring(clazz) + --local is_instance = object_class_name == clazz_name + -- return is_instance +--end + +function selectAnnotationComments(view) + local annotation_comments = {} + + local foundAC = false + + local selection_iterator = JCasUtil:select(view, AnnotationCommentClass):iterator() + while selection_iterator:hasNext() do + local annotation_comment = selection_iterator:next() + local ref = annotation_comment:getReference() + if (TaxonClassBio:isInstance(ref) or TaxonClass:isInstance(ref)) then + table.insert(annotation_comments, annotation_comment) + foundAC = true + end + end + + -- Tools such as GNFinder do not add AnnotationComments, but only link the taxon to the recognized taxon. + -- In this case, we create an AnnotationComment for each taxon, so that the linking information can be stored in the value of the comment. + if foundAC == false then + local biofid_taxon_iter = JCasUtil:select(view, TaxonClassBio):iterator() + while biofid_taxon_iter:hasNext() do + local biofid_taxon = biofid_taxon_iter:next() + local cID = luajava.newInstance("org.texttechnologylab.annotation.AnnotationComment", view) + + cID:setKey("linking") + cID:setValue(biofid_taxon:getIdentifier()) + cID:setReference(biofid_taxon) + cID:addToIndexes() + + table.insert(annotation_comments, cID) + end + end + + return annotation_comments +end + +function serialize(inputCas, outputStream, parameters) + local document_text = inputCas:getDocumentText() + + local annotation_comments = selectAnnotationComments(inputCas) + local recognized_taxa = {} + for _, annotation_comment in ipairs(annotation_comments) do + local taxon = annotation_comment:getReference() + local begin = taxon:getBegin() + -- insert taxon collection by begin position, if not already present + local recognized_taxon = recognized_taxa[begin] + if recognized_taxon == nil then + local end_ = taxon:getEnd() + recognized_taxon = { + -- text = text, + linkings = {} + } + recognized_taxon["begin"] = begin + recognized_taxon["end"] = end_ + recognized_taxa[begin] = recognized_taxon + end + local comment_key = annotation_comment:getKey() + if comment_key == "linking" then + local comment_value = annotation_comment:getValue() + table.insert(recognized_taxon.linkings, comment_value) + end + end + local recognized_taxa_list = {} + for _, recognized_taxon in pairs(recognized_taxa) do + table.insert(recognized_taxa_list, recognized_taxon) + end + + outputStream:write(json.encode({ + taxa = recognized_taxa_list, + document_text = document_text + })) +end + +function populateTaxonResolution(taxon_resolution, properties) + taxon_resolution:setProvider(properties["provider"]) + taxon_resolution:setTaxonId(properties["taxon_id"]) + + local kingdom_name = properties["kingdom_name"] + if kingdom_name ~= nil then + taxon_resolution:setKingdomName(kingdom_name) + end + local kingdom_id = properties["kingdom_key"] + if kingdom_id ~= nil then + taxon_resolution:setKingdomId(kingdom_id) + else + taxon_resolution:setKingdomId(-1) + end + local phylum_name = properties["phylum_name"] + if phylum_name ~= nil then + taxon_resolution:setPhylumName(phylum_name) + end + local phylum_id = properties["phylum_key"] + if phylum_id ~= nil then + taxon_resolution:setPhylumId(phylum_id) + else + taxon_resolution:setPhylumId(-1) + end + local class_name = properties["class_name"] + if class_name ~= nil then + taxon_resolution:setClassName(class_name) + end + local class_id = properties["class_key"] + if class_id ~= nil then + taxon_resolution:setClassId(class_id) + else + taxon_resolution:setClassId(-1) + end + local order_name = properties["order_name"] + if order_name ~= nil then + taxon_resolution:setOrderName(order_name) + end + local order_id = properties["order_key"] + if order_id ~= nil then + taxon_resolution:setOrderId(order_id) + else + taxon_resolution:setOrderId(-1) + end + local superfamily_name = properties["superfamily_name"] + if superfamily_name ~= nil then + taxon_resolution:setSuperfamilyName(superfamily_name) + end + local superfamily_id = properties["superfamily_key"] + if superfamily_id ~= nil then + taxon_resolution:setSuperfamilyId(superfamily_id) + else + taxon_resolution:setSuperfamilyId(-1) + end + local family_name = properties["family_name"] + if family_name ~= nil then + taxon_resolution:setFamilyName(family_name) + end + local family_id = properties["family_key"] + if family_id ~= nil then + taxon_resolution:setFamilyId(family_id) + else + taxon_resolution:setFamilyId(-1) + end + local subfamily_name = properties["subfamily_name"] + if subfamily_name ~= nil then + taxon_resolution:setSubfamilyName(subfamily_name) + end + local subfamily_id = properties["subfamily_key"] + if subfamily_id ~= nil then + taxon_resolution:setSubfamilyId(subfamily_id) + else + taxon_resolution:setSubfamilyId(-1) + end + local tribe_name = properties["tribe_name"] + if tribe_name ~= nil then + taxon_resolution:setTribeName(tribe_name) + end + local tribe_id = properties["tribe_key"] + if tribe_id ~= nil then + taxon_resolution:setTribeId(tribe_id) + else + taxon_resolution:setTribeId(-1) + end + local subtribe_name = properties["subtribe_name"] + if subtribe_name ~= nil then + taxon_resolution:setSubtribeName(subtribe_name) + end + local subtribe_id = properties["subtribe_key"] + if subtribe_id ~= nil then + taxon_resolution:setSubtribeId(subtribe_id) + else + taxon_resolution:setSubtribeId(-1) + end + local genus_name = properties["genus_name"] + if genus_name ~= nil then + taxon_resolution:setGenusName(genus_name) + end + local genus_id = properties["genus_key"] + if genus_id ~= nil then + taxon_resolution:setGenusId(genus_id) + else + taxon_resolution:setGenusId(-1) + end + local subgenus_name = properties["subgenus_name"] + if subgenus_name ~= nil then + taxon_resolution:setSubgenusName(subgenus_name) + end + local subgenus_id = properties["subgenus_key"] + if subgenus_id ~= nil then + taxon_resolution:setSubgenusId(subgenus_id) + else + taxon_resolution:setSubgenusId(-1) + end + local species_name = properties["species_name"] + if species_name ~= nil then + taxon_resolution:setSpeciesName(species_name) + end + local species_id = properties["species_key"] + if species_id ~= nil then + taxon_resolution:setSpeciesId(species_id) + else + taxon_resolution:setSpeciesId(-1) + end + local parent_name = properties["parent_name"] + if parent_name ~= nil then + taxon_resolution:setParentName(parent_name) + end + local parent_id = properties["parent_key"] + if parent_id ~= nil then + taxon_resolution:setParentId(parent_id) + else + taxon_resolution:setParentId(-1) + end + + local scientific_name = properties["scientific_name"] + if scientific_name ~= nil then + taxon_resolution:setScientificName(scientific_name) + end + local canonical_name = properties["canonical_name"] + if canonical_name ~= nil then + taxon_resolution:setCanonicalName(canonical_name) + end + local vernacular_name = properties["vernacular_name"] + if vernacular_name ~= nil then + taxon_resolution:setVernacularName(vernacular_name) + end + local accepted_name_usage = properties["accepted_name_usage"] + if accepted_name_usage ~= nil then + taxon_resolution:setAcceptedNameUsage(accepted_name_usage) + end + local authorship = properties["authorship"] + if authorship ~= nil then + taxon_resolution:setAuthorship(authorship) + end + local name_type = properties["name_type"] + if name_type ~= nil then + taxon_resolution:setNameType(name_type) + end + local rank = properties["rank"] + if rank ~= nil then + taxon_resolution:setRank(rank) + end + local origin = properties["origin"] + if origin ~= nil then + taxon_resolution:setOrigin(origin) + end + local taxonomic_status = properties["taxonomic_status"] + if taxonomic_status ~= nil then + taxon_resolution:setTaxonomicStatus(taxonomic_status) + end + local remarks = properties["remarks"] + if remarks ~= nil then + taxon_resolution:setRemarks(remarks) + end + local references = properties["references"] + if references ~= nil then + taxon_resolution:setReferences(references) + end + local published_in = properties["published_in"] + if published_in ~= nil then + taxon_resolution:setPublishedIn(published_in) + end + local num_descendants = properties["num_descendants"] + if num_descendants ~= nil then + taxon_resolution:setNumDescendants(num_descendants) + else + taxon_resolution:setNumDescendants(-1) + end + local last_crawled = properties["last_crawled"] + if last_crawled ~= nil then + taxon_resolution:setLastCrawled(last_crawled) + end + local last_interpreted = properties["last_interpreted"] + if last_interpreted ~= nil then + taxon_resolution:setLastInterpreted(last_interpreted) + end + local species_epithet = properties["species_epithet"] + if species_epithet ~= nil then + taxon_resolution:setSpeciesEpithet(species_epithet) + end + local infraspecific_epithet = properties["infraspecific_epithet"] + if infraspecific_epithet ~= nil then + taxon_resolution:setInfraspecificEpithet(infraspecific_epithet) + end + local cultivar_epithet = properties["cultivar_epithet"] + if cultivar_epithet ~= nil then + taxon_resolution:setCultivarEpithet(cultivar_epithet) + end + local url = properties["url"] + if url ~= nil then + taxon_resolution:setUrl(url) + end + local wikidata_id = properties["wikidata_id"] + if wikidata_id ~= nil then + taxon_resolution:setWikidataId(wikidata_id) + end + local wikidata_url = properties["wikidata_url"] + if wikidata_url ~= nil then + taxon_resolution:setWikidataUrl(wikidata_url) + end +end + +function deserialize(inputCas, inputStream) + local input_string = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(input_string) + + for _, taxon in ipairs(results["taxa"] or {}) do + local begin = taxon["begin"] + local end_ = taxon["end"] + local recognized_taxon = luajava.newInstance("org.texttechnologylab.annotation.type.RecognizedTaxon", inputCas) + recognized_taxon:setBegin(begin) + recognized_taxon:setEnd(end_) + recognized_taxon:setText(taxon["text"]) + local linkings = taxon["resolved_linkings"] + local resolutions = luajava.newInstance("org.apache.uima.jcas.cas.FSArray", inputCas, #linkings) + recognized_taxon:setResolutions(resolutions) + recognized_taxon:addToIndexes() + + for i, linking in ipairs(linkings) do + local taxon_resolution = luajava.newInstance("org.texttechnologylab.annotation.type.TaxonResolution", inputCas) + taxon_resolution:setBegin(begin) + taxon_resolution:setEnd(end_) + taxon_resolution:setRecognizedTaxon(recognized_taxon) + populateTaxonResolution(taxon_resolution, linking) + taxon_resolution:addToIndexes() + resolutions:set(i - 1, taxon_resolution) + end + end + +end diff --git a/duui-taxon-resolver/src/main/python/gbif_api.py b/duui-taxon-resolver/src/main/python/gbif_api.py new file mode 100644 index 0000000..8a3352f --- /dev/null +++ b/duui-taxon-resolver/src/main/python/gbif_api.py @@ -0,0 +1,111 @@ +from typing import override + +from pydantic import BaseModel, Field +import requests +from shared_taxon import SharedTaxon, TaxonBase + +base_api_url = "https://api.gbif.org/v1" + + +class GbifTaxon(BaseModel, TaxonBase): + key: int + taxon_id: str = Field(alias="taxonID") + kingdom: str | None = Field(default=None) + order: str | None = Field(default=None) + family: str | None = Field(default=None) + genus: str | None = Field(default=None) + species: str | None = Field(default=None) + kingdom_key: int | None = Field(alias="kingdomKey", default=None) + order_key: int | None = Field(alias="orderKey", default=None) + family_key: int | None = Field(alias="familyKey", default=None) + genus_key: int | None = Field(alias="genusKey", default=None) + species_key: int | None = Field(alias="speciesKey", default=None) + parent_key: int | None = Field(alias="parentKey", default=None) + parent: str | None = Field(default=None) + scientific_name: str | None = Field(alias="scientificName", default=None) + canonical_name: str | None = Field(alias="canonicalName", default=None) + vernacular_name: str | None = Field(alias="vernacularName", default=None) + authorship: str | None = Field(default=None) + name_type: str | None = Field(alias="nameType", default=None) + rank: str + origin: str | None = Field(default=None) + taxonomic_status: str | None = Field(alias="taxonomicStatus", default=None) + remarks: str | None = Field(default=None) + published_in: str | None = Field(alias="publishedIn", default=None) + num_descendants: int | None = Field(alias="numDescendants", default=None) + last_crawled: str | None = Field(alias="lastCrawled", default=None) + last_interpreted: str | None = Field(alias="lastInterpreted", default=None) + issues: list[str] = Field(default_factory=list) + class_: str | None = Field(alias="class", default=None) + + @property + def raw_taxon_id(self) -> int: + return int(self.taxon_id.split(":")[-1]) + + @override + def as_shared(self) -> SharedTaxon: + return SharedTaxon( + provider="gbif", + taxon_id=self.raw_taxon_id, + kingdom_name=self.kingdom, + kingdom_key=self.kingdom_key, + order_name=self.order, + order_key=self.order_key, + family_name=self.family, + family_key=self.family_key, + genus_name=self.genus, + genus_key=self.genus_key, + species_name=self.species, + species_key=self.species_key, + parent_name=self.parent, + parent_key=self.parent_key, + scientific_name=self.scientific_name, + canonical_name=self.canonical_name, + vernacular_name=self.vernacular_name, + authorship=self.authorship, + name_type=self.name_type, + rank=self.rank, + origin=self.origin, + taxonomic_status=self.taxonomic_status, + remarks=self.remarks, + published_in=self.published_in, + num_descendants=self.num_descendants, + last_crawled=self.last_crawled, + last_interpreted=self.last_interpreted, + url=f"https://www.gbif.org/species/{self.key}", + ) + + +def get_taxon(taxon_id: int) -> GbifTaxon: + response = requests.get(f"{base_api_url}/species/{taxon_id}") + response.raise_for_status() + response_data = response.content + return GbifTaxon.model_validate_json(response_data) + + +def main(): + while True: + print("Enter taxon ID (or 'exit' to quit): ", end="") + user_input = input().strip() + if user_input.lower() == "exit": + break + try: + taxon_id = int(user_input) + except ValueError: + print( + f"Invalid input '{user_input}'. Please enter a valid integer taxon ID." + ) + continue + try: + taxon = get_taxon(taxon_id) + print(taxon) + except ValueError as e: + print(f"Error parsing taxon data: {e}") + except requests.HTTPError as e: + print(f"HTTP error occurred: {e}") + except Exception as e: + print(f"An error occurred: {e}") + + +if __name__ == "__main__": + main() diff --git a/duui-taxon-resolver/src/main/python/ncbi_api.py b/duui-taxon-resolver/src/main/python/ncbi_api.py new file mode 100644 index 0000000..c3c32a5 --- /dev/null +++ b/duui-taxon-resolver/src/main/python/ncbi_api.py @@ -0,0 +1,108 @@ +import taxoniq +from typing import List, Self, override +from shared_taxon import SharedTaxon, TaxonBase + +class NcbiTaxon(TaxonBase): + handle: taxoniq.Taxon + + def __init__(self, handle: taxoniq.Taxon): + self.handle = handle + + @classmethod + def from_tax_id(cls, tax_id: int) -> Self: + handle = taxoniq.Taxon(tax_id) + return cls(handle) + + @property + def taxon_id(self) -> int | None: + return self.handle.tax_id + + @property + def rank(self) -> str | None: + try: + enum_rank = self.handle.rank + return enum_rank.name if enum_rank is not None else None + except taxoniq.NoValue: + return None + + @property + def scientific_name(self) -> str | None: + try: + return self.handle.scientific_name + except taxoniq.NoValue: + return None + + @property + def common_name(self) -> str | None: + try: + return self.handle.common_name + except taxoniq.NoValue: + return None + + @property + def lineage(self) -> List[Self] | None: + try: + return [NcbiTaxon(taxon) for taxon in self.handle.lineage] + except taxoniq.NoValue: + return None + + @property + def ranked_lineage(self) -> List[Self] | None: + try: + return [NcbiTaxon(taxon) for taxon in self.handle.ranked_lineage] + except taxoniq.NoValue: + return None + + @property + def parent(self) -> Self | None: + try: + return NcbiTaxon(self.handle.parent) + except taxoniq.NoValue: + return None + + @property + def description(self) -> str | None: + try: + return self.handle.description + except taxoniq.NoValue: + return None + + @property + def url(self) -> str: + return self.handle.url + + @property + def wikidata_id(self) -> str | None: + try: + return self.handle.wikidata_id + except KeyError: + return None + + @property + def wikidata_url(self) -> str | None: + try: + return self.handle.wikidata_url + except KeyError: + return None + + @override + def as_shared(self) -> SharedTaxon: + tid = self.taxon_id + if tid is None: + raise ValueError("Taxon ID is required to convert to SharedTaxon") + parent = self.parent + parent_name = parent.scientific_name if parent is not None else None + parent_key = parent.taxon_id if parent is not None else None + return SharedTaxon( + provider="ncbi", + taxon_id=tid, + scientific_name=self.scientific_name, + vernacular_name=self.common_name, + parent_name=parent_name, + parent_key=parent_key, + rank=self.rank, + remarks=self.description, + url=self.url, + wikidata_id=self.wikidata_id, + wikidata_url=self.wikidata_url, + ) diff --git a/duui-taxon-resolver/src/main/python/shared_taxon.py b/duui-taxon-resolver/src/main/python/shared_taxon.py new file mode 100644 index 0000000..250f7c2 --- /dev/null +++ b/duui-taxon-resolver/src/main/python/shared_taxon.py @@ -0,0 +1,60 @@ +from pydantic import BaseModel, Field +from typing import Literal + +type TaxonProvider = Literal["gbif", "taxref", "ncbi"] + +class SharedTaxon(BaseModel): + provider: TaxonProvider + taxon_id: int + kingdom_name: str | None = Field(default=None) + kingdom_key: int | None = Field(default=None) + phylum_name: str | None = Field(default=None) + phylum_key: int | None = Field(default=None) + class_name: str | None = Field(default=None) + class_key: int | None = Field(default=None) + order_name: str | None = Field(default=None) + order_key: int | None = Field(default=None) + superfamily_name: str | None = Field(default=None) + superfamily_key: int | None = Field(default=None) + family_name: str | None = Field(default=None) + family_key: int | None = Field(default=None) + subfamily_name: str | None = Field(default=None) + subfamily_key: int | None = Field(default=None) + tribe_name: str | None = Field(default=None) + tribe_key: int | None = Field(default=None) + subtribe_name: str | None = Field(default=None) + subtribe_key: int | None = Field(default=None) + genus_name: str | None = Field(default=None) + genus_key: int | None = Field(default=None) + subgenus_name: str | None = Field(default=None) + subgenus_key: int | None = Field(default=None) + species_name: str | None = Field(default=None) + species_key: int | None = Field(default=None) + parent_name: str | None = Field(default=None) + parent_key: int | None = Field(default=None) + scientific_name: str | None = Field(default=None) + canonical_name: str | None = Field(default=None) + vernacular_name: str | None = Field(default=None) + accepted_name_usage: str | None = Field(default=None) + authorship: str | None = Field(default=None) + name_type: str | None = Field(default=None) + rank: str | None = Field(default=None) + origin: str | None = Field(default=None) + taxonomic_status: str | None = Field(default=None) + remarks: str | None = Field(default=None) + references: str | None = Field(default=None) + published_in: str | None = Field(default=None) + num_descendants: int | None = Field(default=None) + last_crawled: str | None = Field(default=None) + last_interpreted: str | None = Field(default=None) + species_epithet: str | None = Field(default=None) + infraspecific_epithet: str | None = Field(default=None) + cultivar_epithet: str | None = Field(default=None) + url: str | None = Field(default=None) + wikidata_id: str | None = Field(default=None) + wikidata_url: str | None = Field(default=None) + +class TaxonBase: + def as_shared(self) -> SharedTaxon: + raise NotImplementedError("Subclasses must implement as_shared method") + diff --git a/duui-taxon-resolver/src/main/python/taxon-resolver.py b/duui-taxon-resolver/src/main/python/taxon-resolver.py new file mode 100644 index 0000000..9877c7a --- /dev/null +++ b/duui-taxon-resolver/src/main/python/taxon-resolver.py @@ -0,0 +1,179 @@ +import logging +from typing import List, Literal, Self + +from fastapi import FastAPI, Request, Response +from fastapi.responses import PlainTextResponse +from pydantic import BaseModel, Field +from pydantic_settings import BaseSettings +from urllib3 import request + +import ncbi_api, gbif_api, taxref_loader +from shared_taxon import SharedTaxon, TaxonBase, TaxonProvider + +class Settings(BaseSettings): + annotator_name: str = Field("duui-taxon-resolver", env="ANNOTATOR_NAME") + annotator_version: str = Field("1.0", env="ANNOTATOR_VERSION") + log_level: str = Field("INFO", env="LOG_LEVEL") + execution_mode: Literal["development", "production"] = Field("development", env="EXECUTION_MODE") + + class Config: + env_prefix = "TAXON_RESOLVER_" + +settings = Settings() + +lua_communication_script_path: str +typesystem_path: str +if settings.execution_mode == "development": + lua_communication_script_path = "../lua/communication_layer.lua" + typesystem_path = "../resources/typesystem.xml" +elif settings.execution_mode == "production": + lua_communication_script_path = "/app/communication_layer.lua" + typesystem_path = "/app/typesystem.xml" +else: + raise ValueError(f"Unknown execution mode '{settings.execution_mode}'") + +# Init logger +logging.basicConfig( + format="%(asctime)s %(levelname)-8s %(message)s", + level=settings.log_level, + datefmt="%Y-%m-%d %H:%M:%S", +) +logger = logging.getLogger(__name__) +logger.info("TTLab Taxon Resolver started in %s mode", settings.execution_mode) +logger.info("Name: %s", settings.annotator_name) +logger.info("Version: %s", settings.annotator_version) + +logger.info("Loading backbone data for Taxref...") +taxref_loader.initialize_backbone() +logger.info("Taxref backbone data loaded successfully") + +def load_communication_script() -> str: + with open(lua_communication_script_path, "r") as f: + communication_script = f.read() + logger.info("Loaded Lua communication script from %s", lua_communication_script_path) + return communication_script + +def load_typesystem() -> str: + with open(typesystem_path, "r") as f: + typesystem = f.read() + logger.info("Loaded type system from %s", typesystem_path) + return typesystem + +lua_communication_script: str | None = load_communication_script() if settings.execution_mode == "production" else None +typesystem: str | None = load_typesystem() if settings.execution_mode == "production" else None + +# FastAPI app +app = FastAPI( + title=settings.annotator_name, + description="Annotator for resolving and normalizing taxons in documents", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + result = lua_communication_script + if result is None: + result = load_communication_script() + return result + + +# Return typesystem +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + result = typesystem + if result is None: + result = load_typesystem() + return Response(content=result, media_type="application/xml") + +class RecognizedTaxonLinking(BaseModel): + provider: TaxonProvider + taxon_id: int + + @classmethod + def from_string(cls, linking_str: str) -> Self: + try: + provider_str, taxon_id_str = linking_str.split(":") + provider: TaxonProvider = provider_str.lower() + taxon_id = int(taxon_id_str) + return cls(provider=provider, taxon_id=taxon_id) + except Exception as e: + logger.error("Error parsing linking string '%s': %s", linking_str, e) + raise ValueError(f"Invalid linking string '{linking_str}': {e}") + +class RecognizedTaxon(BaseModel): + begin: int + end: int + text: str + linkings: List[RecognizedTaxonLinking] + +class RequestTaxon(BaseModel): + begin: int + end: int + linkings: List[str] + + def to_recognized_taxon(self, document_text: str) -> RecognizedTaxon: + text = document_text[self.begin:self.end] + return RecognizedTaxon( + begin=self.begin, + end=self.end, + text=text, + linkings=[RecognizedTaxonLinking.from_string(linking_str) for linking_str in self.linkings] + ) + +class DuuiRequest(BaseModel): + taxa: List[RequestTaxon] + document_text: str + + @property + def recognized_taxa(self) -> List[RecognizedTaxon]: + return [taxon.to_recognized_taxon(self.document_text) for taxon in self.taxa] + +class ExportedTaxon(BaseModel): + begin: int + end: int + text: str + resolved_linkings: List[SharedTaxon] + +class DuuiResponse(BaseModel): + taxa: List[ExportedTaxon] + +def resolve_taxon_linking(linking: RecognizedTaxonLinking) -> TaxonBase: + match linking.provider: + case "ncbi": + return ncbi_api.NcbiTaxon.from_tax_id(linking.taxon_id) + case "gbif": + return gbif_api.get_taxon(linking.taxon_id) + case "taxref": + return taxref_loader.taxon_from_id(linking.taxon_id) + case _: + raise ValueError(f"Unknown taxon provider '{linking.provider}'") + +def resolve_taxon_linkings(linkings: List[RecognizedTaxonLinking]) -> List[TaxonBase]: + return [resolve_taxon_linking(linking) for linking in linkings] + +def resolve_recognized_taxon(recognized_taxon: RecognizedTaxon) -> ExportedTaxon: + resolved_linkings = resolve_taxon_linkings(recognized_taxon.linkings) + return ExportedTaxon( + begin=recognized_taxon.begin, + end=recognized_taxon.end, + text=recognized_taxon.text, + resolved_linkings=[linking.as_shared() for linking in resolved_linkings] + ) + +def resolve_recognized_taxa(recognized_taxa: List[RecognizedTaxon]) -> List[ExportedTaxon]: + return [resolve_recognized_taxon(recognized_taxon) for recognized_taxon in recognized_taxa] + +@app.post("/v1/process") +async def post_process(request: DuuiRequest) -> DuuiResponse: + recognized_taxa = request.recognized_taxa + resolved_taxa = resolve_recognized_taxa(recognized_taxa) + logger.debug("Resolved %d taxons", len(resolved_taxa)) + print("AAAAAAAAAAAAAAAAAAAAA") + print(recognized_taxa) + return DuuiResponse(taxa=resolved_taxa) \ No newline at end of file diff --git a/duui-taxon-resolver/src/main/python/taxref_loader.py b/duui-taxon-resolver/src/main/python/taxref_loader.py new file mode 100644 index 0000000..3a32140 --- /dev/null +++ b/duui-taxon-resolver/src/main/python/taxref_loader.py @@ -0,0 +1,209 @@ +import os +import tempfile +from typing import override +from pydantic import BaseModel, Field +from shared_taxon import SharedTaxon, TaxonBase +import requests +import zipfile + +import pandas as pd + +### SETUP ### + + +def download_backbone( + output_path: str, url: str = "https://ipt.gbif.fr/archive.do?r=taxref" +) -> None: + with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: + # download the zip file + response = requests.get(url) + response.raise_for_status() + # write the content to the temporary file + tmp.write(response.content) + tmp.flush() + # extract the zip file + with zipfile.ZipFile(tmp.name, "r") as zip_ref: + zip_ref.extractall(output_path) + + +def load_backbone(dir_path: str) -> tuple[pd.DataFrame, pd.DataFrame]: + vernacular_names_path = f"{dir_path}/vernacularname.txt" + taxonomy_path = f"{dir_path}/taxon.txt" + vernacular_names = pd.read_csv(vernacular_names_path, sep="\t", low_memory=False) + taxonomy = pd.read_csv(taxonomy_path, sep="\t", low_memory=False) + return vernacular_names, taxonomy + + +def load_backbone_from_url( + url: str = "https://ipt.gbif.fr/archive.do?r=taxref", +) -> tuple[pd.DataFrame, pd.DataFrame]: + with tempfile.TemporaryDirectory() as tmpdir: + download_backbone(tmpdir, url) + return load_backbone(tmpdir) + + +def load_taxref() -> tuple[pd.DataFrame, pd.DataFrame]: + local_path = "backbone" + if not os.path.exists(local_path): + return load_backbone_from_url() + else: + return load_backbone(local_path) + + +### Backbone data ### + +vernacular_names: pd.DataFrame +taxonomy: pd.DataFrame + +def preload_backbone(): + download_backbone("backbone") + +def initialize_backbone(): + global vernacular_names, taxonomy + vernacular_names, taxonomy = load_taxref() + +### Utility methods ### + + +def taxon_index(taxon_id: int) -> int: + return taxonomy.index[taxonomy["taxonID"] == taxon_id][0] + + +def vernacular_name_index(vernacular_name_id: int) -> int: + return vernacular_names.index[vernacular_names["id"] == vernacular_name_id][0] + + +### Wrapper classes ### + + +class TaxrefTaxon(BaseModel, TaxonBase): + id_: int = Field(alias="id") + taxon_id: int = Field(alias="taxonID") + scientific_name_id: int | None = Field(alias="scientificNameID", default=None) + accepted_name_usage_id: int | None = Field( + alias="acceptedNameUsageID", default=None + ) + parent_name_usage_id: int | None = Field(alias="parentNameUsageID", default=None) + original_name_usage_id: int | None = Field( + alias="originalNameUsageID", default=None + ) + scientific_name: str | None = Field(alias="scientificName", default=None) + accepted_name_usage: str | None = Field(alias="acceptedNameUsage", default=None) + kingdom: str | None = Field(default=None) + phylum: str | None = Field(default=None) + class_: str | None = Field(alias="class", default=None) + order: str | None = Field(default=None) + superfamily: str | None = Field(default=None) + family: str | None = Field(default=None) + subfamily: str | None = Field(default=None) + tribe: str | None = Field(default=None) + subtribe: str | None = Field(default=None) + genus: str | None = Field(default=None) + subgenus: str | None = Field(default=None) + specific_epithet: str | None = Field(alias="specificEpithet", default=None) + infraspecific_epithet: str | None = Field( + alias="infraspecificEpithet", default=None + ) + cultivar_epithet: str | None = Field(alias="cultivarEpithet", default=None) + taxon_rank: str | None = Field(alias="taxonRank", default=None) + scientific_name_authorship: str | None = Field( + alias="scientificNameAuthorship", default=None + ) + vernacular_name: str | None = Field(alias="vernacularName", default=None) + taxon_remarks: str | None = Field(alias="taxonRemarks", default=None) + references: str | None = Field(default=None) + + @override + def as_shared(self) -> SharedTaxon: + # TODO: resolve parent name and key using parent_name_usage_id + return SharedTaxon( + provider="taxref", + taxon_id=str(self.taxon_id), + kingdom_name=self.kingdom, + phylum_name=self.phylum, + class_name=self.class_, + order_name=self.order, + superfamily_name=self.superfamily, + family_name=self.family, + subfamily_name=self.subfamily, + tribe_name=self.tribe, + subtribe_name=self.subtribe, + genus_name=self.genus, + subgenus_name=self.subgenus, + species_name=self.specific_epithet, + scientific_name=self.scientific_name, + vernacular_name=self.vernacular_name, + accepted_name_usage=self.accepted_name_usage, + authorship=self.scientific_name_authorship, + rank=self.taxon_rank, + remarks=self.taxon_remarks, + references=self.references, + ) + + +class VernacularName(BaseModel): + id_: int = Field(alias="id") + vernacular_name: str = Field(alias="vernacularName") + source: str | None = Field(default=None) + language: str | None = Field(default=None) + location_id: str | None = Field(alias="locationID", default=None) + country_code: str | None = Field(alias="countryCode", default=None) + + +def taxon_from_id(taxon_id: int) -> TaxrefTaxon: + taxon_index_ = taxon_index(taxon_id) + taxon_data = taxonomy.loc[taxon_index_] + # convert NaN to None for optional fields + taxon_data = taxon_data.where(pd.notnull(taxon_data), None) + return TaxrefTaxon(**taxon_data) + + +def vernacular_name_from_id(vernacular_name_id: int) -> VernacularName: + vernacular_name_index_ = vernacular_name_index(vernacular_name_id) + vernacular_name_data = vernacular_names.loc[vernacular_name_index_] + # convert NaN to None for optional fields + vernacular_name_data = vernacular_name_data.where( + pd.notnull(vernacular_name_data), None + ) + return VernacularName(**vernacular_name_data) + + +def main(): + while True: + print( + "Enter taxon ID 't {id}' or vernacular name ID 'v {id}' (or 'exit' to quit): ", + end="", + ) + user_input = input().strip() + if user_input.lower() == "exit": + break + if user_input.startswith("t "): + taxon_id_str = user_input[2:].strip() + try: + taxon_id = int(taxon_id_str) + taxon = taxon_from_id(taxon_id) + print(taxon) + except ValueError: + print( + f"Invalid taxon ID '{taxon_id_str}'. Please enter a valid integer taxon ID." + ) + except IndexError: + print("Taxon ID not found. Please enter a valid taxon ID.") + elif user_input.startswith("v "): + vernacular_name_id_str = user_input[2:].strip() + try: + vernacular_name_id = int(vernacular_name_id_str) + vernacular_name = vernacular_name_from_id(vernacular_name_id) + print(vernacular_name) + except ValueError: + print( + f"Invalid vernacular name ID '{vernacular_name_id_str}'. Please enter a valid integer vernacular name ID." + ) + except IndexError: + print( + "Vernacular name ID not found. Please enter a valid vernacular name ID." + ) + + +if __name__ == "__main__": + main() diff --git a/duui-taxon-resolver/src/main/resources/typesystem.xml b/duui-taxon-resolver/src/main/resources/typesystem.xml new file mode 100644 index 0000000..198e9c0 --- /dev/null +++ b/duui-taxon-resolver/src/main/resources/typesystem.xml @@ -0,0 +1,282 @@ + + + + + org.texttechnologylab.annotation.type.RecognizedTaxon + Annotation type for recognized taxon mentions. + uima.tcas.Annotation + + + resolutions + List of taxon resolutions associated with this recognized taxon. + uima.cas.FSArray + + true + + + text + The text of the recognized taxon. + uima.cas.String + + + + + org.texttechnologylab.annotation.type.TaxonResolution + Annotation type for matching taxon information to a taxonomic database. + uima.tcas.Annotation + + + recognizedTaxon + Reference to the taxon in the document that was recognized. + org.texttechnologylab.annotation.type.RecognizedTaxon + + + provider + The provider of the taxon resolution information. + uima.cas.String + + + taxonId + The identifier of the taxon in the taxonomic database. + uima.cas.Integer + + + kingdomName + The name of the kingdom to which the taxon belongs, if available. + uima.cas.String + + + kingdomId + The identifier of the kingdom to which the taxon belongs, if available. + uima.cas.Integer + + + phylumName + The name of the phylum to which the taxon belongs, if available. + uima.cas.String + + + phylumId + The identifier of the phylum to which the taxon belongs, if available. + uima.cas.Integer + + + className + The name of the class to which the taxon belongs, if available. + uima.cas.String + + + classId + The identifier of the class to which the taxon belongs, if available. + uima.cas.Integer + + + orderName + The name of the order to which the taxon belongs, if available. + uima.cas.String + + + orderId + The identifier of the order to which the taxon belongs, if available. + uima.cas.Integer + + + superfamilyName + The name of the superfamily to which the taxon belongs, if available. + uima.cas.String + + + superfamilyId + The identifier of the superfamily to which the taxon belongs, if available. + uima.cas.Integer + + + familyName + The name of the family to which the taxon belongs, if available. + uima.cas.String + + + familyId + The identifier of the family to which the taxon belongs, if available. + uima.cas.Integer + + + subfamilyName + The name of the subfamily to which the taxon belongs, if available. + uima.cas.String + + + subfamilyId + The identifier of the subfamily to which the taxon belongs, if available. + uima.cas.Integer + + + tribeName + The name of the tribe to which the taxon belongs, if available. + uima.cas.String + + + tribeId + The identifier of the tribe to which the taxon belongs, if available. + uima.cas.Integer + + + subtribeName + The name of the subtribe to which the taxon belongs, if available. + uima.cas.String + + + subtribeId + The identifier of the subtribe to which the taxon belongs, if available. + uima.cas.Integer + + + genusName + The name of the genus to which the taxon belongs, if available. + uima.cas.String + + + genusId + The identifier of the genus to which the taxon belongs, if available. + uima.cas.Integer + + + subgenusName + The name of the subgenus to which the taxon belongs, if available. + uima.cas.String + + + subgenusId + The identifier of the subgenus to which the taxon belongs, if available. + uima.cas.Integer + + + speciesName + The name of the species to which the taxon belongs, if available. + uima.cas.String + + + speciesId + The identifier of the species to which the taxon belongs, if available. + uima.cas.Integer + + + parentName + The name of the parent taxon, if available. + uima.cas.String + + + parentId + The identifier of the parent taxon, if available. + uima.cas.Integer + + + + scientificName + The scientific name of the taxon, if available. + uima.cas.String + + + canonicalName + The canonical name of the taxon, if available. + uima.cas.String + + + vernacularName + The vernacular name of the taxon, if available. + uima.cas.String + + + acceptedNameUsage + The accepted name usage of the taxon, if available. + uima.cas.String + + + authorship + The authorship information for the taxon, if available. + uima.cas.String + + + nameType + The type of the taxon name, if available. + uima.cas.String + + + rank + The taxonomic rank of the taxon, if available. + uima.cas.String + + + origin + The origin of the taxon, if available. + uima.cas.String + + + taxonomicStatus + The taxonomic status of the taxon, if available. + uima.cas.String + + + remarks + Any additional remarks about the taxon, if available. + uima.cas.String + + + references + References for the taxon, if available. + uima.cas.String + + + publishedIn + The publication in which the taxon was published, if available. + uima.cas.String + + + numDescendants + The number of descendant taxa, if available. + uima.cas.Integer + + + lastCrawled + The date and time when the taxon was last crawled, if available. + uima.cas.String + + + lastInterpreted + The date and time when the taxon was last interpreted, if available. + uima.cas.String + + + speciesEpithet + The species epithet of the taxon, if available. + uima.cas.String + + + infraspecificEpithet + The infraspecific epithet of the taxon, if available. + uima.cas.String + + + cultivarEpithet + The cultivar epithet of the taxon, if available. + uima.cas.String + + + url + A web URL associated with the taxon, provided by the taxonomic database, if available. + uima.cas.String + + + wikidataId + The Wikidata ID of the taxon, if available. + uima.cas.String + + + wikidataUrl + The URL of the Wikidata page for the taxon, if available. + uima.cas.String + + + + + diff --git a/duui-taxon-resolver/src/test/java/org/texttechnologylab/TaxonResolverTest.java b/duui-taxon-resolver/src/test/java/org/texttechnologylab/TaxonResolverTest.java new file mode 100644 index 0000000..e612320 --- /dev/null +++ b/duui-taxon-resolver/src/test/java/org/texttechnologylab/TaxonResolverTest.java @@ -0,0 +1,276 @@ +package org.texttechnologylab; + +import org.apache.uima.cas.impl.XmiCasSerializer; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIDockerDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIPodmanDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.texttechnologylab.annotation.AnnotationComment; +import org.texttechnologylab.annotation.type.RecognizedTaxon; +import org.texttechnologylab.annotation.type.Taxon; +import org.texttechnologylab.annotation.type.TaxonResolution; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collection; + +import static org.junit.jupiter.api.Assertions.*; + +public class TaxonResolverTest { + + enum Strategy { + REMOTE, + DOCKER, + PODMAN + } + + static final String TAXON_RESOLVER_IMAGE = "localhost/duui-taxon-resolver:1.0.0"; + static final String TAXON_RESOLVER_REMOTE_HOST = "http://localhost:12345"; + static final Strategy STRATEGY = Strategy.REMOTE; + static final boolean STORE_OUTPUT = false; + + DUUIComposer composer; + + @BeforeEach + public void setup() throws Exception { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + switch (STRATEGY) { + case REMOTE -> composer.addDriver(new DUUIRemoteDriver()); + case DOCKER -> composer.addDriver(new DUUIDockerDriver()); + case PODMAN -> composer.addDriver(new DUUIPodmanDriver()); + } + } + + private void addTaxonResolver(String annotationsView) throws Exception { + switch (STRATEGY) { + case REMOTE -> { + var component = new DUUIRemoteDriver.Component(TAXON_RESOLVER_REMOTE_HOST); + if (annotationsView != null) { + component.withParameter("annotations_view", annotationsView); + } + composer.add(component.build()); + } + case DOCKER -> { + var component = new DUUIDockerDriver.Component(TAXON_RESOLVER_IMAGE); + if (annotationsView != null) { + component.withParameter("annotations_view", annotationsView); + } + composer.add(component.build()); + } + case PODMAN -> { + var component = new DUUIPodmanDriver.Component(TAXON_RESOLVER_IMAGE); + if (annotationsView != null) { + component.withParameter("annotations_view", annotationsView); + } + composer.add(component.build()); + } + } + } + + @Test + public void testEmpty() throws Exception { + addTaxonResolver(null); + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("en"); + jcas.setDocumentText("This is a test."); + + composer.run(jcas); + + storeCas(jcas, "empty"); + + Collection taxa = JCasUtil.select(jcas, RecognizedTaxon.class); + assertTrue(taxa.isEmpty(), "Expected no taxa to be recognized"); + } + + @Test + public void testSingleAnnotationsSameView() throws Exception { + addTaxonResolver(null); + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("en"); + jcas.setDocumentText("Ammophila arenaria"); + + // manually insert a taxon annotation for "Ammophila arenaria" + Taxon taxon = new Taxon(jcas, 0, 18); + taxon.addToIndexes(); + AnnotationComment comment = new AnnotationComment(jcas); + comment.setReference(taxon); + comment.setKey("linking"); + comment.setValue("GBIF:1347914"); + comment.addToIndexes(); + + composer.run(jcas); + + storeCas(jcas, "single_same_view"); + + Collection taxa = JCasUtil.select(jcas, RecognizedTaxon.class); + assertEquals(1, taxa.size(), "Expected exactly one taxon to be recognized"); + RecognizedTaxon recognized = taxa.iterator().next(); + assertEquals("Ammophila arenaria", recognized.getText(), "Expected taxon text to match"); + assertEquals(1, recognized.getResolutions().size(), "Expected exactly one resolution"); + assertInstanceOf(TaxonResolution.class, recognized.getResolutions(0), "Expected resolution to be of type TaxonResolution"); + TaxonResolution resolution = (TaxonResolution) recognized.getResolutions(0); + assertEquals("gbif", resolution.getProvider(), "Expected provider to be gbif"); + assertEquals("SCIENTIFIC", resolution.getNameType(), "Expected name type to be SCIENTIFIC"); + assertEquals("SPECIES", resolution.getRank(), "Expected rank to be SPECIES"); + assertEquals("Animalia", resolution.getKingdomName(), "Expected kingdom name to be 'Animalia'"); + assertEquals(1, resolution.getKingdomId(), "Expected kingdom ID to be 1"); + assertEquals("Hymenoptera", resolution.getOrderName(), "Expected order name to be 'Hymenoptera'"); + assertEquals(1457, resolution.getOrderId(), "Expected order ID to be 1457"); + assertEquals("Sphecidae", resolution.getFamilyName(), "Expected family name to be 'Sphecidae'"); + assertEquals(4352, resolution.getFamilyId(), "Expected family ID to be 4352"); + assertEquals("Podalonia", resolution.getGenusName(), "Expected genus name to be 'Podalonia'"); + assertEquals(1347780, resolution.getGenusId(), "Expected genus ID to be 1347780"); + assertEquals("Podalonia hirsuta", resolution.getSpeciesName(), "Expected species name to be 'Podalonia hirsuta'"); + assertEquals(1347914, resolution.getSpeciesId(), "Expected species ID to be 1347914"); + assertEquals("Podalonia hirsuta (Scopoli, 1763)", resolution.getScientificName(), "Expected scientific name to be 'Podalonia hirsuta (Scopoli, 1763)'"); + } + + @Test + public void testSingleAnnotationsDifferentView() throws Exception { + String annotationsView = "taxons"; + addTaxonResolver(annotationsView); + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("en"); + jcas.setDocumentText("Ammophila arenaria"); + + // manually insert a taxon annotation for "Ammophila arenaria" in the specified view + JCas annotationsJCas = jcas.createView(annotationsView); + Taxon taxon = new Taxon(annotationsJCas, 0, 18); + taxon.addToIndexes(); + AnnotationComment comment = new AnnotationComment(annotationsJCas); + comment.setReference(taxon); + comment.setKey("linking"); + comment.setValue("GBIF:1347914"); + comment.addToIndexes(); + + composer.run(jcas); + + storeCas(jcas, "single_different_view"); + + Collection taxa = JCasUtil.select(jcas, RecognizedTaxon.class); + assertEquals(1, taxa.size(), "Expected exactly one taxon to be recognized"); + RecognizedTaxon recognized = taxa.iterator().next(); + assertEquals("Ammophila arenaria", recognized.getText(), "Expected taxon text to match"); + assertEquals(1, recognized.getResolutions().size(), "Expected exactly one resolution"); + assertInstanceOf(TaxonResolution.class, recognized.getResolutions(0), "Expected resolution to be of type TaxonResolution"); + TaxonResolution resolution = (TaxonResolution) recognized.getResolutions(0); + assertEquals("gbif", resolution.getProvider(), "Expected provider to be gbif"); + assertEquals(1347914, resolution.getSpeciesId(), "Expected species ID to be 1347914"); + } + + @Test + public void testMultipleAnnotationsDifferentProviders() throws Exception { + addTaxonResolver(null); + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("en"); + jcas.setDocumentText("Ammophila arenaria"); + + // manually insert taxon annotations for "Ammophila arenaria" and "Ulex europaeus" + Taxon taxon = new Taxon(jcas, 0, 18); + taxon.addToIndexes(); + + AnnotationComment comment1 = new AnnotationComment(jcas); + comment1.setReference(taxon); + comment1.setKey("linking"); + comment1.setValue("GBIF:1347914"); + comment1.addToIndexes(); + + AnnotationComment comment2 = new AnnotationComment(jcas); + comment2.setReference(taxon); + comment2.setKey("linking"); + comment2.setValue("NCBI:96047"); + comment2.addToIndexes(); + + AnnotationComment comment3 = new AnnotationComment(jcas); + comment3.setReference(taxon); + comment3.setKey("linking"); + comment3.setValue("TAXREF:82139"); + comment3.addToIndexes(); + + composer.run(jcas); + + storeCas(jcas, "multiple_different_providers"); + + Collection taxa = JCasUtil.select(jcas, RecognizedTaxon.class); + assertEquals(1, taxa.size(), "Expected exactly one taxon to be recognized"); + RecognizedTaxon recognized = taxa.iterator().next(); + assertEquals("Ammophila arenaria", recognized.getText(), "Expected taxon text to match"); + assertEquals(3, recognized.getResolutions().size(), "Expected exactly three resolutions"); + boolean hasGbif = false; + boolean hasNcbi = false; + boolean hasTaxref = false; + for (int i = 0; i < recognized.getResolutions().size(); i++) { + assertInstanceOf(TaxonResolution.class, recognized.getResolutions(i), "Expected resolution to be of type TaxonResolution"); + TaxonResolution resolution = (TaxonResolution) recognized.getResolutions(i); + switch (resolution.getProvider()) { + case "gbif" -> { + hasGbif = true; + assertEquals(1347914, resolution.getSpeciesId(), "Expected species ID to be 1347914 for GBIF"); + } + case "ncbi" -> { + hasNcbi = true; + assertEquals(96047, resolution.getTaxonId(), "Expected taxon ID to be 96047 for NCBI"); + assertEquals("Calamagrostis", resolution.getParentName(), "Expected parent name to be 'Calamagrostis' for NCBI"); + assertEquals(15376, resolution.getParentId(), "Expected parent ID to be 15376 for NCBI"); + assertEquals("Calamagrostis arenaria", resolution.getScientificName(), "Expected scientific name to be 'Calamagrostis arenaria' for NCBI"); + assertEquals("species", resolution.getRank(), "Expected rank to be 'species' for NCBI"); + } + case "taxref" -> { + hasTaxref = true; + assertEquals(82139, resolution.getTaxonId(), "Expected taxon ID to be 82139 for TAXREF"); + assertEquals("Plantae", resolution.getKingdomName(), "Expected kingdom name to be 'Plantae' for TAXREF"); + assertEquals("Equisetopsida", resolution.getClassName(), "Expected class name to be 'Equisetopsida' for TAXREF"); + assertEquals("Poales", resolution.getOrderName(), "Expected order name to be 'Poales' for TAXREF"); + assertEquals("Poaceae", resolution.getFamilyName(), "Expected family name to be 'Poaceae' for TAXREF"); + assertEquals("Pooideae", resolution.getSubfamilyName(), "Expected subfamily name to be 'Pooideae' for TAXREF"); + assertEquals("Poeae", resolution.getTribeName(), "Expected tribe name to be 'Poeae' for TAXREF"); + assertEquals("Ammophila arenaria", resolution.getSpeciesName(), "Expected species name to be 'Ammophila arenaria' for TAXREF"); + assertEquals("Ammophila arenaria", resolution.getScientificName(), "Expected scientific name to be 'Ammophila arenaria' for TAXREF"); + assertEquals("Oyat des sables, Ammophile des sables, Oyat, Chiendent marin, Roseau des sables, Gourbet", resolution.getVernacularName(), "Expected vernacular name to match for TAXREF"); + assertEquals("Ammophila arenaria (L.) Link, 1827", resolution.getAcceptedNameUsage(), "Expected accepted name usage to be 'Ammophila arenaria (L.) Link, 1827' for TAXREF"); + assertEquals("(L.) Link, 1827", resolution.getAuthorship(), "Expected authorship to be '(L.) Link, 1827' for TAXREF"); + assertEquals("species", resolution.getRank(), "Expected rank to be 'species' for TAXREF"); + assertEquals("https://taxref.mnhn.fr/taxref-web/taxa/82139", resolution.getReferences(), "Expeceted references to be 'https://taxref.mnhn.fr/taxref-web/taxa/82139' for TAXREF"); + } + default -> fail("Unexpected provider: " + resolution.getProvider()); + } + } + assertTrue(hasGbif, "Expected a resolution from GBIF"); + assertTrue(hasNcbi, "Expected a resolution from NCBI"); + assertTrue(hasTaxref, "Expected a resolution from TAXREF"); + } + + static void storeCas(JCas cas, String name) { + if (!STORE_OUTPUT) { + return; + } + Path folderPath = Path.of(".", "outputs"); + try { + Files.createDirectories(folderPath); + } catch (IOException e) { + throw new RuntimeException(e); + } + Path path = folderPath.resolve(name + ".xmi"); + try (OutputStream os = Files.newOutputStream(path)) { + XmiCasSerializer.serialize(cas.getCas(), os); + System.out.println("CAS stored at: " + path.toAbsolutePath()); + } catch (Exception e) { + System.err.println("Error serializing CAS: " + e.getMessage()); + } + } + +}