From c2b381e5db1f330a061d96fd1b0b8a594f424e07 Mon Sep 17 00:00:00 2001 From: Mevluet Bagci Date: Tue, 26 May 2026 17:10:02 +0200 Subject: [PATCH 01/19] Add two new DUUI topic models: multilingual-topic-classifier and classla/ParlaCAP-Topic-Classifier --- duui-transformers-topic/Readme.md | 23 +++++++++++-------- duui-transformers-topic/docker_build.sh | 22 +++++++++++++++--- duui-transformers-topic/requirements.txt | 2 +- .../src/main/docker/Dockerfile | 5 +++- .../uima/transformers/topic/TopicTest.java | 2 +- 5 files changed, 38 insertions(+), 16 deletions(-) diff --git a/duui-transformers-topic/Readme.md b/duui-transformers-topic/Readme.md index 90d7f2d9..5c01d400 100644 --- a/duui-transformers-topic/Readme.md +++ b/duui-transformers-topic/Readme.md @@ -8,16 +8,19 @@ DUUI implementation for selected Hugging-Face-based transformer [Topic tools](https://huggingface.co/models?sort=trending&search=topic) models. ## Included Models -| Name | | Revision | Languages | -|-------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------|--------------| -| manifestoberta-xlm-roberta | https://huggingface.co/manifesto-project/manifestoberta-xlm-roberta-56policy-topics-context-2023-1-1 | 06c046795a3b7b9822755f0a73776f8fabec3977 | Multilingual | -| multilingual-iptc-media-topic-classifier | https://huggingface.co/classla/multilingual-IPTC-news-topic-classifier | ad2fac9ca58ad554021c0f244f15a9d556976229 | Multilingual | -| xlm-roberta-large-english-cap-v3 |https://huggingface.co/poltextlab/xlm-roberta-large-english-cap-v3| 580cb9cc334735b6cd09a8c2e050d19f5cebfeca | EN | -| xlm-roberta-large-party-cap-v3 |https://huggingface.co/poltextlab/xlm-roberta-large-party-cap-v3| 42804267cb8db2cc056e96f9a6ceee01a579e126 | Multingual | - | cardiffnlp-roberta-large-tweet-topic-single-all |https://huggingface.co/cardiffnlp/roberta-large-tweet-topic-single-all| b9286fabc508a553a4dad6cec8035044deff034a | EN | - | tweet-topic-large-multilingual |https://huggingface.co/cardiffnlp/tweet-topic-large-multilingual| e68d741bf72c67d78806cf49a1f8831ffebd63f8 | EN,ES,El,JA | -| topic-organize-web |https://huggingface.co/WebOrganizer/TopicClassifier 8d158c9d514cdc21a7c8e9bd94e5dc483d49e024 | EN| - +| Name | | Revision | Languages | +|-------------------------------------------------|-----------------------------------------------------------------------------------------------------------|------------------------------------------|--------------| +| manifestoberta-xlm-roberta | https://huggingface.co/manifesto-project/manifestoberta-xlm-roberta-56policy-topics-context-2023-1-1 | 06c046795a3b7b9822755f0a73776f8fabec3977 | Multilingual | +| multilingual-iptc-media-topic-classifier | https://huggingface.co/classla/multilingual-IPTC-news-topic-classifier | ad2fac9ca58ad554021c0f244f15a9d556976229 | Multilingual | +| xlm-roberta-large-english-cap-v3 | https://huggingface.co/poltextlab/xlm-roberta-large-english-cap-v3 | 580cb9cc334735b6cd09a8c2e050d19f5cebfeca | EN | +| xlm-roberta-large-party-cap-v3 | https://huggingface.co/poltextlab/xlm-roberta-large-party-cap-v3 | 42804267cb8db2cc056e96f9a6ceee01a579e126 | Multingual | + | cardiffnlp-roberta-large-tweet-topic-single-all | https://huggingface.co/cardiffnlp/roberta-large-tweet-topic-single-all | b9286fabc508a553a4dad6cec8035044deff034a | EN | + | tweet-topic-large-multilingual | https://huggingface.co/cardiffnlp/tweet-topic-large-multilingual | e68d741bf72c67d78806cf49a1f8831ffebd63f8 | EN,ES,El,JA | +| topic-organize-web | https://huggingface.co/WebOrganizer/TopicClassifier 8d158c9d514cdc21a7c8e9bd94e5dc483d49e024 | EN | +| multilingual-topic-classifier | https://huggingface.co/Keshav0308/multilingual-topic-classifier | 57358ac84a8aebda7a493bbd87205314cccfe8e1 | Multilingual | +| parlacap-topic-classifier | https://huggingface.co/classla/ParlaCAP-Topic-Classifier | 82a13c61ff63b3450638e35f7b1b2cb9e6694ad6 | Multilingual | + + # How To Use For using duui-transformers-topic as a DUUI image it is necessary to use the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). diff --git a/duui-transformers-topic/docker_build.sh b/duui-transformers-topic/docker_build.sh index 85452333..579fd7ee 100644 --- a/duui-transformers-topic/docker_build.sh +++ b/duui-transformers-topic/docker_build.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash set -euo pipefail -#export ANNOTATOR_CUDA= -export ANNOTATOR_CUDA="-cuda" +export ANNOTATOR_CUDA= +#export ANNOTATOR_CUDA="-cuda" export ANNOTATOR_NAME=duui-transformers-topic export ANNOTATOR_VERSION=0.4.0 @@ -118,10 +118,26 @@ export DOCKER_REGISTRY="docker.texttechnologylab.org/" #export MODEL_LANG="EN" ###-------------------------------------------------------------------- +###--------------------------------------------------------------------- +#export MODEL_NAME="classla/ParlaCAP-Topic-Classifier" +#export MODEL_SPECNAME="parlacap-topic-classifier" +#export MODEL_VERSION="bf5c7145d4266b4851063f458eaa5ba5e28a2c43" +#export MODEL_SOURCE="https://huggingface.co/classla/ParlaCAP-Topic-Classifier" +#export MODEL_LANG="Multi" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="Keshav0308/multilingual-topic-classifier" +#export MODEL_SPECNAME="multilingual-topic-classifier" +#export MODEL_VERSION="57358ac84a8aebda7a493bbd87205314cccfe8e1" +#export MODEL_SOURCE="https://huggingface.co/Keshav0308/multilingual-topic-classifier" +#export MODEL_LANG="Multi" +###-------------------------------------------------------------------- + ##--------------------------------------------------------------------- export MODEL_NAME="classla/ParlaCAP-Topic-Classifier" export MODEL_SPECNAME="parlacap-topic-classifier" -export MODEL_VERSION="bf5c7145d4266b4851063f458eaa5ba5e28a2c43" +export MODEL_VERSION="82a13c61ff63b3450638e35f7b1b2cb9e6694ad6" export MODEL_SOURCE="https://huggingface.co/classla/ParlaCAP-Topic-Classifier" export MODEL_LANG="Multi" ##-------------------------------------------------------------------- diff --git a/duui-transformers-topic/requirements.txt b/duui-transformers-topic/requirements.txt index 8e8bde39..f9473034 100644 --- a/duui-transformers-topic/requirements.txt +++ b/duui-transformers-topic/requirements.txt @@ -1,4 +1,4 @@ -torch==2.8.0 +torch==2.5.1 torchaudio==2.5.1 torchvision==0.20.1 scipy==1.13.1 diff --git a/duui-transformers-topic/src/main/docker/Dockerfile b/duui-transformers-topic/src/main/docker/Dockerfile index 2b2e90cb..bac2fe1d 100644 --- a/duui-transformers-topic/src/main/docker/Dockerfile +++ b/duui-transformers-topic/src/main/docker/Dockerfile @@ -29,7 +29,10 @@ RUN pip install -r requirements.txt #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='cardiffnlp/tweet-topic-latest-single')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='cardiffnlp/tweet-topic-large-multilingual')" #RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoModelForSequenceClassification.from_pretrained('WebOrganizer/TopicClassifier', trust_remote_code=True, use_memory_efficient_attention=False); AutoTokenizer.from_pretrained('WebOrganizer/TopicClassifier')" -RUN python -c "from transformers import pipeline; pipeline('text-classification', model='nickmuchi/finbert-tone-finetuned-finance-topic-classification')" +#RUN #python -c "from transformers import pipeline; pipeline('text-classification', model='nickmuchi/finbert-tone-finetuned-finance-topic-classification')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Keshav0308/multilingual-topic-classifier')" +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='classla/ParlaCAP-Topic-Classifier')" + # log level ARG LOG_LEVEL="DEBUG" ENV LOG_LEVEL=$LOG_LEVEL diff --git a/duui-transformers-topic/src/test/java/org/hucompute/textimager/uima/transformers/topic/TopicTest.java b/duui-transformers-topic/src/test/java/org/hucompute/textimager/uima/transformers/topic/TopicTest.java index df330717..43b159c7 100644 --- a/duui-transformers-topic/src/test/java/org/hucompute/textimager/uima/transformers/topic/TopicTest.java +++ b/duui-transformers-topic/src/test/java/org/hucompute/textimager/uima/transformers/topic/TopicTest.java @@ -32,7 +32,7 @@ public class TopicTest { static DUUIComposer composer; static JCas cas; - static String url = "http://127.0.0.1:8000"; + static String url = "http://127.0.0.1:9714"; // static String url = "http://tweentopic.service.component.duui.texttechnologylab.org"; // static String model = "chkla/parlbert-topic-german"; From 45ce951450b3ee790e462f9aeafa6fcc7880f1d3 Mon Sep 17 00:00:00 2001 From: Mevluet Bagci Date: Tue, 26 May 2026 18:37:07 +0200 Subject: [PATCH 02/19] Add six new sentiment models: finance-sentiment-zh, finance-sentiment-zh-fast, finance-sentiment-fr,twitter-sentiment-pl-base, twitter-sentiment-pl-fast, distilroberta-base-climate-sentiment --- duui-transformers-sentiment-atomar/Readme.md | 26 +++++--- .../docker_build.sh | 62 ++++++++++++++++--- .../src/main/docker/Dockerfile | 10 ++- .../src/main/python/SentimentSpeech.py | 33 +++++++++- 4 files changed, 111 insertions(+), 20 deletions(-) diff --git a/duui-transformers-sentiment-atomar/Readme.md b/duui-transformers-sentiment-atomar/Readme.md index aa00640f..50f3738e 100644 --- a/duui-transformers-sentiment-atomar/Readme.md +++ b/duui-transformers-sentiment-atomar/Readme.md @@ -8,16 +8,22 @@ DUUI implementation for selected Hugging-Face-based transformer [Sentiment tools](https://huggingface.co/models?sort=trending&search=sentiment) models. ## Included Models -| Name | | Revision | Languages | -|---------------------------------------------------------------|--------------------------------------------------------------------------------------------|------------------------------------|------| -| twitter-xlm-roberta-base-sentiment | https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment | f2f1202b1bdeb07342385c3f807f9c07cd8f5cf8 | Multilingual | -| citizenlab-twitter-xlm-roberta-base-sentiment-finetunned | https://huggingface.co/citizenlab/twitter-xlm-roberta-base-sentiment-finetunned | a9381f1d9e6f8aac74155964c2f6ea9a63a9e9a6 | Multilingual | -| distilbert-base-multilingual-cased-sentiments-student | https://huggingface.co/lxyuan/distilbert-base-multilingual-cased-sentiments-student | cf991100d706c13c0a080c097134c05b7f436c45 | Multilingual | -| philschmid-distilbert-base-multilingual-cased-sentiments-student | https://huggingface.co/philschmid/distilbert-base-multilingual-cased-sentiment | b45a713783e49ac09c94dfda4bff847f4ad771c5 | Multilingual | -| cardiffnlp-sentiment-en | https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest | 4ba3d4463bd152c9e4abd892b50844f30c646708 | EN | -| roberta-based-en | https://huggingface.co/j-hartmann/sentiment-roberta-large-english-3-classes | 81cdc0fe3eee1bc18d95ffdfb56b2151a39c9007 | EN | -| finance-sentiment-de | https://huggingface.co/bardsai/finance-sentiment-de-base | 51b3d03f716eaa093dc42130f675839675a07b9a | DE | -| german-sentiment-bert | https://huggingface.co/oliverguhr/german-sentiment-bert | b1177ff59e305c966836ba2825d3dc2efc53f125 | DE | +| Name | | Revision | Languages | +|------------------------------------------------------------------|-------------------------------------------------------------------------------------|------------------------------------|--------------| +| twitter-xlm-roberta-base-sentiment | https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment | f2f1202b1bdeb07342385c3f807f9c07cd8f5cf8 | Multilingual | +| citizenlab-twitter-xlm-roberta-base-sentiment-finetunned | https://huggingface.co/citizenlab/twitter-xlm-roberta-base-sentiment-finetunned | a9381f1d9e6f8aac74155964c2f6ea9a63a9e9a6 | Multilingual | +| distilbert-base-multilingual-cased-sentiments-student | https://huggingface.co/lxyuan/distilbert-base-multilingual-cased-sentiments-student | cf991100d706c13c0a080c097134c05b7f436c45 | Multilingual | +| philschmid-distilbert-base-multilingual-cased-sentiments-student | https://huggingface.co/philschmid/distilbert-base-multilingual-cased-sentiment | b45a713783e49ac09c94dfda4bff847f4ad771c5 | Multilingual | +| cardiffnlp-sentiment-en | https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest | 4ba3d4463bd152c9e4abd892b50844f30c646708 | EN | +| roberta-based-en | https://huggingface.co/j-hartmann/sentiment-roberta-large-english-3-classes | 81cdc0fe3eee1bc18d95ffdfb56b2151a39c9007 | EN | +| finance-sentiment-de | https://huggingface.co/bardsai/finance-sentiment-de-base | 51b3d03f716eaa093dc42130f675839675a07b9a | DE | +| german-sentiment-bert | https://huggingface.co/oliverguhr/german-sentiment-bert | b1177ff59e305c966836ba2825d3dc2efc53f125 | DE | +| distilroberta-base-climate-sentiment | https://huggingface.co/climatebert/distilroberta-base-climate-sentiment | e9f9a94ee4263f5ad5cfc97b8539a497fc88aa7d | EN | +| finance-sentiment-zh | https://huggingface.co/bardsai/finance-sentiment-zh-base | 33595d152578da080c6e5c94b60eba15a769107f | ZH | +| finance-sentiment-zh-fast | https://huggingface.co/bardsai/finance-sentiment-zh-fast | 4cf6d7f85579bc73ac402d1dc4ecbcf3de8b6b7a | ZH | +| finance-sentiment-fr | https://huggingface.co/bardsai/finance-sentiment-fr-base | 98f660ba2ca64140df78c1a29b91dc8b6beafb62 | FR | +| twitter-sentiment-pl-base | https://huggingface.co/bardsai/twitter-sentiment-pl-base | 612331865c33e03b87522600ca34b1425c400e90 | PL | +| twitter-sentiment-pl-fast | https://huggingface.co/bardsai/twitter-sentiment-pl-fast | 2adf843ad928baf1d631179b4d52930fc286eee9 | PL | # How To Use diff --git a/duui-transformers-sentiment-atomar/docker_build.sh b/duui-transformers-sentiment-atomar/docker_build.sh index 5427cda1..e0ccace9 100644 --- a/duui-transformers-sentiment-atomar/docker_build.sh +++ b/duui-transformers-sentiment-atomar/docker_build.sh @@ -3,13 +3,13 @@ export ANNOTATOR_VERSION=0.5.1 export LOG_LEVEL=INFO eport MODEL_CACHE_SIZE=3 -##--------------------------------------------------------------------- -export MODEL_NAME="cardiffnlp/twitter-xlm-roberta-base-sentiment" -export MODEL_SPECNAME="twitter-xlm-roberta-base-sentiment" -export MODEL_VERSION="f2f1202b1bdeb07342385c3f807f9c07cd8f5cf8" -export MODEL_SOURCE="https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment" -export MODEL_LANG="Multi" -##-------------------------------------------------------------------- +###--------------------------------------------------------------------- +#export MODEL_NAME="cardiffnlp/twitter-xlm-roberta-base-sentiment" +#export MODEL_SPECNAME="twitter-xlm-roberta-base-sentiment" +#export MODEL_VERSION="f2f1202b1bdeb07342385c3f807f9c07cd8f5cf8" +#export MODEL_SOURCE="https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment" +#export MODEL_LANG="Multi" +###-------------------------------------------------------------------- ###--------------------------------------------------------------------- #export MODEL_NAME="citizenlab/twitter-xlm-roberta-base-sentiment-finetunned" @@ -67,6 +67,54 @@ export MODEL_LANG="Multi" #export MODEL_LANG="DE" ####-------------------------------------------------------------------- +###--------------------------------------------------------------------- +#export MODEL_NAME="bardsai/finance-sentiment-zh-base" +#export MODEL_SPECNAME="finance-sentiment-zh" +#export MODEL_VERSION="33595d152578da080c6e5c94b60eba15a769107f" +#export MODEL_SOURCE="https://huggingface.co/bardsai/finance-sentiment-zh-base" +#export MODEL_LANG="ZH" +###-------------------------------------------------------------------- + +##--------------------------------------------------------------------- +#export MODEL_NAME="bardsai/finance-sentiment-zh-fast" +#export MODEL_SPECNAME="finance-sentiment-zh-fast" +#export MODEL_VERSION="4cf6d7f85579bc73ac402d1dc4ecbcf3de8b6b7a" +#export MODEL_SOURCE="https://huggingface.co/bardsai/finance-sentiment-zh-fast" +#export MODEL_LANG="ZH" +##-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="bardsai/finance-sentiment-fr-base" +#export MODEL_SPECNAME="finance-sentiment-fr" +#export MODEL_VERSION="98f660ba2ca64140df78c1a29b91dc8b6beafb62" +#export MODEL_SOURCE="https://huggingface.co/bardsai/finance-sentiment-fr-base" +#export MODEL_LANG="FR" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="bardsai/twitter-sentiment-pl-base" +#export MODEL_SPECNAME="twitter-sentiment-pl-base" +#export MODEL_VERSION="612331865c33e03b87522600ca34b1425c400e90" +#export MODEL_SOURCE="https://huggingface.co/bardsai/twitter-sentiment-pl-base" +#export MODEL_LANG="PL" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="bardsai/twitter-sentiment-pl-fast" +#export MODEL_SPECNAME="twitter-sentiment-pl-fast" +#export MODEL_VERSION="2adf843ad928baf1d631179b4d52930fc286eee9" +#export MODEL_SOURCE="https://huggingface.co/bardsai/twitter-sentiment-pl-fast" +#export MODEL_LANG="PL" +###-------------------------------------------------------------------- + +##--------------------------------------------------------------------- +export MODEL_NAME="climatebert/distilroberta-base-climate-sentiment" +export MODEL_SPECNAME="distilroberta-base-climate-sentiment" +export MODEL_VERSION="e9f9a94ee4263f5ad5cfc97b8539a497fc88aa7d" +export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-sentiment" +export MODEL_LANG="EN" +##-------------------------------------------------------------------- + export DOCKER_REGISTRY="docker.texttechnologylab.org/" export DUUI_CUDA= diff --git a/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile b/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile index 8945772d..49fbad6b 100644 --- a/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile +++ b/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile @@ -8,14 +8,20 @@ EXPOSE 9714 COPY ./requirements.txt ./requirements.txt RUN pip install -r requirements.txt -RUN python -c "from transformers import pipeline; pipeline('text-classification', model='cardiffnlp/twitter-xlm-roberta-base-sentiment')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='cardiffnlp/twitter-xlm-roberta-base-sentiment')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='citizenlab/twitter-xlm-roberta-base-sentiment-finetunned')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='lxyuan/distilbert-base-multilingual-cased-sentiments-student')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='philschmid/distilbert-base-multilingual-cased-sentiment')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='cardiffnlp/twitter-roberta-base-sentiment-latest')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='j-hartmann/sentiment-roberta-large-english-3-classes')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='bardsai/finance-sentiment-de-base')" -RUN #python -c "from germansentiment import SentimentModel; model = SentimentModel()" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='bardsai/finance-sentiment-zh-base')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='bardsai/finance-sentiment-zh-fast')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='bardsai/finance-sentiment-fr-base')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='bardsai/twitter-sentiment-pl-base')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='bardsai/twitter-sentiment-pl-fast')" +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='climatebert/distilroberta-base-climate-sentiment')" +#RUN python -c "from germansentiment import SentimentModel; model = SentimentModel()" # copy scripts COPY ./src/main/python/TypeSystemSentiment.xml ./TypeSystemSentiment.xml diff --git a/duui-transformers-sentiment-atomar/src/main/python/SentimentSpeech.py b/duui-transformers-sentiment-atomar/src/main/python/SentimentSpeech.py index a6e3bfab..61c8e02d 100644 --- a/duui-transformers-sentiment-atomar/src/main/python/SentimentSpeech.py +++ b/duui-transformers-sentiment-atomar/src/main/python/SentimentSpeech.py @@ -48,7 +48,38 @@ def sigmoid(x): 0: "positive", 1: "neutral", 2: "negative" - } + }, + "bardsai/finance-sentiment-zh-base":{ + 0: "positive", + 1: "neutral", + 2: "negative" + }, + "bardsai/finance-sentiment-zh-fast":{ + 0: "positive", + 1: "neutral", + 2: "negative" + }, + "bardsai/finance-sentiment-fr-base":{ + 0: "positive", + 1: "neutral", + 2: "negative" + }, + "bardsai/twitter-sentiment-pl-base":{ + 0: "positive", + 1: "neutral", + 2: "negative" + }, + "bardsai/twitter-sentiment-pl-fast":{ + 0: "positive", + 1: "neutral", + 2: "negative" + }, + "climatebert/distilroberta-base-climate-sentiment":{ + 0: "positive", + 1: "neutral", + 2: "negative" + }, + } From 08bc47700a3aafd1cde1f454c4bf86cead50aeff Mon Sep 17 00:00:00 2001 From: bagci Date: Wed, 27 May 2026 11:08:10 +0200 Subject: [PATCH 03/19] Add genre classification in DUUI. Models: turkunlp-genre-multi, turkunlp-genre-en, turkunlp-genre-finerweb, ssharoff-genre, x-genre-classifier --- duui-Genre/.dockerignore | 3 + duui-Genre/.gitignore | 3 + duui-Genre/Readme.md | 90 ++++++ duui-Genre/docker_build.sh | 70 +++++ duui-Genre/pom.xml | 155 ++++++++++ duui-Genre/requirements.txt | 14 + duui-Genre/service_start.sh | 5 + duui-Genre/src/main/docker/Dockerfile | 55 ++++ duui-Genre/src/main/docker/Dockerfile-cuda | 70 +++++ duui-Genre/src/main/python/GenreSpeech.py | 70 +++++ .../src/main/python/TypeSystemTopic.xml | 132 +++++++++ duui-Genre/src/main/python/duui_genre.lua | 131 +++++++++ duui-Genre/src/main/python/duui_genre.py | 278 ++++++++++++++++++ .../textimager/uima/genre/GenreTest.java | 180 ++++++++++++ 14 files changed, 1256 insertions(+) create mode 100644 duui-Genre/.dockerignore create mode 100644 duui-Genre/.gitignore create mode 100644 duui-Genre/Readme.md create mode 100644 duui-Genre/docker_build.sh create mode 100644 duui-Genre/pom.xml create mode 100644 duui-Genre/requirements.txt create mode 100644 duui-Genre/service_start.sh create mode 100644 duui-Genre/src/main/docker/Dockerfile create mode 100644 duui-Genre/src/main/docker/Dockerfile-cuda create mode 100644 duui-Genre/src/main/python/GenreSpeech.py create mode 100644 duui-Genre/src/main/python/TypeSystemTopic.xml create mode 100644 duui-Genre/src/main/python/duui_genre.lua create mode 100644 duui-Genre/src/main/python/duui_genre.py create mode 100644 duui-Genre/src/test/java/org/hucompute/textimager/uima/genre/GenreTest.java diff --git a/duui-Genre/.dockerignore b/duui-Genre/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-Genre/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-Genre/.gitignore b/duui-Genre/.gitignore new file mode 100644 index 00000000..d2092691 --- /dev/null +++ b/duui-Genre/.gitignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv*/ \ No newline at end of file diff --git a/duui-Genre/Readme.md b/duui-Genre/Readme.md new file mode 100644 index 00000000..48df6bfe --- /dev/null +++ b/duui-Genre/Readme.md @@ -0,0 +1,90 @@ +[![Version](https://img.shields.io/static/v1?label=duui-genre&message=0.1.0&color=blue)](https://docker.texttechnologylab.org/v2/duui-transformers-topic/tags/list) +[![Version](https://img.shields.io/static/v1?label=Python&message=3.12&color=green)]() +[![Version](https://img.shields.io/static/v1?label=Transformers&message=5.9.0&color=yellow)]() +[![Version](https://img.shields.io/static/v1?label=Torch&message=2.11.0&color=red)]() + +# Transformers Genre + +DUUI implementation for selected Hugging-Face-based transformer [Genre tools](https://huggingface.co/models?sort=trending&search=genre) models. +## Included Models + +| Name | | Revision | Languages | +|-------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------|--------------| +| turkunlp-genre-multi | https://huggingface.co/TurkuNLP/web-register-classification-multilingual | a22ad8b652f6825ec1505dab779979e0f255d7ae | Multilingual | +| turkunlp-genre-en | https://huggingface.co/TurkuNLP/web-register-classification-en | 93969151434144dc8505865d31823c79bd385167 | EN | +| turkunlp-genre-finerweb |https://huggingface.co/TurkuNLP/finerweb-quality-classifier| 93d1635105c974a675e3be8c636d7a5cac6f7b11 | EN | +| ssharoff-genre |https://huggingface.co/ssharoff/genres| 93d1635105c974a675e3be8c636d7a5cac6f7b11| EN | +| x-genre-classifier |https://huggingface.co/classla/xlm-roberta-base-multilingual-text-genre-classifier| ebe54ca322f6fd4dc95700705b99f23e3437c8d0 | Multingual | + +# How To Use + +For using duui-genre as a DUUI image it is necessary to use the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +## Start Docker container + +``` +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-genre-[modelname]:latest + +``` + +Find all available image tags here: [https://docker.texttechnologylab.org/v2/duui-genre-[modelname]/tags/list](https://docker.texttechnologylab.org/v2/duui-transformers-topic-[modelname]/tags/list) + +## Run within DUUI + +``` +composer.add( + new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-genre-[modelname]:latest") + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") +); +``` + +### Parameters + +| Name | Description | +| ---- | ----------- | +| `selection` | Use `text` to process the full document text or any selectable UIMA type class name | + +# Cite + +If you want to use the DUUI image please quote this as follows: + +Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. [[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] + +## BibTeX + +``` +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander}, + editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023}, + year = {2023}, + address = {Singapore}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, + pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf}, + abstract = {Automatic analysis of large corpora is a complex task, especially + in terms of time efficiency. This complexity is increased by the + fact that flexible, extensible text analysis requires the continuous + integration of ever new tools. Since there are no adequate frameworks + for these purposes in the field of NLP, and especially in the + context of UIMA, that are not outdated or unusable for security + reasons, we present a new approach to address the latter task: + Docker Unified UIMA Interface (DUUI), a scalable, flexible, lightweight, + and feature-rich framework for automatic distributed analysis + of text corpora that leverages Big Data experience and virtualization + with Docker. We evaluate DUUI{'}s communication approach against + a state-of-the-art approach and demonstrate its outstanding behavior + in terms of time efficiency, enabling the analysis of big text + data.} +} + +@misc{Bagci:2024, + author = {Bagci, Mevlüt}, + title = {Hugging-Face-based genre models as {DUUI} component}, + year = {2024}, + howpublished = {https://github.com/texttechnologylab/duui-uima/tree/main/duui-Genre} +} + +``` diff --git a/duui-Genre/docker_build.sh b/duui-Genre/docker_build.sh new file mode 100644 index 00000000..558615b0 --- /dev/null +++ b/duui-Genre/docker_build.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +export ANNOTATOR_CUDA= +#export ANNOTATOR_CUDA="-cuda" + +export ANNOTATOR_NAME=duui-genre +export ANNOTATOR_VERSION=0.1.0 +export LOG_LEVEL=DEBUG +export MODEL_CACHE_SIZE=3 +export DOCKER_REGISTRY="docker.texttechnologylab.org/" + +###--------------------------------------------------------------------- +#export MODEL_NAME="TurkuNLP/web-register-classification-multilingual" +#export MODEL_SPECNAME="turkunlp-genre-multi" +#export MODEL_VERSION="a22ad8b652f6825ec1505dab779979e0f255d7ae" +#export MODEL_SOURCE="https://huggingface.co/TurkuNLP/web-register-classification-multilingual" +#export MODEL_LANG="Multi" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="TurkuNLP/web-register-classification-en" +#export MODEL_SPECNAME="turkunlp-genre-en" +#export MODEL_VERSION="93969151434144dc8505865d31823c79bd385167" +#export MODEL_SOURCE="https://huggingface.co/TurkuNLP/web-register-classification-en" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="TurkuNLP/finerweb-quality-classifier" +#export MODEL_SPECNAME="turkunlp-genre-finerweb" +#export MODEL_VERSION="93d1635105c974a675e3be8c636d7a5cac6f7b11" +#export MODEL_SOURCE="https://huggingface.co/TurkuNLP/finerweb-quality-classifier" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="ssharoff/genres" +#export MODEL_SPECNAME="ssharoff-genre" +#export MODEL_VERSION="93d1635105c974a675e3be8c636d7a5cac6f7b11" +#export MODEL_SOURCE="https://huggingface.co/ssharoff/genres" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +##--------------------------------------------------------------------- +export MODEL_NAME="classla/xlm-roberta-base-multilingual-text-genre-classifier" +export MODEL_SPECNAME="x-genre-classifier" +export MODEL_VERSION="ebe54ca322f6fd4dc95700705b99f23e3437c8d0" +export MODEL_SOURCE="https://huggingface.co/classla/xlm-roberta-base-multilingual-text-genre-classifier" +export MODEL_LANG="Multi" +##-------------------------------------------------------------------- + + + +docker build \ + --build-arg ANNOTATOR_NAME \ + --build-arg ANNOTATOR_VERSION \ + --build-arg LOG_LEVEL \ + --build-arg MODEL_CACHE_SIZE \ + --build-arg MODEL_NAME \ + --build-arg MODEL_VERSION \ + --build-arg MODEL_SOURCE \ + --build-arg MODEL_LANG \ + -t ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + -f src/main/docker/Dockerfile${ANNOTATOR_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:latest${ANNOTATOR_CUDA} diff --git a/duui-Genre/pom.xml b/duui-Genre/pom.xml new file mode 100644 index 00000000..4cf8be75 --- /dev/null +++ b/duui-Genre/pom.xml @@ -0,0 +1,155 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui-genre + 0.2.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + + 17 + 17 + 2.4.0 + + + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 7cef2433b5 + + + + + + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.14 + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + \ No newline at end of file diff --git a/duui-Genre/requirements.txt b/duui-Genre/requirements.txt new file mode 100644 index 00000000..c8109fba --- /dev/null +++ b/duui-Genre/requirements.txt @@ -0,0 +1,14 @@ +torch==2.11.0 +torchaudio==2.11.0 +torchvision==0.26.0 +scipy==1.17.1 +transformers==5.9.0 +sentencepiece==0.2.1 +protobuf==4.25.3 +numpy==2.4.6 +scikit-learn==1.8.0 +fastapi==0.110.0 +dkpro-cassis==0.9.1 +uvicorn[standard]==0.27.1 +pydantic-settings==2.0.2 +torchmetrics==1.2.0 \ No newline at end of file diff --git a/duui-Genre/service_start.sh b/duui-Genre/service_start.sh new file mode 100644 index 00000000..34cc130e --- /dev/null +++ b/duui-Genre/service_start.sh @@ -0,0 +1,5 @@ +TEXTIMAGER_DUUI_TRANSFORMERS_TOPIC_ANNOTATOR_NAME="textimager-duui-transformers-topic" \ +TEXTIMAGER_DUUI_TRANSFORMERS_TOPIC_ANNOTATOR_VERSION="unset" \ +TEXTIMAGER_DUUI_TRANSFORMERS_TOPIC_LOG_LEVEL="DEBUG" \ +TEXTIMAGER_DUUI_TRANSFORMERS_TOPIC_MODEL_CACHE_SIZE="1" \ +uvicorn src.main.python.textimager_duui_transformers_topic:app --host 0.0.0.0 --port 9714 --workers 1 diff --git a/duui-Genre/src/main/docker/Dockerfile b/duui-Genre/src/main/docker/Dockerfile new file mode 100644 index 00000000..8b3bd8ec --- /dev/null +++ b/duui-Genre/src/main/docker/Dockerfile @@ -0,0 +1,55 @@ +FROM python:3.12 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_genre.py ./duui_genre.py +COPY ./src/main/python/duui_genre.lua ./duui_genre.lua +COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/web-register-classification-multilingual'); pipeline('fill-mask', model='FacebookAI/xlm-roberta-large')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/web-register-classification-en'); pipeline('fill-mask', model='FacebookAI/xlm-roberta-large')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/finerweb-quality-classifier')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='ssharoff/genres')" +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='classla/xlm-roberta-base-multilingual-text-genre-classifier')" + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-transformers-topic" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_genre:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-Genre/src/main/docker/Dockerfile-cuda b/duui-Genre/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..e64d175e --- /dev/null +++ b/duui-Genre/src/main/docker/Dockerfile-cuda @@ -0,0 +1,70 @@ +FROM nvidia/cuda:11.8.0-base-ubuntu22.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +RUN pip install setuptools wheel +COPY ./requirements.txt ./requirements.txt +RUN apt remove -y python3-blinker || true +RUN pip install -r requirements.txt + + + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_genre.py ./duui_genre.py +COPY ./src/main/python/duui_genre.lua ./duui_genre.lua +COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/web-register-classification-multilingual')" + + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-transformers-topic" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_genre:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-Genre/src/main/python/GenreSpeech.py b/duui-Genre/src/main/python/GenreSpeech.py new file mode 100644 index 00000000..e6c112b1 --- /dev/null +++ b/duui-Genre/src/main/python/GenreSpeech.py @@ -0,0 +1,70 @@ +import torch +import math +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from scipy.special import softmax +import numpy as np +from typing import List + +ssharoff_genres = { + 0: "argum", + 1: "fictive", + 2: "instruct", + 3: "reporting", + 4: "legal", + 5: "personal", + 6: "commercial", + 7: "academic", + 8: "info", + 9: "reviews", +} + + +def sigmoid(x): + return 1 / (1 + math.exp(-x)) + +class GenreCheck: + def __init__(self, model_name: str, device='cuda:0'): + self.device = device + if model_name == "TurkuNLP/web-register-classification-en" or model_name=="TurkuNLP/web-register-classification-multilingual": + self.tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large") + else: + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + # if "manifesto-project" in model_name: + # self.model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True).to(device) + # elif "WebOrganizer/TopicClassifier" in model_name: + # self.model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, use_memory_efficient_attention=False).to(device) + # else: + self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device) + if "ssharoff" in model_name: + self.class_mapping = ssharoff_genres + else: + self.class_mapping = self.model.config.id2label + self.labels = list(self.class_mapping.values()) + + def genre_prediction(self, texts: List[str]): + with torch.no_grad(): + inputs = self.tokenizer( + texts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(self.device) + + outputs = self.model(**inputs) + logits = outputs[0].float() # convert bfloat16 -> float32 + probs = torch.softmax(logits, dim=-1) + + score_list = [] + + for prob in probs.cpu(): + ranking = torch.argsort(prob, descending=True) + + score_dict_i = { + self.labels[i]: float(prob[i]) + for i in ranking + } + + score_list.append(score_dict_i) + return score_list + diff --git a/duui-Genre/src/main/python/TypeSystemTopic.xml b/duui-Genre/src/main/python/TypeSystemTopic.xml new file mode 100644 index 00000000..dc052a36 --- /dev/null +++ b/duui-Genre/src/main/python/TypeSystemTopic.xml @@ -0,0 +1,132 @@ + + + + + org.texttechnologylab.annotation.AnnotatorMetaData + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + name + + uima.cas.String + + + version + + uima.cas.String + + + modelName + + uima.cas.String + + + modelVersion + + uima.cas.String + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + org.texttechnologylab.annotation.DocumentModification + + uima.cas.AnnotationBase + + + user + + uima.cas.String + + + timestamp + + uima.cas.Long + + + comment + + uima.cas.String + + + + + org.hucompute.textimager.uima.type.Sentiment + + uima.tcas.Annotation + + + sentiment + + uima.cas.Double + + + subjectivity + + uima.cas.Double + + + + + org.hucompute.textimager.uima.type.CategorizedSentiment + + org.hucompute.textimager.uima.type.Sentiment + + + pos + + uima.cas.Double + + + neu + + uima.cas.Double + + + neg + + uima.cas.Double + + + + + org.texttechnologylab.annotation.AnnotationComment + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + value + + uima.cas.String + + + key + + uima.cas.String + + + + + diff --git a/duui-Genre/src/main/python/duui_genre.lua b/duui-Genre/src/main/python/duui_genre.lua new file mode 100644 index 00000000..7fc4ffc6 --- /dev/null +++ b/duui-Genre/src/main/python/duui_genre.lua @@ -0,0 +1,131 @@ +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +TopicUtils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") + +function serialize(inputCas, outputStream, parameters) + local doc_lang = inputCas:getDocumentLanguage() + local doc_text = inputCas:getDocumentText() + local doc_len = TopicUtils:getDocumentTextLength(inputCas) + + local selection_types = parameters["selection"] + + local selections = {} + local selections_count = 1 + for selection_type in string.gmatch(selection_types, "([^,]+)") do + local sentences = {} + if selection_type == "text" then + local s = { + text = doc_text, + begin = 0, + ['end'] = doc_len + } + sentences[1] = s + else + local sentences_count = 1 + local clazz = Class:forName(selection_type); + local sentences_it = JCasUtil:select(inputCas, clazz):iterator() + while sentences_it:hasNext() do + local sentence = sentences_it:next() + local s = { + text = sentence:getCoveredText(), + begin = sentence:getBegin(), + ['end'] = sentence:getEnd() + } + sentences[sentences_count] = s + sentences_count = sentences_count + 1 + end + end + + local selection = { + sentences = sentences, + selection = selection_type + } + selections[selections_count] = selection + selections_count = selections_count + 1 + end + + outputStream:write(json.encode({ + selections = selections, + lang = doc_lang, + doc_len = doc_len + })) +end + +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) + if results["modification_meta"] ~= nil and results["meta"] ~= nil and results["results"] ~= nil then + -- print("GetInfo") + local source = results["model_source"] + local model_version = results["model_version"] + local model_name = results["model_name"] + local model_lang = results["model_lang"] + -- print("meta") + local modification_meta = results["modification_meta"] + local modification_anno = luajava.newInstance("org.texttechnologylab.annotation.DocumentModification", inputCas) + modification_anno:setUser(modification_meta["user"]) + modification_anno:setTimestamp(modification_meta["timestamp"]) + modification_anno:setComment(modification_meta["comment"]) + modification_anno:addToIndexes() + + -- print("setMetaData") + local model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) + model_meta:setModelVersion(model_version) + -- print(model_version) + model_meta:setModelName(model_name) + -- print(model_name) + model_meta:setSource(source) + -- print(source) + model_meta:setLang(model_lang) + -- print(model_lang) + model_meta:addToIndexes() + + local meta = results["meta"] + -- print("meta") + local begin_genre = results["begin"] + -- print("begin_emo") + local end_genre = results["end"] + -- print("end_emo") + local res_out = results["results"] +-- print("results") + local res_len = results["len_results"] + -- print("Len_results") + local factors = results["factors"] +-- print(factors) + for index_i, res in ipairs(res_out) do + -- print(res) + local begin_genre_i = begin_genre[index_i] + -- print(begin_genre_i) + local end_genre_i = end_genre[index_i] + -- print(end_genre_i) + local len_i = res_len[index_i] + -- print(len_i) + -- print(type(len_i)) + local genre_i = luajava.newInstance("org.texttechnologylab.annotation.Genre", inputCas, begin_genre_i, end_genre_i) + -- print(genre_i) + local fsarray = luajava.newInstance("org.apache.uima.jcas.cas.FSArray", inputCas, len_i) + -- print(fsarray) + genre_i:setGenres(fsarray) + local counter = 0 + local factor_i = factors[index_i] + -- print(factor_i) + for index_j, genre_j in ipairs(res) do + -- print(genre_j) + local factor_j = factor_i[index_j] + -- print(factor_j) + genre_in_i = luajava.newInstance("org.texttechnologylab.annotation.AnnotationComment", inputCas) + genre_in_i:setReference(genre_i) + genre_in_i:setKey(genre_j) + genre_in_i:setValue(factor_j) + genre_in_i:addToIndexes() + genre_i:setGenres(counter, genre_in_i) + counter = counter + 1 + end + genre_i:setModel(model_meta) + genre_i:addToIndexes() + -- print("add") + end + end + -- print("end") + end diff --git a/duui-Genre/src/main/python/duui_genre.py b/duui-Genre/src/main/python/duui_genre.py new file mode 100644 index 00000000..ccb8763d --- /dev/null +++ b/duui-Genre/src/main/python/duui_genre.py @@ -0,0 +1,278 @@ +from pydantic import BaseModel +from pydantic_settings import BaseSettings +from typing import List, Optional, Dict, Union +import logging +from time import time +from fastapi import FastAPI, Response +from cassis import load_typesystem +import torch +from threading import Lock +from functools import lru_cache +from GenreSpeech import GenreCheck +# from sp_correction import SentenceBestPrediction + +# Settings +# These are automatically loaded from env variables +from starlette.responses import PlainTextResponse + +model_lock = Lock() + + +class UimaSentence(BaseModel): + text: str + begin: int + end: int + + +class UimaSentenceSelection(BaseModel): + selection: str + sentences: List[UimaSentence] + + +class Settings(BaseSettings): + # Name of this annotator + annotator_name: str + # Version of this annotator + annotator_version: str + # Log level + log_level: str + # model_name + model_name: str + # Name of this annotator + model_version: str + #cach_size + model_cache_size: int + # url of the model + model_source: str + # language of the model + model_lang: str + + +# Load settings from env vars +settings = Settings() +lru_cache_with_size = lru_cache(maxsize=settings.model_cache_size) +logging.basicConfig(level=settings.log_level) +logger = logging.getLogger(__name__) + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +# device = "cpu" +logger.info(f'USING {device}') +# Load the predefined typesystem that is needed for this annotator to work +typesystem_filename = 'TypeSystemTopic.xml' +logger.debug("Loading typesystem from \"%s\"", typesystem_filename) +with open(typesystem_filename, 'rb') as f: + typesystem = load_typesystem(f) + logger.debug("Base typesystem:") + logger.debug(typesystem.to_xml()) + +# Load the Lua communication script +lua_communication_script_filename = "duui_genre.lua" +logger.debug("Loading Lua communication script from \"%s\"", lua_communication_script_filename) + + +# Request sent by DUUI +# Note, this is transformed by the Lua script +class DUUIRequest(BaseModel): + # The texts language + doc_len: int + # + lang: str + # + selections: List[UimaSentenceSelection] + # + + +# UIMA type: mark modification of the document +class DocumentModification(BaseModel): + user: str + timestamp: int + comment: str + + +# UIMA type: adds metadata to each annotation +class AnnotationMeta(BaseModel): + name: str + version: str + modelName: str + modelVersion: str + + +# Response sent by DUUI +# Note, this is transformed by the Lua script +class DUUIResponse(BaseModel): + # Symspelloutput + # List of Sentence with every token + # Every token is a dictionary with following Infos: + # Symspelloutput right if the token is correct, wrong if the token is incorrect, skipped if the token was skipped, unkownn if token can corrected with Symspell + # If token is unkown it will be predicted with BERT Three output pos: + # 1. Best Prediction with BERT MASKED + # 2. Best Cos-sim with Sentence-Bert and with perdicted words of BERT MASK + # 3. Option 1 and 2 together + meta: AnnotationMeta + # Modification meta, one per document + modification_meta: DocumentModification + begin: List[int] + end: List[int] + results: List + factors: List + len_results: List[int] + model_name: str + model_version: str + model_source: str + model_lang: str + + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.annotator_name, + description="Factuality annotator", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "TTLab Team", + "url": "https://texttechnologylab.org", + "email": "bagci@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +with open(lua_communication_script_filename, 'rb') as f: + lua_communication_script = f.read().decode("utf-8") +logger.debug("Lua communication script:") +logger.debug(lua_communication_script_filename) + + +# Get typesystem of this annotator +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO remove cassis dependency, as only needed for typesystem at the moment? + xml = typesystem.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +# Return documentation info +@app.get("/v1/documentation") +def get_documentation(): + return "Test" + + +@lru_cache_with_size +def load_model(model_name): + model_i = GenreCheck(model_name, device) + return model_i + + +def fix_unicode_problems(text): + # fix emoji in python string and prevent json error on response + # File "/usr/local/lib/python3.8/site-packages/starlette/responses.py", line 190, in render + # UnicodeEncodeError: 'utf-8' codec can't encode characters in position xx-yy: surrogates not allowed + clean_text = text.encode('utf-16', 'surrogatepass').decode('utf-16', 'surrogateescape') + return clean_text + + +def process_selection(model_name, selection): + begin = [] + end = [] + results_out = [] + factors = [] + len_results = [] + for s in selection.sentences: + s.text = fix_unicode_problems(s.text) + + texts = [ + s.text + for s in selection.sentences + ] + logger.debug("Preprocessed texts:") + logger.debug(texts) + + with model_lock: + classifier = load_model(model_name) + + results = classifier.genre_prediction(texts) + for c, res in enumerate(results): + res_i = [] + factor_i = [] + sentence_i = selection.sentences[c] + begin_i = sentence_i.begin + end_i = sentence_i.end + len_rel = len(res) + begin.append(begin_i) + end.append(end_i) + for i in res: + res_i.append(i) + factor_i.append(res[i]) + len_results.append(len_rel) + results_out.append(res_i) + factors.append(factor_i) + output = { + "begin": begin, + "end": end, + "len_results": len_results, + "results": results_out, + "factors": factors + } + + return output + + +# Process request from DUUI +@app.post("/v1/process") +def post_process(request: DUUIRequest): + # Return data + meta = None + begin = [] + end = [] + len_results = [] + results = [] + factors = [] + # Save modification start time for later + modification_timestamp_seconds = int(time()) + try: + model_source = settings.model_source + model_lang = settings.model_lang + model_version = settings.model_version + # set meta Informations + meta = AnnotationMeta( + name=settings.annotator_name, + version=settings.annotator_version, + modelName=settings.model_name, + modelVersion=model_version, + ) + # Add modification info + modification_meta_comment = f"{settings.annotator_name} ({settings.annotator_version}))" + modification_meta = DocumentModification( + user=settings.annotator_name, + timestamp=modification_timestamp_seconds, + comment=modification_meta_comment + ) + mv = "" + + for selection in request.selections: + processed_sentences = process_selection(settings.model_name, selection) + begin = begin + processed_sentences["begin"] + end = end + processed_sentences["end"] + len_results = len_results + processed_sentences["len_results"] + results = results + processed_sentences["results"] + factors = factors + processed_sentences["factors"] + except Exception as ex: + logger.exception(ex) + return DUUIResponse(meta=meta, modification_meta=modification_meta, begin=begin, end=end, results=results, + len_results=len_results, factors=factors, model_name=settings.model_name, + model_version=model_version, model_source=model_source, model_lang=model_lang) diff --git a/duui-Genre/src/test/java/org/hucompute/textimager/uima/genre/GenreTest.java b/duui-Genre/src/test/java/org/hucompute/textimager/uima/genre/GenreTest.java new file mode 100644 index 00000000..bc90db90 --- /dev/null +++ b/duui-Genre/src/test/java/org/hucompute/textimager/uima/genre/GenreTest.java @@ -0,0 +1,180 @@ +package org.hucompute.textimager.uima.genre; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.util.XmlCasSerializer; +import org.junit.jupiter.api.*; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.texttechnologylab.annotation.Genre; +import org.texttechnologylab.annotation.AnnotationComment; + +public class GenreTest { + static DUUIComposer composer; + static JCas cas; + + static String url = "http://127.0.0.1:9714"; +// static String url = "http://tweentopic.service.component.duui.texttechnologylab.org"; +// static String model = "chkla/parlbert-topic-german"; + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); +// DUUIDockerDriver docker_driver = new DUUIDockerDriver(); +// composer.addDriver(docker_driver); + + + cas = JCasFactory.createJCas(); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + public void afterEach() throws IOException, SAXException { + composer.resetPipeline(); + + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, stream); + System.out.println(stream.toString(StandardCharsets.UTF_8)); + + cas.reset(); + } + + public void createCas(String language, List sentences) throws UIMAException { + cas.setDocumentLanguage(language); + + StringBuilder sb = new StringBuilder(); + for (String sentence : sentences) { + Sentence sentenceAnnotation = new Sentence(cas, sb.length(), sb.length()+sentence.length()); + sentenceAnnotation.addToIndexes(); + sb.append(sentence).append(" "); + } + + cas.setDocumentText(sb.toString()); + } + + @Test + public void DeTest() throws Exception { + HashMap> expected1 = new HashMap<>(); + ArrayList expected2 = new ArrayList<>(); + expected2.add("Domestic"); + expected2.add("Technology"); + expected1.put("test", expected2); + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + ); + + List sentences = Arrays.asList( + "Ich bin ein Profi-Fußballspieler und spiele bei FC Barcelona in Spanien.", + "Das sind die Aktuellen Neuigkeiten aus den USA. Joe Biden hat die Wahl gewonnen." + ); + + createCas("de", sentences); + composer.run(cas); + + Collection all_topics = JCasUtil.select(cas, Genre.class); + ArrayList> expected = new ArrayList>(); + for (Genre topic: all_topics){ + System.out.println(topic.getCoveredText()); + Map topics = new HashMap(); + FSArray topics_all = topic.getGenres(); + for (AnnotationComment comment_i: topics_all){ + topics.put(comment_i.getKey(), Float.parseFloat(comment_i.getValue())); + System.out.println("key:"+comment_i.getKey()+"; Value:"+comment_i.getValue()); + } + expected.add(topics); + } + + for (Map topic: expected){ + // highest value + String key = Collections.max(topic.entrySet(), Map.Entry.comparingByValue()).getKey(); + Assertions.assertEquals(expected1.get("test").get(expected.indexOf(topic)), key); + } + } + + @Test + public void EnTest() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + ); + + List sentences = Arrays.asList( + "I will guide through the Labyrinth. First you need to find the entrance. Then you need to find the exit.", + "These are the latest news from the USA. Joe Biden has won the election." + ); + + createCas("de", sentences); + composer.run(cas); + + Collection all_topics = JCasUtil.select(cas, Genre.class); + ArrayList> expected = new ArrayList>(); + for (Genre genre: all_topics){ + System.out.println(genre.getCoveredText()); + Map topics = new HashMap(); + String model_name = genre.getModel().getModelName(); + FSArray topics_all = genre.getGenres(); + for (AnnotationComment comment_i: topics_all){ + topics.put(comment_i.getKey(), Float.parseFloat(comment_i.getValue())); + System.out.println("key:"+comment_i.getKey()+"; Value:"+comment_i.getValue()); + } + expected.add(topics); + } + +// HashMap> expected = new HashMap<>(); +// Collection topics = JCasUtil.select(cas, CategoryCoveredTagged.class); +//// System.out.println(topics.size()); +// for (CategoryCoveredTagged topic: topics){ +// int start = topic.getBegin(); +// int end = topic.getEnd(); +// String coveredText = topic.getCoveredText(); +// String value = topic.getValue(); +// double score = topic.getScore(); +// String key1 = start + "_" + end; +// HashMap value1 = new HashMap<>(); +// value1.put(value, score); +// if (expected.containsKey(key1)){ +// expected.get(key1).put(value, score); +// } else { +// expected.put(key1, value1); +// } +// } +// HashMap expected1 = new HashMap<>(); +// expected1.put("0_104", "Instruction"); +// expected1.put("105_176", "News"); +// for (Map.Entry> entry: expected.entrySet()){ +// String key = Collections.max(entry.getValue().entrySet(), Map.Entry.comparingByValue()).getKey(); +// // compare the expected with same index in the actual +// String expectedValue = expected1.get(entry.getKey()); +// assertEquals(expectedValue, key); +// } + + } +} From 190d97cf36e8d60427025ebbc06dcf193bf977c6 Mon Sep 17 00:00:00 2001 From: bagci Date: Thu, 28 May 2026 17:41:59 +0200 Subject: [PATCH 04/19] Add Climate classification in DUUI. Models: distilroberta-base-climate-sentiment, distilroberta-base-climate-tcfd, distilroberta-base-climate-commitment, distilroberta-base-climate-sentiment, distilroberta-base-climate-specificity --- duui-Climate/.dockerignore | 3 + duui-Climate/.gitignore | 3 + duui-Climate/Readme.md | 90 ++++++ duui-Climate/docker_build.sh | 70 +++++ duui-Climate/pom.xml | 157 ++++++++++ duui-Climate/requirements.txt | 14 + duui-Climate/src/main/docker/Dockerfile | 55 ++++ duui-Climate/src/main/docker/Dockerfile-cuda | 74 +++++ duui-Climate/src/main/python/Climate.py | 53 ++++ .../src/main/python/TypeSystemTopic.xml | 132 ++++++++ duui-Climate/src/main/python/duui_climate.lua | 133 ++++++++ duui-Climate/src/main/python/duui_climate.py | 286 ++++++++++++++++++ .../textimager/uima/climate/ClimateTest.java | 43 +-- 13 files changed, 1093 insertions(+), 20 deletions(-) create mode 100644 duui-Climate/.dockerignore create mode 100644 duui-Climate/.gitignore create mode 100644 duui-Climate/Readme.md create mode 100644 duui-Climate/docker_build.sh create mode 100644 duui-Climate/pom.xml create mode 100644 duui-Climate/requirements.txt create mode 100644 duui-Climate/src/main/docker/Dockerfile create mode 100644 duui-Climate/src/main/docker/Dockerfile-cuda create mode 100644 duui-Climate/src/main/python/Climate.py create mode 100644 duui-Climate/src/main/python/TypeSystemTopic.xml create mode 100644 duui-Climate/src/main/python/duui_climate.lua create mode 100644 duui-Climate/src/main/python/duui_climate.py rename duui-Genre/src/test/java/org/hucompute/textimager/uima/genre/GenreTest.java => duui-Climate/src/test/java/org/hucompute/textimager/uima/climate/ClimateTest.java (82%) diff --git a/duui-Climate/.dockerignore b/duui-Climate/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-Climate/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-Climate/.gitignore b/duui-Climate/.gitignore new file mode 100644 index 00000000..d2092691 --- /dev/null +++ b/duui-Climate/.gitignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv*/ \ No newline at end of file diff --git a/duui-Climate/Readme.md b/duui-Climate/Readme.md new file mode 100644 index 00000000..52078b0b --- /dev/null +++ b/duui-Climate/Readme.md @@ -0,0 +1,90 @@ +[![Version](https://img.shields.io/static/v1?label=duui-climate&message=0.1.0&color=blue)](https://docker.texttechnologylab.org/v2/duui-transformers-topic/tags/list) +[![Version](https://img.shields.io/static/v1?label=Python&message=3.12&color=green)]() +[![Version](https://img.shields.io/static/v1?label=Transformers&message=5.9.0&color=yellow)]() +[![Version](https://img.shields.io/static/v1?label=Torch&message=2.11.0&color=red)]() + +# Transformers Climate + +DUUI implementation for selected Hugging-Face-based transformer [Climate tools](https://huggingface.co/models?sort=trending&search=climatebert) models. +## Included Models + +| Name | | Revision | Languages | +|-------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|--------------------------------|----------| +| distilroberta-base-climate-sentiment | https://huggingface.co/climatebert/distilroberta-base-climate-sentiment | e9f9a94ee4263f5ad5cfc97b8539a497fc88aa7d | EN | +| distilroberta-base-climate-tcfd | https://huggingface.co/climatebert/distilroberta-base-climate-tcfd | 970630beedc21db81a84156448ad2e3ac860153d | EN | +| distilroberta-base-climate-commitment | https://huggingface.co/climatebert/distilroberta-base-climate-commitment | 17337c3292df16a8fe93b1505dfe4122d50a4c91 | EN | +| distilroberta-base-climate-sentiment | https://huggingface.co/climatebert/distilroberta-base-climate-sentiment | e9f9a94ee4263f5ad5cfc97b8539a497fc88aa7d | EN | +| distilroberta-base-climate-specificity | https://huggingface.co/climatebert/distilroberta-base-climate-specificity | 4ada96ed4bf5c3a7a711282e41f1ab9b29f0ddea | EN | + +# How To Use + +For using duui-climate as a DUUI image it is necessary to use the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +## Start Docker container + +``` +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-climate-[modelname]:latest + +``` + +Find all available image tags here: [https://docker.texttechnologylab.org/v2/duui-climate-[modelname]/tags/list](https://docker.texttechnologylab.org/v2/duui-transformers-topic-[modelname]/tags/list) + +## Run within DUUI + +``` +composer.add( + new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-climate-[modelname]:latest") + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") +); +``` + +### Parameters + +| Name | Description | +| ---- | ----------- | +| `selection` | Use `text` to process the full document text or any selectable UIMA type class name | + +# Cite + +If you want to use the DUUI image please quote this as follows: + +Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. [[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] + +## BibTeX + +``` +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander}, + editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023}, + year = {2023}, + address = {Singapore}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, + pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf}, + abstract = {Automatic analysis of large corpora is a complex task, especially + in terms of time efficiency. This complexity is increased by the + fact that flexible, extensible text analysis requires the continuous + integration of ever new tools. Since there are no adequate frameworks + for these purposes in the field of NLP, and especially in the + context of UIMA, that are not outdated or unusable for security + reasons, we present a new approach to address the latter task: + Docker Unified UIMA Interface (DUUI), a scalable, flexible, lightweight, + and feature-rich framework for automatic distributed analysis + of text corpora that leverages Big Data experience and virtualization + with Docker. We evaluate DUUI{'}s communication approach against + a state-of-the-art approach and demonstrate its outstanding behavior + in terms of time efficiency, enabling the analysis of big text + data.} +} + +@misc{Bagci:2024, + author = {Bagci, Mevlüt}, + title = {Hugging-Face-based climate models as {DUUI} component}, + year = {2026}, + howpublished = {https://github.com/texttechnologylab/duui-uima/tree/main/duui-Climate} +} + +``` diff --git a/duui-Climate/docker_build.sh b/duui-Climate/docker_build.sh new file mode 100644 index 00000000..0abfc296 --- /dev/null +++ b/duui-Climate/docker_build.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +export ANNOTATOR_CUDA= +#export ANNOTATOR_CUDA="-cuda" + +export ANNOTATOR_NAME=duui-climate +export ANNOTATOR_VERSION=0.1.0 +export LOG_LEVEL=DEBUG +export MODEL_CACHE_SIZE=3 +export DOCKER_REGISTRY="docker.texttechnologylab.org/" + +###--------------------------------------------------------------------- +#export MODEL_NAME="climatebert/distilroberta-base-climate-detector" +#export MODEL_SPECNAME="distilroberta-base-climate-detector" +#export MODEL_VERSION="2c3bc660d45a59e31b35f5d3e365ee4f59fdf76c" +#export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-detector" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="climatebert/distilroberta-base-climate-tcfd" +#export MODEL_SPECNAME="distilroberta-base-climate-tcfd" +#export MODEL_VERSION="970630beedc21db81a84156448ad2e3ac860153d" +#export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-tcfd" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="climatebert/distilroberta-base-climate-commitment" +#export MODEL_SPECNAME="distilroberta-base-climate-commitment" +#export MODEL_VERSION="17337c3292df16a8fe93b1505dfe4122d50a4c91" +#export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-commitment" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="climatebert/distilroberta-base-climate-sentiment" +#export MODEL_SPECNAME="distilroberta-base-climate-sentiment" +#export MODEL_VERSION="e9f9a94ee4263f5ad5cfc97b8539a497fc88aa7d" +#export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-sentiment" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +##--------------------------------------------------------------------- +export MODEL_NAME="climatebert/distilroberta-base-climate-specificity" +export MODEL_SPECNAME="distilroberta-base-climate-specificity" +export MODEL_VERSION="4ada96ed4bf5c3a7a711282e41f1ab9b29f0ddea" +export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-specificity" +export MODEL_LANG="EN" +##-------------------------------------------------------------------- + + + +docker build \ + --build-arg ANNOTATOR_NAME \ + --build-arg ANNOTATOR_VERSION \ + --build-arg LOG_LEVEL \ + --build-arg MODEL_CACHE_SIZE \ + --build-arg MODEL_NAME \ + --build-arg MODEL_VERSION \ + --build-arg MODEL_SOURCE \ + --build-arg MODEL_LANG \ + -t ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + -f src/main/docker/Dockerfile${ANNOTATOR_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:latest${ANNOTATOR_CUDA} diff --git a/duui-Climate/pom.xml b/duui-Climate/pom.xml new file mode 100644 index 00000000..23c49fe7 --- /dev/null +++ b/duui-Climate/pom.xml @@ -0,0 +1,157 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui-climate + 0.1.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + + 17 + 17 + 2.4.0 + + + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + com.github.mevbagci + DockerUnifiedUIMAInterface + + + ad501be374 + + + + + + + + + com.github.mevbagci + UIMATypeSystem + 3.0.23.1 + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + \ No newline at end of file diff --git a/duui-Climate/requirements.txt b/duui-Climate/requirements.txt new file mode 100644 index 00000000..c8109fba --- /dev/null +++ b/duui-Climate/requirements.txt @@ -0,0 +1,14 @@ +torch==2.11.0 +torchaudio==2.11.0 +torchvision==0.26.0 +scipy==1.17.1 +transformers==5.9.0 +sentencepiece==0.2.1 +protobuf==4.25.3 +numpy==2.4.6 +scikit-learn==1.8.0 +fastapi==0.110.0 +dkpro-cassis==0.9.1 +uvicorn[standard]==0.27.1 +pydantic-settings==2.0.2 +torchmetrics==1.2.0 \ No newline at end of file diff --git a/duui-Climate/src/main/docker/Dockerfile b/duui-Climate/src/main/docker/Dockerfile new file mode 100644 index 00000000..69b89a12 --- /dev/null +++ b/duui-Climate/src/main/docker/Dockerfile @@ -0,0 +1,55 @@ +FROM python:3.12 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_climate.py ./duui_climate.py +COPY ./src/main/python/duui_climate.lua ./duui_climate.lua +COPY ./src/main/python/Climate.py ./Climate.py + +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-detector'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-detector')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-tcfd'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-tcfd')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-commitment'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-commitment')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-sentiment'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-sentiment')" +RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-specificity'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-specificity')" + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-climate" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_climate:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-Climate/src/main/docker/Dockerfile-cuda b/duui-Climate/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..8d902811 --- /dev/null +++ b/duui-Climate/src/main/docker/Dockerfile-cuda @@ -0,0 +1,74 @@ +FROM nvidia/cuda:11.8.0-base-ubuntu22.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +RUN pip install setuptools wheel +COPY ./requirements.txt ./requirements.txt +RUN apt remove -y python3-blinker || true +RUN pip install -r requirements.txt + + + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_genre.py ./duui_genre.py +COPY ./src/main/python/duui_genre.lua ./duui_genre.lua +COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-detector'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-detector')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-tcfd'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-tcfd')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-commitment'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-commitment')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-sentiment'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-sentiment')" +RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-specificity'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-specificity')" + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-climate" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_climate:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] + diff --git a/duui-Climate/src/main/python/Climate.py b/duui-Climate/src/main/python/Climate.py new file mode 100644 index 00000000..b9ed6344 --- /dev/null +++ b/duui-Climate/src/main/python/Climate.py @@ -0,0 +1,53 @@ +import torch +import math +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from scipy.special import softmax +import numpy as np +from typing import List + +model_name_map = { + "climatebert/distilroberta-base-climate-detector": "ClimateDetector", + "climatebert/distilroberta-base-climate-tcfd": "ClimateTCFD", + "climatebert/distilroberta-base-climate-commitment": "ClimateCommitment", + "climatebert/distilroberta-base-climate-sentiment": "ClimateSentiment", + "climatebert/distilroberta-base-climate-specificity": "ClimateSpecificity", +} + +def sigmoid(x): + return 1 / (1 + math.exp(-x)) + +class ClimateBert: + def __init__(self, model_name: str, device='cuda:0'): + self.device = device + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device) + self.class_mapping = self.model.config.id2label + self.labels = list(self.class_mapping.values()) + + def prediction(self, texts: List[str]): + with torch.no_grad(): + inputs = self.tokenizer( + texts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(self.device) + + outputs = self.model(**inputs) + logits = outputs[0].float() # convert bfloat16 -> float32 + probs = torch.softmax(logits, dim=-1) + + score_list = [] + + for prob in probs.cpu(): + ranking = torch.argsort(prob, descending=True) + + score_dict_i = { + self.labels[i]: float(prob[i]) + for i in ranking + } + + score_list.append(score_dict_i) + return score_list + diff --git a/duui-Climate/src/main/python/TypeSystemTopic.xml b/duui-Climate/src/main/python/TypeSystemTopic.xml new file mode 100644 index 00000000..dc052a36 --- /dev/null +++ b/duui-Climate/src/main/python/TypeSystemTopic.xml @@ -0,0 +1,132 @@ + + + + + org.texttechnologylab.annotation.AnnotatorMetaData + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + name + + uima.cas.String + + + version + + uima.cas.String + + + modelName + + uima.cas.String + + + modelVersion + + uima.cas.String + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + org.texttechnologylab.annotation.DocumentModification + + uima.cas.AnnotationBase + + + user + + uima.cas.String + + + timestamp + + uima.cas.Long + + + comment + + uima.cas.String + + + + + org.hucompute.textimager.uima.type.Sentiment + + uima.tcas.Annotation + + + sentiment + + uima.cas.Double + + + subjectivity + + uima.cas.Double + + + + + org.hucompute.textimager.uima.type.CategorizedSentiment + + org.hucompute.textimager.uima.type.Sentiment + + + pos + + uima.cas.Double + + + neu + + uima.cas.Double + + + neg + + uima.cas.Double + + + + + org.texttechnologylab.annotation.AnnotationComment + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + value + + uima.cas.String + + + key + + uima.cas.String + + + + + diff --git a/duui-Climate/src/main/python/duui_climate.lua b/duui-Climate/src/main/python/duui_climate.lua new file mode 100644 index 00000000..fcd1740f --- /dev/null +++ b/duui-Climate/src/main/python/duui_climate.lua @@ -0,0 +1,133 @@ +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +TopicUtils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") + +function serialize(inputCas, outputStream, parameters) + local doc_lang = inputCas:getDocumentLanguage() + local doc_text = inputCas:getDocumentText() + local doc_len = TopicUtils:getDocumentTextLength(inputCas) + + local selection_types = parameters["selection"] + + local selections = {} + local selections_count = 1 + for selection_type in string.gmatch(selection_types, "([^,]+)") do + local sentences = {} + if selection_type == "text" then + local s = { + text = doc_text, + begin = 0, + ['end'] = doc_len + } + sentences[1] = s + else + local sentences_count = 1 + local clazz = Class:forName(selection_type); + local sentences_it = JCasUtil:select(inputCas, clazz):iterator() + while sentences_it:hasNext() do + local sentence = sentences_it:next() + local s = { + text = sentence:getCoveredText(), + begin = sentence:getBegin(), + ['end'] = sentence:getEnd() + } + sentences[sentences_count] = s + sentences_count = sentences_count + 1 + end + end + + local selection = { + sentences = sentences, + selection = selection_type + } + selections[selections_count] = selection + selections_count = selections_count + 1 + end + + outputStream:write(json.encode({ + selections = selections, + lang = doc_lang, + doc_len = doc_len + })) +end + +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) + if results["modification_meta"] ~= nil and results["meta"] ~= nil and results["results"] ~= nil then + -- print("GetInfo") + local source = results["model_source"] + local model_version = results["model_version"] + local model_name = results["model_name"] + local model_lang = results["model_lang"] + -- print("meta") + local modification_meta = results["modification_meta"] + local modification_anno = luajava.newInstance("org.texttechnologylab.annotation.DocumentModification", inputCas) + modification_anno:setUser(modification_meta["user"]) + modification_anno:setTimestamp(modification_meta["timestamp"]) + modification_anno:setComment(modification_meta["comment"]) + modification_anno:addToIndexes() + + -- print("setMetaData") + local model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) + model_meta:setModelVersion(model_version) + -- print(model_version) + model_meta:setModelName(model_name) + -- print(model_name) + model_meta:setSource(source) + -- print(source) + model_meta:setLang(model_lang) + -- print(model_lang) + model_meta:addToIndexes() + + local meta = results["meta"] + -- print("meta") + local begin_climate = results["begin"] + -- print("begin_emo") + local end_climate = results["end"] + -- print("end_emo") + local res_out = results["results"] +-- print("results") + local res_len = results["len_results"] + -- print("Len_results") + local factors = results["factors"] + local maptype = results["model_type"] +-- print(factors) + for index_i, res in ipairs(res_out) do + -- print(res) + local begin_climate_i = begin_climate[index_i] + -- print(begin_climate_i) + local end_climate_i = end_climate[index_i] + -- print(end_climate_i) + local len_i = res_len[index_i] + -- print(len_i) + -- print(type(len_i)) + local climate_i = luajava.newInstance("org.texttechnologylab.annotation.Climate", inputCas, begin_climate_i, end_climate_i) + -- print(climate_i) + local fsarray = luajava.newInstance("org.apache.uima.jcas.cas.FSArray", inputCas, len_i) + -- print(fsarray) + climate_i:setClimates(fsarray) + local counter = 0 + local factor_i = factors[index_i] + -- print(factor_i) + for index_j, climate_j in ipairs(res) do + -- print(climate_j) + local factor_j = factor_i[index_j] + -- print(factor_j) + climate_in_i = luajava.newInstance("org.texttechnologylab.annotation.AnnotationComment", inputCas) + climate_in_i:setReference(climate_i) + climate_in_i:setKey(climate_j) + climate_in_i:setValue(factor_j) + climate_in_i:addToIndexes() + climate_i:setClimates(counter, climate_in_i) + counter = counter + 1 + end + climate_i:setModel(model_meta) + climate_i:setClimateType(maptype) + climate_i:addToIndexes() + -- print("add") + end + end + -- print("end") + end diff --git a/duui-Climate/src/main/python/duui_climate.py b/duui-Climate/src/main/python/duui_climate.py new file mode 100644 index 00000000..1fd9390c --- /dev/null +++ b/duui-Climate/src/main/python/duui_climate.py @@ -0,0 +1,286 @@ +from pydantic import BaseModel +from pydantic_settings import BaseSettings +from typing import List, Optional, Dict, Union +import logging +from time import time +from fastapi import FastAPI, Response +from cassis import load_typesystem +import torch +from threading import Lock +from functools import lru_cache +from Climate import ClimateBert,model_name_map +# from sp_correction import SentenceBestPrediction + +# Settings +# These are automatically loaded from env variables +from starlette.responses import PlainTextResponse + +model_lock = Lock() + + +class UimaSentence(BaseModel): + text: str + begin: int + end: int + + +class UimaSentenceSelection(BaseModel): + selection: str + sentences: List[UimaSentence] + + +class Settings(BaseSettings): + # Name of this annotator + annotator_name: str + # Version of this annotator + annotator_version: str + # Log level + log_level: str + # model_name + model_name: str + # Name of this annotator + model_version: str + #cach_size + model_cache_size: int + # url of the model + model_source: str + # language of the model + model_lang: str + + +# Load settings from env vars +settings = Settings() +lru_cache_with_size = lru_cache(maxsize=settings.model_cache_size) +logging.basicConfig(level=settings.log_level) +logger = logging.getLogger(__name__) + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +# device = "cpu" +logger.info(f'USING {device}') +# Load the predefined typesystem that is needed for this annotator to work +typesystem_filename = 'TypeSystemTopic.xml' +logger.debug("Loading typesystem from \"%s\"", typesystem_filename) +with open(typesystem_filename, 'rb') as f: + typesystem = load_typesystem(f) + logger.debug("Base typesystem:") + logger.debug(typesystem.to_xml()) + +# Load the Lua communication script +lua_communication_script_filename = "duui_climate.lua" +logger.debug("Loading Lua communication script from \"%s\"", lua_communication_script_filename) + + +# Request sent by DUUI +# Note, this is transformed by the Lua script +class DUUIRequest(BaseModel): + # The texts language + doc_len: int + # + lang: str + # + selections: List[UimaSentenceSelection] + # + + +# UIMA type: mark modification of the document +class DocumentModification(BaseModel): + user: str + timestamp: int + comment: str + + +# UIMA type: adds metadata to each annotation +class AnnotationMeta(BaseModel): + name: str + version: str + modelName: str + modelVersion: str + + +# Response sent by DUUI +# Note, this is transformed by the Lua script +class DUUIResponse(BaseModel): + # Symspelloutput + # List of Sentence with every token + # Every token is a dictionary with following Infos: + # Symspelloutput right if the token is correct, wrong if the token is incorrect, skipped if the token was skipped, unkownn if token can corrected with Symspell + # If token is unkown it will be predicted with BERT Three output pos: + # 1. Best Prediction with BERT MASKED + # 2. Best Cos-sim with Sentence-Bert and with perdicted words of BERT MASK + # 3. Option 1 and 2 together + meta: AnnotationMeta + # Modification meta, one per document + modification_meta: DocumentModification + begin: List[int] + end: List[int] + results: List + factors: List + len_results: List[int] + model_name: str + model_version: str + model_source: str + model_lang: str + model_type: str + + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.annotator_name, + description="Factuality annotator", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "TTLab Team", + "url": "https://texttechnologylab.org", + "email": "bagci@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +with open(lua_communication_script_filename, 'rb') as f: + lua_communication_script = f.read().decode("utf-8") +logger.debug("Lua communication script:") +logger.debug(lua_communication_script_filename) + + +# Get typesystem of this annotator +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO remove cassis dependency, as only needed for typesystem at the moment? + xml = typesystem.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +# Return documentation info +@app.get("/v1/documentation") +def get_documentation(): + return "Test" + + +@lru_cache_with_size +def load_model(model_name): + model_i = ClimateBert(model_name, device) + return model_i + + +def fix_unicode_problems(text): + # fix emoji in python string and prevent json error on response + # File "/usr/local/lib/python3.8/site-packages/starlette/responses.py", line 190, in render + # UnicodeEncodeError: 'utf-8' codec can't encode characters in position xx-yy: surrogates not allowed + clean_text = text.encode('utf-16', 'surrogatepass').decode('utf-16', 'surrogateescape') + return clean_text + + +def process_selection(model_name, selection): + begin = [] + end = [] + results_out = [] + factors = [] + len_results = [] + for s in selection.sentences: + s.text = fix_unicode_problems(s.text) + + texts = [ + s.text + for s in selection.sentences + ] + logger.debug("Preprocessed texts:") + logger.debug(texts) + model_map = "others" + + with model_lock: + if model_name in model_name_map: + model_map = model_name_map[model_name] + classifier = load_model(model_name) + + results = classifier.prediction(texts) + for c, res in enumerate(results): + res_i = [] + factor_i = [] + sentence_i = selection.sentences[c] + begin_i = sentence_i.begin + end_i = sentence_i.end + len_rel = len(res) + begin.append(begin_i) + end.append(end_i) + for i in res: + res_i.append(i) + factor_i.append(res[i]) + len_results.append(len_rel) + results_out.append(res_i) + factors.append(factor_i) + output = { + "begin": begin, + "end": end, + "len_results": len_results, + "results": results_out, + "factors": factors, + "model_type": model_map + } + + return output + + +# Process request from DUUI +@app.post("/v1/process") +def post_process(request: DUUIRequest): + # Return data + meta = None + begin = [] + end = [] + len_results = [] + results = [] + factors = [] + model_type = "others" + # Save modification start time for later + modification_timestamp_seconds = int(time()) + try: + model_source = settings.model_source + model_lang = settings.model_lang + model_version = settings.model_version + # set meta Informations + meta = AnnotationMeta( + name=settings.annotator_name, + version=settings.annotator_version, + modelName=settings.model_name, + modelVersion=model_version, + ) + if settings.model_name in model_name_map: + model_type = model_name_map[settings.model_name] + # Add modification info + modification_meta_comment = f"{settings.annotator_name} ({settings.annotator_version}))" + modification_meta = DocumentModification( + user=settings.annotator_name, + timestamp=modification_timestamp_seconds, + comment=modification_meta_comment + ) + mv = "" + + for selection in request.selections: + processed_sentences = process_selection(settings.model_name, selection) + begin = begin + processed_sentences["begin"] + end = end + processed_sentences["end"] + len_results = len_results + processed_sentences["len_results"] + results = results + processed_sentences["results"] + factors = factors + processed_sentences["factors"] + except Exception as ex: + logger.exception(ex) + return DUUIResponse(meta=meta, modification_meta=modification_meta, begin=begin, end=end, results=results, + len_results=len_results, factors=factors, model_name=settings.model_name, + model_version=model_version, model_source=model_source, model_lang=model_lang, model_type=model_type) diff --git a/duui-Genre/src/test/java/org/hucompute/textimager/uima/genre/GenreTest.java b/duui-Climate/src/test/java/org/hucompute/textimager/uima/climate/ClimateTest.java similarity index 82% rename from duui-Genre/src/test/java/org/hucompute/textimager/uima/genre/GenreTest.java rename to duui-Climate/src/test/java/org/hucompute/textimager/uima/climate/ClimateTest.java index bc90db90..23db65e7 100644 --- a/duui-Genre/src/test/java/org/hucompute/textimager/uima/genre/GenreTest.java +++ b/duui-Climate/src/test/java/org/hucompute/textimager/uima/climate/ClimateTest.java @@ -1,4 +1,4 @@ -package org.hucompute.textimager.uima.genre; +package org.hucompute.textimager.uima.climate; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import org.apache.commons.compress.compressors.CompressorException; @@ -24,10 +24,10 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; -import org.texttechnologylab.annotation.Genre; +import org.texttechnologylab.annotation.Climate; import org.texttechnologylab.annotation.AnnotationComment; -public class GenreTest { +public class ClimateTest { static DUUIComposer composer; static JCas cas; @@ -99,17 +99,17 @@ public void DeTest() throws Exception { createCas("de", sentences); composer.run(cas); - Collection all_topics = JCasUtil.select(cas, Genre.class); + Collection all_climates = JCasUtil.select(cas, Climate.class); ArrayList> expected = new ArrayList>(); - for (Genre topic: all_topics){ - System.out.println(topic.getCoveredText()); - Map topics = new HashMap(); - FSArray topics_all = topic.getGenres(); - for (AnnotationComment comment_i: topics_all){ - topics.put(comment_i.getKey(), Float.parseFloat(comment_i.getValue())); + for (Climate climate: all_climates){ + System.out.println(climate.getCoveredText()); + Map climates = new HashMap(); + FSArray climates_all = climate.getClimates(); + for (AnnotationComment comment_i: climates_all){ + climates.put(comment_i.getKey(), Float.parseFloat(comment_i.getValue())); System.out.println("key:"+comment_i.getKey()+"; Value:"+comment_i.getValue()); } - expected.add(topics); + expected.add(climates); } for (Map topic: expected){ @@ -134,18 +134,21 @@ public void EnTest() throws Exception { createCas("de", sentences); composer.run(cas); - Collection all_topics = JCasUtil.select(cas, Genre.class); + Collection all_climates = JCasUtil.select(cas, Climate.class); ArrayList> expected = new ArrayList>(); - for (Genre genre: all_topics){ - System.out.println(genre.getCoveredText()); - Map topics = new HashMap(); - String model_name = genre.getModel().getModelName(); - FSArray topics_all = genre.getGenres(); - for (AnnotationComment comment_i: topics_all){ - topics.put(comment_i.getKey(), Float.parseFloat(comment_i.getValue())); + for (Climate climate: all_climates){ + System.out.println(climate.getCoveredText()); + Map climates = new HashMap(); + String model_name = climate.getModel().getModelName(); + String type_name = climate.getClimateType(); + System.out.println(model_name); + System.out.println(type_name); + FSArray climates_all = climate.getClimates(); + for (AnnotationComment comment_i: climates_all){ + climates.put(comment_i.getKey(), Float.parseFloat(comment_i.getValue())); System.out.println("key:"+comment_i.getKey()+"; Value:"+comment_i.getValue()); } - expected.add(topics); + expected.add(climates); } // HashMap> expected = new HashMap<>(); From 386da2b744f4d523467df55d330a9d2548782740 Mon Sep 17 00:00:00 2001 From: bagci Date: Fri, 29 May 2026 18:04:42 +0200 Subject: [PATCH 05/19] Add Coference Tool --- duui-Coreference/.dockerignore | 3 + duui-Coreference/.gitignore | 4 + duui-Coreference/Readme.md | 1 + duui-Coreference/docker_build.sh | 45 ++ duui-Coreference/pom.xml | 155 +++++ duui-Coreference/reqiurements.txt | 9 + duui-Coreference/src/.dockerignore | 3 + duui-Coreference/src/main/docker/Dockerfile | 61 ++ .../src/main/docker/Dockerfile-cuda | 57 ++ .../src/main/python/Coreferee_resolver.py | 550 +++++++++++++++++ .../src/main/python/TypeSystemCoreference.xml | 568 ++++++++++++++++++ .../src/main/python/duui_coreference.lua | 144 +++++ .../src/main/python/duui_coreference.py | 242 ++++++++ .../uima/Coreference/CoreferenceTest.java | 242 ++++++++ 14 files changed, 2084 insertions(+) create mode 100644 duui-Coreference/.dockerignore create mode 100644 duui-Coreference/.gitignore create mode 100644 duui-Coreference/Readme.md create mode 100644 duui-Coreference/docker_build.sh create mode 100644 duui-Coreference/pom.xml create mode 100644 duui-Coreference/reqiurements.txt create mode 100644 duui-Coreference/src/.dockerignore create mode 100644 duui-Coreference/src/main/docker/Dockerfile create mode 100644 duui-Coreference/src/main/docker/Dockerfile-cuda create mode 100644 duui-Coreference/src/main/python/Coreferee_resolver.py create mode 100644 duui-Coreference/src/main/python/TypeSystemCoreference.xml create mode 100644 duui-Coreference/src/main/python/duui_coreference.lua create mode 100644 duui-Coreference/src/main/python/duui_coreference.py create mode 100644 duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java diff --git a/duui-Coreference/.dockerignore b/duui-Coreference/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-Coreference/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-Coreference/.gitignore b/duui-Coreference/.gitignore new file mode 100644 index 00000000..98adfc38 --- /dev/null +++ b/duui-Coreference/.gitignore @@ -0,0 +1,4 @@ +.idea +target +venv +models \ No newline at end of file diff --git a/duui-Coreference/Readme.md b/duui-Coreference/Readme.md new file mode 100644 index 00000000..90a1d60a --- /dev/null +++ b/duui-Coreference/Readme.md @@ -0,0 +1 @@ +... \ No newline at end of file diff --git a/duui-Coreference/docker_build.sh b/duui-Coreference/docker_build.sh new file mode 100644 index 00000000..e1c1cec8 --- /dev/null +++ b/duui-Coreference/docker_build.sh @@ -0,0 +1,45 @@ +export ANNOTATOR_NAME=duui-factchecking +export ANNOTATOR_VERSION=0.1.0 +export LOG_LEVEL=INFO +eport MODEL_CACHE_SIZE=3 + +##--------------------------------------------------------------------- +export MODEL_NAME="coreferee" +export MODEL_SPECNAME="coreferee" +export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +export MODEL_LANG="EN,DE,FR,PL" +export MODEL_VARIANT="SM" +##-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="EN,DE,FR,PL" +#export MODEL_VARIANT="LG" +###-------------------------------------------------------------------- + + +export DOCKER_REGISTRY="docker.texttechnologylab.org/" +export DUUI_CUDA= +#export DUUI_CUDA="-cuda" + +docker build \ + --build-arg ANNOTATOR_NAME \ + --build-arg ANNOTATOR_VERSION \ + --build-arg LOG_LEVEL \ + --build-arg MODEL_CACHE_SIZE \ + --build-arg MODEL_NAME \ + --build-arg MODEL_VERSION \ + --build-arg MODEL_SOURCE \ + --build-arg MODEL_LANG \ + --build-arg MODEL_VARIANT \ + -t ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_VARIANT}:${ANNOTATOR_VERSION}${DUUI_CUDA} \ + -f src/main/docker/Dockerfile${DUUI_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_VARIANT}:${ANNOTATOR_VERSION}${DUUI_CUDA} \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_VARIANT}:latest${DUUI_CUDA} \ No newline at end of file diff --git a/duui-Coreference/pom.xml b/duui-Coreference/pom.xml new file mode 100644 index 00000000..837436f9 --- /dev/null +++ b/duui-Coreference/pom.xml @@ -0,0 +1,155 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui-Coreference + 0.1.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + + 17 + 17 + 2.4.0 + + 2789ba29fa1f236b64b0402315ffe1cf5d81b654 + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 7cef2433b5 + + + + + + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.14 + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + \ No newline at end of file diff --git a/duui-Coreference/reqiurements.txt b/duui-Coreference/reqiurements.txt new file mode 100644 index 00000000..f61317a7 --- /dev/null +++ b/duui-Coreference/reqiurements.txt @@ -0,0 +1,9 @@ +spacy==3.5.0 +coreferee==1.4.1 +numpy==1.26.4 +setuptools<70 +pydantic>=1.7.4,<1.11.0 +regex==2023.12.25 +fastapi==0.110.0 +uvicorn[standard]==0.27.1 +dkpro-cassis==0.9.1 \ No newline at end of file diff --git a/duui-Coreference/src/.dockerignore b/duui-Coreference/src/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-Coreference/src/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-Coreference/src/main/docker/Dockerfile b/duui-Coreference/src/main/docker/Dockerfile new file mode 100644 index 00000000..68ed716c --- /dev/null +++ b/duui-Coreference/src/main/docker/Dockerfile @@ -0,0 +1,61 @@ +FROM python:3.10 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +RUN python3 -m coreferee install en +RUN python3 -m coreferee install de +RUN python3 -m coreferee install fr +RUN python3 -m coreferee install pl +RUN python -m spacy download en_core_web_sm +RUN python -m spacy download de_core_news_sm +RUN python -m spacy download fr_core_news_sm +RUN python -m spacy download pl_core_news_sm + +# copy scripts +COPY ./src/main/python/TypeSystemCoreference.xml ./TypeSystemCoreference.xml +COPY ./src/main/python/duui_coreference.py ./duui_coreference.py +COPY ./src/main/python/duui_coreference.lua ./duui_coreference.lua +#COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-transformers-topic" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG +ARG MODEL_VARIANT="" +ENV MODEL_VARIANT=$MODEL_VARIANT + + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_coreference:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-Coreference/src/main/docker/Dockerfile-cuda b/duui-Coreference/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..2ad8b60b --- /dev/null +++ b/duui-Coreference/src/main/docker/Dockerfile-cuda @@ -0,0 +1,57 @@ +FROM nvidia/cuda:11.0.3-base-ubuntu20.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.8 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + + +# meta data +ARG FACT_ANNOTATOR_NAME="duui-Factchecking:app" +ENV FACT_ANNOTATOR_NAME=$FACT_ANNOTATOR_NAME +ARG FACT_ANNOTATOR_VERSION="unset" +ENV FACT_ANNOTATOR_VERSION=$FACT_ANNOTATOR_VERSION + +# log level +ARG FACT_LOG_LEVEL="DEBUG" +ENV FACT_LOG_LEVEL=$FACT_LOG_LEVEL + +# config +ARG FACT_MODEL_CACHE_SIZE=3 +ENV FACT_MODEL_CACHE_SIZE=$FACT_MODEL_CACHE_SIZE + +# Model Info +ARG FACT_MODEL_NAME="" +ENV FACT_MODEL_NAME=$FACT_MODEL_NAME +ARG FACT_MODEL_VERSION=0.1 +ENV FACT_MODEL_VERSION=$FACT_MODEL_VERSION + +# service script +COPY ./src/main/python/TypeSystemFactChecking.xml ./TypeSystemFactChecking.xml +COPY ./src/main/python/scorer.py ./scorer.py +COPY ./src/main/python/evaluator.py ./evaluator.py +COPY ./src/main/python/utils.py ./utils.py +COPY ./src/main/python/factchecker.py ./factchecker.py +COPY ./src/main/python/duui_fact.lua ./duui_fact.lua +COPY ./src/main/python/duui_fact.py ./duui_fact.py +COPY ./reqiurements.txt ./reqiurements.txt + +RUN pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118 +RUN pip install -r reqiurements.txt +RUN python -m nltk.downloader punkt +RUN python -c "from evaluator import get_evaluator; get_evaluator('fact', device='cpu')" +#RUN python -c "from nubia_score import Nubia; nubia = Nubia()" + + +ENTRYPOINT ["uvicorn", "duui_fact:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] + diff --git a/duui-Coreference/src/main/python/Coreferee_resolver.py b/duui-Coreference/src/main/python/Coreferee_resolver.py new file mode 100644 index 00000000..def8254e --- /dev/null +++ b/duui-Coreference/src/main/python/Coreferee_resolver.py @@ -0,0 +1,550 @@ +from __future__ import annotations + +from typing import Any, Optional, Union + +import coreferee # noqa: F401 # Registers the spaCy pipeline component "coreferee" +import spacy + +from spacy.tokens import Doc + + +EXTERNAL_OFFSETS_EXTENSION = "external_token_offsets" + +if not Doc.has_extension(EXTERNAL_OFFSETS_EXTENSION): + Doc.set_extension(EXTERNAL_OFFSETS_EXTENSION, default=None) + + +class CorefereeResolver: + """ + Coreferee wrapper with one fixed language per instance. + + Input: + - tokens: list[str] + - begins: list[int] + - ends: list[int] + + Output: + { + "begin": [...], + "end": [...], + "begin_resolve": [...], + "end_resolve": [...], + "token": [...], + "token_resolve": [...], + } + + The language is set once during initialization. + Runtime language switching is intentionally not supported. + """ + + DEFAULT_MODELS = { + "sm": { + "en": "en_core_web_sm", + "de": "de_core_news_sm", + "fr": "fr_core_news_sm", + "pl": "pl_core_news_sm", + }, + "lg": { + "en": "en_core_web_lg", + "de": "de_core_news_lg", + "fr": "fr_core_news_lg", + "pl": "pl_core_news_lg", + }, + } + + LANG_ALIASES = { + "en": "en", + "english": "en", + "englisch": "en", + + "de": "de", + "german": "de", + "deutsch": "de", + + "fr": "fr", + "french": "fr", + "französisch": "fr", + "franzoesisch": "fr", + + "pl": "pl", + "polish": "pl", + "polnisch": "pl", + } + + def __init__( + self, + language: str, + variant: str, + model_overrides: Optional[dict[str, str]] = None, + ): + self._language = self._normalize_language(language) + self.variant = variant + + self.models = dict(self.DEFAULT_MODELS) + if model_overrides: + self.models.update(model_overrides) + + self.nlp = self._load_pipeline() + + @property + def language(self) -> str: + return self._language + + def _normalize_language(self, language: str) -> str: + lang = language.strip().lower() + + if lang not in self.LANG_ALIASES: + supported = ", ".join(self.DEFAULT_MODELS.keys()) + raise ValueError( + f"Unsupported language: {language!r}. " + f"Supported languages are: {supported}" + ) + + return self.LANG_ALIASES[lang] + + def _load_pipeline(self): + model_name = self.models[self.variant][self.language] + + try: + nlp = spacy.load(model_name) + except OSError as exc: + raise RuntimeError( + f"spaCy model not found: {model_name!r}\n\n" + f"Install it with:\n" + f"python -m spacy download {model_name}\n\n" + f"Or override the model name, for example:\n" + f"CorefereeResolver('en', model_overrides={{'en': 'en_core_web_sm'}})" + ) from exc + + if "coreferee" not in nlp.pipe_names: + try: + nlp.add_pipe("coreferee") + except Exception as exc: + raise RuntimeError( + f"Could not load Coreferee for language {self.language!r}.\n\n" + f"Install the Coreferee language data with:\n" + f"python -m coreferee install {self.language}" + ) from exc + + return nlp + + def process_text(self, text: str) -> Doc: + """ + Process raw text. + + This is optional. If you already have tokens/begins/ends, + use process_tokens instead. + + For raw text, external offsets are taken from spaCy token offsets. + """ + if not text or not text.strip(): + raise ValueError("text must not be empty.") + + doc = self.nlp(text) + + doc._.external_token_offsets = [ + { + "begin": token.idx, + "end": token.idx + len(token.text), + } + for token in doc + ] + + return doc + + def process_tokens( + self, + tokens: list[str], + begins: list[int], + ends: list[int], + spaces: Optional[list[bool]] = None, + ) -> Doc: + """ + Process pre-tokenized input with external begin/end offsets. + + Args: + tokens: + Separate list of token strings. + + begins: + Separate list of begin offsets. + + ends: + Separate list of end offsets. + + spaces: + Optional whitespace information. + If None, spaces are inferred from begin/end offsets. + + Important: + This method does not call self.nlp(" ".join(tokens)), + because that would let spaCy tokenize the text again. + """ + self._validate_token_offsets(tokens, begins, ends) + + if spaces is None: + spaces = self._infer_spaces_from_offsets(begins, ends) + + if len(tokens) != len(spaces): + raise ValueError( + "tokens and spaces must have the same length. " + f"tokens={len(tokens)}, spaces={len(spaces)}" + ) + + doc = Doc(self.nlp.vocab, words=tokens, spaces=spaces) + + doc._.external_token_offsets = [ + { + "begin": int(begin), + "end": int(end), + } + for begin, end in zip(begins, ends) + ] + + for _, component in self.nlp.pipeline: + doc = component(doc) + + return doc + + def process( + self, + input_data: Union[str, list[str]], + begins: Optional[list[int]] = None, + ends: Optional[list[int]] = None, + spaces: Optional[list[bool]] = None, + ) -> Doc: + """ + Generic input processor. + + Supported: + - str + - list[str] with begins and ends + + If input_data is list[str], begins and ends are required. + """ + if isinstance(input_data, str): + return self.process_text(input_data) + + if isinstance(input_data, list): + if begins is None or ends is None: + raise ValueError( + "begins and ends are required when input_data is a token list." + ) + + return self.process_tokens( + tokens=input_data, + begins=begins, + ends=ends, + spaces=spaces, + ) + + raise TypeError("input_data must be either a string or a list of tokens.") + + def get_coreference_dict( + self, + doc: Doc, + include_self: bool = False, + expand_noun_chunks: bool = True, + ) -> dict[str, list]: + """ + Return all detected coreferences as a dictionary with six lists. + + Output: + { + "begin": [...], + "end": [...], + "begin_resolve": [...], + "end_resolve": [...], + "token": [...], + "token_resolve": [...], + } + + Meaning: + begin[i], end[i], token[i] + The detected mention. + + begin_resolve[i], end_resolve[i], token_resolve[i] + The resolved mention of the same coreference chain. + + No pronoun list is required. + """ + if doc._.external_token_offsets is None: + raise RuntimeError( + "The Doc has no external offsets. " + "Use process_tokens(tokens, begins, ends) or process_text(text)." + ) + + result: dict[str, list] = { + "begin": [], + "end": [], + "begin_resolve": [], + "end_resolve": [], + "token": [], + "token_resolve": [], + } + + seen: set[tuple[int, int, int, int]] = set() + + for chain in doc._.coref_chains: + mentions = self._get_chain_mentions(chain) + + if not mentions: + continue + + representative_index = getattr( + chain, + "most_specific_mention_index", + 0, + ) + + if representative_index is None: + representative_index = 0 + + if representative_index < 0 or representative_index >= len(mentions): + representative_index = 0 + + representative_mention = mentions[representative_index] + + resolved_span = self._mention_to_external_span( + doc=doc, + mention=representative_mention, + expand_noun_chunks=expand_noun_chunks, + ) + + for mention in mentions: + mention_span = self._mention_to_external_span( + doc=doc, + mention=mention, + expand_noun_chunks=expand_noun_chunks, + ) + + same_span = ( + mention_span["begin"] == resolved_span["begin"] + and mention_span["end"] == resolved_span["end"] + ) + + if same_span and not include_self: + continue + + key = ( + mention_span["begin"], + mention_span["end"], + resolved_span["begin"], + resolved_span["end"], + ) + + if key in seen: + continue + + seen.add(key) + + result["begin"].append(mention_span["begin"]) + result["end"].append(mention_span["end"]) + result["begin_resolve"].append(resolved_span["begin"]) + result["end_resolve"].append(resolved_span["end"]) + result["token"].append(mention_span["text"]) + result["token_resolve"].append(resolved_span["text"]) + + return result + + def _get_chain_mentions(self, chain) -> list: + """ + Return mentions from a Coreferee chain. + + Coreferee chains behave like lists, but some versions also expose + a .mentions attribute. This helper supports both variants. + """ + if hasattr(chain, "mentions"): + return list(chain.mentions) + + return list(chain) + + def _mention_to_external_span( + self, + doc: Doc, + mention, + expand_noun_chunks: bool, + ) -> dict[str, Any]: + """ + Convert a Coreferee mention to external begin/end offsets. + + A Coreferee mention is usually a list of token indices, for example: + [14] + [16, 19] + """ + token_indices = self._mention_to_token_indices(mention) + + if not token_indices: + raise ValueError("Coreferee mention does not contain token indices.") + + if expand_noun_chunks: + token_indices = self._expand_indices_to_noun_chunks( + doc=doc, + token_indices=token_indices, + ) + + first_i = min(token_indices) + last_i = max(token_indices) + + offsets = doc._.external_token_offsets + + begin = offsets[first_i]["begin"] + end = offsets[last_i]["end"] + + contiguous_indices = list(range(first_i, last_i + 1)) + is_contiguous = token_indices == contiguous_indices + + if is_contiguous: + text = doc[first_i:last_i + 1].text + else: + text = " ".join(doc[i].text for i in token_indices) + + return { + "begin": begin, + "end": end, + "text": text, + "token_indices": token_indices, + } + + def _mention_to_token_indices(self, mention) -> list[int]: + """ + Normalize a Coreferee mention to a list of token indices. + """ + if mention is None: + return [] + + if hasattr(mention, "token_indexes"): + return [int(i) for i in mention.token_indexes] + + if hasattr(mention, "token_indices"): + return [int(i) for i in mention.token_indices] + + if isinstance(mention, int): + return [int(mention)] + + return [int(i) for i in mention] + + def _expand_indices_to_noun_chunks( + self, + doc: Doc, + token_indices: list[int], + ) -> list[int]: + """ + Expand token indices to their noun chunks if possible. + + Examples: + token index for "cactus" -> indices for "a cactus" + token index for "vase" -> indices for "The vase" + """ + expanded_indices = set(token_indices) + + try: + noun_chunks = list(doc.noun_chunks) + except Exception: + return sorted(expanded_indices) + + for token_index in token_indices: + for chunk in noun_chunks: + if chunk.start <= token_index < chunk.end: + expanded_indices.update(range(chunk.start, chunk.end)) + break + + return sorted(expanded_indices) + + @staticmethod + def _infer_spaces_from_offsets( + begins: list[int], + ends: list[int], + ) -> list[bool]: + """ + Infer spaCy spaces from begin/end offsets. + + If the next token starts after the current token ends, + there is whitespace between them. + """ + spaces: list[bool] = [] + + for i in range(len(begins)): + if i == len(begins) - 1: + spaces.append(False) + else: + spaces.append(ends[i] < begins[i + 1]) + + return spaces + + @staticmethod + def _validate_token_offsets( + tokens: list[str], + begins: list[int], + ends: list[int], + ) -> None: + """ + Validate that tokens, begins and ends are aligned. + """ + if not tokens: + raise ValueError("tokens must not be empty.") + + if len(tokens) != len(begins) or len(tokens) != len(ends): + raise ValueError( + "tokens, begins and ends must have the same length. " + f"tokens={len(tokens)}, begins={len(begins)}, ends={len(ends)}" + ) + + for i, (token, begin, end) in enumerate(zip(tokens, begins, ends)): + if not token: + raise ValueError(f"token must not be empty at index {i}.") + + if begin < 0: + raise ValueError(f"begin must be >= 0 at index {i}.") + + if end < begin: + raise ValueError(f"end must be >= begin at index {i}.") + + if i > 0 and begin < ends[i - 1]: + raise ValueError( + f"Token offsets must not overlap. " + f"Problem at index {i}: begin={begin}, previous_end={ends[i - 1]}" + ) + + +if __name__ == "__main__": + resolver = CorefereeResolver("en", "sm") + + tokens = [ + "Anna", "bought", "a", "cactus", ".", + "The", "plant", "needed", "sunlight", ".", + "She", "put", "a", "vase", "on", "the", "table", ".", + "The", "vase", "was", "old", ",", "but", "it", "was", "beautiful", ".", + "The", "cactus", "grew", "quickly", "because", "it", "got", "enough", "light", ".", + ] + + begins = [ + 0, 5, 12, 14, 20, + 22, 26, 32, 39, 47, + 49, 53, 57, 59, 64, 67, 71, 76, + 78, 82, 87, 91, 94, 96, 100, 103, 107, 116, + 118, 122, 129, 134, 142, 150, 153, 157, 164, 169, + ] + + ends = [ + 4, 11, 13, 20, 21, + 25, 31, 38, 47, 48, + 52, 56, 58, 63, 66, 70, 76, 77, + 81, 86, 90, 94, 95, 99, 102, 106, 116, 117, + 121, 128, 133, 141, 149, 152, 156, 163, 169, 170, + ] + + doc = resolver.process_tokens( + tokens=tokens, + begins=begins, + ends=ends, + ) + + print("Coreference dictionary:") + result = resolver.get_coreference_dict( + doc, + include_self=False, + expand_noun_chunks=True, + ) + + print(result) \ No newline at end of file diff --git a/duui-Coreference/src/main/python/TypeSystemCoreference.xml b/duui-Coreference/src/main/python/TypeSystemCoreference.xml new file mode 100644 index 00000000..15d18277 --- /dev/null +++ b/duui-Coreference/src/main/python/TypeSystemCoreference.xml @@ -0,0 +1,568 @@ + + TypeSystemFactChecking + + 1.0 + + + + + + + + + + + + org.texttechnologylab.annotation.ModelAnnotation + + + + + + uima.tcas.Annotation + + + + + + + ModelReference + + Reference to the Model + + org.texttechnologylab.annotation.MetaData + + + + + + + + + + + + + + org.texttechnologylab.uima.type.Embedding + + + + + + org.texttechnologylab.annotation.ModelAnnotation + + + + + + + + + embedding + + + + + + uima.cas.FloatArray + + + + + + + + + + + + + + + org.texttechnologylab.uima.type.Classification + + + + + + org.texttechnologylab.annotation.ModelAnnotation + + + + + + + + + org.texttechnologylab.uima.type.Topic + + + + + + org.texttechnologylab.uima.type.Classification + + + + + + + + + topic + + + + + + uima.cas.String + + + + + + + + + score + + + + + + uima.cas.Double + + + + + + + + + + + + + + + org.texttechnologylab.uima.type.Sentiment + + + + + + org.texttechnologylab.uima.type.Classification + + + + + + + + + sentiment + + + + + + uima.cas.Double + + + + + + + + + subjectivity + + + + + + uima.cas.Double + + + + + + + + + + + + + + + org.texttechnologylab.uima.type.CategorizedSentiment + + + + + + org.texttechnologylab.uima.type.Sentiment + + + + + + + + + pos + + + + + + uima.cas.Double + + + + + + + + + neu + + + + + + uima.cas.Double + + + + + + + + + neg + + + + + + uima.cas.Double + + + + + + + + + + + + + + + org.texttechnologylab.uima.type.StarSentiment + + + + + + org.texttechnologylab.uima.type.Sentiment + + + + + + + + + OneStar + + + + + + uima.cas.Double + + + + + + + + + TwoStars + + + + + + uima.cas.Double + + + + + + + + + ThreeStars + + + + + + uima.cas.Double + + + + + + + + + FourStars + + + + + + uima.cas.Double + + + + + + + + + FiveStars + + + + + + uima.cas.Double + + + + + + + + + + + + org.texttechnologylab.annotation.MetaData + + + + uima.tcas.Annotation + + + + + + Lang + + Language of the method or the Model + + uima.cas.String + + + + + + Source + + Link of the used resource + + uima.cas.String + + + + + + + + + + org.texttechnologylab.annotation.model.MetaData + + + + org.texttechnologylab.annotation.MetaData + + + + + + ModelVersion + + Version of the Model + + uima.cas.String + + + + + + ModelName + + Name of the Model + + uima.cas.String + + + + + + + + + + org.texttechnologylab.annotation.model.SpacyMetaData + + + + org.texttechnologylab.annotation.model.MetaData + + + + + + SpacyVersion + + Spacy Libary Version + + uima.cas.String + + + + + + ModelSpacyGitVersion + + Explicit Spacy git version + + uima.cas.String + + + + + + + + + + org.texttechnologylab.annotation.model.HuggingfaceMetaData + + + + org.texttechnologylab.annotation.model.MetaData + + + + + + HuggingfaceVersion + + Transformer Library Version + + uima.cas.String + + + + + + DependeciesVersion + + Dependency Library Version e.g. Pytorch... + + uima.cas.StringArray + + + + + + + + org.texttechnologylab.annotation.Claim + One Claim for different facts + uima.tcas.Annotation + + + value + Information of Claim + uima.cas.String + + + Facts + Set of Fact + uima.cas.FSArray + org.texttechnologylab.annotation.Fact + + + + + org.texttechnologylab.annotation.Fact + One Fact for different claims + uima.tcas.Annotation + + + value + Information for the fact + uima.cas.String + + + Claims + Set of Claims + uima.cas.FSArray + org.texttechnologylab.annotation.Claim + + + + + org.texttechnologylab.annotation.model.FactCheckingMetaData + + org.texttechnologylab.annotation.model.MetaData + + + DependeciesVersion + Dependency Library Version e.g. Pytorch... + uima.cas.StringArray + + + + + org.texttechnologylab.annotation.FactChecking + Does the assertion confirm the statement + uima.tcas.Annotation + + + Fact + + org.texttechnologylab.annotation.Fact + + + Claim + + org.texttechnologylab.annotation.Claim + + + consistency + + uima.cas.Double + + + model + + org.texttechnologylab.annotation.model.MetaData + + + + + \ No newline at end of file diff --git a/duui-Coreference/src/main/python/duui_coreference.lua b/duui-Coreference/src/main/python/duui_coreference.lua new file mode 100644 index 00000000..9c03455d --- /dev/null +++ b/duui-Coreference/src/main/python/duui_coreference.lua @@ -0,0 +1,144 @@ +-- Bind static classes from java +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +DUUIutils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") +Token = luajava.bindClass("org.texttechnologylab.uima.type.spacy.SpacyToken") +Coreference = luajava.bindClass("org.texttechnologylab.annotation.Coreference") + +-- This "serialize" function is called to transform the CAS object into an stream that is sent to the annotator +-- Inputs: +-- - inputCas: The actual CAS object to serialize +-- - outputStream: Stream that is sent to the annotator, can be e.g. a string, JSON payload, ... +function serialize(inputCas, outputStream) + -- Get data from CAS + -- For spaCy, we need the documents text and its language + -- TODO add additional params? + print("start") + local doc_text = inputCas:getDocumentText() + print(doc_text) + local doc_lang = inputCas:getDocumentLanguage() + local tokens = {} + local begin_token = {} + local end_token = {} + local tokens_count = 1 + local tokens_it = luajava.newInstance("java.util.ArrayList", JCasUtil:select(inputCas, Token)):listIterator() + while tokens_it:hasNext() do + local token = tokens_it:next() + tokens[tokens_count] = token:getCoveredText() + begin_token[tokens_count] = token:getBegin() + end_token[tokens_count] = token:getEnd() + tokens_count = tokens_count + 1 + end +-- print("sentences") + print(tokens) + print(begin_token) + print(end_token) + outputStream:write(json.encode({ + tokens = tokens, + lang = doc_lang, + begin_token = begin_token, + end_token = end_token, + })) +-- -- print("sendToPython") +end + +-- This "deserialize" function is called on receiving the results from the annotator that have to be transformed into a CAS object +-- Inputs: +-- - inputCas: The actual CAS object to deserialize into +-- - inputStream: Stream that is received from to the annotator, can be e.g. a string, JSON payload, ... +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) +-- print("begin_deserialize") + + if results["modification_meta"] ~= nil and results["meta"] ~= nil and results["begin_resolve"] ~= nil then +-- print("GetInfo") + local source = results["model_source"] + local model_version = results["model_version"] + local model_name = results["model_name"] + local model_lang = results["model_lang"] +-- print("meta") + local modification_meta = results["modification_meta"] + local modification_anno = luajava.newInstance("org.texttechnologylab.annotation.DocumentModification", inputCas) + modification_anno:setUser(modification_meta["user"]) + modification_anno:setTimestamp(modification_meta["timestamp"]) + modification_anno:setComment(modification_meta["comment"]) + modification_anno:addToIndexes() + +-- print("setMetaData") + local model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) + model_meta:setModelVersion(model_version) + print(model_version) + model_meta:setModelName(model_name) + print(model_name) + model_meta:setSource(source) + print(source) + model_meta:setLang(model_lang) + print(model_lang) + model_meta:addToIndexes() + + local meta = results["meta"] +-- print("meta") + local begin = results["begin"] + local end_token = results["end"] + local begin_resolve = results["begin_resolve"] + local end_resolve = results["end_resolve"] + for index_i, begin_i in ipairs(begin) do + local end_i = end_token[index_i] + local begin_resolve_i = begin_resolve[index_i] + local end_resolve_i = end_resolve[index_i] + local coref_resolve = JCasUtil:selectAt(inputCas, Coreference, begin_resolve_i, end_resolve_i) +-- print(coref_resolve) + if coref_resolve:size() == 0 then + coref_resolve = luajava.newInstance("org.texttechnologylab.annotation.Coreference", inputCas, begin_resolve_i, end_resolve_i) + else + coref_resolve = coref_resolve:iterator():next() + end +-- print(coref_resolve) + local coref_anno = luajava.newInstance("org.texttechnologylab.annotation.Coreference", inputCas, begin_i, end_i) + coref_anno:setLink(coref_resolve) + coref_anno:addToIndexes() + end + +-- local meta = results["meta"] +-- -- print("meta") +-- local begin_claims = results["begin_claims"] +-- -- print("begin_claims") +-- local end_claims = results["end_claims"] +-- -- print("end_claims") +-- local begin_facts = results["begin_facts"] +-- -- print("begin_facts") +-- local end_facts = results["end_facts"] +-- -- print("end_facts") +-- local consistency = results["consistency"] +-- -- print("consistency") +-- for index_i, cons in ipairs(consistency) do +-- -- print(cons) +-- local begin_claim_i = begin_claims[index_i] +-- -- print(begin_claim_i) +-- local end_claim_i = end_claims[index_i] +-- -- print(end_claim_i) +-- local begin_fact_i = begin_facts[index_i] +-- -- print(begin_fact_i) +-- local end_fact_i = end_facts[index_i] +-- -- print(end_fact_i) +-- local claim_i = util:selectAt(inputCas, claims, begin_claim_i, end_claim_i):iterator():next() +-- -- print(claim_i) +-- local fact_i = util:selectAt(inputCas, facts, begin_fact_i, end_fact_i):iterator():next() +-- -- print(fact_i) +-- local factcheck_i = luajava.newInstance("org.texttechnologylab.annotation.FactChecking", inputCas) +-- -- print("FactCheck") +-- factcheck_i:setClaim(claim_i) +-- -- print("claim") +-- factcheck_i:setFact(fact_i) +-- -- print("fact") +-- factcheck_i:setConsistency(cons) +-- -- print("cons") +-- factcheck_i:setModel(model_meta) +-- -- print("setModel") +-- factcheck_i:addToIndexes() +-- -- print(factcheck_i) +-- end + end +end diff --git a/duui-Coreference/src/main/python/duui_coreference.py b/duui-Coreference/src/main/python/duui_coreference.py new file mode 100644 index 00000000..f86de5a9 --- /dev/null +++ b/duui-Coreference/src/main/python/duui_coreference.py @@ -0,0 +1,242 @@ +# from pydantic import BaseModel +# from pydantic_settings import BaseSettings +from pydantic import BaseModel, BaseSettings +from typing import List, Optional, Dict, Union +import logging +from time import time +from fastapi import FastAPI, Response +from cassis import load_typesystem +from threading import Lock +from functools import lru_cache + +from Coreferee_resolver import CorefereeResolver + +# from Climate import ClimateBert,model_name_map +# from sp_correction import SentenceBestPrediction + +# Settings +# These are automatically loaded from env variables +from starlette.responses import PlainTextResponse + +model_lock = Lock() + + +class UimaSentence(BaseModel): + text: str + begin: int + end: int + + +class UimaSentenceSelection(BaseModel): + selection: str + sentences: List[UimaSentence] + + +class Settings(BaseSettings): + # Name of this annotator + annotator_name: str + # Version of this annotator + annotator_version: str + # Log level + log_level: str + # model_name + model_name: str + # Name of this annotator + model_version: str + #cach_size + model_cache_size: int + # url of the model + model_source: str + # language of the model + model_lang: str + # sm or lg + model_variant: str + + +# Load settings from env vars +settings = Settings() +lru_cache_with_size = lru_cache(maxsize=settings.model_cache_size) +logging.basicConfig(level=settings.log_level) +logger = logging.getLogger(__name__) + + +# Load the predefined typesystem that is needed for this annotator to work +typesystem_filename = 'TypeSystemCoreference.xml' +logger.debug("Loading typesystem from \"%s\"", typesystem_filename) +with open(typesystem_filename, 'rb') as f: + typesystem = load_typesystem(f) + logger.debug("Base typesystem:") + logger.debug(typesystem.to_xml()) + +# Load the Lua communication script +lua_communication_script_filename = "duui_coreference.lua" +logger.debug("Loading Lua communication script from \"%s\"", lua_communication_script_filename) + + +# Request sent by DUUI +# Note, this is transformed by the Lua script +class DUUIRequest(BaseModel): + # + tokens: List[str] + # + lang: str + # + begin_token: List[int] + # + end_token: List[int] + + + +# UIMA type: mark modification of the document +class DocumentModification(BaseModel): + user: str + timestamp: int + comment: str + + +# UIMA type: adds metadata to each annotation +class AnnotationMeta(BaseModel): + name: str + version: str + modelName: str + modelVersion: str + + +# Response sent by DUUI +# Note, this is transformed by the Lua script +class DUUIResponse(BaseModel): + # Symspelloutput + # List of Sentence with every token + # Every token is a dictionary with following Infos: + # Symspelloutput right if the token is correct, wrong if the token is incorrect, skipped if the token was skipped, unkownn if token can corrected with Symspell + # If token is unkown it will be predicted with BERT Three output pos: + # 1. Best Prediction with BERT MASKED + # 2. Best Cos-sim with Sentence-Bert and with perdicted words of BERT MASK + # 3. Option 1 and 2 together + meta: AnnotationMeta + # Modification meta, one per document + modification_meta: DocumentModification + begin: List[int] + end: List[int] + begin_resolve: List[int] + end_resolve: List[int] + model_name: str + model_version: str + model_source: str + model_lang: str + model_variant: str + + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.annotator_name, + description="Factuality annotator", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "TTLab Team", + "url": "https://texttechnologylab.org", + "email": "bagci@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +with open(lua_communication_script_filename, 'rb') as f: + lua_communication_script = f.read().decode("utf-8") +logger.debug("Lua communication script:") +logger.debug(lua_communication_script_filename) + + +# Get typesystem of this annotator +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO remove cassis dependency, as only needed for typesystem at the moment? + xml = typesystem.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +# Return documentation info +@app.get("/v1/documentation") +def get_documentation(): + return "Test" + +@lru_cache_with_size +def load_model(language, variant): + model_i = CorefereeResolver(language, variant) + return model_i + + +# @lru_cache_with_size +# def load_model(model_name): +# model_i = ClimateBert(model_name, device) +# return model_i + + +def fix_unicode_problems(text): + # fix emoji in python string and prevent json error on response + # File "/usr/local/lib/python3.8/site-packages/starlette/responses.py", line 190, in render + # UnicodeEncodeError: 'utf-8' codec can't encode characters in position xx-yy: surrogates not allowed + clean_text = text.encode('utf-16', 'surrogatepass').decode('utf-16', 'surrogateescape') + return clean_text + + +# Process request from DUUI +@app.post("/v1/process") +def post_process(request: DUUIRequest): + # Return data + meta = None + begin = [] + end = [] + begin_resolve = [] + end_resolve = [] + # Save modification start time for later + modification_timestamp_seconds = int(time()) + try: + model_source = settings.model_source + model_lang = settings.model_lang + model_version = settings.model_version + # set meta Informations + meta = AnnotationMeta( + name=settings.annotator_name, + version=settings.annotator_version, + modelName=settings.model_name, + modelVersion=model_version, + ) + # Add modification info + modification_meta_comment = f"{settings.annotator_name} ({settings.annotator_version}))" + modification_meta = DocumentModification( + user=settings.annotator_name, + timestamp=modification_timestamp_seconds, + comment=modification_meta_comment + ) + mv = "" + + with model_lock: + coreference_resolver = load_model(request.lang, settings.model_variant.lower()) + doc = coreference_resolver.process_tokens(request.tokens, request.begin_token, request.end_token) + result = coreference_resolver.get_coreference_dict(doc, include_self=False, expand_noun_chunks=True) + begin = result["begin"] + end = result["end"] + begin_resolve = result["begin_resolve"] + end_resolve = result["end_resolve"] + + except Exception as ex: + logger.exception(ex) + return DUUIResponse(meta=meta, modification_meta=modification_meta, begin=begin, end=end, begin_resolve=begin_resolve, end_resolve=end_resolve, model_name=settings.model_name, + model_version=model_version, model_source=model_source, model_lang=model_lang, model_variant=settings.model_variant) diff --git a/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java b/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java new file mode 100644 index 00000000..64705932 --- /dev/null +++ b/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java @@ -0,0 +1,242 @@ +package org.hucompute.textimager.uima.Coreference; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import org.apache.uima.fit.util.JCasUtil; +import org.texttechnologylab.uima.type.spacy.SpacyToken; +import org.texttechnologylab.annotation.Coreference; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.XmlCasSerializer; + +import org.junit.jupiter.api.*; + +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; + +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class CoreferenceTest { + static DUUIComposer composer; + static JCas cas; + + static String url = "http://127.0.0.1:8000"; + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); + + cas = JCasFactory.createJCas(); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + public void afterEach() throws IOException, SAXException { + composer.resetPipeline(); + + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, stream); + System.out.println(stream.toString(StandardCharsets.UTF_8)); + + cas.reset(); + } + + public void createCas( + String language, + List tokens, + List begins, + List ends + ) throws UIMAException { + validateInput(tokens, begins, ends); + + cas.setDocumentLanguage(language); + + String documentText = buildDocumentText(tokens, begins, ends); + cas.setDocumentText(documentText); + + addTokens(tokens, begins, ends); + addSentencesFromPunctuation(tokens, begins, ends); + } + + private void addTokens( + List tokens, + List begins, + List ends + ) { + for (int i = 0; i < tokens.size(); i++) { + SpacyToken token = new SpacyToken(cas, begins.get(i), ends.get(i)); + token.addToIndexes(); + } + } + + private void addSentencesFromPunctuation( + List tokens, + List begins, + List ends + ) { + int sentenceBegin = begins.get(0); + + for (int i = 0; i < tokens.size(); i++) { + String token = tokens.get(i); + + if (token.equals(".") || token.equals("!") || token.equals("?")) { + int sentenceEnd = ends.get(i); + + Sentence sentence = new Sentence(cas, sentenceBegin, sentenceEnd); + sentence.addToIndexes(); + + if (i + 1 < tokens.size()) { + sentenceBegin = begins.get(i + 1); + } + } + } + } + + private String buildDocumentText( + List tokens, + List begins, + List ends + ) { + int documentLength = ends.get(ends.size() - 1); + char[] chars = new char[documentLength]; + Arrays.fill(chars, ' '); + + for (int i = 0; i < tokens.size(); i++) { + String token = tokens.get(i); + int begin = begins.get(i); + int end = ends.get(i); + + for (int j = 0; j < token.length(); j++) { + chars[begin + j] = token.charAt(j); + } + } + + return new String(chars); + } + + private void validateInput( + List tokens, + List begins, + List ends + ) { + assertEquals(tokens.size(), begins.size()); + assertEquals(tokens.size(), ends.size()); + + for (int i = 0; i < tokens.size(); i++) { + String token = tokens.get(i); + int begin = begins.get(i); + int end = ends.get(i); + + if (token.length() != end - begin) { + throw new IllegalArgumentException( + "Token length does not match offsets at index " + i + + ": token='" + token + "'" + + ", begin=" + begin + + ", end=" + end + ); + } + + if (i > 0 && begin < ends.get(i - 1)) { + throw new IllegalArgumentException( + "Token offsets overlap at index " + i + ); + } + } + } + + @Test + public void EnTest() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter( + "selection", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" + ) + ); + + List tokens = Arrays.asList( + "Anna", "bought", "a", "cactus", ".", + "The", "plant", "needed", "sunlight", ".", + "She", "put", "a", "vase", "on", "the", "table", ".", + "The", "vase", "was", "old", ",", "but", "it", "was", "beautiful", ".", + "The", "cactus", "grew", "quickly", "because", "it", "got", "enough", "light", "." + ); + + List begins = Arrays.asList( + 0, 5, 12, 14, 20, + 22, 26, 32, 39, 47, + 49, 53, 57, 59, 64, 67, 71, 76, + 78, 82, 87, 91, 94, 96, 100, 103, 107, 116, + 118, 122, 129, 134, 142, 150, 153, 157, 164, 169 + ); + + List ends = Arrays.asList( + 4, 11, 13, 20, 21, + 25, 31, 38, 47, 48, + 52, 56, 58, 63, 66, 70, 76, 77, + 81, 86, 90, 94, 95, 99, 102, 106, 116, 117, + 121, 128, 133, 141, 149, 152, 156, 163, 169, 170 + ); + + createCas("en", tokens, begins, ends); + + System.out.println("Input document:"); + System.out.println(cas.getDocumentText()); + SpacyToken h = JCasUtil.selectAt(cas, SpacyToken.class, 0, 4).iterator().next(); + composer.run(cas); + + Collection coreferences = JCasUtil.select(cas, Coreference.class); + Map> result = extractCoreferenceResult(); + for (Coreference coreference : coreferences) { + String token = coreference.getCoveredText(); + int begin = coreference.getBegin(); + int end = coreference.getEnd(); + + result.get("token").add(token); + result.get("begin").add(begin); + result.get("end").add(end); + + if (coreference.getLink() != null) { + String token_resolve = coreference.getLink().getCoveredText(); + int begin_resolve = coreference.getLink().getBegin(); + int end_resolve = coreference.getLink().getEnd(); + System.out.println("Coreference: '" + token + "' (begin=" + begin + ", end=" + end + ")" + " -> '" + token_resolve + "' (begin=" + begin_resolve + ", end=" + end_resolve + ")"); + } + + } + } + + private Map> extractCoreferenceResult() { + Map> result = new LinkedHashMap<>(); + + result.put("begin", new ArrayList<>()); + result.put("end", new ArrayList<>()); + result.put("begin_resolve", new ArrayList<>()); + result.put("end_resolve", new ArrayList<>()); + result.put("token", new ArrayList<>()); + result.put("token_resolve", new ArrayList<>()); + return result; + } +} \ No newline at end of file From 5eb8da10a8191fd2500cc5c3f2b259cda2dcf1a5 Mon Sep 17 00:00:00 2001 From: Mevluet Bagci Date: Tue, 2 Jun 2026 19:23:50 +0200 Subject: [PATCH 06/19] Update Coreference Tool --- duui-Coreference/docker_build.sh | 72 ++++++++++++++++--- .../{reqiurements.txt => requirements.txt} | 2 +- duui-Coreference/src/main/docker/Dockerfile | 35 ++++++--- .../src/main/python/Coreferee_resolver.py | 2 +- .../src/main/python/duui_coreference.lua | 12 ++-- .../src/main/python/duui_coreference.py | 2 +- .../uima/Coreference/CoreferenceTest.java | 2 +- 7 files changed, 100 insertions(+), 27 deletions(-) rename duui-Coreference/{reqiurements.txt => requirements.txt} (91%) diff --git a/duui-Coreference/docker_build.sh b/duui-Coreference/docker_build.sh index e1c1cec8..5da1de30 100644 --- a/duui-Coreference/docker_build.sh +++ b/duui-Coreference/docker_build.sh @@ -1,24 +1,78 @@ -export ANNOTATOR_NAME=duui-factchecking -export ANNOTATOR_VERSION=0.1.0 +export ANNOTATOR_NAME=duui-coreference +export ANNOTATOR_VERSION=0.2.0 export LOG_LEVEL=INFO eport MODEL_CACHE_SIZE=3 +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="en" +#export MODEL_VARIANT="sm" +###-------------------------------------------------------------------- + ##--------------------------------------------------------------------- export MODEL_NAME="coreferee" export MODEL_SPECNAME="coreferee" export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" -export MODEL_LANG="EN,DE,FR,PL" -export MODEL_VARIANT="SM" +export MODEL_LANG="de" +export MODEL_VARIANT="sm" ##-------------------------------------------------------------------- +# +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="fr" +#export MODEL_VARIANT="sm" +###-------------------------------------------------------------------- +# +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="pl" +#export MODEL_VARIANT="sm" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="en" +#export MODEL_VARIANT="lg" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="de" +#export MODEL_VARIANT="lg" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="fr" +#export MODEL_VARIANT="lg" +###-------------------------------------------------------------------- ###--------------------------------------------------------------------- #export MODEL_NAME="coreferee" #export MODEL_SPECNAME="coreferee" #export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" #export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" -#export MODEL_LANG="EN,DE,FR,PL" -#export MODEL_VARIANT="LG" +#export MODEL_LANG="pl" +#export MODEL_VARIANT="lg" ###-------------------------------------------------------------------- @@ -36,10 +90,10 @@ docker build \ --build-arg MODEL_SOURCE \ --build-arg MODEL_LANG \ --build-arg MODEL_VARIANT \ - -t ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_VARIANT}:${ANNOTATOR_VERSION}${DUUI_CUDA} \ + -t ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_LANG}"-"${MODEL_VARIANT}:${ANNOTATOR_VERSION}${DUUI_CUDA} \ -f src/main/docker/Dockerfile${DUUI_CUDA} \ . docker tag \ - ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_VARIANT}:${ANNOTATOR_VERSION}${DUUI_CUDA} \ - ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_VARIANT}:latest${DUUI_CUDA} \ No newline at end of file + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_LANG}"-"${MODEL_VARIANT}:${ANNOTATOR_VERSION}${DUUI_CUDA} \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_LANG}"-"${MODEL_VARIANT}:latest${DUUI_CUDA} \ No newline at end of file diff --git a/duui-Coreference/reqiurements.txt b/duui-Coreference/requirements.txt similarity index 91% rename from duui-Coreference/reqiurements.txt rename to duui-Coreference/requirements.txt index f61317a7..bb516d09 100644 --- a/duui-Coreference/reqiurements.txt +++ b/duui-Coreference/requirements.txt @@ -1,4 +1,4 @@ -spacy==3.5.0 +spacy==3.2.0 coreferee==1.4.1 numpy==1.26.4 setuptools<70 diff --git a/duui-Coreference/src/main/docker/Dockerfile b/duui-Coreference/src/main/docker/Dockerfile index 68ed716c..f6dd18c9 100644 --- a/duui-Coreference/src/main/docker/Dockerfile +++ b/duui-Coreference/src/main/docker/Dockerfile @@ -8,18 +8,37 @@ EXPOSE 9714 COPY ./requirements.txt ./requirements.txt RUN pip install -r requirements.txt -RUN python3 -m coreferee install en -RUN python3 -m coreferee install de -RUN python3 -m coreferee install fr -RUN python3 -m coreferee install pl -RUN python -m spacy download en_core_web_sm -RUN python -m spacy download de_core_news_sm -RUN python -m spacy download fr_core_news_sm -RUN python -m spacy download pl_core_news_sm +RUN python -m pip install --no-cache-dir \ + "spacy==3.2.0" \ + "coreferee" \ + "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl" \ + "de-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.2.0/de_core_news_sm-3.2.0-py3-none-any.whl" \ + "fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.2.0/fr_core_news_sm-3.2.0-py3-none-any.whl" \ + "pl-core-news-md @ https://github.com/explosion/spacy-models/releases/download/pl_core_news_md-3.2.0/pl_core_news_md-3.2.0-py3-none-any.whl" && \ + python -m spacy validate && \ + python -m coreferee install en && \ + python -m coreferee install de && \ + python -m coreferee install fr && \ + python -m coreferee install pl + + +#RUN python -m pip install --no-cache-dir \ +# "spacy==3.2.0" \ +# "coreferee" \ +# "en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl" \ +# "de-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.2.0/de_core_news_lg-3.2.0-py3-none-any.whl" \ +# "fr-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.2.0/fr_core_news_lg-3.2.0-py3-none-any.whl" \ +# "pl-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/pl_core_news_lg-3.2.0/pl_core_news_lg-3.2.0-py3-none-any.whl" && \ +# python -m spacy validate && \ +# python -m coreferee install en && \ +# python -m coreferee install de && \ +# python -m coreferee install fr && \ +# python -m coreferee install pl # copy scripts COPY ./src/main/python/TypeSystemCoreference.xml ./TypeSystemCoreference.xml COPY ./src/main/python/duui_coreference.py ./duui_coreference.py +COPY ./src/main/python/Coreferee_resolver.py ./Coreferee_resolver.py COPY ./src/main/python/duui_coreference.lua ./duui_coreference.lua #COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py diff --git a/duui-Coreference/src/main/python/Coreferee_resolver.py b/duui-Coreference/src/main/python/Coreferee_resolver.py index def8254e..703bce21 100644 --- a/duui-Coreference/src/main/python/Coreferee_resolver.py +++ b/duui-Coreference/src/main/python/Coreferee_resolver.py @@ -42,7 +42,7 @@ class CorefereeResolver: "en": "en_core_web_sm", "de": "de_core_news_sm", "fr": "fr_core_news_sm", - "pl": "pl_core_news_sm", + "pl": "pl_core_news_md", }, "lg": { "en": "en_core_web_lg", diff --git a/duui-Coreference/src/main/python/duui_coreference.lua b/duui-Coreference/src/main/python/duui_coreference.lua index 9c03455d..46c69c82 100644 --- a/duui-Coreference/src/main/python/duui_coreference.lua +++ b/duui-Coreference/src/main/python/duui_coreference.lua @@ -3,7 +3,7 @@ StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") Class = luajava.bindClass("java.lang.Class") JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") DUUIutils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") -Token = luajava.bindClass("org.texttechnologylab.uima.type.spacy.SpacyToken") +Token = luajava.bindClass("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") Coreference = luajava.bindClass("org.texttechnologylab.annotation.Coreference") -- This "serialize" function is called to transform the CAS object into an stream that is sent to the annotator @@ -14,9 +14,9 @@ function serialize(inputCas, outputStream) -- Get data from CAS -- For spaCy, we need the documents text and its language -- TODO add additional params? - print("start") +-- print("start") local doc_text = inputCas:getDocumentText() - print(doc_text) +-- print(doc_text) local doc_lang = inputCas:getDocumentLanguage() local tokens = {} local begin_token = {} @@ -31,9 +31,9 @@ function serialize(inputCas, outputStream) tokens_count = tokens_count + 1 end -- print("sentences") - print(tokens) - print(begin_token) - print(end_token) +-- print(tokens) +-- print(begin_token) +-- print(end_token) outputStream:write(json.encode({ tokens = tokens, lang = doc_lang, diff --git a/duui-Coreference/src/main/python/duui_coreference.py b/duui-Coreference/src/main/python/duui_coreference.py index f86de5a9..5e4b4ec5 100644 --- a/duui-Coreference/src/main/python/duui_coreference.py +++ b/duui-Coreference/src/main/python/duui_coreference.py @@ -228,7 +228,7 @@ def post_process(request: DUUIRequest): mv = "" with model_lock: - coreference_resolver = load_model(request.lang, settings.model_variant.lower()) + coreference_resolver = load_model(model_lang, settings.model_variant.lower()) doc = coreference_resolver.process_tokens(request.tokens, request.begin_token, request.end_token) result = coreference_resolver.get_coreference_dict(doc, include_self=False, expand_noun_chunks=True) begin = result["begin"] diff --git a/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java b/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java index 64705932..4fd42bd7 100644 --- a/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java +++ b/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java @@ -33,7 +33,7 @@ public class CoreferenceTest { static DUUIComposer composer; static JCas cas; - static String url = "http://127.0.0.1:8000"; + static String url = "http://127.0.0.1:9714"; @BeforeAll static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { From faf0fe2466b9ea23023cb7b01ad37434b84d242c Mon Sep 17 00:00:00 2001 From: Mevluet Bagci Date: Mon, 8 Jun 2026 16:37:48 +0200 Subject: [PATCH 07/19] Add DUUI-based transformer NER components (duui-NER) Implement Dockerized DUUI components for selected multilingual transformer-based Named Entity Recognition models, including GLiNER, GLiNER2, RoBERTa, WikiNEuRal, and XLM-R. Each Docker image now builds a single model-specific NER service with DUUI endpoints for type system, Lua communication, documentation, and processing. Add model metadata, runtime parameters, DKPro/TTLab annotation mapping, and usage documentation for DUUI integration. --- duui-NER/.dockerignore | 3 + duui-NER/.gitignore | 3 + duui-NER/Readme.md | 147 +++++ duui-NER/docker_build.sh | 89 +++ duui-NER/duui-NER.iml | 8 + duui-NER/pom.xml | 155 ++++++ duui-NER/requirements.txt | 14 + duui-NER/src/main/docker/Dockerfile | 60 ++ duui-NER/src/main/docker/Dockerfile-cuda | 70 +++ duui-NER/src/main/python/TypeSystemNER.xml | 132 +++++ duui-NER/src/main/python/duui_ner.lua | 261 +++++++++ duui-NER/src/main/python/duui_ner.py | 525 ++++++++++++++++++ .../main/python/ner_classification_backend.py | 396 +++++++++++++ .../textimager/uima/NER/NERTest.java | 167 ++++++ 14 files changed, 2030 insertions(+) create mode 100644 duui-NER/.dockerignore create mode 100644 duui-NER/.gitignore create mode 100644 duui-NER/Readme.md create mode 100644 duui-NER/docker_build.sh create mode 100644 duui-NER/duui-NER.iml create mode 100644 duui-NER/pom.xml create mode 100644 duui-NER/requirements.txt create mode 100644 duui-NER/src/main/docker/Dockerfile create mode 100644 duui-NER/src/main/docker/Dockerfile-cuda create mode 100644 duui-NER/src/main/python/TypeSystemNER.xml create mode 100644 duui-NER/src/main/python/duui_ner.lua create mode 100644 duui-NER/src/main/python/duui_ner.py create mode 100644 duui-NER/src/main/python/ner_classification_backend.py create mode 100644 duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java diff --git a/duui-NER/.dockerignore b/duui-NER/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-NER/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-NER/.gitignore b/duui-NER/.gitignore new file mode 100644 index 00000000..d2092691 --- /dev/null +++ b/duui-NER/.gitignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv*/ \ No newline at end of file diff --git a/duui-NER/Readme.md b/duui-NER/Readme.md new file mode 100644 index 00000000..988fbbb5 --- /dev/null +++ b/duui-NER/Readme.md @@ -0,0 +1,147 @@ +[![Version](https://img.shields.io/static/v1?label=duui-ner&message=0.1.0&color=blue)](https://docker.texttechnologylab.org/v2/duui-ner/tags/list) +[![Version](https://img.shields.io/static/v1?label=Python&message=3.12&color=green)]() +[![Version](https://img.shields.io/static/v1?label=Transformers&message=5.1.0&color=yellow)]() +[![Version](https://img.shields.io/static/v1?label=Torch&message=2.11.0&color=red)]() +[![Version](https://img.shields.io/static/v1?label=GLiNER&message=0.2.26&color=orange)]() +[![Version](https://img.shields.io/static/v1?label=GLiNER2&message=1.3.1&color=orange)]() + +# Transformers NER + +DUUI implementation for selected transformer-based Named Entity Recognition (NER) models. The component is designed for use with the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +The component supports one NER model per Docker image/container. Each image is built with a single `MODEL_NAME` and exposes the DUUI endpoints for type system, Lua communication layer, documentation, and processing. + +## Included Models + +| Image suffix / `MODEL_SPECNAME` | `MODEL_NAME` | Model source | Model version | Languages | Backend | +| --- | --- | --- | --- | --- | --- | +| `gliner-multi-v2-1` | `gliner` | https://huggingface.co/urchade/gliner_multi-v2.1 | `443d26d654e0324125a96bebd8e796c14ff2efe6` | Multilingual | GLiNER | +| `gliner2-multi-v1` | `gliner2` | https://huggingface.co/fastino/gliner2-multi-v1 | `cc151f5b0ce4f7010c3ae8884527dd43dddf9d21` | Multilingual | GLiNER2 | +| `roberta-ner-multilingual` | `roberta-ner-multilingual` | https://huggingface.co/julian-schelb/roberta-ner-multilingual | `d0a19147f3bb0065c8091459e3d35405ce9d48da` | Multilingual | HuggingFace token-classification | +| `wikineural-multilingual-ner` | `wikineural-multilingual-ner` | https://huggingface.co/Babelscape/wikineural-multilingual-ner | `bed6ee7a45d2827b6c90a4fd7983f0241ae0a5c1` | Multilingual | HuggingFace token-classification | +| `xlm-r-ner-40-lang` | `xlm-r-ner-40-lang` | https://huggingface.co/nbroad/jplu-xlm-r-ner-40-lang | `7f7f0fe9bc946a9848611aff079f556387687216` | Multilingual / 40 languages | HuggingFace token-classification | + +## Annotation Types + +The component creates UIMA NER annotations from the model output. Standard NER labels are mapped to DKPro NER types where possible, for example: + +| Label | UIMA type | +| --- | --- | +| `PER`, `person` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person` | +| `ORG`, `organization` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization` | +| `LOC`, `location` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location` | +| `taxon`, `taxa` | `org.texttechnologylab.annotation.type.Taxon` | +| other labels | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity` | + +The `taxon` label is mapped to the TTLab taxon type: + +```text +org.texttechnologylab.annotation.type.Taxon +``` + +The delivered type system must include this type if taxon annotations should be created as `Taxon` instead of falling back to a generic `NamedEntity`. + +## Requirements + +The container uses Python 3.12 and the following core Python dependencies: + +| Package | Version | +| --- | --- | +| `gliner` | `0.2.26` | +| `gliner2[local]` | `1.3.1` | +| `transformers` | `5.1.0` | +| `torch` | `2.11.0` | +| `fastapi` | `0.110.0` | +| `dkpro-cassis` | `0.9.1` | +| `uvicorn[standard]` | `0.27.1` | +| `pydantic-settings` | `2.0.2` | + +See `requirements.txt` for the full dependency list. + +# How To Use + +## Start Docker container + +```bash +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-ner-[modelname]:latest +``` + +Example: + +```bash +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-ner-wikineural-multilingual-ner:latest +``` + +## Run within DUUI + +```java +composer.add( + new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-ner-[modelname]:latest") + .withParameter( + "selection", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + ) +); +``` + +With optional runtime parameters: + +```java +composer.add( + new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-ner-[modelname]:latest") + .withParameter( + "selection", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + ) + .withParameter("threshold", "0.5") + .withParameter("batch_size", "8") + .withParameter("labels", "person,organization,location,date,event,product,taxon,other") +); +``` + +### Parameters + +| Name | Default | Description | +| --- | --- | --- | +| `selection` | required | Use `text` to process the full document text or any selectable UIMA type class name, e.g. `de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence`. | +| `threshold` | `0.5` | Confidence threshold for GLiNER/GLiNER2. HuggingFace token-classification models may ignore this value. | +| `batch_size` | `8` | Batch size used during prediction. | +| `labels` | `person,organization,location,date,event,product,taxon,other` | Candidate labels for GLiNER/GLiNER2. HuggingFace token-classification models use their trained label set. | + +## Runtime behavior + +- Each Docker image/container uses exactly one model. +- `MODEL_NAME` selects the backend model alias used by the Python service. +- `MODEL_VERSION` is used as model metadata in the DUUI response. +- `MODEL_SOURCE` and `MODEL_LANG` are also returned as metadata. +- Runtime parameters such as `threshold`, `batch_size`, and `labels` are passed via DUUI `.withParameter(...)`. + +# Cite + +If you use this DUUI image, please cite DUUI as follows: + +Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. [[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] + +## BibTeX + +```bibtex +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander}, + editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023}, + year = {2023}, + address = {Singapore}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, + pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf} +} + +@misc{Bagci:2026, + author = {Bagci, Mevlüt}, + title = {Transformer-based Named Entity Recognition models as {DUUI} component}, + year = {2026}, + howpublished = {https://github.com/texttechnologylab/duui-uima} +} +``` \ No newline at end of file diff --git a/duui-NER/docker_build.sh b/duui-NER/docker_build.sh new file mode 100644 index 00000000..7b5d3ecb --- /dev/null +++ b/duui-NER/docker_build.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +set -euo pipefail + +export ANNOTATOR_CUDA= +#export ANNOTATOR_CUDA="-cuda" + +export ANNOTATOR_NAME=duui-ner +export ANNOTATOR_VERSION=0.1.0 +export LOG_LEVEL=DEBUG +export MODEL_CACHE_SIZE=1 +export DOCKER_REGISTRY="docker.texttechnologylab.org/" + +# Optional GLiNER/GLiNER2 settings +export NER_LABELS="person,organization,location,date,event,product,taxon,other" +export THRESHOLD=0.5 +export BATCH_SIZE=8 + +###--------------------------------------------------------------------- +# GLiNER +# Passend dazu im Dockerfile aktivieren: +# RUN python -c "from gliner import GLiNER; GLiNER.from_pretrained(model_id='urchade/gliner_multi-v2.1', map_location='cpu')" +#export MODEL_NAME="gliner" +#export MODEL_SPECNAME="gliner-multi-v2-1" +#export MODEL_VERSION="443d26d654e0324125a96bebd8e796c14ff2efe6" +#export MODEL_SOURCE="https://huggingface.co/urchade/gliner_multi-v2.1" +#export MODEL_LANG="Multi" +###--------------------------------------------------------------------- + +###--------------------------------------------------------------------- +# GLiNER2 +# Passend dazu im Dockerfile aktivieren: +# RUN python -c "from gliner2 import GLiNER2; GLiNER2.from_pretrained('fastino/gliner2-multi-v1')" +#export MODEL_NAME="gliner2" +#export MODEL_SPECNAME="gliner2-multi-v1" +#export MODEL_VERSION="cc151f5b0ce4f7010c3ae8884527dd43dddf9d21" +#export MODEL_SOURCE="https://huggingface.co/fastino/gliner2-multi-v1" +#export MODEL_LANG="Multi" +###--------------------------------------------------------------------- + +###--------------------------------------------------------------------- +# RoBERTa multilingual NER +# Passend dazu im Dockerfile aktivieren: +# RUN python -c "from transformers import pipeline; pipeline('token-classification', model='julian-schelb/roberta-ner-multilingual', aggregation_strategy='simple')" +#export MODEL_NAME="roberta-ner-multilingual" +#export MODEL_SPECNAME="roberta-ner-multilingual" +#export MODEL_VERSION="d0a19147f3bb0065c8091459e3d35405ce9d48da" +#export MODEL_SOURCE="https://huggingface.co/julian-schelb/roberta-ner-multilingual" +#export MODEL_LANG="Multi" +###--------------------------------------------------------------------- + +###--------------------------------------------------------------------- +# WikiNEuRal multilingual NER +# Passend dazu im Dockerfile aktivieren: +# RUN python -c "from transformers import pipeline; pipeline('token-classification', model='Babelscape/wikineural-multilingual-ner', aggregation_strategy='simple')" +#export MODEL_NAME="wikineural-multilingual-ner" +#export MODEL_SPECNAME="wikineural-multilingual-ner" +#export MODEL_VERSION="bed6ee7a45d2827b6c90a4fd7983f0241ae0a5c1" +#export MODEL_SOURCE="https://huggingface.co/Babelscape/wikineural-multilingual-ner" +#export MODEL_LANG="Multi" +###--------------------------------------------------------------------- + +###--------------------------------------------------------------------- +# XLM-R NER 40 languages +# Passend dazu im Dockerfile aktivieren: +# RUN python -c "from transformers import pipeline; pipeline('token-classification', model='nbroad/jplu-xlm-r-ner-40-lang', aggregation_strategy='simple')" +export MODEL_NAME="xlm-r-ner-40-lang" +export MODEL_SPECNAME="xlm-r-ner-40-lang" +export MODEL_VERSION="7f7f0fe9bc946a9848611aff079f556387687216" +export MODEL_SOURCE="https://huggingface.co/nbroad/jplu-xlm-r-ner-40-lang" +export MODEL_LANG="Multi" +###--------------------------------------------------------------------- + + +docker build \ + --build-arg ANNOTATOR_NAME \ + --build-arg ANNOTATOR_VERSION \ + --build-arg LOG_LEVEL \ + --build-arg MODEL_CACHE_SIZE \ + --build-arg MODEL_NAME \ + --build-arg MODEL_VERSION \ + --build-arg MODEL_SOURCE \ + --build-arg MODEL_LANG \ + -t ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + -f src/main/docker/Dockerfile${ANNOTATOR_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:latest${ANNOTATOR_CUDA} \ No newline at end of file diff --git a/duui-NER/duui-NER.iml b/duui-NER/duui-NER.iml new file mode 100644 index 00000000..b58dac36 --- /dev/null +++ b/duui-NER/duui-NER.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/duui-NER/pom.xml b/duui-NER/pom.xml new file mode 100644 index 00000000..28b4395d --- /dev/null +++ b/duui-NER/pom.xml @@ -0,0 +1,155 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui-ner + 0.1.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + + 17 + 17 + 2.4.0 + + + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 7cef2433b5 + + + + + + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.14 + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + \ No newline at end of file diff --git a/duui-NER/requirements.txt b/duui-NER/requirements.txt new file mode 100644 index 00000000..dc3f5cb2 --- /dev/null +++ b/duui-NER/requirements.txt @@ -0,0 +1,14 @@ +gliner==0.2.26 +gliner2[local]==1.3.1 +transformers==5.1.0 +torch==2.11.0 +torchvision==0.26.0 +torchaudio==2.11.0 +nltk==3.9.4 +termcolor==3.3.0 +six==1.17.0 +fastapi==0.110.0 +dkpro-cassis==0.9.1 +uvicorn[standard]==0.27.1 +pydantic-settings==2.0.2 +torchmetrics==1.2.0 \ No newline at end of file diff --git a/duui-NER/src/main/docker/Dockerfile b/duui-NER/src/main/docker/Dockerfile new file mode 100644 index 00000000..47aa2c08 --- /dev/null +++ b/duui-NER/src/main/docker/Dockerfile @@ -0,0 +1,60 @@ +FROM python:3.12 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# MODEL_NAME=gliner +#RUN python -c "from gliner import GLiNER; GLiNER.from_pretrained(model_id='urchade/gliner_multi-v2.1', map_location='cpu')" +# MODEL_NAME=gliner2 +#RUN python -c "from gliner2 import GLiNER2; GLiNER2.from_pretrained('fastino/gliner2-multi-v1')" +# MODEL_NAME=roberta-ner-multilingual +#RUN python -c "from transformers import pipeline; pipeline('token-classification', model='julian-schelb/roberta-ner-multilingual', aggregation_strategy='simple')" +# MODEL_NAME=wikineural-multilingual-ner +#RUN python -c "from transformers import pipeline; pipeline('token-classification', model='Babelscape/wikineural-multilingual-ner', aggregation_strategy='simple')" +# MODEL_NAME=xlm-r-ner-40-lang +RUN python -c "from transformers import pipeline; pipeline('token-classification', model='nbroad/jplu-xlm-r-ner-40-lang', aggregation_strategy='simple')" + +# copy DUUI NER scripts +COPY ./src/main/python/TypeSystemNER.xml ./TypeSystemNER.xml +COPY ./src/main/python/duui_ner.py ./duui_ner.py +COPY ./src/main/python/ner_classification_backend.py ./ner_classification_backend.py +COPY ./src/main/python/duui_ner.lua ./duui_ner.lua + + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config: one model per container +ARG MODEL_CACHE_SIZE=1 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-ner" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model info +# Genau EIN Modell pro Container. Keine kommagetrennten Listen, kein "all". +ARG MODEL_NAME="wikineural-multilingual-ner" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_VERSION="0.1" +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="multi" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for HuggingFace/runtime +ARG TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TRANSFORMERS_OFFLINE +ENV HF_HUB_OFFLINE=$TRANSFORMERS_OFFLINE + +ENTRYPOINT ["uvicorn", "duui_ner:app", "--host", "0.0.0.0", "--port", "9714"] +CMD ["--workers", "1"] diff --git a/duui-NER/src/main/docker/Dockerfile-cuda b/duui-NER/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..e64d175e --- /dev/null +++ b/duui-NER/src/main/docker/Dockerfile-cuda @@ -0,0 +1,70 @@ +FROM nvidia/cuda:11.8.0-base-ubuntu22.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +RUN pip install setuptools wheel +COPY ./requirements.txt ./requirements.txt +RUN apt remove -y python3-blinker || true +RUN pip install -r requirements.txt + + + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_genre.py ./duui_genre.py +COPY ./src/main/python/duui_genre.lua ./duui_genre.lua +COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/web-register-classification-multilingual')" + + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-transformers-topic" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_genre:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-NER/src/main/python/TypeSystemNER.xml b/duui-NER/src/main/python/TypeSystemNER.xml new file mode 100644 index 00000000..dc052a36 --- /dev/null +++ b/duui-NER/src/main/python/TypeSystemNER.xml @@ -0,0 +1,132 @@ + + + + + org.texttechnologylab.annotation.AnnotatorMetaData + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + name + + uima.cas.String + + + version + + uima.cas.String + + + modelName + + uima.cas.String + + + modelVersion + + uima.cas.String + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + org.texttechnologylab.annotation.DocumentModification + + uima.cas.AnnotationBase + + + user + + uima.cas.String + + + timestamp + + uima.cas.Long + + + comment + + uima.cas.String + + + + + org.hucompute.textimager.uima.type.Sentiment + + uima.tcas.Annotation + + + sentiment + + uima.cas.Double + + + subjectivity + + uima.cas.Double + + + + + org.hucompute.textimager.uima.type.CategorizedSentiment + + org.hucompute.textimager.uima.type.Sentiment + + + pos + + uima.cas.Double + + + neu + + uima.cas.Double + + + neg + + uima.cas.Double + + + + + org.texttechnologylab.annotation.AnnotationComment + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + value + + uima.cas.String + + + key + + uima.cas.String + + + + + diff --git a/duui-NER/src/main/python/duui_ner.lua b/duui-NER/src/main/python/duui_ner.lua new file mode 100644 index 00000000..f1c10dd5 --- /dev/null +++ b/duui-NER/src/main/python/duui_ner.lua @@ -0,0 +1,261 @@ +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +TopicUtils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") + +DEFAULT_SELECTION = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + +-- Runtime defaults. These mirror the Python/ENV defaults and can be overwritten +-- with DUUI .withParameter(...). +DEFAULT_THRESHOLD = 0.5 +DEFAULT_BATCH_SIZE = 8 +DEFAULT_LABELS = "person,organization,location,date,event,product,taxon,other" + +DEFAULT_NER_TYPE = "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" +TAXON_TYPE = "org.texttechnologylab.annotation.type.Taxon" + +function get_parameter(parameters, key, default_value) + if parameters ~= nil and parameters[key] ~= nil then + return parameters[key] + end + return default_value +end + +function safe_string(value) + if value == nil then + return "" + end + return tostring(value) +end + +function serialize(inputCas, outputStream, parameters) + local doc_lang = inputCas:getDocumentLanguage() + local doc_text = inputCas:getDocumentText() + local doc_len = TopicUtils:getDocumentTextLength(inputCas) + + local selection_types = get_parameter(parameters, "selection", DEFAULT_SELECTION) + + -- Runtime parameters. Defaults are declared here and mirrored in Python. + -- DUUI parameters override these defaults. + local threshold_parameter = get_parameter(parameters, "threshold", DEFAULT_THRESHOLD) + local threshold = DEFAULT_THRESHOLD + if threshold_parameter ~= nil then + threshold = tonumber(threshold_parameter) + if threshold == nil then + error("Parameter 'threshold' must be a number between 0.0 and 1.0", 2) + end + if threshold < 0.0 or threshold > 1.0 then + error("Parameter 'threshold' must be between 0.0 and 1.0", 2) + end + end + + local batch_size_parameter = get_parameter(parameters, "batch_size", DEFAULT_BATCH_SIZE) + local batch_size = DEFAULT_BATCH_SIZE + if batch_size_parameter ~= nil then + batch_size = tonumber(batch_size_parameter) + if batch_size == nil then + error("Parameter 'batch_size' must be a positive integer", 2) + end + batch_size = math.floor(batch_size) + if batch_size < 1 then + error("Parameter 'batch_size' must be greater than or equal to 1", 2) + end + end + + local labels = get_parameter(parameters, "labels", DEFAULT_LABELS) + labels = safe_string(labels) + labels = string.gsub(labels, "^%s*(.-)%s*$", "%1") + if labels == "" then + error("Parameter 'labels' must contain at least one label", 2) + end + + local selections = {} + local selections_count = 1 + + for selection_type in string.gmatch(selection_types, "([^,]+)") do + selection_type = string.gsub(selection_type, "^%s*(.-)%s*$", "%1") + + local sentences = {} + local sentences_count = 1 + + if selection_type == "text" then + sentences[1] = { + text = doc_text, + begin = 0, + ['end'] = doc_len + } + else + local clazz = Class:forName(selection_type) + local sentences_it = JCasUtil:select(inputCas, clazz):iterator() + + while sentences_it:hasNext() do + local sentence = sentences_it:next() + sentences[sentences_count] = { + text = sentence:getCoveredText(), + begin = sentence:getBegin(), + ['end'] = sentence:getEnd() + } + sentences_count = sentences_count + 1 + end + end + + selections[selections_count] = { + sentences = sentences, + selection = selection_type + } + selections_count = selections_count + 1 + end + + outputStream:write(json.encode({ + selections = selections, + lang = doc_lang, + doc_len = doc_len, + threshold = threshold, + batch_size = batch_size, + labels = labels + })) +end + +function add_document_modification(inputCas, results) + if results["modification_meta"] == nil then + return + end + + pcall(function() + local modification_meta = results["modification_meta"] + local modification_anno = luajava.newInstance("org.texttechnologylab.annotation.DocumentModification", inputCas) + modification_anno:setUser(safe_string(modification_meta["user"])) + modification_anno:setTimestamp(modification_meta["timestamp"]) + modification_anno:setComment(safe_string(modification_meta["comment"])) + modification_anno:addToIndexes() + end) +end + +function add_model_metadata(inputCas, results) + local model_meta = nil + + pcall(function() + model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) + model_meta:setModelVersion(safe_string(results["model_version"])) + model_meta:setModelName(safe_string(results["model_name"])) + model_meta:setSource(safe_string(results["model_source"])) + model_meta:setLang(safe_string(results["model_lang"])) + model_meta:addToIndexes() + end) + + return model_meta +end + +function add_annotation_comment(inputCas, reference, key, value) + if value == nil then + return + end + + pcall(function() + local comment = luajava.newInstance("org.texttechnologylab.annotation.AnnotationComment", inputCas) + comment:setReference(reference) + comment:setKey(safe_string(key)) + comment:setValue(safe_string(value)) + comment:addToIndexes() + end) +end + +function create_ner_annotation(inputCas, tag) + local tag_type = tag["ner_type"] + local value = string.lower(safe_string(tag["value"])) + + -- If Python returns a taxon label, prefer the TTLab Taxon type. + -- This also covers older responses where ner_type was missing or still NamedEntity. + if value == "taxon" or value == "taxa" then + tag_type = TAXON_TYPE + elseif tag_type == nil or tag_type == "" then + tag_type = DEFAULT_NER_TYPE + end + + local annotation = nil + + -- Prefer the concrete NER subtype returned by Python, e.g. Person, Location, Organization, Taxon. + local ok = pcall(function() + annotation = luajava.newInstance(tag_type, inputCas) + end) + + -- Fallback to generic NamedEntity if the subtype is not available in the active type system. + if not ok or annotation == nil then + annotation = luajava.newInstance(DEFAULT_NER_TYPE, inputCas) + end + + annotation:setBegin(tag["begin"]) + annotation:setEnd(tag["end"]) + + -- DKPro NamedEntity and TTLab NamedEntity-like types usually provide setValue, + -- but keep this safe for custom types without this feature. + pcall(function() + annotation:setValue(safe_string(tag["value"])) + end) + + if tag["identifier"] ~= nil then + pcall(function() + annotation:setIdentifier(safe_string(tag["identifier"])) + end) + end + + annotation:addToIndexes() + return annotation +end + +function get_tags(results) + if results["tags"] ~= nil then + return results["tags"] + end + + -- Fallback for the flattened response fields of duui_ner_single_model.py. + local tags = {} + local begins = results["begin"] or {} + local ends = results["end"] or {} + local labels = results["results"] or {} + local ner_types = results["ner_type"] or {} + local covered_texts = results["covered_text"] or {} + local factors = results["factors"] or {} + local models = results["model"] or {} + + for i, label in ipairs(labels) do + tags[i] = { + begin = begins[i], + ['end'] = ends[i], + value = label, + ner_type = ner_types[i], + covered_text = covered_texts[i], + score = factors[i], + model_name = models[i] + } + end + + return tags +end + +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) + + if results == nil then + return + end + + add_document_modification(inputCas, results) + local model_meta = add_model_metadata(inputCas, results) + + local tags = get_tags(results) + + for i, tag in ipairs(tags) do + if tag["begin"] ~= nil and tag["end"] ~= nil then + local annotation = create_ner_annotation(inputCas, tag) + + -- These comments are optional. They are only written if the TTLab AnnotationComment + -- type exists in the active type system. + add_annotation_comment(inputCas, annotation, "score", tag["score"]) + add_annotation_comment(inputCas, annotation, "covered_text", tag["covered_text"]) + add_annotation_comment(inputCas, annotation, "model_name", tag["model_name"]) + add_annotation_comment(inputCas, annotation, "ner_type", tag["ner_type"]) + end + end +end \ No newline at end of file diff --git a/duui-NER/src/main/python/duui_ner.py b/duui-NER/src/main/python/duui_ner.py new file mode 100644 index 00000000..2bad4fa0 --- /dev/null +++ b/duui-NER/src/main/python/duui_ner.py @@ -0,0 +1,525 @@ +from __future__ import annotations + +import logging +from functools import lru_cache +from threading import Lock +from time import time +from typing import Any, Dict, Final, Iterable, List, Optional, Tuple, Union + +import torch +from fastapi import FastAPI, Response +from fastapi.responses import JSONResponse, PlainTextResponse +from pydantic import BaseModel + +try: + from pydantic_settings import BaseSettings +except ImportError: # pydantic v1 fallback + from pydantic import BaseSettings # type: ignore + +from ner_classification_backend import MODEL_REGISTRY, create_ner_classifier, resolve_model_name + + +model_lock = Lock() + + +class UimaSentence(BaseModel): + text: str + begin: int + end: int + + +class UimaSentenceSelection(BaseModel): + selection: str + sentences: List[UimaSentence] + + +class Settings(BaseSettings): + annotator_name: str = "DUUI NER" + annotator_version: str = "0.1.0" + log_level: str = "INFO" + + # Exactly one model per container. + # Use one registry alias or one exact HuggingFace model id. + # Comma-separated lists and "all" are intentionally rejected. + model_name: str = "wikineural-multilingual-ner" + model_version: str = "latest" + # Optional HuggingFace revision/commit hash. Empty/default values are ignored. + model_cache_size: int = 1 + model_source: str = "" + model_lang: str = "multi" + + # Labels are used by GLiNER/GLiNER2. HF token-classification ignores them. + ner_labels: str = "person,organization,location,date,event,product,taxon,other" + + threshold: float = 0.5 + batch_size: int = 8 + + typesystem_filename: str = "TypeSystemNER.xml" + lua_communication_script_filename: str = "duui_ner.lua" + + class Config: + env_prefix = "" + case_sensitive = False + + +settings = Settings() +logging.basicConfig(level=getattr(logging, settings.log_level.upper(), logging.INFO)) +logger = logging.getLogger(__name__) + +lru_cache_with_size = lru_cache(maxsize=max(1, settings.model_cache_size)) +device = "cuda:0" if torch.cuda.is_available() else "cpu" +logger.info("USING %s", device) + + +class DUUIRequest(BaseModel): + doc_len: Optional[int] = None + lang: Optional[str] = None + selections: List[UimaSentenceSelection] + + # Runtime parameters passed through the Lua layer from .withParameter(...). + # If omitted, the defaults from Settings are used. + threshold: Optional[float] = None + batch_size: Optional[int] = None + labels: Optional[Union[str, List[str]]] = None + + +class DocumentModification(BaseModel): + user: str + timestamp: int + comment: str + + +class AnnotationMeta(BaseModel): + name: str + version: str + modelName: str + modelVersion: str + + +ner_types: Final[Dict[str, str]] = { + "Animal": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Animal", + "Cardinal": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Cardinal", + "ContactInfo": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.ContactInfo", + "Date": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Date", + "Disease": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Disease", + "Event": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Event", + "Fac": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Fac", + "FacDesc": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.FacDesc", + "Game": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Game", + "Gpe": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Gpe", + "GpeDesc": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.GpeDesc", + "Language": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Language", + "Law": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Law", + "Location": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location", + "Money": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Money", + "NamedEntity": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + "Taxon": "org.texttechnologylab.annotation.type.Taxon", + "Nationality": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Nationality", + "Norp": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Norp", + "Ordinal": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Ordinal", + "OrgDesc": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.OrgDesc", + "Organization": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization", + "PerDesc": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.PerDesc", + "Percent": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Percent", + "Person": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person", + "Plant": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Plant", + "Product": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Product", + "ProductDesc": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.ProductDesc", + "Quantity": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Quantity", + "Substance": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Substance", + "Time": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Time", + "WorkOfArt": "de.tudarmstadt.ukp.dkpro.core.api.ner.type.WorkOfArt", +} +ner_base_type: Final[str] = ner_types["NamedEntity"] + +ner_tag_map: Final[Dict[str, str]] = { + "PER": "Person", + "PERSON": "Person", + "person": "Person", + "ORG": "Organization", + "ORGANIZATION": "Organization", + "organization": "Organization", + "LOC": "Location", + "LOCATION": "Location", + "location": "Location", + "GPE": "Gpe", + "date": "Date", + "DATE": "Date", + "event": "Event", + "EVENT": "Event", + "product": "Product", + "PRODUCT": "Product", + "taxon": "Taxon", + "TAXON": "Taxon", + "Taxon": "Taxon", + "taxa": "Taxon", + "TAXA": "Taxon", + "other": "NamedEntity", + "MISC": "NamedEntity", +} + + +class DkproNer(BaseModel): + begin: int + end: int + value: str + identifier: Optional[str] = None + ner_type: str = ner_base_type + covered_text: Optional[str] = None + score: Optional[float] = None + model_name: Optional[str] = None + + +class DUUIResponse(BaseModel): + meta: AnnotationMeta + modification_meta: DocumentModification + begin: List[int] + end: List[int] + results: List[str] + factors: List[float] + len_results: List[int] + ner_type: List[str] + covered_text: List[str] + model: List[str] + tags: List[DkproNer] + model_name: str + model_version: str + model_source: str + model_lang: str + + +class TextImagerCapability(BaseModel): + supported_languages: List[str] + reproducible: bool + + +class TextImagerDocumentation(BaseModel): + annotator_name: str + version: str + implementation_lang: str + meta: Dict[str, Any] + docker_container_id: Optional[str] + parameters: Dict[str, Any] + capability: TextImagerCapability + implementation_specific: Optional[str] + + +def read_required_text_file(filename: str) -> str: + with open(filename, "r", encoding="utf-8") as f: + content = f.read() + if not content.strip(): + raise RuntimeError(f"Required Lua communication script is empty: {filename}") + return content + + +def read_required_binary_file(filename: str) -> bytes: + with open(filename, "rb") as f: + content = f.read() + if not content.strip(): + raise RuntimeError(f"Required UIMA type system XML is empty: {filename}") + return content + + +lua_communication_script = read_required_text_file(settings.lua_communication_script_filename) +type_system = read_required_binary_file(settings.typesystem_filename) + + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.annotator_name, + description="DUUI NER annotator using the second code as backend", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "TTLab Team", + "url": "https://texttechnologylab.org", + "email": "bagci@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + + +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + return Response(content=type_system, media_type="application/xml") + + +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +@app.get("/v1/documentation") +def get_documentation() -> TextImagerDocumentation: + selected_model = get_selected_model_name(settings.model_name) + _, selected_cfg = resolve_model_name(selected_model) + return TextImagerDocumentation( + annotator_name=settings.annotator_name, + version=settings.annotator_version, + implementation_lang="Python", + meta={ + "device": device, + "available_models": MODEL_REGISTRY, + "selected_model": selected_model, + "selected_model_id": selected_cfg.get("model_id", selected_model), + "backend_module": "ner_classification_backend.py", + "ner_labels": get_ner_labels(), + }, + docker_container_id=None, + parameters={ + "model_name": "exactly one registry alias or one exact HuggingFace model id", + "threshold": settings.threshold, + "batch_size": settings.batch_size, + "labels": settings.ner_labels, + }, + capability=TextImagerCapability(supported_languages=["multi"], reproducible=True), + implementation_specific=None, + ) + + +def parse_ner_labels(labels: Optional[Union[str, List[str]]] = None) -> List[str]: + raw_labels: Union[str, List[str]] = settings.ner_labels if labels is None else labels + if isinstance(raw_labels, list): + return [str(label).strip() for label in raw_labels if str(label).strip()] + return [label.strip() for label in str(raw_labels).split(",") if label.strip()] + + +def get_ner_labels() -> List[str]: + return parse_ner_labels(None) + + +def get_selected_model_name(model_name: str) -> str: + """Return exactly one configured model for this container. + + DUUI should start one container per model. Therefore values such as + "all" or "model_a,model_b" are rejected deliberately. + """ + selected = (model_name or "").strip() + if not selected: + selected = "wikineural-multilingual-ner" + + if selected.lower() == "all" or "," in selected: + supported = ", ".join(sorted(MODEL_REGISTRY.keys())) + raise ValueError( + "This DUUI container supports exactly one MODEL_NAME. " + "Start one container per model instead of using 'all' or comma-separated lists. " + f"Supported aliases: {supported}" + ) + + # Validate alias or exact HuggingFace model id. + alias, _ = resolve_model_name(selected) + return alias + + +def get_ner_type(o_tag: str) -> str: + if not o_tag: + return ner_base_type + + label = o_tag.strip() + if label in ner_tag_map: + return ner_types.get(ner_tag_map[label], ner_base_type) + + upper_label = label.upper() + if upper_label in ner_tag_map: + return ner_types.get(ner_tag_map[upper_label], ner_base_type) + + tag = "".join(map(str.title, label.replace("-", "_").split("_"))) + return ner_types.get(tag, ner_base_type) + + +@lru_cache_with_size +def load_model(model_name: str): + # This is the key integration point: DUUI loads the second-code backend here. + return create_ner_classifier(model_name, device=device) + + +def fix_unicode_problems(text: str) -> str: + return text.encode("utf-16", "surrogatepass").decode("utf-16", "surrogateescape") + + +def iter_batches(items: List[UimaSentence], batch_size: int) -> Iterable[List[UimaSentence]]: + size = max(1, int(batch_size)) + for start in range(0, len(items), size): + yield items[start:start + size] + + +def process_selection( + model_name: str, + selection: UimaSentenceSelection, + labels: List[str], + threshold: float, + batch_size: int, + model_version: str = "", +) -> Dict[str, Any]: + begin: List[int] = [] + end: List[int] = [] + results_out: List[str] = [] + factors: List[float] = [] + len_results: List[int] = [] + ner_type_out: List[str] = [] + covered_text_out: List[str] = [] + model_out: List[str] = [] + tags: List[DkproNer] = [] + + for s in selection.sentences: + s.text = fix_unicode_problems(s.text) + + for batch in iter_batches(selection.sentences, batch_size): + texts = [s.text for s in batch] + + with model_lock: + classifier = load_model(model_name) + predictions = classifier.predict( + texts, + labels=labels, + threshold=threshold, + batch_size=batch_size, + ) + + for sentence, sentence_entities in zip(batch, predictions): + for ent in sentence_entities: + rel_start = int(ent["start"]) + rel_end = int(ent["end"]) + if rel_end <= rel_start: + continue + + abs_begin = sentence.begin + rel_start + abs_end = sentence.begin + rel_end + value = str(ent["label"]) + score = float(ent.get("score", 0.0)) + covered = str(ent.get("text", sentence.text[rel_start:rel_end])) + ner_type = get_ner_type(value) + entity_model_name = str(ent.get("model_name", model_name)) + + tag = DkproNer( + begin=abs_begin, + end=abs_end, + value=value, + identifier=None, + ner_type=ner_type, + covered_text=covered, + score=score, + model_name=entity_model_name, + ) + + begin.append(abs_begin) + end.append(abs_end) + results_out.append(value) + factors.append(score) + len_results.append(1) + ner_type_out.append(ner_type) + covered_text_out.append(covered) + model_out.append(entity_model_name) + tags.append(tag) + + return { + "begin": begin, + "end": end, + "results": results_out, + "factors": factors, + "len_results": len_results, + "ner_type": ner_type_out, + "covered_text": covered_text_out, + "model": model_out, + "tags": tags, + } + + +def model_meta_values(model_name: str) -> Tuple[str, str]: + _, cfg = resolve_model_name(model_name) + model_source = settings.model_source or cfg.get("model_source", "") + model_lang = settings.model_lang or cfg.get("model_lang", "multi") + return model_source, model_lang + + +@app.post("/v1/process", response_model=DUUIResponse) +def post_process(request: DUUIRequest): + if not request.selections: + return JSONResponse(status_code=400, content={"message": "The request must contain sentence selections."}) + + try: + model_name = get_selected_model_name(settings.model_name) + except Exception as ex: + return JSONResponse(status_code=400, content={"message": str(ex)}) + + effective_threshold = request.threshold if request.threshold is not None else settings.threshold + effective_batch_size = request.batch_size if request.batch_size is not None else settings.batch_size + effective_labels = parse_ner_labels(request.labels) + + if effective_threshold < 0.0 or effective_threshold > 1.0: + return JSONResponse(status_code=400, content={"message": "threshold must be between 0.0 and 1.0"}) + if effective_batch_size < 1: + return JSONResponse(status_code=400, content={"message": "batch_size must be >= 1"}) + if not effective_labels: + return JSONResponse(status_code=400, content={"message": "labels must not be empty"}) + + modification_timestamp_seconds = int(time()) + meta_model_name = model_name + meta = AnnotationMeta( + name=settings.annotator_name, + version=settings.annotator_version, + modelName=meta_model_name, + modelVersion=settings.model_version, + ) + modification_meta = DocumentModification( + user=settings.annotator_name, + timestamp=modification_timestamp_seconds, + comment=f"{settings.annotator_name} ({settings.annotator_version})", + ) + + begin: List[int] = [] + end: List[int] = [] + len_results: List[int] = [] + results: List[str] = [] + factors: List[float] = [] + ner_type_out: List[str] = [] + covered_text_out: List[str] = [] + model_out: List[str] = [] + tags: List[DkproNer] = [] + + try: + for selection in request.selections: + processed = process_selection( + model_name=model_name, + selection=selection, + labels=effective_labels, + threshold=effective_threshold, + batch_size=effective_batch_size, + model_version=settings.model_version, + ) + begin += processed["begin"] + end += processed["end"] + len_results += processed["len_results"] + results += processed["results"] + factors += processed["factors"] + ner_type_out += processed["ner_type"] + covered_text_out += processed["covered_text"] + model_out += processed["model"] + tags += processed["tags"] + + model_source, model_lang = model_meta_values(model_name) + return DUUIResponse( + meta=meta, + modification_meta=modification_meta, + begin=begin, + end=end, + results=results, + factors=factors, + len_results=len_results, + ner_type=ner_type_out, + covered_text=covered_text_out, + model=model_out, + tags=tags, + model_name=meta_model_name, + model_version=settings.model_version, + model_source=model_source, + model_lang=model_lang, + ) + except Exception as ex: + logger.exception("NER processing failed") + return JSONResponse(status_code=500, content={"message": str(ex)}) \ No newline at end of file diff --git a/duui-NER/src/main/python/ner_classification_backend.py b/duui-NER/src/main/python/ner_classification_backend.py new file mode 100644 index 00000000..9a36373b --- /dev/null +++ b/duui-NER/src/main/python/ner_classification_backend.py @@ -0,0 +1,396 @@ +from __future__ import annotations + +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple + +import torch + + +MODEL_REGISTRY: Dict[str, Dict[str, str]] = { + "gliner": { + "backend": "gliner", + "model_id": "urchade/gliner_multi-v2.1", + "model_source": "https://huggingface.co/urchade/gliner_multi-v2.1", + "model_lang": "multi", + }, + "gliner2": { + "backend": "gliner2", + "model_id": "fastino/gliner2-multi-v1", + "model_source": "https://huggingface.co/fastino/gliner2-multi-v1", + "model_lang": "multi", + }, + "roberta-ner-multilingual": { + "backend": "hf_token_classification", + "model_id": "julian-schelb/roberta-ner-multilingual", + "model_source": "https://huggingface.co/julian-schelb/roberta-ner-multilingual", + "model_lang": "multi", + }, + "wikineural-multilingual-ner": { + "backend": "hf_token_classification", + "model_id": "Babelscape/wikineural-multilingual-ner", + "model_source": "https://huggingface.co/Babelscape/wikineural-multilingual-ner", + "model_lang": "multi", + }, + "xlm-r-ner-40-lang": { + "backend": "hf_token_classification", + "model_id": "nbroad/jplu-xlm-r-ner-40-lang", + "model_source": "https://huggingface.co/nbroad/jplu-xlm-r-ner-40-lang", + "model_lang": "multi", + }, +} + + +def resolve_model_name(model_name: str) -> Tuple[str, Dict[str, str]]: + """Resolve a short alias or exact HuggingFace model id to a registry entry.""" + model_name = (model_name or "").strip() + if model_name in MODEL_REGISTRY: + return model_name, MODEL_REGISTRY[model_name] + + for alias, cfg in MODEL_REGISTRY.items(): + if model_name == cfg["model_id"]: + return alias, cfg + + supported = sorted(list(MODEL_REGISTRY.keys()) + [cfg["model_id"] for cfg in MODEL_REGISTRY.values()]) + raise ValueError(f"Unsupported model_name '{model_name}'. Supported values: {', '.join(supported)}") + + +def _entity_label(ent: Dict[str, Any], label_hint: Optional[str]) -> str: + return str( + ent.get( + "label", + ent.get( + "entity_group", + ent.get("entity", ent.get("class", ent.get("type", label_hint or "NamedEntity"))), + ), + ) + ) + + +def _entity_score(ent: Dict[str, Any]) -> float: + try: + return float(ent.get("score", ent.get("confidence", ent.get("probability", 1.0))) or 0.0) + except Exception: + return 0.0 + + +def _entity_start_end(ent: Dict[str, Any]) -> Tuple[int, int]: + try: + start = int(ent.get("start", ent.get("start_pos", ent.get("span_start", 0))) or 0) + except Exception: + start = 0 + try: + end = int(ent.get("end", ent.get("end_pos", ent.get("span_end", 0))) or 0) + except Exception: + end = 0 + return start, end + + +def _flatten_raw_entities(raw: Any, label_hint: Optional[str] = None) -> Iterable[Dict[str, Any]]: + """ + Flatten all known NER output variants into raw entity dicts. + + Supported shapes: + - [{"text": ..., "start": ..., "end": ..., "label": ...}, ...] + - {"entities": [{...}, ...]} + - {"entities": {"taxon": [{...}], "location": [{...}]}} + - {"taxon": [{...}], "location": [{...}]} # defensive fallback + """ + if raw is None: + return + + if isinstance(raw, list): + for item in raw: + yield from _flatten_raw_entities(item, label_hint=label_hint) + return + + if not isinstance(raw, dict): + return + + # GLiNER2 format_results=True: {"entities": {label: [entities...]}} + entities_value = raw.get("entities") + if isinstance(entities_value, dict): + for label, entities in entities_value.items(): + yield from _flatten_raw_entities(entities, label_hint=str(label)) + return + + # Alternative wrapper: {"entities": [entities...]} + if isinstance(entities_value, list): + yield from _flatten_raw_entities(entities_value, label_hint=label_hint) + return + + # Normal flat entity dictionary. + if any(key in raw for key in ("start", "end", "text", "word", "label", "entity_group", "entity")): + ent = dict(raw) + if label_hint and not any(k in ent for k in ("label", "entity_group", "entity", "class", "type")): + ent["label"] = label_hint + yield ent + return + + # Defensive fallback for a direct label -> entities dict. + for label, value in raw.items(): + if isinstance(value, list): + yield from _flatten_raw_entities(value, label_hint=str(label)) + + +def normalize_entity_output( + sentence: str, + entities: Any, + model_name: str, + model_id: str, +) -> List[Dict[str, Any]]: + """Normalize GLiNER, GLiNER2 and HF-like entity outputs to one common format.""" + normalized: List[Dict[str, Any]] = [] + + for ent in _flatten_raw_entities(entities): + if not isinstance(ent, dict): + continue + + start, end = _entity_start_end(ent) + label = _entity_label(ent, None) + score = _entity_score(ent) + covered = str(ent.get("text", ent.get("word", "")) or "") + + if not covered and 0 <= start < end <= len(sentence): + covered = sentence[start:end] + + if end <= start and covered: + # Fallback for outputs that contain text but no valid span. + idx = sentence.find(covered) + if idx >= 0: + start = idx + end = idx + len(covered) + + if end <= start: + # DUUI needs valid spans. + continue + + normalized.append( + { + "text": covered, + "label": label, + "score": score, + "start": start, + "end": end, + "model_name": model_name, + "model_id": model_id, + } + ) + + return normalized + + +def _normalize_sentence_outputs(outputs: Any, text_count: int) -> List[Any]: + """Return exactly one raw output object per input sentence where possible.""" + if outputs is None: + sentence_outputs: List[Any] = [] + elif isinstance(outputs, dict): + sentence_outputs = [outputs] + elif isinstance(outputs, list): + # If there is one input sentence and the model returns a flat entity list, + # keep that flat list as the single sentence output. + if text_count == 1: + if outputs and all(isinstance(item, dict) for item in outputs): + if not (len(outputs) == 1 and "entities" in outputs[0]): + # Could be a flat list of entity dicts. + sentence_outputs = [outputs] + else: + sentence_outputs = outputs + else: + sentence_outputs = [outputs] + else: + sentence_outputs = outputs + else: + sentence_outputs = [] + + # Pad/truncate defensively so zip(text, sentence_outputs) cannot silently skip + # sentences or shift later output. + if len(sentence_outputs) < text_count: + sentence_outputs = sentence_outputs + [None] * (text_count - len(sentence_outputs)) + elif len(sentence_outputs) > text_count: + sentence_outputs = sentence_outputs[:text_count] + + return sentence_outputs + + +class NERClassificationGLiNER: + def __init__(self, model_name: str = "urchade/gliner_multi-v2.1", device: str = "cuda"): + from gliner import GLiNER + + self.model_name = "gliner" + self.model_id = model_name + self.device = device + self.model = GLiNER.from_pretrained(model_id=model_name, map_location=device) + + def predict( + self, + text: List[str], + labels: List[str], + threshold: float = 0.5, + batch_size: int = 8, + ) -> List[List[Dict[str, Any]]]: + results: List[List[Dict[str, Any]]] = [] + with torch.no_grad(): + for sentence in text: + try: + output = self.model.predict_entities( + sentence, + labels, + threshold=threshold, + multi_label=True, + return_class_probs=True, + ) + except TypeError: + output = self.model.predict_entities( + sentence, + labels, + multi_label=True, + return_class_probs=True, + ) + results.append(normalize_entity_output(sentence, output, self.model_name, self.model_id)) + return results + + +class NERClassificationGLiNER2: + def __init__(self, model_name: str = "fastino/gliner2-multi-v1", device: str = "cuda"): + from gliner2 import GLiNER2 + + self.model_name = "gliner2" + self.model_id = model_name + self.device = device + self.model = GLiNER2.from_pretrained(model_name) + if hasattr(self.model, "to"): + self.model.to(device) + + def predict( + self, + text: List[str], + labels: List[str], + threshold: float = 0.5, + batch_size: int = 8, + ) -> List[List[Dict[str, Any]]]: + with torch.no_grad(): + outputs = self.model.batch_extract_entities( + text, + labels, + batch_size, + threshold=threshold, + format_results=True, + include_confidence=True, + include_spans=True, + ) + + # GLiNER2 already returns one dict per input sentence when text is a list: + # [{"entities": {"taxon": [...], ...}}, ...] + # Never wrap this full list again. + sentence_outputs = _normalize_sentence_outputs(outputs, len(text)) + + return [ + normalize_entity_output(sentence, sentence_output, self.model_name, self.model_id) + for sentence, sentence_output in zip(text, sentence_outputs) + ] + + +class NERClassification: + def __init__(self, model_name: str, device: str = "cpu"): + from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline + + self.model_name = model_name + self.model_id = model_name + self.device = device + + self.model = AutoModelForTokenClassification.from_pretrained(model_name) + try: + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, + use_fast=True, + add_prefix_space=True, + ) + except TypeError: + self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) + + self.model.to(device) + self.model.eval() + + pipe_device = 0 if str(device).startswith("cuda") else -1 + self.ner_pipeline = pipeline( + "token-classification", + model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy="simple", + device=pipe_device, + ) + + def predict( + self, + text: List[str], + labels: Optional[List[str]] = None, + threshold: float = 0.5, + batch_size: int = 8, + ) -> List[List[Dict[str, Any]]]: + with torch.no_grad(): + outputs = self.ner_pipeline(text, batch_size=batch_size) + + sentence_outputs = _normalize_sentence_outputs(outputs, len(text)) + return [ + normalize_entity_output(sentence, sentence_output, self.model_name, self.model_id) + for sentence, sentence_output in zip(text, sentence_outputs) + ] + + +def create_ner_classifier( + model_name: str, + device: str = "cpu", + model_version: Optional[str] = None, +): + """ + Factory used by DUUI. Returns one of the classes from the second code. + + model_version is accepted for API consistency with DUUI metadata. It is not + passed as a HuggingFace revision here, because MODEL_VERSION is metadata in + your Docker build setup. + """ + alias, cfg = resolve_model_name(model_name) + backend = cfg["backend"] + model_id = cfg["model_id"] + + if backend == "gliner": + classifier = NERClassificationGLiNER(model_name=model_id, device=device) + classifier.model_name = alias + return classifier + + if backend == "gliner2": + classifier = NERClassificationGLiNER2(model_name=model_id, device=device) + classifier.model_name = alias + return classifier + + if backend == "hf_token_classification": + classifier = NERClassification(model_name=model_id, device=device) + classifier.model_name = alias + return classifier + + raise ValueError(f"Unsupported backend '{backend}' for model '{model_name}'") + + +def predict_ner( + model_name: str, + texts: List[str], + labels: Optional[List[str]] = None, + device: str = "cpu", + threshold: float = 0.5, + batch_size: int = 8, +) -> List[List[Dict[str, Any]]]: + """Convenience function for standalone use outside DUUI.""" + classifier = create_ner_classifier(model_name, device=device) + return classifier.predict(texts, labels or [], threshold=threshold, batch_size=batch_size) + + +if __name__ == "__main__": + textes = [ + "Dr. Anna Weber untersuchte für BioFID eine Streuobstwiese bei Frankfurt am Main und einen Buchenwald im Taunus.", + "Auf der Wiese fand sie Apis mellifera, Bombus terrestris, Papilio machaon und Vanessa atalanta.", + ] + labels = ["person", "organization", "location", "date", "event", "product", "taxon", "other"] + device_i = "cuda" if torch.cuda.is_available() else "cpu" + + for name in ["roberta-ner-multilingual", "wikineural-multilingual-ner"]: + print(name) + print(predict_ner(name, textes, labels=labels, device=device_i)) \ No newline at end of file diff --git a/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java b/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java new file mode 100644 index 00000000..fea607d6 --- /dev/null +++ b/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java @@ -0,0 +1,167 @@ +package org.hucompute.textimager.uima.NER; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.XmlCasSerializer; +import org.junit.jupiter.api.*; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.*; + +public class NERTest { + static DUUIComposer composer; + static JCas cas; + + static String url = "http://127.0.0.1:8000"; + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); + + cas = JCasFactory.createJCas(); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + public void afterEach() throws IOException, SAXException { + composer.resetPipeline(); + + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, stream); + System.out.println(stream.toString(StandardCharsets.UTF_8)); + + cas.reset(); + } + + public void createCasFromSentences(String language, List sentences) { + cas.setDocumentLanguage(language); + + StringBuilder documentText = new StringBuilder(); + int offset = 0; + + for (int i = 0; i < sentences.size(); i++) { + String sentenceText = sentences.get(i); + + if (i > 0) { + documentText.append("\n"); + offset += 1; + } + + int begin = offset; + int end = begin + sentenceText.length(); + + documentText.append(sentenceText); + + Sentence sentence = new Sentence(cas, begin, end); + sentence.addToIndexes(); + + offset = end; + } + + cas.setDocumentText(documentText.toString()); + } + + @Test + public void DeBioFidNERTest() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter( + "selection", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + ) + .withParameter("threshold", "0.5") + .withParameter("batch_size", "8") + .withParameter("labels", "person,organization,location,date,event,product,taxon,other") + ); + + List textes = Arrays.asList( + "Dr. Anna Weber untersuchte für BioFID eine Streuobstwiese bei Frankfurt am Main und einen Buchenwald im Taunus.", + "Auf der Wiese fand sie Apis mellifera, Bombus terrestris, Papilio machaon und Vanessa atalanta.", + "Der Schwalbenschwanz flog über Daucus carota und setzte sich kurz auf eine Blüte.", + "Er wurde fotografiert und später im Protokoll erwähnt.", + "Im Wald standen Fagus sylvatica, Quercus robur und Acer pseudoplatanus.", + "Eine alte Buche trug Flechten wie Xanthoria parietina und Parmelia sulcata.", + "Sie war teilweise abgestorben, bot aber vielen Insekten Lebensraum.", + "Unter ihrer Rinde fanden die Forschenden Spuren von Lucanus cervus.", + "An einem Bach beobachtete das Team Salamandra salamandra und Rana temporaria.", + "Der Feuersalamander kroch langsam über den feuchten Weg.", + "Er verschwand unter einem Stein, nachdem Dr. Weber ihn fotografiert hatte.", + "Später verglich sie die Funde aus Frankfurt am Main, dem Taunus und dem Bodensee.", + "In ihrem Bericht wurden wissenschaftliche Namen, Ortsnamen und Koreferenzen markiert.", + "Die Pipeline sollte erkennen, dass Fagus sylvatica, Papilio machaon, Salamandra salamandra und Lucanus cervus Taxa sind, während Frankfurt am Main, Taunus und Bodensee geographische Namen sind." + ); + + createCasFromSentences("de", textes); + + System.out.println("Input document:"); + System.out.println(cas.getDocumentText()); + + composer.run(cas); + + Collection namedEntities = JCasUtil.select(cas, NamedEntity.class); + Map> result = extractNERResult(); + + for (NamedEntity namedEntity : namedEntities) { + String coveredText = namedEntity.getCoveredText(); + int begin = namedEntity.getBegin(); + int end = namedEntity.getEnd(); + String value = namedEntity.getValue(); + String typeName = namedEntity.getType().getName(); + + result.get("token").add(coveredText); + result.get("begin").add(begin); + result.get("end").add(end); + result.get("value").add(value); + result.get("type").add(typeName); + + System.out.println( + "NER: '" + coveredText + "'" + + " (begin=" + begin + ", end=" + end + ")" + + " value='" + value + "'" + + " type='" + typeName + "'" + ); + + assertTrue(begin >= 0, "NER begin offset must be non-negative"); + assertTrue(end > begin, "NER end offset must be greater than begin offset"); + assertFalse(coveredText.isBlank(), "NER covered text must not be blank"); + } + + assertFalse(namedEntities.isEmpty(), "The DUUI NER component should create at least one NamedEntity annotation."); + } + + private Map> extractNERResult() { + Map> result = new LinkedHashMap<>(); + + result.put("begin", new ArrayList<>()); + result.put("end", new ArrayList<>()); + result.put("token", new ArrayList<>()); + result.put("value", new ArrayList<>()); + result.put("type", new ArrayList<>()); + + return result; + } +} \ No newline at end of file From 08fa306b6a511cf4b96ebdb4e7c2ac8d929c1c8c Mon Sep 17 00:00:00 2001 From: Mevluet Bagci Date: Tue, 9 Jun 2026 22:05:36 +0200 Subject: [PATCH 08/19] Add DUUI-based transformer Time components (duui-Time) --- duui-TimeDetection/.dockerignore | 3 + duui-TimeDetection/.gitignore | 3 + duui-TimeDetection/README.md | 0 duui-TimeDetection/Readme.md | 147 +++ duui-TimeDetection/docker_build.sh | 392 ++++++ duui-TimeDetection/duui-TimeDetection.iml | 8 + duui-TimeDetection/pom.xml | 155 +++ duui-TimeDetection/requirements.txt | 20 + duui-TimeDetection/src/main/docker/Dockerfile | 108 ++ .../src/main/docker/Dockerfile-cuda | 70 + .../src/main/python/TypeSystemTime.xml | 132 ++ .../src/main/python/duui_time.lua | 314 +++++ .../src/main/python/duui_time.py | 536 ++++++++ .../src/main/python/preload_model.py | 45 + .../main/python/time_recognition_backend.py | 1159 +++++++++++++++++ .../uima/TimeDetection/TimeTest.java | 384 ++++++ 16 files changed, 3476 insertions(+) create mode 100644 duui-TimeDetection/.dockerignore create mode 100644 duui-TimeDetection/.gitignore create mode 100644 duui-TimeDetection/README.md create mode 100644 duui-TimeDetection/Readme.md create mode 100644 duui-TimeDetection/docker_build.sh create mode 100644 duui-TimeDetection/duui-TimeDetection.iml create mode 100644 duui-TimeDetection/pom.xml create mode 100644 duui-TimeDetection/requirements.txt create mode 100644 duui-TimeDetection/src/main/docker/Dockerfile create mode 100644 duui-TimeDetection/src/main/docker/Dockerfile-cuda create mode 100644 duui-TimeDetection/src/main/python/TypeSystemTime.xml create mode 100644 duui-TimeDetection/src/main/python/duui_time.lua create mode 100644 duui-TimeDetection/src/main/python/duui_time.py create mode 100644 duui-TimeDetection/src/main/python/preload_model.py create mode 100644 duui-TimeDetection/src/main/python/time_recognition_backend.py create mode 100644 duui-TimeDetection/src/test/java/org/hucompute/textimager/uima/TimeDetection/TimeTest.java diff --git a/duui-TimeDetection/.dockerignore b/duui-TimeDetection/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-TimeDetection/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-TimeDetection/.gitignore b/duui-TimeDetection/.gitignore new file mode 100644 index 00000000..d2092691 --- /dev/null +++ b/duui-TimeDetection/.gitignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv*/ \ No newline at end of file diff --git a/duui-TimeDetection/README.md b/duui-TimeDetection/README.md new file mode 100644 index 00000000..e69de29b diff --git a/duui-TimeDetection/Readme.md b/duui-TimeDetection/Readme.md new file mode 100644 index 00000000..3c078e07 --- /dev/null +++ b/duui-TimeDetection/Readme.md @@ -0,0 +1,147 @@ +[![Version](https://img.shields.io/static/v1?label=duui-ner&message=0.1.0&color=blue)](https://docker.texttechnologylab.org/v2/duui-ner/tags/list) +[![Version](https://img.shields.io/static/v1?label=Python&message=3.12&color=green)]() +[![Version](https://img.shields.io/static/v1?label=Transformers&message=5.1.0&color=yellow)]() +[![Version](https://img.shields.io/static/v1?label=Torch&message=2.11.0&color=red)]() +[![Version](https://img.shields.io/static/v1?label=GLiNER&message=0.2.26&color=orange)]() +[![Version](https://img.shields.io/static/v1?label=GLiNER2&message=1.3.1&color=orange)]() + +# Transformers NER + +DUUI implementation for selected transformer-based Named Entity Recognition (NER) models. The component is designed for use with the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +The component supports one NER model per Docker image/container. Each image is built with a single `MODEL_NAME` and exposes the DUUI endpoints for type system, Lua communication layer, documentation, and processing. + +## Included Models + +| Image suffix / `MODEL_SPECNAME` | `MODEL_NAME` | Model source | Model version | Languages | Backend | +| --- | --- | --- | --- | --- | --- | +| `gliner-multi-v2-1` | `gliner` | https://huggingface.co/urchade/gliner_multi-v2.1 | `443d26d654e0324125a96bebd8e796c14ff2efe6` | Multilingual | GLiNER | +| `gliner2-multi-v1` | `gliner2` | https://huggingface.co/fastino/gliner2-multi-v1 | `cc151f5b0ce4f7010c3ae8884527dd43dddf9d21` | Multilingual | GLiNER2 | +| `roberta-ner-multilingual` | `roberta-ner-multilingual` | https://huggingface.co/julian-schelb/roberta-ner-multilingual | `d0a19147f3bb0065c8091459e3d35405ce9d48da` | Multilingual | HuggingFace token-classification | +| `wikineural-multilingual-ner` | `wikineural-multilingual-ner` | https://huggingface.co/Babelscape/wikineural-multilingual-ner | `bed6ee7a45d2827b6c90a4fd7983f0241ae0a5c1` | Multilingual | HuggingFace token-classification | +| `xlm-r-ner-40-lang` | `xlm-r-ner-40-lang` | https://huggingface.co/nbroad/jplu-xlm-r-ner-40-lang | `7f7f0fe9bc946a9848611aff079f556387687216` | Multilingual / 40 languages | HuggingFace token-classification | + +## Annotation Types + +The component creates UIMA NER annotations from the model output. Standard NER labels are mapped to DKPro NER types where possible, for example: + +| Label | UIMA type | +| --- | --- | +| `PER`, `person` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person` | +| `ORG`, `organization` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization` | +| `LOC`, `location` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location` | +| `taxon`, `taxa` | `org.texttechnologylab.annotation.type.Taxon` | +| other labels | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity` | + +The `taxon` label is mapped to the TTLab taxon type: + +```text +org.texttechnologylab.annotation.type.Taxon +``` + +The delivered type system must include this type if taxon annotations should be created as `Taxon` instead of falling back to a generic `NamedEntity`. + +## Requirements + +The container uses Python 3.12 and the following core Python dependencies: + +| Package | Version | +| --- | --- | +| `gliner` | `0.2.26` | +| `gliner2[local]` | `1.3.1` | +| `transformers` | `5.1.0` | +| `torch` | `2.11.0` | +| `fastapi` | `0.110.0` | +| `dkpro-cassis` | `0.9.1` | +| `uvicorn[standard]` | `0.27.1` | +| `pydantic-settings` | `2.0.2` | + +See `requirements.txt` for the full dependency list. + +# How To Use + +## Start Docker container + +```bash +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-ner-[modelname]:latest +``` + +Example: + +```bash[duui_time.py](../../../Downloads/duui_time_tool/duui_time.py) +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-ner-wikineural-multilingual-ner:latest +``` + +## Run within DUUI + +```java +composer.add( + new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-ner-[modelname]:latest") + .withParameter( + "selection", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + ) +); +``` + +With optional runtime parameters: + +```java +composer.add( + new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-ner-[modelname]:latest") + .withParameter( + "selection", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + ) + .withParameter("threshold", "0.5") + .withParameter("batch_size", "8") + .withParameter("labels", "person,organization,location,date,event,product,taxon,other") +); +``` + +### Parameters + +| Name | Default | Description | +| --- | --- | --- | +| `selection` | required | Use `text` to process the full document text or any selectable UIMA type class name, e.g. `de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence`. | +| `threshold` | `0.5` | Confidence threshold for GLiNER/GLiNER2. HuggingFace token-classification models may ignore this value. | +| `batch_size` | `8` | Batch size used during prediction. | +| `labels` | `person,organization,location,date,event,product,taxon,other` | Candidate labels for GLiNER/GLiNER2. HuggingFace token-classification models use their trained label set. | + +## Runtime behavior + +- Each Docker image/container uses exactly one model. +- `MODEL_NAME` selects the backend model alias used by the Python service. +- `MODEL_VERSION` is used as model metadata in the DUUI response. +- `MODEL_SOURCE` and `MODEL_LANG` are also returned as metadata. +- Runtime parameters such as `threshold`, `batch_size`, and `labels` are passed via DUUI `.withParameter(...)`. + +# Cite + +If you use this DUUI image, please cite DUUI as follows: + +Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. [[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] + +## BibTeX + +```bibtex +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander}, + editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023}, + year = {2023}, + address = {Singapore}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, + pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf} +} + +@misc{Bagci:2026, + author = {Bagci, Mevlüt}, + title = {Transformer-based Named Entity Recognition models as {DUUI} component}, + year = {2026}, + howpublished = {https://github.com/texttechnologylab/duui-uima} +} +``` \ No newline at end of file diff --git a/duui-TimeDetection/docker_build.sh b/duui-TimeDetection/docker_build.sh new file mode 100644 index 00000000..d52ac30a --- /dev/null +++ b/duui-TimeDetection/docker_build.sh @@ -0,0 +1,392 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Build DUUI TimeX3 Docker images. +# One image = one MODEL_NAME + one MODEL_LANG. +# The Dockerfile always uses requirements.txt. +# This script copies the complete project requirements.txt unchanged into the temporary build context. +# Model artifacts are still downloaded/cached per MODEL_NAME/MODEL_LANG in the Dockerfile. +# +# Examples: +# ./docker_build.sh microsoft de +# ./docker_build.sh tei2go de +# ./docker_build.sh tei2go all +# ./docker_build.sh timexy de +# ./docker_build.sh timexy all +# ./docker_build.sh german-gelectra +# ./docker_build.sh bert-got-a-date +# ./docker_build.sh duckling de +# ./docker_build.sh sutime de +# ./docker_build.sh all +# TEI2GO_LANGUAGES="de en es fr it pt" TIMEXY_LANGUAGES="de en fr" ./docker_build.sh all +# ./docker_build.sh hf-token-classification de satyaalmasian/temporal_tagger_German_GELECTRA + +export ANNOTATOR_CUDA="${ANNOTATOR_CUDA:-}" +# export ANNOTATOR_CUDA="-cuda" + +export ANNOTATOR_NAME="${ANNOTATOR_NAME:-duui-time}" +export ANNOTATOR_VERSION="${ANNOTATOR_VERSION:-0.1.0}" +export LOG_LEVEL="${LOG_LEVEL:-DEBUG}" +export MODEL_CACHE_SIZE="${MODEL_CACHE_SIZE:-1}" +export DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.texttechnologylab.org/}" +export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}" +export PYTHON_IMAGE="${PYTHON_IMAGE:-python:3.12}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="${PROJECT_ROOT:-${SCRIPT_DIR}}" +PYTHON_SRC_DIR="${PYTHON_SRC_DIR:-${PROJECT_ROOT}/src/main/python}" +DOCKERFILE_SRC="${DOCKERFILE_SRC:-${PROJECT_ROOT}/src/main/docker/Dockerfile${ANNOTATOR_CUDA}}" +REQUIREMENTS_SRC="${REQUIREMENTS_SRC:-${PROJECT_ROOT}/requirements.txt}" + +# Language variants. +# TEI2GO currently has separate Hugging Face spaCy packages for these six languages. +# Timexy 0.1.3 ships rules for German, English and French. +export DEFAULT_LANGUAGES="${DEFAULT_LANGUAGES:-de}" +export TEI2GO_LANGUAGES="${TEI2GO_LANGUAGES:-de en es fr it pt}" +export TIMEXY_LANGUAGES="${TIMEXY_LANGUAGES:-de en fr}" + +# Used by `all`. +export TIME_MODELS="${TIME_MODELS:-microsoft duckling sutime tei2go timexy german-gelectra bert-got-a-date}" + +lower() { + printf '%s' "$1" | tr '[:upper:]' '[:lower:]' +} + +sanitize_tag_part() { + lower "$1" \ + | sed -E 's#[^a-z0-9_.-]+#-#g; s#-+#-#g; s#^-##; s#-$##' +} + +contains_word() { + local needle="$1" + shift + local item + for item in "$@"; do + if [[ "${item}" == "${needle}" ]]; then + return 0 + fi + done + return 1 +} + +languages_for_model() { + local model + model="$(lower "$1")" + + case "${model}" in + tei2go) + printf '%s\n' ${TEI2GO_LANGUAGES} + ;; + timexy) + printf '%s\n' ${TIMEXY_LANGUAGES} + ;; + bert-got-a-date) + printf '%s\n' en + ;; + german-gelectra) + printf '%s\n' de + ;; + *) + printf '%s\n' ${DEFAULT_LANGUAGES} + ;; + esac +} + +validate_model_language() { + local model="$1" + local lang="$2" + local supported + + case "${model}" in + tei2go) + read -r -a supported <<< "${TEI2GO_LANGUAGES}" + ;; + timexy) + read -r -a supported <<< "${TIMEXY_LANGUAGES}" + ;; + german-gelectra) + supported=(de) + ;; + bert-got-a-date) + supported=(en) + ;; + *) + return 0 + ;; + esac + + if ! contains_word "${lang}" "${supported[@]}"; then + echo "Unsupported language '${lang}' for model '${model}'. Supported: ${supported[*]}" >&2 + exit 1 + fi +} + +usage() { + cat < [language] [model_specname] + +Models: + microsoft + duckling + sutime + tei2go + timexy + german-gelectra + bert-got-a-date + hf-token-classification + +Examples: + $0 microsoft de + $0 tei2go de + $0 tei2go all + $0 timexy de + $0 timexy all + $0 german-gelectra + $0 bert-got-a-date + $0 duckling de + $0 sutime de + $0 hf-token-classification de satyaalmasian/temporal_tagger_German_GELECTRA + +For all default images: + $0 all + +For language-capable variants: + TEI2GO_LANGUAGES="de en es fr it pt" TIMEXY_LANGUAGES="de en fr" $0 all + +Variables: + DOCKER_REGISTRY=${DOCKER_REGISTRY} + ANNOTATOR_NAME=${ANNOTATOR_NAME} + ANNOTATOR_VERSION=${ANNOTATOR_VERSION} + PROJECT_ROOT=${PROJECT_ROOT} + PYTHON_SRC_DIR=${PYTHON_SRC_DIR} + DOCKERFILE_SRC=${DOCKERFILE_SRC} + REQUIREMENTS_SRC=${REQUIREMENTS_SRC} + PYTHON_IMAGE=${PYTHON_IMAGE} + DEFAULT_LANGUAGES=${DEFAULT_LANGUAGES} + TEI2GO_LANGUAGES=${TEI2GO_LANGUAGES} + TIMEXY_LANGUAGES=${TIMEXY_LANGUAGES} +USAGE +} + +set_model_metadata() { + local requested_model="$1" + local requested_lang="${2:-de}" + local requested_spec="${3:-}" + + MODEL_NAME="$(lower "${requested_model}")" + MODEL_LANG="$(lower "${requested_lang}")" + MODEL_SPECNAME="${requested_spec}" + MODEL_VERSION="${MODEL_VERSION_OVERRIDE:-}" + MODEL_SOURCE="${MODEL_SOURCE_OVERRIDE:-}" + + validate_model_language "${MODEL_NAME}" "${MODEL_LANG}" + + case "${MODEL_NAME}" in + microsoft) + MODEL_SPECNAME="${MODEL_SPECNAME:-recognizers-text-suite}" + MODEL_VERSION="${MODEL_VERSION:-1.0.2a2}" + MODEL_SOURCE="${MODEL_SOURCE:-https://github.com/microsoft/Recognizers-Text}" + ;; + duckling) + MODEL_SPECNAME="${MODEL_SPECNAME:-duckling}" + MODEL_VERSION="${MODEL_VERSION:-latest}" + MODEL_SOURCE="${MODEL_SOURCE:-https://github.com/facebook/duckling}" + ;; + sutime) + MODEL_SPECNAME="${MODEL_SPECNAME:-stanford-corenlp-sutime}" + MODEL_VERSION="${MODEL_VERSION:-latest}" + MODEL_SOURCE="${MODEL_SOURCE:-https://stanfordnlp.github.io/CoreNLP/sutime.html}" + ;; + tei2go) + MODEL_SPECNAME="${MODEL_SPECNAME:-${MODEL_LANG}_tei2go}" + MODEL_VERSION="${MODEL_VERSION:-0.0.0}" + MODEL_SOURCE="${MODEL_SOURCE:-https://github.com/hmosousa/tei2go}" + ;; + timexy) + MODEL_SPECNAME="${MODEL_SPECNAME:-timexy-${MODEL_LANG}}" + MODEL_VERSION="${MODEL_VERSION:-0.1.3}" + MODEL_SOURCE="${MODEL_SOURCE:-https://pypi.org/project/timexy/}" + ;; + german-gelectra) + MODEL_LANG="de" + MODEL_SPECNAME="${MODEL_SPECNAME:-satyaalmasian/temporal_tagger_German_GELECTRA}" + MODEL_VERSION="${MODEL_VERSION:-a523f786c63a5c0542e04d22f4b42364f33ec935}" + MODEL_SOURCE="${MODEL_SOURCE:-https://huggingface.co/satyaalmasian/temporal_tagger_German_GELECTRA}" + ;; + bert-got-a-date) + MODEL_LANG="en" + MODEL_SPECNAME="${MODEL_SPECNAME:-satyaalmasian/temporal_tagger_BERT_tokenclassifier}" + MODEL_VERSION="${MODEL_VERSION:-3b4029b1ec47d4bdc9ef29f6652a44d69410b09f}" + MODEL_SOURCE="${MODEL_SOURCE:-https://huggingface.co/satyaalmasian/temporal_tagger_BERT_tokenclassifier}" + ;; + hf-token-classification) + if [[ -z "${MODEL_SPECNAME}" ]]; then + echo "MODEL_SPECNAME is required for hf-token-classification." >&2 + echo "Example: $0 hf-token-classification de satyaalmasian/temporal_tagger_German_GELECTRA" >&2 + exit 1 + fi + MODEL_VERSION="${MODEL_VERSION:-main}" + MODEL_SOURCE="${MODEL_SOURCE:-https://huggingface.co/${MODEL_SPECNAME}}" + ;; + *) + echo "Unsupported model: ${MODEL_NAME}" >&2 + usage >&2 + exit 1 + ;; + esac + + export MODEL_NAME MODEL_LANG MODEL_SPECNAME MODEL_VERSION MODEL_SOURCE +} + + +create_build_context() { + local build_context="$1" + + if [[ ! -f "${DOCKERFILE_SRC}" ]]; then + echo "Dockerfile not found: ${DOCKERFILE_SRC}" >&2 + exit 1 + fi + + if [[ ! -f "${REQUIREMENTS_SRC}" ]]; then + echo "requirements.txt not found: ${REQUIREMENTS_SRC}" >&2 + exit 1 + fi + + mkdir -p "${build_context}/src/main/docker" "${build_context}/src/main/python" + cp "${DOCKERFILE_SRC}" "${build_context}/src/main/docker/Dockerfile" + + if [[ -d "${PYTHON_SRC_DIR}" ]]; then + cp "${PYTHON_SRC_DIR}/TypeSystemTime.xml" "${build_context}/src/main/python/" + cp "${PYTHON_SRC_DIR}/duui_time.py" "${build_context}/src/main/python/" + cp "${PYTHON_SRC_DIR}/time_recognition_backend.py" "${build_context}/src/main/python/" + cp "${PYTHON_SRC_DIR}/duui_time.lua" "${build_context}/src/main/python/" + else + echo "Python source dir not found: ${PYTHON_SRC_DIR}" >&2 + exit 1 + fi + + cp "${REQUIREMENTS_SRC}" "${build_context}/requirements.txt" +} + +build_one() { + local requested_model="$1" + local requested_lang="${2:-de}" + local requested_spec="${3:-}" + + set_model_metadata "${requested_model}" "${requested_lang}" "${requested_spec}" + + local model_tag + local lang_tag + local image + local build_context + + model_tag="$(sanitize_tag_part "${MODEL_NAME}")" + lang_tag="$(sanitize_tag_part "${MODEL_LANG}")" + + if [[ "${MODEL_NAME}" == "hf-token-classification" ]]; then + model_tag="${model_tag}-$(sanitize_tag_part "${MODEL_SPECNAME}")" + fi + + image="${DOCKER_REGISTRY}${ANNOTATOR_NAME}-${model_tag}-${lang_tag}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA}" + build_context="$(mktemp -d -t duui-time-build-${model_tag}-${lang_tag}-XXXXXX)" + + create_build_context "${build_context}" + + echo "============================================================" + echo "Building ${image}" + echo " MODEL_NAME=${MODEL_NAME}" + echo " MODEL_SPECNAME=${MODEL_SPECNAME}" + echo " MODEL_VERSION=${MODEL_VERSION}" + echo " MODEL_SOURCE=${MODEL_SOURCE}" + echo " MODEL_LANG=${MODEL_LANG}" + echo " PYTHON_IMAGE=${PYTHON_IMAGE}" + echo " Dockerfile uses copied requirements: ${REQUIREMENTS_SRC}" + echo "============================================================" + echo "Copied requirements.txt:" + sed 's/^/ /' "${build_context}/requirements.txt" + echo "============================================================" + + docker build \ + --build-arg PYTHON_IMAGE="${PYTHON_IMAGE}" \ + --build-arg ANNOTATOR_NAME="${ANNOTATOR_NAME}" \ + --build-arg ANNOTATOR_VERSION="${ANNOTATOR_VERSION}" \ + --build-arg LOG_LEVEL="${LOG_LEVEL}" \ + --build-arg MODEL_CACHE_SIZE="${MODEL_CACHE_SIZE}" \ + --build-arg MODEL_NAME="${MODEL_NAME}" \ + --build-arg MODEL_SPECNAME="${MODEL_SPECNAME}" \ + --build-arg MODEL_VERSION="${MODEL_VERSION}" \ + --build-arg MODEL_SOURCE="${MODEL_SOURCE}" \ + --build-arg MODEL_LANG="${MODEL_LANG}" \ + --build-arg TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE}" \ + -t "${image}" \ + -f "${build_context}/src/main/docker/Dockerfile" \ + "${build_context}" + + docker tag \ + "${image}" \ + "${DOCKER_REGISTRY}${ANNOTATOR_NAME}-${model_tag}-${lang_tag}:latest${ANNOTATOR_CUDA}" + + if [[ "${KEEP_BUILD_CONTEXT:-0}" == "1" ]]; then + echo "Keeping build context: ${build_context}" + else + rm -rf "${build_context}" + fi +} + +build_all() { + local model + local lang + + for model in ${TIME_MODELS}; do + case "$(lower "${model}")" in + hf-token-classification) + if [[ -n "${HF_MODEL_SPECNAME:-}" ]]; then + build_one "hf-token-classification" "${HF_MODEL_LANG:-de}" "${HF_MODEL_SPECNAME}" + else + echo "Skipping hf-token-classification in all mode because HF_MODEL_SPECNAME is not set." + fi + ;; + *) + while IFS= read -r lang; do + [[ -z "${lang}" ]] && continue + build_one "${model}" "${lang}" + done < <(languages_for_model "${model}") + ;; + esac + done +} + +build_model_all_languages() { + local model="$1" + local spec="${2:-}" + local lang + + while IFS= read -r lang; do + [[ -z "${lang}" ]] && continue + build_one "${model}" "${lang}" "${spec}" + done < <(languages_for_model "${model}") +} + +main() { + local command="${1:-all}" + + case "${command}" in + -h|--help|help) + usage + ;; + all) + build_all + ;; + *) + if [[ "${2:-}" == "all" ]]; then + build_model_all_languages "${command}" "${3:-}" + else + build_one "${command}" "${2:-de}" "${3:-}" + fi + ;; + esac +} + +main "$@" \ No newline at end of file diff --git a/duui-TimeDetection/duui-TimeDetection.iml b/duui-TimeDetection/duui-TimeDetection.iml new file mode 100644 index 00000000..4004f39b --- /dev/null +++ b/duui-TimeDetection/duui-TimeDetection.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/duui-TimeDetection/pom.xml b/duui-TimeDetection/pom.xml new file mode 100644 index 00000000..4a84188c --- /dev/null +++ b/duui-TimeDetection/pom.xml @@ -0,0 +1,155 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui-TimeDetection + 0.1.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + + 17 + 17 + 2.4.0 + + + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 7cef2433b5 + + + + + + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.14 + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + \ No newline at end of file diff --git a/duui-TimeDetection/requirements.txt b/duui-TimeDetection/requirements.txt new file mode 100644 index 00000000..46b456c7 --- /dev/null +++ b/duui-TimeDetection/requirements.txt @@ -0,0 +1,20 @@ +fastapi>=0.115,<1.0 +uvicorn[standard]>=0.30,<1.0 +pydantic>=2.7,<3.0 +pydantic-settings>=2.3,<3.0 +requests>=2.32,<3.0 + +# Required only for Hugging Face based temporal taggers. +torch>=2.3,<3.0 +transformers>=4.44,<5.0 +sentencepiece>=0.2,<0.3 + +# Microsoft Recognizers-Text backend. +# recognizers-text 1.0.2a2 expects emoji.UNICODE_EMOJI, removed in emoji>=2. +emoji==1.7.0 +recognizers-text-suite==1.0.2a2 + +# Optional spaCy-based backends: TEI2GO and Timexy. +# DUUI may already provide spaCy in a separate component; keep this only for this standalone container. +spacy>=3.7,<4.0 +timexy==0.1.3 diff --git a/duui-TimeDetection/src/main/docker/Dockerfile b/duui-TimeDetection/src/main/docker/Dockerfile new file mode 100644 index 00000000..80ce751d --- /dev/null +++ b/duui-TimeDetection/src/main/docker/Dockerfile @@ -0,0 +1,108 @@ +ARG PYTHON_IMAGE=python:3.12 +FROM ${PYTHON_IMAGE} + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# System dependencies. curl is needed only for model artifact downloads such as TEI2GO, +# but keeping it here makes the model-specific RUN block simple and deterministic. +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Python dependencies. +# IMPORTANT: docker_build.sh generates this requirements.txt per MODEL_NAME/MODEL_LANG, +# so each image installs only the dependencies needed for that one backend. +COPY ./requirements.txt ./requirements.txt +RUN python -m pip install --upgrade pip \ + && python -m pip install --no-cache-dir -r requirements.txt + +# copy DUUI TimeX3 scripts +COPY ./src/main/python/TypeSystemTime.xml ./TypeSystemTime.xml +COPY ./src/main/python/duui_time.py ./duui_time.py +COPY ./src/main/python/time_recognition_backend.py ./time_recognition_backend.py +COPY ./src/main/python/duui_time.lua ./duui_time.lua + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config: one model per container +ARG MODEL_CACHE_SIZE=1 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-time" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model info: exactly one model and one language per image. +ARG MODEL_NAME="microsoft" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SPECNAME="recognizers-text-suite" +ENV MODEL_SPECNAME=$MODEL_SPECNAME +ENV HF_TEMPORAL_MODEL_ID=$MODEL_SPECNAME +ARG MODEL_VERSION="0.1" +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="de" +ENV MODEL_LANG=$MODEL_LANG + +# Download/cache only the selected model artifact, never all model artifacts. +RUN set -eux; \ + model="$(echo "${MODEL_NAME}" | tr '[:upper:]' '[:lower:]')"; \ + lang="$(echo "${MODEL_LANG}" | tr '[:upper:]' '[:lower:]')"; \ + case "${model}" in \ + microsoft) \ + echo "Microsoft Recognizers-Text uses package data from requirements.txt; no extra model download."; \ + ;; \ + duckling|sutime) \ + echo "${MODEL_NAME} uses an external runtime service; no local model is downloaded."; \ + ;; \ + tei2go) \ + case "${lang}" in \ + de|en|es|fr|it|pt) tei2go_model="${lang}_tei2go" ;; \ + *) echo "Unsupported TEI2GO language: ${MODEL_LANG}" >&2; exit 1 ;; \ + esac; \ + curl -L \ + -o "/tmp/${tei2go_model}-0.0.0-py3-none-any.whl" \ + "https://huggingface.co/hugosousa/${tei2go_model}/resolve/main/${tei2go_model}-any-py3-none-any.whl"; \ + python -m pip install --no-deps "/tmp/${tei2go_model}-0.0.0-py3-none-any.whl"; \ + rm -f "/tmp/${tei2go_model}-0.0.0-py3-none-any.whl"; \ + ;; \ + timexy) \ + case "${lang}" in \ + de) spacy_model="de_core_news_sm" ;; \ + en) spacy_model="en_core_web_sm" ;; \ + fr) spacy_model="fr_core_news_sm" ;; \ + *) echo "Unsupported Timexy language: ${MODEL_LANG}. Supported: de en fr" >&2; exit 1 ;; \ + esac; \ + python -m spacy download "${spacy_model}"; \ + ;; \ + german-gelectra|bert-got-a-date|hf-token-classification) \ + hf_model="${MODEL_SPECNAME}"; \ + if [[ -z "${hf_model}" ]]; then \ + echo "MODEL_SPECNAME must contain a Hugging Face model id for ${MODEL_NAME}" >&2; \ + exit 1; \ + fi; \ + export HF_MODEL_ID="${hf_model}"; \ + python -c "import os; from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline; model_id=os.environ['HF_MODEL_ID']; print(f'Caching Hugging Face token-classification model: {model_id}'); AutoTokenizer.from_pretrained(model_id); AutoModelForTokenClassification.from_pretrained(model_id); pipeline('token-classification', model=model_id, aggregation_strategy='simple')"; \ + ;; \ + *) \ + echo "Unsupported MODEL_NAME=${MODEL_NAME}" >&2; \ + exit 1; \ + ;; \ + esac + +# offline mode for HuggingFace/runtime after build-time model caching +ARG TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TRANSFORMERS_OFFLINE +ENV HF_HUB_OFFLINE=$TRANSFORMERS_OFFLINE + +ENTRYPOINT ["uvicorn", "duui_time:app", "--host", "0.0.0.0", "--port", "9714"] +CMD ["--workers", "1"] \ No newline at end of file diff --git a/duui-TimeDetection/src/main/docker/Dockerfile-cuda b/duui-TimeDetection/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..e64d175e --- /dev/null +++ b/duui-TimeDetection/src/main/docker/Dockerfile-cuda @@ -0,0 +1,70 @@ +FROM nvidia/cuda:11.8.0-base-ubuntu22.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +RUN pip install setuptools wheel +COPY ./requirements.txt ./requirements.txt +RUN apt remove -y python3-blinker || true +RUN pip install -r requirements.txt + + + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_genre.py ./duui_genre.py +COPY ./src/main/python/duui_genre.lua ./duui_genre.lua +COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/web-register-classification-multilingual')" + + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-transformers-topic" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_genre:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-TimeDetection/src/main/python/TypeSystemTime.xml b/duui-TimeDetection/src/main/python/TypeSystemTime.xml new file mode 100644 index 00000000..dc052a36 --- /dev/null +++ b/duui-TimeDetection/src/main/python/TypeSystemTime.xml @@ -0,0 +1,132 @@ + + + + + org.texttechnologylab.annotation.AnnotatorMetaData + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + name + + uima.cas.String + + + version + + uima.cas.String + + + modelName + + uima.cas.String + + + modelVersion + + uima.cas.String + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + org.texttechnologylab.annotation.DocumentModification + + uima.cas.AnnotationBase + + + user + + uima.cas.String + + + timestamp + + uima.cas.Long + + + comment + + uima.cas.String + + + + + org.hucompute.textimager.uima.type.Sentiment + + uima.tcas.Annotation + + + sentiment + + uima.cas.Double + + + subjectivity + + uima.cas.Double + + + + + org.hucompute.textimager.uima.type.CategorizedSentiment + + org.hucompute.textimager.uima.type.Sentiment + + + pos + + uima.cas.Double + + + neu + + uima.cas.Double + + + neg + + uima.cas.Double + + + + + org.texttechnologylab.annotation.AnnotationComment + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + value + + uima.cas.String + + + key + + uima.cas.String + + + + + diff --git a/duui-TimeDetection/src/main/python/duui_time.lua b/duui-TimeDetection/src/main/python/duui_time.lua new file mode 100644 index 00000000..d37c4dc6 --- /dev/null +++ b/duui-TimeDetection/src/main/python/duui_time.lua @@ -0,0 +1,314 @@ +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +TopicUtils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") + +DEFAULT_SELECTION = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + +-- Runtime defaults. These mirror the Python/ENV defaults and can be overwritten +-- with DUUI .withParameter(...). +DEFAULT_THRESHOLD = 0.0 +DEFAULT_BATCH_SIZE = 8 + +DEFAULT_TIMEX_TYPE = "org.texttechnologylab.annotation.semaf.isotimeml.TimeX3" +DATE_TYPE = "org.texttechnologylab.annotation.semaf.isotimeml.time.Date" +TIME_TYPE = "org.texttechnologylab.annotation.semaf.isotimeml.time.Time" +DURATION_TYPE = "org.texttechnologylab.annotation.semaf.isotimeml.time.Duration" +SET_TYPE = "org.texttechnologylab.annotation.semaf.isotimeml.time.Set" + +function get_parameter(parameters, key, default_value) + if parameters ~= nil and parameters[key] ~= nil then + return parameters[key] + end + return default_value +end + +function safe_string(value) + if value == nil then + return "" + end + return tostring(value) +end + +function parse_optional_number(value, default_value, parameter_name, min_value, max_value) + if value == nil then + return default_value + end + + local parsed = tonumber(value) + if parsed == nil then + error("Parameter '" .. parameter_name .. "' must be a number", 2) + end + + if min_value ~= nil and parsed < min_value then + error("Parameter '" .. parameter_name .. "' must be >= " .. tostring(min_value), 2) + end + + if max_value ~= nil and parsed > max_value then + error("Parameter '" .. parameter_name .. "' must be <= " .. tostring(max_value), 2) + end + + return parsed +end + +function trim(value) + return string.gsub(safe_string(value), "^%s*(.-)%s*$", "%1") +end + +function serialize(inputCas, outputStream, parameters) + local doc_lang = inputCas:getDocumentLanguage() + local doc_text = inputCas:getDocumentText() + local doc_len = TopicUtils:getDocumentTextLength(inputCas) + + local selection_types = get_parameter(parameters, "selection", DEFAULT_SELECTION) + + local threshold = parse_optional_number( + get_parameter(parameters, "threshold", DEFAULT_THRESHOLD), + DEFAULT_THRESHOLD, + "threshold", + 0.0, + 1.0 + ) + + local batch_size = parse_optional_number( + get_parameter(parameters, "batch_size", DEFAULT_BATCH_SIZE), + DEFAULT_BATCH_SIZE, + "batch_size", + 1, + nil + ) + batch_size = math.floor(batch_size) + + local document_creation_time = trim(get_parameter(parameters, "document_creation_time", "")) + if document_creation_time == "" then + document_creation_time = trim(get_parameter(parameters, "reference_time", "")) + end + + local duckling_url = trim(get_parameter(parameters, "duckling_url", "")) + local corenlp_url = trim(get_parameter(parameters, "corenlp_url", "")) + local duckling_timezone = trim(get_parameter(parameters, "duckling_timezone", "Europe/Berlin")) + + local selections = {} + local selections_count = 1 + + for selection_type in string.gmatch(selection_types, "([^,]+)") do + selection_type = trim(selection_type) + + local sentences = {} + local sentences_count = 1 + + if selection_type == "text" then + sentences[1] = { + text = doc_text, + begin = 0, + ['end'] = doc_len + } + else + local clazz = Class:forName(selection_type) + local sentences_it = JCasUtil:select(inputCas, clazz):iterator() + + while sentences_it:hasNext() do + local sentence = sentences_it:next() + sentences[sentences_count] = { + text = sentence:getCoveredText(), + begin = sentence:getBegin(), + ['end'] = sentence:getEnd() + } + sentences_count = sentences_count + 1 + end + end + + selections[selections_count] = { + sentences = sentences, + selection = selection_type + } + selections_count = selections_count + 1 + end + + outputStream:write(json.encode({ + selections = selections, + lang = doc_lang, + doc_len = doc_len, + threshold = threshold, + batch_size = batch_size, + document_creation_time = document_creation_time, + duckling_url = duckling_url, + corenlp_url = corenlp_url, + duckling_timezone = duckling_timezone + })) +end + +function add_document_modification(inputCas, results) + if results["modification_meta"] == nil then + return + end + + pcall(function() + local modification_meta = results["modification_meta"] + local modification_anno = luajava.newInstance("org.texttechnologylab.annotation.DocumentModification", inputCas) + modification_anno:setUser(safe_string(modification_meta["user"])) + modification_anno:setTimestamp(modification_meta["timestamp"]) + modification_anno:setComment(safe_string(modification_meta["comment"])) + modification_anno:addToIndexes() + end) +end + +function add_model_metadata(inputCas, results) + local model_meta = nil + + pcall(function() + model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) + model_meta:setModelVersion(safe_string(results["model_version"])) + model_meta:setModelName(safe_string(results["model_name"])) + model_meta:setSource(safe_string(results["model_source"])) + model_meta:setLang(safe_string(results["model_lang"])) + model_meta:addToIndexes() + end) + + return model_meta +end + +function add_annotation_comment(inputCas, reference, key, value) + if value == nil then + return + end + + pcall(function() + local comment = luajava.newInstance("org.texttechnologylab.annotation.AnnotationComment", inputCas) + comment:setReference(reference) + comment:setKey(safe_string(key)) + comment:setValue(safe_string(value)) + comment:addToIndexes() + end) +end + +function get_time_type_from_tag(tag) + local tag_type = tag["time_type"] + if tag_type ~= nil and tag_type ~= "" then + return tag_type + end + + local timex_type = string.upper(safe_string(tag["timex_type"])) + if timex_type == "DATE" then + return DATE_TYPE + elseif timex_type == "TIME" then + return TIME_TYPE + elseif timex_type == "DURATION" then + return DURATION_TYPE + elseif timex_type == "SET" then + return SET_TYPE + end + + return DEFAULT_TIMEX_TYPE +end + +function create_time_annotation(inputCas, tag) + local tag_type = get_time_type_from_tag(tag) + local annotation = nil + + local ok = pcall(function() + annotation = luajava.newInstance(tag_type, inputCas) + end) + + if not ok or annotation == nil then + annotation = luajava.newInstance(DEFAULT_TIMEX_TYPE, inputCas) + end + + annotation:setBegin(tag["begin"]) + annotation:setEnd(tag["end"]) + + local value = tag["value"] + if value == nil then + value = tag["timex_value"] + end + + pcall(function() + annotation:setValue(safe_string(value)) + end) + + if tag["function_in_document"] ~= nil then + pcall(function() + annotation:setFunctionInDocument(safe_string(tag["function_in_document"])) + end) + end + + if tag["temporal_function"] ~= nil then + pcall(function() + annotation:setTemporalFunction(tag["temporal_function"]) + end) + end + + if tag["quant"] ~= nil then + pcall(function() + annotation:setQuant(safe_string(tag["quant"])) + end) + end + + if tag["freq"] ~= nil then + pcall(function() + annotation:setFreq(safe_string(tag["freq"])) + end) + end + + annotation:addToIndexes() + return annotation +end + +function get_tags(results) + if results["tags"] ~= nil then + return results["tags"] + end + + -- Fallback for flattened response fields. + local tags = {} + local begins = results["begin"] or {} + local ends = results["end"] or {} + local timex_types = results["results"] or {} + local timex_values = results["timex_value"] or {} + local time_types = results["time_type"] or {} + local covered_texts = results["covered_text"] or {} + local factors = results["factors"] or {} + local models = results["model"] or {} + + for i, timex_type in ipairs(timex_types) do + tags[i] = { + begin = begins[i], + ['end'] = ends[i], + value = timex_values[i], + timex_type = timex_type, + time_type = time_types[i], + covered_text = covered_texts[i], + score = factors[i], + model_name = models[i] + } + end + + return tags +end + +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) + + if results == nil then + return + end + + add_document_modification(inputCas, results) + add_model_metadata(inputCas, results) + + local tags = get_tags(results) + + for i, tag in ipairs(tags) do + if tag["begin"] ~= nil and tag["end"] ~= nil then + local annotation = create_time_annotation(inputCas, tag) + + -- Optional comments for traceability. + add_annotation_comment(inputCas, annotation, "score", tag["score"]) + add_annotation_comment(inputCas, annotation, "covered_text", tag["covered_text"]) + add_annotation_comment(inputCas, annotation, "model_name", tag["model_name"]) + add_annotation_comment(inputCas, annotation, "timex_type", tag["timex_type"]) + add_annotation_comment(inputCas, annotation, "time_type", tag["time_type"]) + end + end +end diff --git a/duui-TimeDetection/src/main/python/duui_time.py b/duui-TimeDetection/src/main/python/duui_time.py new file mode 100644 index 00000000..0c0441db --- /dev/null +++ b/duui-TimeDetection/src/main/python/duui_time.py @@ -0,0 +1,536 @@ +from __future__ import annotations + +import json +import logging +from functools import lru_cache +from threading import Lock +from time import time +from typing import Any, Dict, Final, Iterable, List, Optional, Tuple + +import torch +from fastapi import FastAPI, Request, Response +from fastapi.responses import JSONResponse, PlainTextResponse +from pydantic import BaseModel + +try: + from pydantic_settings import BaseSettings +except ImportError: # pydantic v1 fallback + from pydantic import BaseSettings # type: ignore + +from time_recognition_backend import MODEL_REGISTRY, create_time_recognizer, resolve_model_name + + +model_lock = Lock() + + +def _string_list(value: str) -> List[str]: + return [item.strip() for item in str(value).split(",") if item.strip()] + + +class UimaSentence(BaseModel): + text: str + begin: int + end: int + + +class UimaSentenceSelection(BaseModel): + selection: str + sentences: List[UimaSentence] + + +class Settings(BaseSettings): + annotator_name: str = "DUUI TimeX3" + annotator_version: str = "0.1.0" + log_level: str = "INFO" + + # Exactly one time recognizer per container. + # Use one registry alias or one exact Hugging Face model id. + # Comma-separated lists and "all" are intentionally rejected. + model_name: str = "microsoft" + model_version: str = "latest" + model_cache_size: int = 1 + model_source: str = "" + + # One running container instance is bound to exactly one language. + model_lang: str = "de" + + threshold: float = 0.0 + batch_size: int = 8 + + typesystem_filename: str = "TypeSystemTime.xml" + lua_communication_script_filename: str = "duui_time.lua" + + class Config: + env_prefix = "" + case_sensitive = False + + +settings = Settings() +logging.basicConfig(level=getattr(logging, settings.log_level.upper(), logging.INFO)) +logger = logging.getLogger(__name__) + +lru_cache_with_size = lru_cache(maxsize=max(1, settings.model_cache_size)) +device = "cuda:0" if torch.cuda.is_available() else "cpu" +logger.info("USING %s", device) + + +class DUUIRequest(BaseModel): + doc_len: Optional[int] = None + lang: Optional[str] = None + selections: List[UimaSentenceSelection] + + # Runtime parameters passed through the Lua layer from .withParameter(...). + threshold: Optional[float] = None + batch_size: Optional[int] = None + + # Optional reference date/time for relative temporal expressions. + # Accepts ISO-like values such as 2026-06-09 or 2026-06-09T00:00:00+02:00. + document_creation_time: Optional[str] = None + reference_time: Optional[str] = None + + # Optional service URLs passed via DUUI .withParameter(...). + # They are relevant only for the duckling and sutime backends. + duckling_url: Optional[str] = None + duckling_timezone: Optional[str] = None + corenlp_url: Optional[str] = None + + +class DocumentModification(BaseModel): + user: str + timestamp: int + comment: str + + +class AnnotationMeta(BaseModel): + name: str + version: str + modelName: str + modelVersion: str + + +TIME_BASE_TYPE: Final[str] = "org.texttechnologylab.annotation.semaf.isotimeml.TimeX3" +TIME_TYPES: Final[Dict[str, str]] = { + "DATE": "org.texttechnologylab.annotation.semaf.isotimeml.time.Date", + "TIME": "org.texttechnologylab.annotation.semaf.isotimeml.time.Time", + "DURATION": "org.texttechnologylab.annotation.semaf.isotimeml.time.Duration", + "SET": "org.texttechnologylab.annotation.semaf.isotimeml.time.Set", + "UNKNOWN": TIME_BASE_TYPE, +} + + +class Timex3Annotation(BaseModel): + begin: int + end: int + value: Optional[str] = None + timex_type: str + time_type: str = TIME_BASE_TYPE + covered_text: Optional[str] = None + score: Optional[float] = None + model_name: Optional[str] = None + + # Optional IsoTimeML/TimeX3 features. + function_in_document: Optional[str] = None + temporal_function: Optional[bool] = None + quant: Optional[str] = None + freq: Optional[str] = None + + +class DUUIResponse(BaseModel): + meta: AnnotationMeta + modification_meta: DocumentModification + begin: List[int] + end: List[int] + results: List[str] + factors: List[float] + len_results: List[int] + timex_value: List[Optional[str]] + time_type: List[str] + covered_text: List[str] + model: List[str] + tags: List[Timex3Annotation] + model_name: str + model_version: str + model_source: str + model_lang: str + + +class TextImagerCapability(BaseModel): + supported_languages: List[str] + reproducible: bool + + +class TextImagerDocumentation(BaseModel): + annotator_name: str + version: str + implementation_lang: str + meta: Dict[str, Any] + docker_container_id: Optional[str] + parameters: Dict[str, Any] + capability: TextImagerCapability + implementation_specific: Optional[str] + + +def read_required_text_file(filename: str) -> str: + with open(filename, "r", encoding="utf-8") as f: + content = f.read() + if not content.strip(): + raise RuntimeError(f"Required Lua communication script is empty: {filename}") + return content + + +def read_required_binary_file(filename: str) -> bytes: + with open(filename, "rb") as f: + content = f.read() + if not content.strip(): + raise RuntimeError(f"Required UIMA type system XML is empty: {filename}") + return content + + +lua_communication_script = read_required_text_file(settings.lua_communication_script_filename) +type_system = read_required_binary_file(settings.typesystem_filename) + + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.annotator_name, + description="DUUI TimeX3 annotator built from the DUUI NER template", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "TTLab Team", + "url": "https://texttechnologylab.org", + "email": "bagci@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + + +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + return Response(content=type_system, media_type="application/xml") + + +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +@app.get("/v1/documentation") +def get_documentation() -> TextImagerDocumentation: + selected_model = get_selected_model_name(settings.model_name) + _, selected_cfg = resolve_model_name(selected_model) + return TextImagerDocumentation( + annotator_name=settings.annotator_name, + version=settings.annotator_version, + implementation_lang="Python", + meta={ + "device": device, + "available_models": MODEL_REGISTRY, + "selected_model": selected_model, + "selected_model_id": selected_cfg.get("model_id", selected_model), + "backend_module": "time_recognition_backend.py", + "time_types": TIME_TYPES, + }, + docker_container_id=None, + parameters={ + "model_name": "exactly one registry alias or one exact Hugging Face model id", + "model_lang": "exactly one language per running container instance", + "threshold": settings.threshold, + "batch_size": settings.batch_size, + "document_creation_time": "optional ISO reference date/time for relative expressions", + }, + capability=TextImagerCapability(supported_languages=[settings.model_lang], reproducible=True), + implementation_specific=None, + ) + + +def get_selected_model_name(model_name: str) -> str: + """Return exactly one configured recognizer for this container.""" + selected = (model_name or "").strip() + if not selected: + selected = "microsoft" + + if selected.lower() == "all" or "," in selected: + supported = ", ".join(sorted(MODEL_REGISTRY.keys())) + raise ValueError( + "This DUUI container supports exactly one MODEL_NAME. " + "Start one container per recognizer/language instead of using 'all' or comma-separated lists. " + f"Supported aliases: {supported}" + ) + + alias, _ = resolve_model_name(selected) + return alias + + +def validate_language(request_language: Optional[str]) -> str: + service_language = (settings.model_lang or "").strip().lower() + if not service_language or service_language == "multi": + raise ValueError( + "MODEL_LANG must be one concrete language for this DUUI component, " + "for example MODEL_LANG=de or MODEL_LANG=en." + ) + + if request_language is None or not str(request_language).strip(): + return service_language + + request_language_normalized = str(request_language).strip().lower() + if request_language_normalized != service_language: + raise ValueError( + f"This service was started for language '{service_language}', " + f"but the request uses language '{request_language_normalized}'. " + "Start a separate service instance for another language." + ) + return service_language + + +@lru_cache_with_size +def load_model(model_name: str, language: str): + return create_time_recognizer(model_name, language=language, device=device) + + +def fix_unicode_problems(text: str) -> str: + return text.encode("utf-16", "surrogatepass").decode("utf-16", "surrogateescape") + + +def iter_batches(items: List[UimaSentence], batch_size: int) -> Iterable[List[UimaSentence]]: + size = max(1, int(batch_size)) + for start in range(0, len(items), size): + yield items[start:start + size] + + +def process_selection( + model_name: str, + selection: UimaSentenceSelection, + language: str, + threshold: float, + batch_size: int, + document_creation_time: Optional[str], + duckling_url: Optional[str] = None, + duckling_timezone: Optional[str] = None, + corenlp_url: Optional[str] = None, +) -> Dict[str, Any]: + begin: List[int] = [] + end: List[int] = [] + results_out: List[str] = [] + factors: List[float] = [] + len_results: List[int] = [] + timex_value_out: List[Optional[str]] = [] + time_type_out: List[str] = [] + covered_text_out: List[str] = [] + model_out: List[str] = [] + tags: List[Timex3Annotation] = [] + + for s in selection.sentences: + s.text = fix_unicode_problems(s.text) + + for batch in iter_batches(selection.sentences, batch_size): + texts = [s.text for s in batch] + + with model_lock: + recognizer = load_model(model_name, language) + predictions = recognizer.predict( + texts, + language=language, + document_creation_time=document_creation_time, + threshold=threshold, + batch_size=batch_size, + duckling_url=duckling_url, + duckling_timezone=duckling_timezone, + corenlp_url=corenlp_url, + ) + + for sentence, sentence_times in zip(batch, predictions): + for ent in sentence_times: + rel_start = int(ent["start"]) + rel_end = int(ent["end"]) + if rel_end <= rel_start: + continue + + abs_begin = sentence.begin + rel_start + abs_end = sentence.begin + rel_end + timex_type = str(ent.get("timex_type") or ent.get("label") or "UNKNOWN") + timex_value = ent.get("value") + score = float(ent.get("score", 1.0)) + covered = str(ent.get("text", sentence.text[rel_start:rel_end])) + time_type = str(ent.get("time_type") or TIME_TYPES.get(timex_type, TIME_BASE_TYPE)) + entity_model_name = str(ent.get("model_name", model_name)) + + tag = Timex3Annotation( + begin=abs_begin, + end=abs_end, + value=timex_value, + timex_type=timex_type, + time_type=time_type, + covered_text=covered, + score=score, + model_name=entity_model_name, + function_in_document=ent.get("function_in_document"), + temporal_function=ent.get("temporal_function"), + quant=ent.get("quant"), + freq=ent.get("freq"), + ) + + begin.append(abs_begin) + end.append(abs_end) + results_out.append(timex_type) + factors.append(score) + len_results.append(1) + timex_value_out.append(timex_value) + time_type_out.append(time_type) + covered_text_out.append(covered) + model_out.append(entity_model_name) + tags.append(tag) + + return { + "begin": begin, + "end": end, + "results": results_out, + "factors": factors, + "len_results": len_results, + "timex_value": timex_value_out, + "time_type": time_type_out, + "covered_text": covered_text_out, + "model": model_out, + "tags": tags, + } + + +def model_meta_values(model_name: str) -> Tuple[str, str]: + _, cfg = resolve_model_name(model_name) + model_source = settings.model_source or cfg.get("model_source", "") + model_lang = settings.model_lang + return model_source, model_lang + + +def _validate_duui_request_payload(payload: Any) -> DUUIRequest: + """Validate DUUI payloads that may arrive as object or JSON-encoded string. + + Some DUUI/Lua combinations send the serialized Lua output as a JSON string, + e.g. "{\"selections\":[...]}" instead of a JSON object. + FastAPI/Pydantic then returns 422 before our code runs. Reading the raw + request body and decoding once or twice makes the endpoint compatible with + both shapes. + """ + if isinstance(payload, (bytes, bytearray)): + payload = payload.decode("utf-8") + + if isinstance(payload, str): + payload = payload.strip() + if not payload: + raise ValueError("Request body is empty.") + payload = json.loads(payload) + + if isinstance(payload, str): + payload = json.loads(payload) + + if not isinstance(payload, dict): + raise ValueError("Request body must be a JSON object.") + + if hasattr(DUUIRequest, "model_validate"): + return DUUIRequest.model_validate(payload) + return DUUIRequest.parse_obj(payload) + + +@app.post("/v1/process", response_model=DUUIResponse) +async def post_process(raw_request: Request): + try: + request = _validate_duui_request_payload(await raw_request.body()) + except Exception as ex: + return JSONResponse(status_code=400, content={"message": f"Invalid request body: {ex}"}) + if not request.selections: + return JSONResponse(status_code=400, content={"message": "The request must contain sentence selections."}) + + try: + language = validate_language(request.lang) + model_name = get_selected_model_name(settings.model_name) + except Exception as ex: + return JSONResponse(status_code=400, content={"message": str(ex)}) + + effective_threshold = request.threshold if request.threshold is not None else settings.threshold + effective_batch_size = request.batch_size if request.batch_size is not None else settings.batch_size + reference_time = request.document_creation_time or request.reference_time + duckling_url = request.duckling_url or None + duckling_timezone = request.duckling_timezone or None + corenlp_url = request.corenlp_url or None + + if effective_threshold < 0.0 or effective_threshold > 1.0: + return JSONResponse(status_code=400, content={"message": "threshold must be between 0.0 and 1.0"}) + if effective_batch_size < 1: + return JSONResponse(status_code=400, content={"message": "batch_size must be >= 1"}) + + modification_timestamp_seconds = int(time()) + meta_model_name = model_name + meta = AnnotationMeta( + name=settings.annotator_name, + version=settings.annotator_version, + modelName=meta_model_name, + modelVersion=settings.model_version, + ) + modification_meta = DocumentModification( + user=settings.annotator_name, + timestamp=modification_timestamp_seconds, + comment=f"{settings.annotator_name} ({settings.annotator_version})", + ) + + begin: List[int] = [] + end: List[int] = [] + len_results: List[int] = [] + results: List[str] = [] + factors: List[float] = [] + timex_value: List[Optional[str]] = [] + time_type: List[str] = [] + covered_text: List[str] = [] + model: List[str] = [] + tags: List[Timex3Annotation] = [] + + try: + for selection in request.selections: + processed = process_selection( + model_name=model_name, + selection=selection, + language=language, + threshold=effective_threshold, + batch_size=effective_batch_size, + document_creation_time=reference_time, + duckling_url=duckling_url, + duckling_timezone=duckling_timezone, + corenlp_url=corenlp_url, + ) + begin += processed["begin"] + end += processed["end"] + len_results += processed["len_results"] + results += processed["results"] + factors += processed["factors"] + timex_value += processed["timex_value"] + time_type += processed["time_type"] + covered_text += processed["covered_text"] + model += processed["model"] + tags += processed["tags"] + + model_source, model_lang = model_meta_values(model_name) + return DUUIResponse( + meta=meta, + modification_meta=modification_meta, + begin=begin, + end=end, + results=results, + factors=factors, + len_results=len_results, + timex_value=timex_value, + time_type=time_type, + covered_text=covered_text, + model=model, + tags=tags, + model_name=meta_model_name, + model_version=settings.model_version, + model_source=model_source, + model_lang=model_lang, + ) + except Exception as ex: + logger.exception("TimeX3 processing failed") + return JSONResponse(status_code=500, content={"message": str(ex)}) \ No newline at end of file diff --git a/duui-TimeDetection/src/main/python/preload_model.py b/duui-TimeDetection/src/main/python/preload_model.py new file mode 100644 index 00000000..8acfbb60 --- /dev/null +++ b/duui-TimeDetection/src/main/python/preload_model.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import os + +from time_recognition_backend import apply_model_settings, resolve_model_name + + +def main() -> None: + model_name = os.environ.get("MODEL_NAME", "microsoft") + model_specname = os.environ.get("MODEL_SPECNAME", "") + model_source = os.environ.get("MODEL_SOURCE", "") + model_lang = os.environ.get("MODEL_LANG", "") + model_version = os.environ.get("MODEL_VERSION", "latest") + + alias, cfg = resolve_model_name(model_name) + cfg = apply_model_settings( + cfg, + model_specname=model_specname, + model_source=model_source, + model_lang=model_lang, + ) + + backend = cfg.get("backend") + if backend != "hf_token_classification": + print(f"No build-time model preload needed for backend {backend}.") + return + + model_id = cfg.get("model_id") + if not model_id: + raise RuntimeError( + "MODEL_SPECNAME or a registry model_id is required for Hugging Face backends." + ) + + revision = None if model_version in {"", "latest"} else model_version + + from transformers import AutoModelForTokenClassification, AutoTokenizer + + AutoTokenizer.from_pretrained(model_id, use_fast=True, revision=revision) + AutoModelForTokenClassification.from_pretrained(model_id, revision=revision) + + print(f"Preloaded Hugging Face model {model_id} revision={revision or 'default'} for alias {alias}.") + + +if __name__ == "__main__": + main() diff --git a/duui-TimeDetection/src/main/python/time_recognition_backend.py b/duui-TimeDetection/src/main/python/time_recognition_backend.py new file mode 100644 index 00000000..1d0ccde3 --- /dev/null +++ b/duui-TimeDetection/src/main/python/time_recognition_backend.py @@ -0,0 +1,1159 @@ +from __future__ import annotations + +import json +import os +import re +from abc import ABC, abstractmethod +from typing import Any, Dict, Iterable, List, Optional, Tuple + +import requests + + +TIME_BASE_TYPE = "org.texttechnologylab.annotation.semaf.isotimeml.TimeX3" +TIME_TYPE_MAP: Dict[str, str] = { + "DATE": "org.texttechnologylab.annotation.semaf.isotimeml.time.Date", + "TIME": "org.texttechnologylab.annotation.semaf.isotimeml.time.Time", + "DURATION": "org.texttechnologylab.annotation.semaf.isotimeml.time.Duration", + "SET": "org.texttechnologylab.annotation.semaf.isotimeml.time.Set", + "UNKNOWN": TIME_BASE_TYPE, +} + +LANGUAGE_TO_CULTURE: Dict[str, str] = { + "de": "de-de", + "en": "en-us", + "es": "es-es", + "fr": "fr-fr", + "it": "it-it", + "pt": "pt-br", +} + +LANGUAGE_TO_DUCKLING_LOCALE: Dict[str, str] = { + "de": "de_DE", + "en": "en_US", + "es": "es_ES", + "fr": "fr_FR", + "it": "it_IT", + "pt": "pt_BR", +} + +LANGUAGE_TO_TEI2GO_MODEL: Dict[str, str] = { + "de": "de_tei2go", + "en": "en_tei2go", + "es": "es_tei2go", + "fr": "fr_tei2go", + "it": "it_tei2go", + "pt": "pt_tei2go", +} + +MODEL_REGISTRY: Dict[str, Dict[str, str]] = { + "microsoft": { + "backend": "microsoft_recognizers_text", + "model_id": "recognizers-text-suite", + "model_source": "https://github.com/microsoft/Recognizers-Text", + "model_lang": "multi", + }, + "duckling": { + "backend": "duckling_http", + "model_id": "duckling", + "model_source": "https://github.com/facebook/duckling", + "model_lang": "multi", + }, + "sutime": { + "backend": "sutime_http", + "model_id": "stanford-corenlp-sutime", + "model_source": "https://stanfordnlp.github.io/CoreNLP/sutime.html", + "model_lang": "multi", + }, + "german-gelectra": { + "backend": "hf_token_classification", + "model_id": "satyaalmasian/temporal_tagger_German_GELECTRA", + "model_source": "https://huggingface.co/satyaalmasian/temporal_tagger_German_GELECTRA", + "model_lang": "de", + }, + "bert-got-a-date": { + "backend": "hf_token_classification", + "model_id": "satyaalmasian/temporal_tagger_BERT_tokenclassifier", + "model_source": "https://github.com/satya77/Transformer_Temporal_Tagger", + "model_lang": "en", + }, + "tei2go": { + "backend": "spacy_tei2go", + "model_id": "tei2go", + "model_source": "https://github.com/hmosousa/tei2go", + "model_lang": "multi", + }, + "timexy": { + "backend": "spacy_timexy", + "model_id": "timexy", + "model_source": "https://pypi.org/project/timexy/", + "model_lang": "multi", + }, + "hf-token-classification": { + "backend": "hf_token_classification", + "model_id": "", + "model_source": "", + "model_lang": "multi", + }, +} + + +class TimeRecognizer(ABC): + model_name: str + model_id: str + model_version: str + model_source: str + model_lang: str + + @abstractmethod + def predict( + self, + texts: List[str], + language: str, + document_creation_time: Optional[str] = None, + threshold: float = 0.0, + batch_size: int = 8, + duckling_url: Optional[str] = None, + corenlp_url: Optional[str] = None, + duckling_timezone: Optional[str] = None, + ) -> List[List[Dict[str, Any]]]: + raise NotImplementedError + + +def resolve_model_name(model_name: str) -> Tuple[str, Dict[str, str]]: + selected = (model_name or "").strip() + if not selected: + selected = "microsoft" + + if selected in MODEL_REGISTRY: + return selected, MODEL_REGISTRY[selected] + + for alias, cfg in MODEL_REGISTRY.items(): + if selected == cfg.get("model_id") and cfg.get("model_id"): + return alias, cfg + + # Exact Hugging Face model ids are allowed through the generic backend. + if "/" in selected: + cfg = dict(MODEL_REGISTRY["hf-token-classification"]) + cfg["model_id"] = selected + cfg["model_source"] = f"https://huggingface.co/{selected}" + return selected, cfg + + supported = sorted(list(MODEL_REGISTRY.keys())) + raise ValueError(f"Unsupported model_name '{selected}'. Supported values: {', '.join(supported)}") + + +def apply_model_settings( + cfg: Dict[str, str], + *, + model_specname: Optional[str] = None, + model_source: Optional[str] = None, + model_lang: Optional[str] = None, +) -> Dict[str, str]: + """Apply Docker build-time model settings to a registry entry. + + MODEL_NAME selects the registry alias/backend. MODEL_SPECNAME can override + the concrete model id/specification stored in the registry. MODEL_SOURCE + and MODEL_LANG are metadata fields written to the DUUI response. + """ + configured = dict(cfg) + + specname = (model_specname or "").strip() + source = (model_source or "").strip() + lang = (model_lang or "").strip() + + if specname: + configured["model_id"] = specname + if source: + configured["model_source"] = source + if lang: + configured["model_lang"] = lang + + return configured + + +def normalize_timex_type(value: Optional[str]) -> str: + if not value: + return "UNKNOWN" + + label = str(value).strip().upper() + label = re.sub(r"^[BI]-", "", label) + + # TIMEX/TIMEX3 ist generisch, nicht automatisch TIME. + if label in {"TIMEX", "TIMEX3", "TIME_EXPRESSION"}: + return "UNKNOWN" + + if "DURATION" in label or label in {"DUR", "PERIOD"}: + return "DURATION" + if "SET" in label or label in {"FREQUENCY", "FREQ"}: + return "SET" + if "DATERANGE" in label or label in {"DATEPERIOD"}: + return "DATE" + if "DATE" in label and "TIME" not in label: + return "DATE" + if "DATETIME" in label: + return "TIME" + if "TIME" in label: + return "TIME" + + return "UNKNOWN" + + +GERMAN_MONTHS = { + "januar", "jan", "februar", "feb", "märz", "maerz", "mrz", + "april", "apr", "mai", "juni", "jun", "juli", "jul", + "august", "aug", "september", "sep", "oktober", "okt", + "november", "nov", "dezember", "dez", +} + +GERMAN_WEEKDAYS = { + "montag", "dienstag", "mittwoch", "donnerstag", + "freitag", "samstag", "sonntag", +} + +RELATIVE_DATE_WORDS_DE = { + "heute", "morgen", "übermorgen", "uebermorgen", "gestern", "vorgestern", +} + +SET_WORDS_DE = { + "jeden", "jede", "jeder", "jedes", "täglich", "taeglich", + "wöchentlich", "woechentlich", "monatlich", "jährlich", "jaehrlich", +} + + +def infer_timex_type_from_text(text: str, language: str = "de") -> str: + normalized = text.strip().lower() + + if not normalized: + return "UNKNOWN" + + tokens = re.findall(r"\w+", normalized, flags=re.UNICODE) + token_set = set(tokens) + + if token_set & SET_WORDS_DE: + return "SET" + + if re.search(r"\b\d{1,2}\s*(uhr|:\d{2})\b", normalized): + return "TIME" + + if re.search(r"\b(sekunde|minuten?|stunden?|tage?|wochen?|monate?|jahre?)\b", normalized): + if re.search(r"\b\d+\b", normalized): + return "DURATION" + + if token_set & RELATIVE_DATE_WORDS_DE: + return "DATE" + + if token_set & GERMAN_MONTHS: + return "DATE" + + if token_set & GERMAN_WEEKDAYS: + return "DATE" + + if re.search(r"\b\d{1,2}\.\s*\d{1,2}\.?\s*(\d{2,4})?\b", normalized): + return "DATE" + + if re.fullmatch(r"\d{4}", normalized): + return "DATE" + + return "UNKNOWN" + + +def infer_timex_type_from_timex3(value: Optional[str]) -> str: + if not value: + return "UNKNOWN" + + match = re.search(r'type="([^"]+)"', str(value), flags=re.IGNORECASE) + if not match: + return "UNKNOWN" + + return normalize_timex_type(match.group(1)) + +def normalize_resolution_type(value_type: Optional[str]) -> str: + if not value_type: + return "UNKNOWN" + + value_type = str(value_type).lower() + + if value_type in {"date", "daterange", "dateperiod"}: + return "DATE" + if value_type in {"time"}: + return "TIME" + if value_type in {"datetime", "datetimerange", "timeperiod"}: + return "TIME" + if value_type == "duration": + return "DURATION" + if value_type == "set": + return "SET" + + return "UNKNOWN" + + +def time_type_for_timex(timex_type: str) -> str: + return TIME_TYPE_MAP.get(normalize_timex_type(timex_type), TIME_BASE_TYPE) + + +def make_time_expression( + *, + sentence: str, + start: int, + end: int, + timex_type: Optional[str], + value: Optional[str], + score: Optional[float], + model_name: str, + model_id: str, + model_version: str = "", + raw: Optional[Dict[str, Any]] = None, +) -> Optional[Dict[str, Any]]: + try: + start_i = max(0, int(start)) + end_i = min(len(sentence), int(end)) + except Exception: + return None + + if end_i <= start_i: + return None + + normalized_type = normalize_timex_type(timex_type) + covered = sentence[start_i:end_i] + + return { + "text": covered, + "label": normalized_type, + "timex_type": normalized_type, + "value": value, + "score": float(score) if score is not None else 1.0, + "start": start_i, + "end": end_i, + "time_type": time_type_for_timex(normalized_type), + "model_name": model_name, + "model_id": model_id, + "model_version": model_version, + "raw": raw or {}, + } + + +def deduplicate_and_prefer_longest(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + unique: Dict[Tuple[int, int, str], Dict[str, Any]] = {} + for item in items: + key = (int(item["start"]), int(item["end"]), str(item.get("timex_type", "UNKNOWN"))) + current = unique.get(key) + if current is None or float(item.get("score", 0.0)) > float(current.get("score", 0.0)): + unique[key] = item + + ordered = sorted(unique.values(), key=lambda x: (int(x["start"]), -(int(x["end"]) - int(x["start"])))) + result: List[Dict[str, Any]] = [] + + for item in ordered: + overlaps = [existing for existing in result if not (item["end"] <= existing["start"] or item["start"] >= existing["end"])] + if not overlaps: + result.append(item) + continue + + longest = max(overlaps, key=lambda x: int(x["end"]) - int(x["start"])) + if int(item["end"]) - int(item["start"]) > int(longest["end"]) - int(longest["start"]): + result = [existing for existing in result if existing is not longest] + result.append(item) + + return sorted(result, key=lambda x: (int(x["start"]), int(x["end"]))) + + +class MicrosoftRecognizer(TimeRecognizer): + def __init__(self, alias: str, cfg: Dict[str, str], model_version: str = ""): + try: + from recognizers_date_time import recognize_datetime + except ImportError as exc: + raise RuntimeError("Install recognizers-text-suite to use the Microsoft backend.") from exc + + self.recognize_datetime = recognize_datetime + self.model_name = alias + self.model_id = cfg["model_id"] + self.model_version = model_version or "latest" + self.model_source = cfg.get("model_source", "") + self.model_lang = cfg.get("model_lang", "multi") + + def predict( + self, + texts: List[str], + language: str, + document_creation_time: Optional[str] = None, + threshold: float = 0.0, + batch_size: int = 8, + duckling_url: Optional[str] = None, + corenlp_url: Optional[str] = None, + duckling_timezone: Optional[str] = None, + ) -> List[List[Dict[str, Any]]]: + culture = LANGUAGE_TO_CULTURE.get(language.lower(), language) + outputs: List[List[Dict[str, Any]]] = [] + + for sentence in texts: + items: List[Dict[str, Any]] = [] + for result in self.recognize_datetime(sentence, culture): + result_text = getattr(result, "text", "") or "" + start = int(getattr(result, "start", 0)) + end = start + len(result_text) if result_text else int(getattr(result, "end", start)) + 1 + resolution = getattr(result, "resolution", {}) or {} + type_name = getattr(result, "type_name", "") or "" + value = self._extract_value(resolution) + timex_type = self._map_type(type_name, resolution) + item = make_time_expression( + sentence=sentence, + start=start, + end=end, + timex_type=timex_type, + value=value, + score=1.0, + model_name=self.model_name, + model_id=self.model_id, + model_version=self.model_version, + raw={"type_name": type_name, "resolution": resolution}, + ) + if item is not None: + items.append(item) + outputs.append(deduplicate_and_prefer_longest(items)) + return outputs + + @staticmethod + def _extract_value(resolution: Dict[str, Any]) -> Optional[str]: + values = resolution.get("values") or [] + if not values or not isinstance(values[0], dict): + return None + first = values[0] + return first.get("timex") or first.get("value") or first.get("start") or first.get("end") + + @staticmethod + def _map_type(type_name: str, resolution: Dict[str, Any]) -> str: + values = resolution.get("values") or [] + + # Microsoft liefert oft den zuverlässigsten Typ hier: + # {'type': 'daterange'} => DATE + if values and isinstance(values[0], dict): + from_resolution = normalize_resolution_type(values[0].get("type")) + if from_resolution != "UNKNOWN": + return from_resolution + + name = type_name.lower() + + if "duration" in name: + return "DURATION" + if "set" in name: + return "SET" + if "daterange" in name or "dateperiod" in name: + return "DATE" + if "datetimerange" in name: + return "TIME" + if "datetime" in name: + return "TIME" + if "time" in name: + return "TIME" + if "date" in name: + return "DATE" + + return "UNKNOWN" + + +class DucklingRecognizer(TimeRecognizer): + def __init__(self, alias: str, cfg: Dict[str, str], model_version: str = ""): + self.model_name = alias + self.model_id = cfg["model_id"] + self.model_version = model_version or "latest" + self.model_source = cfg.get("model_source", "") + self.model_lang = cfg.get("model_lang", "multi") + self.default_url = "" + + def predict( + self, + texts: List[str], + language: str, + document_creation_time: Optional[str] = None, + threshold: float = 0.0, + batch_size: int = 8, + duckling_url: Optional[str] = None, + corenlp_url: Optional[str] = None, + duckling_timezone: Optional[str] = None, + ) -> List[List[Dict[str, Any]]]: + locale = LANGUAGE_TO_DUCKLING_LOCALE.get(language.lower(), "en_US") + url = (duckling_url or self.default_url).rstrip("/") + if not url: + raise RuntimeError( + "duckling_url is required for the duckling backend. " + "Pass it with DUUI .withParameter('duckling_url', 'http://duckling:8000')." + ) + timezone_name = duckling_timezone or "Europe/Berlin" + outputs: List[List[Dict[str, Any]]] = [] + + for sentence in texts: + response = requests.post( + f"{url}/parse", + data={ + "text": sentence, + "locale": locale, + "tz": timezone_name, + "dims": json.dumps(["time", "duration"]), + }, + timeout=30, + ) + response.raise_for_status() + items: List[Dict[str, Any]] = [] + for raw in response.json(): + dim = raw.get("dim") + if dim not in {"time", "duration"}: + continue + value = raw.get("value") or {} + item = make_time_expression( + sentence=sentence, + start=raw.get("start", 0), + end=raw.get("end", 0), + timex_type=self._map_type(dim, value), + value=self._extract_value(value), + score=1.0, + model_name=self.model_name, + model_id=self.model_id, + model_version=self.model_version, + raw=raw, + ) + if item is not None: + items.append(item) + outputs.append(deduplicate_and_prefer_longest(items)) + return outputs + + @staticmethod + def _map_type(dimension: str, value: Dict[str, Any]) -> str: + if dimension == "duration": + return "DURATION" + if value.get("type") == "interval": + return "DATE" + if value.get("grain") in {"second", "minute", "hour"}: + return "TIME" + return "DATE" + + @staticmethod + def _extract_value(value: Dict[str, Any]) -> Optional[str]: + if "value" in value: + return str(value["value"]) + if value.get("type") == "interval": + start = (value.get("from") or {}).get("value") + end = (value.get("to") or {}).get("value") + if start or end: + return f"{start or ''}/{end or ''}" + normalized = value.get("normalized") + return str(normalized) if normalized else None + + +class SutimeRecognizer(TimeRecognizer): + def __init__(self, alias: str, cfg: Dict[str, str], model_version: str = ""): + self.model_name = alias + self.model_id = cfg["model_id"] + self.model_version = model_version or "latest" + self.model_source = cfg.get("model_source", "") + self.model_lang = cfg.get("model_lang", "multi") + self.default_url = "" + + def predict( + self, + texts: List[str], + language: str, + document_creation_time: Optional[str] = None, + threshold: float = 0.0, + batch_size: int = 8, + duckling_url: Optional[str] = None, + corenlp_url: Optional[str] = None, + duckling_timezone: Optional[str] = None, + ) -> List[List[Dict[str, Any]]]: + url = (corenlp_url or self.default_url).rstrip("/") + if not url: + raise RuntimeError( + "corenlp_url is required for the sutime backend. " + "Pass it with DUUI .withParameter('corenlp_url', 'http://corenlp:9000')." + ) + outputs: List[List[Dict[str, Any]]] = [] + for sentence in texts: + properties: Dict[str, Any] = { + "annotators": "tokenize,ssplit,pos,lemma,ner", + "outputFormat": "json", + } + if document_creation_time: + properties["sutime.referenceDate"] = document_creation_time[:10] + if language.lower() == "de": + properties["tokenize.language"] = "de" + + response = requests.post( + url, + params={"properties": json.dumps(properties)}, + data=sentence.encode("utf-8"), + headers={"Content-Type": "text/plain; charset=utf-8"}, + timeout=60, + ) + response.raise_for_status() + data = response.json() + items: List[Dict[str, Any]] = [] + for sent in data.get("sentences", []): + for mention in sent.get("entitymentions", []): + ner = mention.get("ner") + if ner not in {"DATE", "TIME", "DURATION", "SET"}: + continue + timex = mention.get("timex") or {} + item = make_time_expression( + sentence=sentence, + start=mention.get("characterOffsetBegin", 0), + end=mention.get("characterOffsetEnd", 0), + timex_type=timex.get("type") or ner, + value=timex.get("value") or mention.get("normalizedNER"), + score=1.0, + model_name=self.model_name, + model_id=self.model_id, + model_version=self.model_version, + raw=mention, + ) + if item is not None: + items.append(item) + outputs.append(deduplicate_and_prefer_longest(items)) + return outputs + + +class HuggingFaceTimeRecognizer(TimeRecognizer): + def __init__(self, alias: str, cfg: Dict[str, str], device: str = "cpu", model_version: str = ""): + try: + from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline + except ImportError as exc: + raise RuntimeError("Install transformers and torch to use the Hugging Face backend.") from exc + + model_id = cfg.get("model_id") or os.getenv("HF_TEMPORAL_MODEL_ID", "") + if not model_id: + raise ValueError( + f"The backend '{alias}' needs a Hugging Face model id. " + "Set MODEL_NAME to an exact Hugging Face id or set HF_TEMPORAL_MODEL_ID." + ) + + self.model_name = alias + self.model_id = model_id + self.model_version = model_version or "latest" + self.model_source = cfg.get("model_source") or f"https://huggingface.co/{model_id}" + self.model_lang = cfg.get("model_lang", "multi") + + import torch + + self.device = device + revision = None if self.model_version in {"", "latest"} else self.model_version + self.model = AutoModelForTokenClassification.from_pretrained(model_id, revision=revision) + self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, revision=revision) + self.model.to(device) + self.model.eval() + pipe_device = 0 if str(device).startswith("cuda") else -1 + self.pipeline = pipeline( + "token-classification", + model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy="simple", + device=pipe_device, + ) + self.torch = torch + + def predict( + self, + texts: List[str], + language: str, + document_creation_time: Optional[str] = None, + threshold: float = 0.0, + batch_size: int = 8, + duckling_url: Optional[str] = None, + corenlp_url: Optional[str] = None, + duckling_timezone: Optional[str] = None, + ) -> List[List[Dict[str, Any]]]: + with self.torch.no_grad(): + outputs = self.pipeline(texts, batch_size=batch_size) + + sentence_outputs = normalize_sentence_outputs(outputs, len(texts)) + result: List[List[Dict[str, Any]]] = [] + + for sentence, raw_items in zip(texts, sentence_outputs): + items: List[Dict[str, Any]] = [] + for raw in flatten_raw_items(raw_items): + score = float(raw.get("score", 1.0) or 0.0) + if score < threshold: + continue + label = raw.get("entity_group") or raw.get("entity") or raw.get("label") + timex_type = normalize_timex_type(str(label)) + if timex_type == "UNKNOWN": + continue + item = make_time_expression( + sentence=sentence, + start=raw.get("start", 0), + end=raw.get("end", 0), + timex_type=timex_type, + value=None, + score=score, + model_name=self.model_name, + model_id=self.model_id, + model_version=self.model_version, + raw=raw, + ) + if item is not None: + items.append(item) + result.append(deduplicate_and_prefer_longest(items)) + return result + + +class SpacyTimeRecognizer(TimeRecognizer): + def __init__(self, alias: str, cfg: Dict[str, str], language: str, model_version: str = ""): + import importlib + import spacy + + self.model_name = alias + self.model_id = cfg["model_id"] + self.model_version = model_version or "latest" + self.model_source = cfg.get("model_source", "") + self.model_lang = cfg.get("model_lang", "multi") + + backend = cfg["backend"] + if backend == "spacy_tei2go": + model_name = os.getenv( + "TEI2GO_MODEL", + LANGUAGE_TO_TEI2GO_MODEL.get(language.lower(), "de_tei2go"), + ) + else: + model_name = os.getenv( + "SPACY_MODEL", + "de_core_news_sm" if language.lower() == "de" else "en_core_web_sm", + ) + + self.nlp = self._load_spacy_model( + importlib=importlib, + spacy=spacy, + model_name=model_name, + backend=backend, + language=language, + ) + + if backend == "spacy_timexy" and "timexy" not in self.nlp.pipe_names: + try: + from timexy import Timexy # noqa: F401 + except ImportError as exc: + raise RuntimeError( + "Install timexy to use the Timexy backend: python -m pip install timexy==0.1.3" + ) from exc + + timexy_config = { + "kb_id_type": "timex3", + "label": "TIMEX", + "overwrite": False, + } + + if "ner" in self.nlp.pipe_names: + self.nlp.add_pipe("timexy", config=timexy_config, before="ner") + else: + self.nlp.add_pipe("timexy", config=timexy_config) + + @staticmethod + def _load_spacy_model(importlib: Any, spacy: Any, model_name: str, backend: str, language: str) -> Any: + try: + module = importlib.import_module(model_name) + return module.load() if hasattr(module, "load") else spacy.load(model_name) + + except ModuleNotFoundError as exc: + if exc.name != model_name: + raise RuntimeError( + f"spaCy model '{model_name}' could not be imported because dependency " + f"{exc.name!r} is missing." + ) from exc + + try: + return spacy.load(model_name) + except OSError as load_exc: + raise RuntimeError( + SpacyTimeRecognizer._missing_spacy_model_message( + backend=backend, + language=language, + model_name=model_name, + ) + ) from load_exc + + except ImportError as exc: + raise RuntimeError( + f"spaCy model '{model_name}' could not be imported: {exc}" + ) from exc + + except OSError as exc: + raise RuntimeError( + SpacyTimeRecognizer._missing_spacy_model_message( + backend=backend, + language=language, + model_name=model_name, + ) + ) from exc + + @staticmethod + def _missing_spacy_model_message(backend: str, language: str, model_name: str) -> str: + if backend == "spacy_tei2go": + wheel_url = ( + f"https://huggingface.co/hugosousa/{model_name}/resolve/main/" + f"{model_name}-any-py3-none-any.whl" + ) + local_wheel = f"/tmp/{model_name}-0.0.0-py3-none-any.whl" + + return ( + f"TEI2GO spaCy model '{model_name}' is not installed for language " + f"'{language}'. Install it with:\n" + f" curl -L -o {local_wheel} {wheel_url}\n" + f" python -m pip install --no-deps {local_wheel}\n" + "Or set TEI2GO_MODEL to an installed spaCy model package/path." + ) + + return ( + f"spaCy model '{model_name}' is not installed for language '{language}'. " + f"Install it with:\n" + f" python -m spacy download {model_name}\n" + "Or set SPACY_MODEL to an installed spaCy model package/path." + ) + + def predict( + self, + texts: List[str], + language: str, + document_creation_time: Optional[str] = None, + threshold: float = 0.0, + batch_size: int = 8, + duckling_url: Optional[str] = None, + corenlp_url: Optional[str] = None, + duckling_timezone: Optional[str] = None, + ) -> List[List[Dict[str, Any]]]: + outputs: List[List[Dict[str, Any]]] = [] + + # Wichtig: explizit über die Eingabetexte iterieren. + # So bleibt len(outputs) immer gleich len(texts). + for sentence in texts: + doc = self.nlp(sentence) + items: List[Dict[str, Any]] = [] + + for span in iter_spacy_temporal_spans(doc): + label = getattr(span, "label_", "UNKNOWN") or "UNKNOWN" + value = extract_spacy_value(span) + + timex_type = normalize_timex_type(label) + + # Timexy: label ist oft nur "TIMEX"; echter Typ steht in kb_id_/value. + if timex_type == "UNKNOWN": + timex_type = infer_timex_type_from_timex3(value) + + # TEI2GO: label ist oft "TIMEX", value ist oft None; dann aus Text ableiten. + if timex_type == "UNKNOWN" and str(label).upper() in { + "TIMEX", + "TIMEX3", + "TIME_EXPRESSION", + }: + timex_type = infer_timex_type_from_text(span.text, language=language) + + if timex_type == "UNKNOWN": + continue + + item = make_time_expression( + sentence=sentence, + start=span.start_char, + end=span.end_char, + timex_type=timex_type, + value=value, + score=1.0, + model_name=self.model_name, + model_id=self.model_id, + model_version=self.model_version, + raw={"label": label}, + ) + + if item is not None: + items.append(item) + + outputs.append(deduplicate_and_prefer_longest(items)) + + return outputs + + +def normalize_sentence_outputs(outputs: Any, text_count: int) -> List[Any]: + if outputs is None: + sentence_outputs: List[Any] = [] + elif isinstance(outputs, dict): + sentence_outputs = [outputs] + elif isinstance(outputs, list): + if text_count == 1: + if outputs and all(isinstance(item, dict) for item in outputs): + sentence_outputs = [outputs] + else: + sentence_outputs = outputs + else: + sentence_outputs = outputs + else: + sentence_outputs = [] + + if len(sentence_outputs) < text_count: + sentence_outputs = sentence_outputs + [None] * (text_count - len(sentence_outputs)) + elif len(sentence_outputs) > text_count: + sentence_outputs = sentence_outputs[:text_count] + return sentence_outputs + + +def flatten_raw_items(raw: Any) -> Iterable[Dict[str, Any]]: + if raw is None: + return + if isinstance(raw, dict): + if "entities" in raw: + yield from flatten_raw_items(raw["entities"]) + return + if any(key in raw for key in ("start", "end", "entity", "entity_group", "label")): + yield raw + return + for value in raw.values(): + yield from flatten_raw_items(value) + return + if isinstance(raw, list): + for item in raw: + yield from flatten_raw_items(item) + + +def iter_spacy_temporal_spans(doc: Any) -> Iterable[Any]: + for entity in getattr(doc, "ents", []): + yield entity + + for name, spans in getattr(doc, "spans", {}).items(): + if "time" not in str(name).lower() and "timex" not in str(name).lower(): + continue + for span in spans: + yield span + + +def extract_spacy_value(span: Any) -> Optional[str]: + if getattr(span, "kb_id_", None): + return str(span.kb_id_) + + custom = getattr(span, "_", None) + if custom is None: + return None + + for attribute_name in ["timex_value", "value", "timex3", "normalized", "timex"]: + try: + if custom.has(attribute_name): + value = getattr(custom, attribute_name) + if value: + return str(value) + except Exception: + continue + return None + + +def create_time_recognizer( + model_name: str, + language: str, + device: str = "cpu", + model_version: Optional[str] = None, + model_specname: Optional[str] = None, + model_source: Optional[str] = None, + model_lang: Optional[str] = None, +) -> TimeRecognizer: + alias, cfg = resolve_model_name(model_name) + cfg = apply_model_settings( + cfg, + model_specname=model_specname, + model_source=model_source, + model_lang=model_lang, + ) + backend = cfg["backend"] + + if backend == "microsoft_recognizers_text": + return MicrosoftRecognizer(alias, cfg, model_version=model_version or "latest") + if backend == "duckling_http": + return DucklingRecognizer(alias, cfg, model_version=model_version or "latest") + if backend == "sutime_http": + return SutimeRecognizer(alias, cfg, model_version=model_version or "latest") + if backend == "hf_token_classification": + return HuggingFaceTimeRecognizer(alias, cfg, device=device, model_version=model_version or "latest") + if backend in {"spacy_tei2go", "spacy_timexy"}: + return SpacyTimeRecognizer(alias, cfg, language=language, model_version=model_version or "latest") + + raise ValueError(f"Unsupported backend '{backend}' for model '{model_name}'") + + +def predict_time( + model_name: str, + texts: List[str], + language: str = "de", + document_creation_time: Optional[str] = None, + device: str = "cpu", + threshold: float = 0.0, + batch_size: int = 8, + model_version: str = "latest", + model_specname: Optional[str] = None, + model_source: Optional[str] = None, + model_lang: Optional[str] = None, + duckling_url: Optional[str] = None, + corenlp_url: Optional[str] = None, + duckling_timezone: Optional[str] = None, +) -> List[List[Dict[str, Any]]]: + recognizer = create_time_recognizer( + model_name, + language=language, + device=device, + model_version=model_version, + model_specname=model_specname, + model_source=model_source, + model_lang=model_lang, + ) + return recognizer.predict( + texts, + language=language, + document_creation_time=document_creation_time, + threshold=threshold, + batch_size=batch_size, + duckling_url=duckling_url, + corenlp_url=corenlp_url, + duckling_timezone=duckling_timezone, + ) + + +if __name__ == "__main__": + examples_de = [ + "Wir treffen uns morgen.", + "Wir treffen uns morgen um 14 Uhr.", + "Wir treffen uns morgen um 14 Uhr und danach jeden Montag.", + "Die Sitzung fand vom 3. bis 5. Mai 2024 statt.", + ] + + examples_en = [ + "We will meet tomorrow.", + "We will meet tomorrow at 2 pm.", + "We will meet tomorrow at 2 pm and then every Monday.", + "The meeting took place from May 3 to May 5, 2024.", + ] + + tests = [ + { + "name": "microsoft", + "examples": examples_de, + "language": "de", + "kwargs": {}, + }, + { + "name": "tei2go", + "examples": examples_de, + "language": "de", + "kwargs": {}, + }, + { + "name": "timexy", + "examples": examples_de, + "language": "de", + "kwargs": {}, + }, + { + "name": "german-gelectra", + "examples": examples_de, + "language": "de", + "kwargs": {}, + }, + { + "name": "bert-got-a-date", + "examples": examples_en, + "language": "en", + "kwargs": { + "model_specname": "satyaalmasian/temporal_tagger_BERT_tokenclassifier", + "model_source": "https://github.com/satya77/Transformer_Temporal_Tagger", + "model_lang": "en", + }, + }, + { + "name": "hf-token-classification", + "examples": examples_en, + "language": "en", + "kwargs": { + "model_specname": "satyaalmasian/temporal_tagger_BERT_tokenclassifier", + "model_source": "https://huggingface.co/satyaalmasian/temporal_tagger_BERT_tokenclassifier", + "model_lang": "en", + }, + }, + { + "name": "duckling", + "examples": examples_de, + "language": "de", + "kwargs": {}, + }, + { + "name": "sutime", + "examples": examples_de, + "language": "de", + "kwargs": {}, + }, + ] + + print("TIME_TYPE_MAP:") + for key, value in TIME_TYPE_MAP.items(): + print(f" {key:8} -> {value}") + + print("\n" + "=" * 80) + print("Running backend tests") + + for test in tests: + model_name = test["name"] + examples = test["examples"] + language = test["language"] + kwargs = test["kwargs"] + + print("\n" + "=" * 80) + print(f"Model: {model_name}") + print(f"Language: {language}") + print(f"Input length: {len(examples)}") + + try: + result = predict_time( + model_name, + examples, + language=language, + document_creation_time="2026-06-09", + device="cpu", + threshold=0.0, + batch_size=4, + **kwargs, + ) + + print(f"Output length: {len(result)}") + + for i, (text, items) in enumerate(zip(examples, result), start=1): + print("\n" + "-" * 80) + print(f"[{i}] Input: {text}") + + if not items: + print(" -> []") + continue + + for j, item in enumerate(items, start=1): + raw = item.get("raw") or {} + + raw_model_type = ( + raw.get("type_name") + or raw.get("label") + or raw.get("dim") + or raw.get("entity_group") + or raw.get("entity") + or raw.get("ner") + or None + ) + + # For Microsoft/Duckling/SUTime-like raw dicts, try to expose nested type too. + raw_resolution_type = None + resolution = raw.get("resolution") + if isinstance(resolution, dict): + values = resolution.get("values") or [] + if values and isinstance(values[0], dict): + raw_resolution_type = values[0].get("type") + + duckling_value = raw.get("value") + if isinstance(duckling_value, dict): + raw_resolution_type = duckling_value.get("type") or duckling_value.get("grain") + + print(f" Hit {j}:") + print(f" text: {item.get('text')!r}") + print(f" model_raw_type: {raw_model_type!r}") + print(f" model_value_type: {raw_resolution_type!r}") + print(f" normalized_type: {item.get('timex_type')!r}") + print(f" mapped_uima_type: {item.get('time_type')!r}") + print(f" value: {item.get('value')!r}") + print(f" score: {item.get('score')!r}") + print(f" span: ({item.get('start')}, {item.get('end')})") + + except Exception as exc: + print(f"FAILED: {type(exc).__name__}: {exc}") diff --git a/duui-TimeDetection/src/test/java/org/hucompute/textimager/uima/TimeDetection/TimeTest.java b/duui-TimeDetection/src/test/java/org/hucompute/textimager/uima/TimeDetection/TimeTest.java new file mode 100644 index 00000000..00773db7 --- /dev/null +++ b/duui-TimeDetection/src/test/java/org/hucompute/textimager/uima/TimeDetection/TimeTest.java @@ -0,0 +1,384 @@ +package org.hucompute.textimager.uima.TimeDetection; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.XmlCasSerializer; +import org.junit.jupiter.api.*; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.texttechnologylab.annotation.semaf.isotimeml.TimeX3; +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.junit.jupiter.api.Assertions.*; + +public class TimeTest { + static DUUIComposer composer; + static JCas cas; + + static String url = "http://127.0.0.1:9714"; + + private static final String DOCUMENT_CREATION_TIME = getenvOrDefault("DOCUMENT_CREATION_TIME", "2026-06-09"); + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); + + cas = JCasFactory.createJCas(); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + public void afterEach() throws IOException, SAXException { + composer.resetPipeline(); + + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, stream); + System.out.println(stream.toString(StandardCharsets.UTF_8)); + + cas.reset(); + } + + @Test + public void MicrosoftTimeTest() throws Exception { + runModelTest( + "microsoft", + "de", + germanBioFidSentences(), + Collections.emptyMap() + ); + } + + @Test + public void Tei2GoTimeTest() throws Exception { + runModelTest( + "tei2go", + "de", + germanBioFidSentences(), + Collections.emptyMap() + ); + } + + @Test + public void TimexyTimeTest() throws Exception { + runModelTest( + "timexy", + "de", + germanBioFidSentences(), + Collections.emptyMap() + ); + } + + @Test + public void GermanGelectraTimeTest() throws Exception { + runModelTest( + "german-gelectra", + "de", + germanBioFidSentences(), + Collections.emptyMap() + ); + } + + @Test + public void BertGotADateTimeTest() throws Exception { + runModelTest( + "bert-got-a-date", + "en", + englishSentences(), + Collections.emptyMap() + ); + } + + @Test + public void DucklingTimeTest() throws Exception { + Map parameters = new LinkedHashMap<>(); + parameters.put("duckling_url", getenvOrDefault("DUCKLING_URL", "http://127.0.0.1:8000")); + parameters.put("duckling_timezone", getenvOrDefault("DUCKLING_TIMEZONE", "Europe/Berlin")); + + runModelTest( + "duckling", + "de", + germanBioFidSentences(), + parameters + ); + } + + @Test + public void SutimeTimeTest() throws Exception { + Map parameters = new LinkedHashMap<>(); + parameters.put("corenlp_url", getenvOrDefault("CORENLP_URL", "http://127.0.0.1:9000")); + + runModelTest( + "sutime", + "de", + germanBioFidSentences(), + parameters + ); + } + + @Test + public void GenericHfTokenClassificationTimeTest() throws Exception { + runModelTest( + "hf-token-classification", + getenvOrDefault("HF_TOKEN_CLASSIFICATION_LANG", "de"), + germanBioFidSentences(), + Collections.emptyMap() + ); + } + + private void runModelTest( + String expectedModel, + String language, + List sentences, + Map extraParameters + ) throws Exception { + Assumptions.assumeTrue( + serviceAvailable(url), + "Skipping " + expectedModel + " because no DUUI Time service is reachable at " + url + ); + + String runningModel = getRunningModel(); + Assumptions.assumeTrue( + expectedModel.equals(runningModel), + "Skipping " + expectedModel + " because running model is " + runningModel + " at " + url + ); + + composer.add(createComponent(extraParameters)); + + createCasFromSentences(language, sentences); + + System.out.println("============================================================"); + System.out.println("Expected model: " + expectedModel); + System.out.println("Running model: " + runningModel); + System.out.println("URL: " + url); + System.out.println("Language: " + language); + System.out.println("Input document:"); + System.out.println(cas.getDocumentText()); + + composer.run(cas); + + Collection timeAnnotations = JCasUtil.select(cas, TimeX3.class); + Map> result = extractTimeResult(); + + for (TimeX3 timeAnnotation : timeAnnotations) { + String coveredText = timeAnnotation.getCoveredText(); + int begin = timeAnnotation.getBegin(); + int end = timeAnnotation.getEnd(); + String value = timeAnnotation.getValue(); + String typeName = timeAnnotation.getType().getName(); + + result.get("token").add(coveredText); + result.get("begin").add(begin); + result.get("end").add(end); + result.get("value").add(value); + result.get("type").add(typeName); + + System.out.println( + "TIMEX3 [" + runningModel + "]: '" + coveredText + "'" + + " (begin=" + begin + ", end=" + end + ")" + + " value='" + value + "'" + + " type='" + typeName + "'" + ); + + assertTrue(begin >= 0, "TimeX3 begin offset must be non-negative"); + assertTrue(end > begin, "TimeX3 end offset must be greater than begin offset"); + assertFalse(coveredText.isBlank(), "TimeX3 covered text must not be blank"); + + assertTrue( + typeName.equals("org.texttechnologylab.annotation.semaf.isotimeml.TimeX3") + || typeName.equals("org.texttechnologylab.annotation.semaf.isotimeml.time.Date") + || typeName.equals("org.texttechnologylab.annotation.semaf.isotimeml.time.Time") + || typeName.equals("org.texttechnologylab.annotation.semaf.isotimeml.time.Duration") + || typeName.equals("org.texttechnologylab.annotation.semaf.isotimeml.time.Set"), + "Unexpected TimeX3 type: " + typeName + ); + } + + assertFalse( + timeAnnotations.isEmpty(), + "The DUUI TimeX3 component should create at least one TimeX3 annotation for model " + runningModel + ); + } + + private DUUIRemoteDriver.Component createComponent(Map extraParameters) throws URISyntaxException, IOException { + DUUIRemoteDriver.Component component = new DUUIRemoteDriver.Component(url) + .withParameter( + "selection", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + ) + .withParameter("document_creation_time", DOCUMENT_CREATION_TIME) + .withParameter("threshold", getenvOrDefault("TIME_THRESHOLD", "0.0")) + .withParameter("batch_size", getenvOrDefault("TIME_BATCH_SIZE", "8")); + + for (Map.Entry parameter : extraParameters.entrySet()) { + if (parameter.getValue() != null && !parameter.getValue().isBlank()) { + component = component.withParameter(parameter.getKey(), parameter.getValue()); + } + } + + return component; + } + + public void createCasFromSentences(String language, List sentences) { + cas.setDocumentLanguage(language); + + StringBuilder documentText = new StringBuilder(); + int offset = 0; + + for (int i = 0; i < sentences.size(); i++) { + String sentenceText = sentences.get(i); + + if (i > 0) { + documentText.append("\n"); + offset += 1; + } + + int begin = offset; + int end = begin + sentenceText.length(); + + documentText.append(sentenceText); + + Sentence sentence = new Sentence(cas, begin, end); + sentence.addToIndexes(); + + offset = end; + } + + cas.setDocumentText(documentText.toString()); + } + + private Map> extractTimeResult() { + Map> result = new LinkedHashMap<>(); + + result.put("begin", new ArrayList<>()); + result.put("end", new ArrayList<>()); + result.put("token", new ArrayList<>()); + result.put("value", new ArrayList<>()); + result.put("type", new ArrayList<>()); + + return result; + } + + private static List germanBioFidSentences() { + return Arrays.asList( + "Dr. Anna Weber begann die Exkursion am 12. Mai 2024 um 08:30 Uhr in Frankfurt am Main.", + "Bereits am frühen Morgen wurden die ersten Proben genommen.", + "Nach zwei Stunden erreichte das Team eine Streuobstwiese am Rand des Taunus.", + "Dort wurden zwischen 10:15 Uhr und 11:45 Uhr mehrere Insekten beobachtet.", + "Am Nachmittag dokumentierte BioFID weitere Funde im Labor.", + "Die zweite Untersuchung fand am folgenden Montag statt.", + "Jeden Dienstag wurden die Temperaturwerte erneut kontrolliert.", + "Im Sommer 2024 verglich das Team die Daten mit älteren Beobachtungen.", + "Vom 1. Juni bis zum 3. Juni wurden zusätzliche Bodenproben gesammelt.", + "Die Auswertung dauerte drei Wochen und wurde gestern abgeschlossen.", + "Morgen um 14 Uhr soll ein weiteres Treffen stattfinden.", + "In zwei Monaten sollen die Ergebnisse erneut überprüft werden.", + "Seit 2021 werden die Funde regelmäßig in einer Datenbank gespeichert.", + "Vor drei Jahren begann das Projekt mit einer Pilotstudie am Bodensee." + ); + } + + private static List englishSentences() { + return Arrays.asList( + "Dr. Anna Weber started the field trip on May 12, 2024 at 08:30 in Frankfurt am Main.", + "After two hours the team reached an orchard near the Taunus.", + "Several insects were observed between 10:15 and 11:45.", + "The second investigation took place the following Monday.", + "Every Tuesday the temperature values were checked again.", + "The evaluation lasted three weeks and was completed yesterday.", + "Tomorrow at 2 pm another meeting is scheduled.", + "In two months the results will be reviewed again.", + "Since 2021 the findings have been stored regularly in a database.", + "Three years ago the project began with a pilot study at Lake Constance." + ); + } + + private static boolean serviceAvailable(String serviceUrl) { + try { + URL endpoint = new URL(serviceUrl + "/v1/communication_layer"); + HttpURLConnection connection = (HttpURLConnection) endpoint.openConnection(); + connection.setConnectTimeout(1000); + connection.setReadTimeout(1000); + connection.setRequestMethod("GET"); + + int responseCode = connection.getResponseCode(); + connection.disconnect(); + + return responseCode >= 200 && responseCode < 300; + } catch (Exception ignored) { + return false; + } + } + + private static String getRunningModel() { + try { + String documentation = httpGet(url + "/v1/documentation"); + + Pattern selectedModelPattern = Pattern.compile("\"selected_model\"\\s*:\\s*\"([^\"]+)\""); + Matcher selectedModelMatcher = selectedModelPattern.matcher(documentation); + if (selectedModelMatcher.find()) { + return selectedModelMatcher.group(1); + } + + Pattern modelNamePattern = Pattern.compile("\"model_name\"\\s*:\\s*\"([^\"]+)\""); + Matcher modelNameMatcher = modelNamePattern.matcher(documentation); + if (modelNameMatcher.find()) { + return modelNameMatcher.group(1); + } + + return "unknown"; + } catch (Exception ex) { + return "unknown"; + } + } + + private static String httpGet(String requestUrl) throws IOException { + URL endpoint = new URL(requestUrl); + HttpURLConnection connection = (HttpURLConnection) endpoint.openConnection(); + connection.setConnectTimeout(1000); + connection.setReadTimeout(1000); + connection.setRequestMethod("GET"); + + try (InputStream inputStream = connection.getInputStream()) { + return new String(inputStream.readAllBytes(), StandardCharsets.UTF_8); + } finally { + connection.disconnect(); + } + } + + private static String getenvOrDefault(String key, String defaultValue) { + String value = System.getenv(key); + if (value == null || value.isBlank()) { + value = System.getProperty(key); + } + if (value == null || value.isBlank()) { + return defaultValue; + } + return value; + } +} \ No newline at end of file From 8587767969d4de9470d72435da5e8baa0f579049 Mon Sep 17 00:00:00 2001 From: bagci Date: Wed, 10 Jun 2026 10:27:57 +0200 Subject: [PATCH 09/19] Add DUUI-based temporal expression detection components (duui-Time) Implement Dockerized DUUI components for selected temporal expression detection backends, including Microsoft Recognizers-Text, Duckling, Stanford SUTime, German GELECTRA, BERT Got-a-Date, TEI2GO, Timexy, and generic Hugging Face token-classification models. Each Docker image now builds a single model- and language-specific TimeX3 service with DUUI endpoints for type system, Lua communication, documentation, and processing. Add model metadata, runtime parameters, TimeX3/ISO-TimeML annotation mapping, per-model Docker build support, external Duckling/CoreNLP service configuration, Java test coverage, and usage documentation for DUUI integration. --- duui-TimeDetection/README.md | 0 duui-TimeDetection/Readme.md | 259 ++++++++++++++++++++++++----------- 2 files changed, 180 insertions(+), 79 deletions(-) delete mode 100644 duui-TimeDetection/README.md diff --git a/duui-TimeDetection/README.md b/duui-TimeDetection/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/duui-TimeDetection/Readme.md b/duui-TimeDetection/Readme.md index 3c078e07..60ce9543 100644 --- a/duui-TimeDetection/Readme.md +++ b/duui-TimeDetection/Readme.md @@ -1,124 +1,226 @@ -[![Version](https://img.shields.io/static/v1?label=duui-ner&message=0.1.0&color=blue)](https://docker.texttechnologylab.org/v2/duui-ner/tags/list) +[![Version](https://img.shields.io/static/v1?label=duui-time&message=0.1.0&color=blue)](https://docker.texttechnologylab.org/v2/duui-time/tags/list) [![Version](https://img.shields.io/static/v1?label=Python&message=3.12&color=green)]() -[![Version](https://img.shields.io/static/v1?label=Transformers&message=5.1.0&color=yellow)]() -[![Version](https://img.shields.io/static/v1?label=Torch&message=2.11.0&color=red)]() -[![Version](https://img.shields.io/static/v1?label=GLiNER&message=0.2.26&color=orange)]() -[![Version](https://img.shields.io/static/v1?label=GLiNER2&message=1.3.1&color=orange)]() +[![Version](https://img.shields.io/static/v1?label=FastAPI&message=0.115%2B&color=yellow)]() +[![Version](https://img.shields.io/static/v1?label=UIMA&message=TimeX3&color=red)]() -# Transformers NER +# DUUI Time Detection -DUUI implementation for selected transformer-based Named Entity Recognition (NER) models. The component is designed for use with the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). +DUUI implementation for temporal expression detection and TimeX3 annotation. -The component supports one NER model per Docker image/container. Each image is built with a single `MODEL_NAME` and exposes the DUUI endpoints for type system, Lua communication layer, documentation, and processing. +The component detects temporal expressions in selected UIMA annotations or in the full document text and writes ISO-TimeML-compatible `TimeX3` annotations into the CAS. The implementation supports multiple backends. Each Docker image is built for exactly one model/backend and one language configuration. ## Included Models -| Image suffix / `MODEL_SPECNAME` | `MODEL_NAME` | Model source | Model version | Languages | Backend | -| --- | --- | --- | --- | --- | --- | -| `gliner-multi-v2-1` | `gliner` | https://huggingface.co/urchade/gliner_multi-v2.1 | `443d26d654e0324125a96bebd8e796c14ff2efe6` | Multilingual | GLiNER | -| `gliner2-multi-v1` | `gliner2` | https://huggingface.co/fastino/gliner2-multi-v1 | `cc151f5b0ce4f7010c3ae8884527dd43dddf9d21` | Multilingual | GLiNER2 | -| `roberta-ner-multilingual` | `roberta-ner-multilingual` | https://huggingface.co/julian-schelb/roberta-ner-multilingual | `d0a19147f3bb0065c8091459e3d35405ce9d48da` | Multilingual | HuggingFace token-classification | -| `wikineural-multilingual-ner` | `wikineural-multilingual-ner` | https://huggingface.co/Babelscape/wikineural-multilingual-ner | `bed6ee7a45d2827b6c90a4fd7983f0241ae0a5c1` | Multilingual | HuggingFace token-classification | -| `xlm-r-ner-40-lang` | `xlm-r-ner-40-lang` | https://huggingface.co/nbroad/jplu-xlm-r-ner-40-lang | `7f7f0fe9bc946a9848611aff079f556387687216` | Multilingual / 40 languages | HuggingFace token-classification | +| Name | Backend | Model / Resource | Languages | Notes | +| ---- | ------- | ---------------- | --------- | ----- | +| `microsoft` | Microsoft Recognizers-Text | `recognizers-text-suite==1.0.2a2` | multilingual | Rule-based temporal recognition. | +| `duckling` | Duckling HTTP service | external Duckling server | multilingual | Requires a running Duckling container or service. | +| `sutime` | Stanford CoreNLP SUTime HTTP service | external CoreNLP/SUTime server | multilingual | Requires a running CoreNLP server. | +| `german-gelectra` | Hugging Face token classification | `satyaalmasian/temporal_tagger_German_GELECTRA` | DE | German temporal tagger. | +| `bert-got-a-date` | Hugging Face token classification | `satyaalmasian/temporal_tagger_BERT_tokenclassifier` | EN | English temporal tagger. | +| `hf-token-classification` | Hugging Face token classification | custom `MODEL_SPECNAME` | configurable | Generic Hugging Face token-classification backend. | +| `tei2go-de` | spaCy / TEI2GO | `de_tei2go` | DE | One image per language. | +| `tei2go-en` | spaCy / TEI2GO | `en_tei2go` | EN | One image per language. | +| `tei2go-es` | spaCy / TEI2GO | `es_tei2go` | ES | One image per language. | +| `tei2go-fr` | spaCy / TEI2GO | `fr_tei2go` | FR | One image per language. | +| `tei2go-it` | spaCy / TEI2GO | `it_tei2go` | IT | One image per language. | +| `tei2go-pt` | spaCy / TEI2GO | `pt_tei2go` | PT | One image per language. | +| `timexy-de` | spaCy / Timexy | `de_core_news_sm` | DE | One image per language. | +| `timexy-en` | spaCy / Timexy | `en_core_web_sm` | EN | One image per language. | +| `timexy-fr` | spaCy / Timexy | `fr_core_news_sm` | FR | One image per language. | + +## Build Images + +The build script creates one Docker image per model and language. + +Build one model: -## Annotation Types +```bash +./docker_build.sh microsoft de +``` -The component creates UIMA NER annotations from the model output. Standard NER labels are mapped to DKPro NER types where possible, for example: +Build all Timexy language variants: -| Label | UIMA type | -| --- | --- | -| `PER`, `person` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person` | -| `ORG`, `organization` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization` | -| `LOC`, `location` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location` | -| `taxon`, `taxa` | `org.texttechnologylab.annotation.type.Taxon` | -| other labels | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity` | +```bash +./docker_build.sh timexy all +``` -The `taxon` label is mapped to the TTLab taxon type: +Build all default images: -```text -org.texttechnologylab.annotation.type.Taxon +```bash +./docker_build.sh all ``` -The delivered type system must include this type if taxon annotations should be created as `Taxon` instead of falling back to a generic `NamedEntity`. +Build a custom Hugging Face token-classification model: -## Requirements +```bash +./docker_build.sh hf-token-classification de satyaalmasian/temporal_tagger_German_GELECTRA +``` -The container uses Python 3.12 and the following core Python dependencies: +## Start Docker Container -| Package | Version | -| --- | --- | -| `gliner` | `0.2.26` | -| `gliner2[local]` | `1.3.1` | -| `transformers` | `5.1.0` | -| `torch` | `2.11.0` | -| `fastapi` | `0.110.0` | -| `dkpro-cassis` | `0.9.1` | -| `uvicorn[standard]` | `0.27.1` | -| `pydantic-settings` | `2.0.2` | +Run a DUUI Time Detection image locally: -See `requirements.txt` for the full dependency list. +```bash +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-time-[modelname]-[lang]:latest +``` -# How To Use +Example: + +```bash +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-time-microsoft-de:latest +``` -## Start Docker container +TEI2GO example: ```bash -docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-ner-[modelname]:latest +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-time-tei2go-de:latest ``` -Example: +Timexy example: + +```bash +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-time-timexy-de:latest +``` + +## External Services + +### Duckling + +Start Duckling: + +```bash +docker run --rm -p 8000:8000 rasa/duckling +``` + +Start the DUUI Time Duckling wrapper: + +```bash +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-time-duckling-de:latest +``` + +In DUUI, pass the Duckling URL as runtime parameter: + +```java +.withParameter("duckling_url", "http://127.0.0.1:8000") +.withParameter("duckling_timezone", "Europe/Berlin") +``` + +If DUUI runs inside another Docker container, use the reachable host name, for example: + +```java +.withParameter("duckling_url", "http://host.docker.internal:8000") +``` + +### SUTime / CoreNLP + +Start CoreNLP: -```bash[duui_time.py](../../../Downloads/duui_time_tool/duui_time.py) -docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-ner-wikineural-multilingual-ner:latest +```bash +docker run --rm -p 9000:9000 --name corenlp nlpbox/corenlp +``` + +Start the DUUI Time SUTime wrapper: + +```bash +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-time-sutime-de:latest +``` + +In DUUI, pass the CoreNLP URL as runtime parameter: + +```java +.withParameter("corenlp_url", "http://127.0.0.1:9000") +``` + +If DUUI runs inside another Docker container, use the reachable host name, for example: + +```java +.withParameter("corenlp_url", "http://host.docker.internal:9000") ``` ## Run within DUUI +For using DUUI Time Detection as a DUUI image it is necessary to use the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +### Docker Driver + ```java composer.add( - new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-ner-[modelname]:latest") - .withParameter( - "selection", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" - ) + new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-time-microsoft-de:latest") + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + .withParameter("document_creation_time", "2026-06-09") ); ``` -With optional runtime parameters: +### Remote Driver + +If the container or local Python service is already running on port `9714`: ```java composer.add( - new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-ner-[modelname]:latest") - .withParameter( - "selection", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" - ) - .withParameter("threshold", "0.5") - .withParameter("batch_size", "8") - .withParameter("labels", "person,organization,location,date,event,product,taxon,other") + new DUUIRemoteDriver.Component("http://127.0.0.1:9714") + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + .withParameter("document_creation_time", "2026-06-09") ); ``` -### Parameters +### Duckling Remote Driver -| Name | Default | Description | -| --- | --- | --- | -| `selection` | required | Use `text` to process the full document text or any selectable UIMA type class name, e.g. `de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence`. | -| `threshold` | `0.5` | Confidence threshold for GLiNER/GLiNER2. HuggingFace token-classification models may ignore this value. | -| `batch_size` | `8` | Batch size used during prediction. | -| `labels` | `person,organization,location,date,event,product,taxon,other` | Candidate labels for GLiNER/GLiNER2. HuggingFace token-classification models use their trained label set. | +```java +composer.add( + new DUUIRemoteDriver.Component("http://127.0.0.1:9714") + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + .withParameter("document_creation_time", "2026-06-09") + .withParameter("duckling_url", "http://127.0.0.1:8000") + .withParameter("duckling_timezone", "Europe/Berlin") +); +``` -## Runtime behavior +### SUTime Remote Driver -- Each Docker image/container uses exactly one model. -- `MODEL_NAME` selects the backend model alias used by the Python service. -- `MODEL_VERSION` is used as model metadata in the DUUI response. -- `MODEL_SOURCE` and `MODEL_LANG` are also returned as metadata. -- Runtime parameters such as `threshold`, `batch_size`, and `labels` are passed via DUUI `.withParameter(...)`. +```java +composer.add( + new DUUIRemoteDriver.Component("http://127.0.0.1:9714") + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + .withParameter("document_creation_time", "2026-06-09") + .withParameter("corenlp_url", "http://127.0.0.1:9000") +); +``` + +## Parameters + +| Name | Description | +| ---- | ----------- | +| `selection` | Use `text` to process the full document text or any selectable UIMA type class name, for example `de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence`. | +| `document_creation_time` | Reference date for relative temporal expressions, for example `2026-06-09`. | +| `threshold` | Optional confidence threshold for Hugging Face token-classification models. | +| `batch_size` | Optional batch size for Hugging Face token-classification models. | +| `duckling_url` | Runtime URL of the Duckling HTTP service. Required for `MODEL_NAME=duckling`. | +| `duckling_timezone` | Runtime timezone for Duckling normalization, for example `Europe/Berlin`. | +| `corenlp_url` | Runtime URL of the CoreNLP/SUTime HTTP service. Required for `MODEL_NAME=sutime`. | + +## Local Development + +Start the service locally without Docker: + +```bash +export ANNOTATOR_NAME="duui-time" +export ANNOTATOR_VERSION="0.1.0" +export LOG_LEVEL="DEBUG" + +export MODEL_NAME="microsoft" +export MODEL_SPECNAME="recognizers-text-suite" +export MODEL_VERSION="1.0.2a2" +export MODEL_SOURCE="https://github.com/microsoft/Recognizers-Text" +export MODEL_LANG="de" +export MODEL_CACHE_SIZE="1" + +uvicorn duui_time:app --host 0.0.0.0 --port 9714 --workers 1 +``` -# Cite +## Cite -If you use this DUUI image, please cite DUUI as follows: +If you want to use the DUUI image please quote this as follows: Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. [[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] @@ -134,13 +236,12 @@ Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (202 address = {Singapore}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2023.findings-emnlp.29}, - pages = {385--399}, - pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf} + pages = {385--399} } @misc{Bagci:2026, author = {Bagci, Mevlüt}, - title = {Transformer-based Named Entity Recognition models as {DUUI} component}, + title = {Temporal expression detection models as {DUUI} component}, year = {2026}, howpublished = {https://github.com/texttechnologylab/duui-uima} } From 490927f99147ebfad78fdac8e535c0098183b7b4 Mon Sep 17 00:00:00 2001 From: bagci Date: Wed, 10 Jun 2026 13:00:37 +0200 Subject: [PATCH 10/19] Update gliner with batching --- duui-NER/docker_build.sh | 20 ++++----- duui-NER/src/main/docker/Dockerfile | 4 +- .../main/python/ner_classification_backend.py | 43 +++++++++++-------- .../textimager/uima/NER/NERTest.java | 2 +- 4 files changed, 37 insertions(+), 32 deletions(-) diff --git a/duui-NER/docker_build.sh b/duui-NER/docker_build.sh index 7b5d3ecb..6d5762b5 100644 --- a/duui-NER/docker_build.sh +++ b/duui-NER/docker_build.sh @@ -19,11 +19,11 @@ export BATCH_SIZE=8 # GLiNER # Passend dazu im Dockerfile aktivieren: # RUN python -c "from gliner import GLiNER; GLiNER.from_pretrained(model_id='urchade/gliner_multi-v2.1', map_location='cpu')" -#export MODEL_NAME="gliner" -#export MODEL_SPECNAME="gliner-multi-v2-1" -#export MODEL_VERSION="443d26d654e0324125a96bebd8e796c14ff2efe6" -#export MODEL_SOURCE="https://huggingface.co/urchade/gliner_multi-v2.1" -#export MODEL_LANG="Multi" +export MODEL_NAME="gliner" +export MODEL_SPECNAME="gliner-multi-v2-1" +export MODEL_VERSION="443d26d654e0324125a96bebd8e796c14ff2efe6" +export MODEL_SOURCE="https://huggingface.co/urchade/gliner_multi-v2.1" +export MODEL_LANG="Multi" ###--------------------------------------------------------------------- ###--------------------------------------------------------------------- @@ -63,11 +63,11 @@ export BATCH_SIZE=8 # XLM-R NER 40 languages # Passend dazu im Dockerfile aktivieren: # RUN python -c "from transformers import pipeline; pipeline('token-classification', model='nbroad/jplu-xlm-r-ner-40-lang', aggregation_strategy='simple')" -export MODEL_NAME="xlm-r-ner-40-lang" -export MODEL_SPECNAME="xlm-r-ner-40-lang" -export MODEL_VERSION="7f7f0fe9bc946a9848611aff079f556387687216" -export MODEL_SOURCE="https://huggingface.co/nbroad/jplu-xlm-r-ner-40-lang" -export MODEL_LANG="Multi" +#export MODEL_NAME="xlm-r-ner-40-lang" +#export MODEL_SPECNAME="xlm-r-ner-40-lang" +#export MODEL_VERSION="7f7f0fe9bc946a9848611aff079f556387687216" +#export MODEL_SOURCE="https://huggingface.co/nbroad/jplu-xlm-r-ner-40-lang" +#export MODEL_LANG="Multi" ###--------------------------------------------------------------------- diff --git a/duui-NER/src/main/docker/Dockerfile b/duui-NER/src/main/docker/Dockerfile index 47aa2c08..a0dcefe6 100644 --- a/duui-NER/src/main/docker/Dockerfile +++ b/duui-NER/src/main/docker/Dockerfile @@ -9,7 +9,7 @@ COPY ./requirements.txt ./requirements.txt RUN pip install --no-cache-dir -r requirements.txt # MODEL_NAME=gliner -#RUN python -c "from gliner import GLiNER; GLiNER.from_pretrained(model_id='urchade/gliner_multi-v2.1', map_location='cpu')" +RUN python -c "from gliner import GLiNER; GLiNER.from_pretrained(model_id='urchade/gliner_multi-v2.1', map_location='cpu')" # MODEL_NAME=gliner2 #RUN python -c "from gliner2 import GLiNER2; GLiNER2.from_pretrained('fastino/gliner2-multi-v1')" # MODEL_NAME=roberta-ner-multilingual @@ -17,7 +17,7 @@ RUN pip install --no-cache-dir -r requirements.txt # MODEL_NAME=wikineural-multilingual-ner #RUN python -c "from transformers import pipeline; pipeline('token-classification', model='Babelscape/wikineural-multilingual-ner', aggregation_strategy='simple')" # MODEL_NAME=xlm-r-ner-40-lang -RUN python -c "from transformers import pipeline; pipeline('token-classification', model='nbroad/jplu-xlm-r-ner-40-lang', aggregation_strategy='simple')" +#RUN python -c "from transformers import pipeline; pipeline('token-classification', model='nbroad/jplu-xlm-r-ner-40-lang', aggregation_strategy='simple')" # copy DUUI NER scripts COPY ./src/main/python/TypeSystemNER.xml ./TypeSystemNER.xml diff --git a/duui-NER/src/main/python/ner_classification_backend.py b/duui-NER/src/main/python/ner_classification_backend.py index 9a36373b..26007477 100644 --- a/duui-NER/src/main/python/ner_classification_backend.py +++ b/duui-NER/src/main/python/ner_classification_backend.py @@ -227,26 +227,31 @@ def predict( threshold: float = 0.5, batch_size: int = 8, ) -> List[List[Dict[str, Any]]]: - results: List[List[Dict[str, Any]]] = [] with torch.no_grad(): - for sentence in text: - try: - output = self.model.predict_entities( - sentence, - labels, - threshold=threshold, - multi_label=True, - return_class_probs=True, - ) - except TypeError: - output = self.model.predict_entities( - sentence, - labels, - multi_label=True, - return_class_probs=True, - ) - results.append(normalize_entity_output(sentence, output, self.model_name, self.model_id)) - return results + try: + outputs = self.model.inference( + text, + labels, + threshold=threshold, + multi_label=True, + return_class_probs=True, + batch_size=batch_size, + ) + except TypeError: + outputs = self.model.inference( + text, + labels, + multi_label=True, + return_class_probs=True, + batch_size=batch_size, + ) + + sentence_outputs = _normalize_sentence_outputs(outputs, len(text)) + + return [ + normalize_entity_output(sentence, entities, self.model_name, self.model_id) + for sentence, entities in zip(text, sentence_outputs) + ] class NERClassificationGLiNER2: diff --git a/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java b/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java index fea607d6..14bc10ed 100644 --- a/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java +++ b/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java @@ -27,7 +27,7 @@ public class NERTest { static DUUIComposer composer; static JCas cas; - static String url = "http://127.0.0.1:8000"; + static String url = "http://127.0.0.1:9714"; @BeforeAll static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { From 2c17683e310d7a98a9b0c6aed7c96967cf806266 Mon Sep 17 00:00:00 2001 From: bagci Date: Thu, 11 Jun 2026 12:15:34 +0200 Subject: [PATCH 11/19] Update Huggingface Model with DUUI Parallelization --- duui-NER/docker_build.sh | 20 ++--- duui-NER/requirements.txt | 2 + duui-NER/src/main/docker/Dockerfile | 11 ++- duui-NER/src/main/python/duui_ner.py | 80 ++++++++++++++++++- .../main/python/ner_classification_backend.py | 15 +++- .../textimager/uima/NER/NERTest.java | 5 ++ 6 files changed, 115 insertions(+), 18 deletions(-) diff --git a/duui-NER/docker_build.sh b/duui-NER/docker_build.sh index 6d5762b5..e23bdc48 100644 --- a/duui-NER/docker_build.sh +++ b/duui-NER/docker_build.sh @@ -19,11 +19,11 @@ export BATCH_SIZE=8 # GLiNER # Passend dazu im Dockerfile aktivieren: # RUN python -c "from gliner import GLiNER; GLiNER.from_pretrained(model_id='urchade/gliner_multi-v2.1', map_location='cpu')" -export MODEL_NAME="gliner" -export MODEL_SPECNAME="gliner-multi-v2-1" -export MODEL_VERSION="443d26d654e0324125a96bebd8e796c14ff2efe6" -export MODEL_SOURCE="https://huggingface.co/urchade/gliner_multi-v2.1" -export MODEL_LANG="Multi" +#export MODEL_NAME="gliner" +#export MODEL_SPECNAME="gliner-multi-v2-1" +#export MODEL_VERSION="443d26d654e0324125a96bebd8e796c14ff2efe6" +#export MODEL_SOURCE="https://huggingface.co/urchade/gliner_multi-v2.1" +#export MODEL_LANG="Multi" ###--------------------------------------------------------------------- ###--------------------------------------------------------------------- @@ -41,11 +41,11 @@ export MODEL_LANG="Multi" # RoBERTa multilingual NER # Passend dazu im Dockerfile aktivieren: # RUN python -c "from transformers import pipeline; pipeline('token-classification', model='julian-schelb/roberta-ner-multilingual', aggregation_strategy='simple')" -#export MODEL_NAME="roberta-ner-multilingual" -#export MODEL_SPECNAME="roberta-ner-multilingual" -#export MODEL_VERSION="d0a19147f3bb0065c8091459e3d35405ce9d48da" -#export MODEL_SOURCE="https://huggingface.co/julian-schelb/roberta-ner-multilingual" -#export MODEL_LANG="Multi" +export MODEL_NAME="roberta-ner-multilingual" +export MODEL_SPECNAME="roberta-ner-multilingual" +export MODEL_VERSION="d0a19147f3bb0065c8091459e3d35405ce9d48da" +export MODEL_SOURCE="https://huggingface.co/julian-schelb/roberta-ner-multilingual" +export MODEL_LANG="Multi" ###--------------------------------------------------------------------- ###--------------------------------------------------------------------- diff --git a/duui-NER/requirements.txt b/duui-NER/requirements.txt index dc3f5cb2..09f819af 100644 --- a/duui-NER/requirements.txt +++ b/duui-NER/requirements.txt @@ -4,6 +4,8 @@ transformers==5.1.0 torch==2.11.0 torchvision==0.26.0 torchaudio==2.11.0 +httpx==0.28.1 +httpcore==1.0.9 nltk==3.9.4 termcolor==3.3.0 six==1.17.0 diff --git a/duui-NER/src/main/docker/Dockerfile b/duui-NER/src/main/docker/Dockerfile index a0dcefe6..57cb2974 100644 --- a/duui-NER/src/main/docker/Dockerfile +++ b/duui-NER/src/main/docker/Dockerfile @@ -6,14 +6,19 @@ EXPOSE 9714 # dependencies COPY ./requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt +RUN python -m pip install --upgrade pip setuptools wheel && \ + python -m pip install \ + --timeout 300 \ + --retries 30 \ + --progress-bar off \ + -r requirements.txt # MODEL_NAME=gliner -RUN python -c "from gliner import GLiNER; GLiNER.from_pretrained(model_id='urchade/gliner_multi-v2.1', map_location='cpu')" +#RUN python -c "from gliner import GLiNER; GLiNER.from_pretrained(model_id='urchade/gliner_multi-v2.1', map_location='cpu')" # MODEL_NAME=gliner2 #RUN python -c "from gliner2 import GLiNER2; GLiNER2.from_pretrained('fastino/gliner2-multi-v1')" # MODEL_NAME=roberta-ner-multilingual -#RUN python -c "from transformers import pipeline; pipeline('token-classification', model='julian-schelb/roberta-ner-multilingual', aggregation_strategy='simple')" +RUN python -c "from transformers import pipeline; pipeline('token-classification', model='julian-schelb/roberta-ner-multilingual', aggregation_strategy='simple')" # MODEL_NAME=wikineural-multilingual-ner #RUN python -c "from transformers import pipeline; pipeline('token-classification', model='Babelscape/wikineural-multilingual-ner', aggregation_strategy='simple')" # MODEL_NAME=xlm-r-ner-40-lang diff --git a/duui-NER/src/main/python/duui_ner.py b/duui-NER/src/main/python/duui_ner.py index 2bad4fa0..ba48dfec 100644 --- a/duui-NER/src/main/python/duui_ner.py +++ b/duui-NER/src/main/python/duui_ner.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os import logging from functools import lru_cache from threading import Lock @@ -20,7 +21,8 @@ model_lock = Lock() - +os.environ["CUDA_LAUNCH_BLOCKING"]="1" +os.environ["TORCH_USE_CUDA_DSA"]="1" class UimaSentence(BaseModel): text: str @@ -54,6 +56,13 @@ class Settings(BaseSettings): threshold: float = 0.5 batch_size: int = 8 + # Runtime tokenizer/thread defaults. These can also be passed per /v1/process request. + tokenizers_parallelism: str = "false" + rayon_num_threads: int = 1 + omp_num_threads: int = 1 + mkl_num_threads: int = 1 + use_fast_tokenizer: bool = False + typesystem_filename: str = "TypeSystemNER.xml" lua_communication_script_filename: str = "duui_ner.lua" @@ -82,6 +91,14 @@ class DUUIRequest(BaseModel): batch_size: Optional[int] = None labels: Optional[Union[str, List[str]]] = None + # Runtime tokenizer/thread parameters passed through DUUI .withParameter(...). + # They are applied at the beginning of every /v1/process call. + tokenizers_parallelism: Optional[Union[bool, str]] = None + rayon_num_threads: Optional[Union[int, str]] = None + omp_num_threads: Optional[Union[int, str]] = None + mkl_num_threads: Optional[Union[int, str]] = None + use_fast_tokenizer: Optional[Union[bool, str]] = None + class DocumentModification(BaseModel): user: str @@ -293,6 +310,62 @@ def get_ner_labels() -> List[str]: return parse_ner_labels(None) +_LAST_RUNTIME_ENV: Dict[str, str] = {} + + +def parse_bool_like(value: Optional[Union[bool, str]], default: bool) -> bool: + if value is None: + return default + if isinstance(value, bool): + return value + text = str(value).strip().lower() + if text in {"1", "true", "yes", "y", "on"}: + return True + if text in {"0", "false", "no", "n", "off"}: + return False + return default + + +def parse_positive_int_like(value: Optional[Union[int, str]], default: int) -> int: + if value is None: + return max(1, int(default)) + try: + return max(1, int(str(value).strip())) + except Exception: + return max(1, int(default)) + + +def apply_runtime_env_from_request(request: DUUIRequest) -> bool: + """Apply tokenizer/thread runtime settings for every process call. + + The values are intentionally written into os.environ on each request so DUUI + can pass them via .withParameter(...). If values change after a model has + been cached, the model cache is cleared so the backend can pick up the new + USE_FAST_TOKENIZER value on the next load. + """ + values = { + "TOKENIZERS_PARALLELISM": "true" if parse_bool_like( + request.tokenizers_parallelism, + str(settings.tokenizers_parallelism).strip().lower() in {"1", "true", "yes", "y", "on"}, + ) else "false", + "RAYON_NUM_THREADS": str(parse_positive_int_like(request.rayon_num_threads, settings.rayon_num_threads)), + "OMP_NUM_THREADS": str(parse_positive_int_like(request.omp_num_threads, settings.omp_num_threads)), + "MKL_NUM_THREADS": str(parse_positive_int_like(request.mkl_num_threads, settings.mkl_num_threads)), + "USE_FAST_TOKENIZER": "true" if parse_bool_like(request.use_fast_tokenizer, settings.use_fast_tokenizer) else "false", + } + + changed = False + for key, value in values.items(): + if os.environ.get(key) != value: + os.environ[key] = value + changed = True + + global _LAST_RUNTIME_ENV + cache_relevant_changed = _LAST_RUNTIME_ENV.get("USE_FAST_TOKENIZER") != values["USE_FAST_TOKENIZER"] + _LAST_RUNTIME_ENV = values + return cache_relevant_changed + + def get_selected_model_name(model_name: str) -> str: """Return exactly one configured model for this container. @@ -439,6 +512,11 @@ def model_meta_values(model_name: str) -> Tuple[str, str]: @app.post("/v1/process", response_model=DUUIResponse) def post_process(request: DUUIRequest): + runtime_env_changed = apply_runtime_env_from_request(request) + if runtime_env_changed: + with model_lock: + load_model.cache_clear() + if not request.selections: return JSONResponse(status_code=400, content={"message": "The request must contain sentence selections."}) diff --git a/duui-NER/src/main/python/ner_classification_backend.py b/duui-NER/src/main/python/ner_classification_backend.py index 26007477..0cbc4343 100644 --- a/duui-NER/src/main/python/ner_classification_backend.py +++ b/duui-NER/src/main/python/ner_classification_backend.py @@ -1,7 +1,7 @@ from __future__ import annotations from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple - +import os import torch @@ -302,15 +302,23 @@ def __init__(self, model_name: str, device: str = "cpu"): self.model_id = model_name self.device = device + use_fast_tokenizer = str( + os.environ.get("USE_FAST_TOKENIZER", "true") + ).strip().lower() in {"1", "true", "yes", "y", "on"} + self.model = AutoModelForTokenClassification.from_pretrained(model_name) + try: self.tokenizer = AutoTokenizer.from_pretrained( model_name, - use_fast=True, + use_fast=use_fast_tokenizer, add_prefix_space=True, ) except TypeError: - self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, + use_fast=use_fast_tokenizer, + ) self.model.to(device) self.model.eval() @@ -340,7 +348,6 @@ def predict( for sentence, sentence_output in zip(text, sentence_outputs) ] - def create_ner_classifier( model_name: str, device: str = "cpu", diff --git a/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java b/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java index 14bc10ed..77345369 100644 --- a/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java +++ b/duui-NER/src/test/java/org/hucompute/textimager/uima/NER/NERTest.java @@ -96,6 +96,11 @@ public void DeBioFidNERTest() throws Exception { .withParameter("threshold", "0.5") .withParameter("batch_size", "8") .withParameter("labels", "person,organization,location,date,event,product,taxon,other") + .withParameter("tokenizers_parallelism", "false") + .withParameter("rayon_num_threads", "1") + .withParameter("omp_num_threads", "1") + .withParameter("mkl_num_threads", "1") + .withParameter("use_fast_tokenizer", "false") ); List textes = Arrays.asList( From 66d586596853e42301b67541c4123363f961a9ec Mon Sep 17 00:00:00 2001 From: bagci Date: Fri, 12 Jun 2026 18:04:12 +0200 Subject: [PATCH 12/19] Update Time --- duui-TimeDetection/src/main/python/duui_time.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/duui-TimeDetection/src/main/python/duui_time.py b/duui-TimeDetection/src/main/python/duui_time.py index 0c0441db..915370d2 100644 --- a/duui-TimeDetection/src/main/python/duui_time.py +++ b/duui-TimeDetection/src/main/python/duui_time.py @@ -11,6 +11,10 @@ from fastapi import FastAPI, Request, Response from fastapi.responses import JSONResponse, PlainTextResponse from pydantic import BaseModel +import os + +os.environ["CUDA_LAUNCH_BLOCKING"]="1" +os.environ["TORCH_USE_CUDA_DSA"]="1" try: from pydantic_settings import BaseSettings From 2c3f85db8f4bc8cdfa59e877d51ccec96b1018cb Mon Sep 17 00:00:00 2001 From: Mevluet Bagci Date: Tue, 16 Jun 2026 15:03:13 +0200 Subject: [PATCH 13/19] feat(duui-geonames-fst): add serialize/deserialize support and proxy setup - switch Lua communication layer to serialize/deserialize mode - add proxy for DUUI RemoteDriver requests without application/json header - serve communication_layer.lua through /v1/communication_layer - add build script for all GeoNames FST image variants - update Dockerfiles and entrypoint for proxy-based startup - revise README following the duui-FactChecking structure --- duui-geonames-fst/.idea/.gitignore | 10 + duui-geonames-fst/.idea/compiler.xml | 13 + duui-geonames-fst/.idea/jarRepositories.xml | 25 + duui-geonames-fst/.idea/misc.xml | 12 + duui-geonames-fst/.idea/vcs.xml | 6 + duui-geonames-fst/README.md | 154 + duui-geonames-fst/docker_build.sh | 57 + .../src/main/docker/eu.Dockerfile | 22 +- .../src/main/docker/europe-central.Dockerfile | 22 +- .../src/main/docker/europe.Dockerfile | 22 +- .../src/main/docker/single.Dockerfile | 19 +- .../main/resources/communication_layer.lua | 238 +- .../src/main/resources/duui_geonames_proxy.py | 191 + .../src/main/resources/entrypoint.sh | 56 + .../src/main/resources/requirements.txt | 3 + .../textimager/uima/rust/TestGeoNamesFst.java | 6 +- .../target/classes/communication_layer.lua | 201 + .../target/classes/duui_geonames_proxy.py | 191 + .../target/classes/entrypoint.sh | 56 + .../target/classes/requirements.txt | 3 + .../target/classes/typesystem.xml | 4137 +++++++++++++++++ .../uima/rust/TestGeoNamesFst.class | Bin 0 -> 7705 bytes 22 files changed, 5309 insertions(+), 135 deletions(-) create mode 100644 duui-geonames-fst/.idea/.gitignore create mode 100644 duui-geonames-fst/.idea/compiler.xml create mode 100644 duui-geonames-fst/.idea/jarRepositories.xml create mode 100644 duui-geonames-fst/.idea/misc.xml create mode 100644 duui-geonames-fst/.idea/vcs.xml create mode 100644 duui-geonames-fst/README.md create mode 100644 duui-geonames-fst/docker_build.sh create mode 100644 duui-geonames-fst/src/main/resources/duui_geonames_proxy.py create mode 100755 duui-geonames-fst/src/main/resources/entrypoint.sh create mode 100644 duui-geonames-fst/src/main/resources/requirements.txt create mode 100644 duui-geonames-fst/target/classes/communication_layer.lua create mode 100644 duui-geonames-fst/target/classes/duui_geonames_proxy.py create mode 100755 duui-geonames-fst/target/classes/entrypoint.sh create mode 100644 duui-geonames-fst/target/classes/requirements.txt create mode 100644 duui-geonames-fst/target/classes/typesystem.xml create mode 100644 duui-geonames-fst/target/test-classes/org/texttechnologylab/textimager/uima/rust/TestGeoNamesFst.class diff --git a/duui-geonames-fst/.idea/.gitignore b/duui-geonames-fst/.idea/.gitignore new file mode 100644 index 00000000..7bc07ec2 --- /dev/null +++ b/duui-geonames-fst/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Environment-dependent path to Maven home directory +/mavenHomeManager.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/duui-geonames-fst/.idea/compiler.xml b/duui-geonames-fst/.idea/compiler.xml new file mode 100644 index 00000000..295ce4a6 --- /dev/null +++ b/duui-geonames-fst/.idea/compiler.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/duui-geonames-fst/.idea/jarRepositories.xml b/duui-geonames-fst/.idea/jarRepositories.xml new file mode 100644 index 00000000..947ef884 --- /dev/null +++ b/duui-geonames-fst/.idea/jarRepositories.xml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/duui-geonames-fst/.idea/misc.xml b/duui-geonames-fst/.idea/misc.xml new file mode 100644 index 00000000..9dc782bb --- /dev/null +++ b/duui-geonames-fst/.idea/misc.xml @@ -0,0 +1,12 @@ + + + + + + + + \ No newline at end of file diff --git a/duui-geonames-fst/.idea/vcs.xml b/duui-geonames-fst/.idea/vcs.xml new file mode 100644 index 00000000..6c0b8635 --- /dev/null +++ b/duui-geonames-fst/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/duui-geonames-fst/README.md b/duui-geonames-fst/README.md new file mode 100644 index 00000000..f135efd6 --- /dev/null +++ b/duui-geonames-fst/README.md @@ -0,0 +1,154 @@ +[![Version](https://img.shields.io/static/v1?label=duui-geonames-fst-de&message=0.3.3&color=blue)](https://docker.texttechnologylab.org/v2/duui-geonames-fst/de/tags/list) +[![Version](https://img.shields.io/static/v1?label=duui-geonames-fst-eu&message=0.3.3&color=blue)](https://docker.texttechnologylab.org/v2/duui-geonames-fst/eu/tags/list) +[![Version](https://img.shields.io/static/v1?label=duui-geonames-fst-europe&message=0.3.3&color=blue)](https://docker.texttechnologylab.org/v2/duui-geonames-fst/europe/tags/list) +[![Version](https://img.shields.io/static/v1?label=duui-geonames-fst-europe-central&message=0.3.3&color=blue)](https://docker.texttechnologylab.org/v2/duui-geonames-fst/europe-central/tags/list) +[![Version](https://img.shields.io/static/v1?label=DUUI&message=compatible&color=green)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface) + +# GeoNames FST DUUI + +DUUI implementation for GeoNames-based location linking using finite-state transducers. The component reads location annotations from a configured DUUI source view and writes resolved `org.texttechnologylab.annotation.geonames.GeoNamesEntity` annotations to the configured DUUI target view. + +## Included Images + +| Name | Docker image | Description | +|--------------------------------------------------------------|--------------------------------------------------------------|----------------------------------------| +| de | `docker.texttechnologylab.org/duui-geonames-fst/de:latest` | GeoNames lookup for Germany | +| eu | `docker.texttechnologylab.org/duui-geonames-fst/eu:latest` | GeoNames lookup for the European Union | +| europe | `docker.texttechnologylab.org/duui-geonames-fst/europe:latest` | GeoNames lookup for Europe | +| europe-central | `docker.texttechnologylab.org/duui-geonames-fst/europe-central:latest` | GeoNames lookup for Central Europe | +| ------------------------------------------------------------ |--------------------------------------------------------------|----------------------------------------| + +# How To Use + +For using `duui-geonames-fst` as a DUUI image it is necessary to use the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +## Start Docker container + +```bash +docker run --rm -p 1000:9714 docker.texttechnologylab.org/duui-geonames-fst/europe:latest +``` + +Find all available image tags here: + +- https://docker.texttechnologylab.org/v2/duui-geonames-fst/de/tags/list +- https://docker.texttechnologylab.org/v2/duui-geonames-fst/eu/tags/list +- https://docker.texttechnologylab.org/v2/duui-geonames-fst/europe/tags/list +- https://docker.texttechnologylab.org/v2/duui-geonames-fst/europe-central/tags/list + +## Run within DUUI + +```java +composer.add( + new DUUIDockerDriver.Component( + "docker.texttechnologylab.org/duui-geonames-fst/europe:latest") + .withScale(iWorkers) + .withParameter("timeout", "5000") + .withParameter("mode", "levenshtein") + .withParameter("max_dist", "2") + .withParameter("min_length", "5") + .withParameter("result_selection", "first") + .withSourceView("roberta-ner-multilingual") + .withTargetView("geonames-roberta-ner-multilingual") + .withImageFetching() + .build() +); +``` + +## Parameters + +| Parameter | Default | Description | +|--------------------------------------------------------------|------------------------------------------|----------------------------------------| +| `mode` | `find` | Lookup mode. Supported modes include `find` and `levenshtein`. | +| `max_dist` | - | Maximum edit distance for `levenshtein` lookup. | +| `state_limit` | - | Optional state limit for `levenshtein` lookup. | +| `min_length` | - | Minimum query length. | +| `result_selection` | `first` | Result selection strategy. | +| `timeout` | - | Request timeout in milliseconds. | +| `annotation_type` | `de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location` | Source annotation type read from the configured source view. | +| ------------------------------------------------------------ |------------------------------------------|----------------------------------------| + +## Input and Output + +The component expects location annotations in the configured DUUI source view: + +```java +.withSourceView("roberta-ner-multilingual") +``` + +Resolved GeoNames annotations are written to the configured DUUI target view: + +```java +.withTargetView("geonames-roberta-ner-multilingual") +``` + +The default input annotation type is: + +```text +de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location +``` + +The output annotation type is: + +```text +org.texttechnologylab.annotation.geonames.GeoNamesEntity +``` + +## Build Docker images + +Build all variants locally: + +```bash +./build-all.sh 0.3.3 +``` + +Build and push all variants: + +```bash +docker login docker.texttechnologylab.org +PUSH=true ./build-all.sh 0.3.3 +``` + +# Cite + +If you want to use the DUUI image please quote this as follows: + +Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. +[[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] + +## BibTeX + +```bibtex +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander}, + editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023}, + year = {2023}, + address = {Singapore}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, + pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf}, + abstract = {Automatic analysis of large corpora is a complex task, especially + in terms of time efficiency. This complexity is increased by the + fact that flexible, extensible text analysis requires the continuous + integration of ever new tools. Since there are no adequate frameworks + for these purposes in the field of NLP, and especially in the + context of UIMA, that are not outdated or unusable for security + reasons, we present a new approach to address the latter task: + Docker Unified UIMA Interface (DUUI), a scalable, flexible, + lightweight, and feature-rich framework for automatic distributed + analysis of text corpora that leverages Big Data experience and + virtualization with Docker. We evaluate DUUI{'}s communication + approach against a state-of-the-art approach and demonstrate its + outstanding behavior in terms of time efficiency, enabling the + analysis of big text data.} +} + +@misc{Bagci:2026, + author = {Bagci, Mevlüt}, + title = {GeoNames FST as {DUUI} component}, + year = {2026}, + howpublished = {https://github.com/texttechnologylab/duui-uima/tree/main/duui-geonames-fst} +} +``` \ No newline at end of file diff --git a/duui-geonames-fst/docker_build.sh b/duui-geonames-fst/docker_build.sh new file mode 100644 index 00000000..f4d4b993 --- /dev/null +++ b/duui-geonames-fst/docker_build.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -euo pipefail + +VERSION="${1:-0.3.4}" +REGISTRY="docker.texttechnologylab.org" +IMAGE_NAME="duui-geonames-fst" +PUSH="${PUSH:-true}" + +build_image() { + local variant="$1" + local dockerfile="$2" + local country_arg="${3:-}" + + echo "============================================================" + echo "Building ${IMAGE_NAME}/${variant}:${VERSION}" + echo "Dockerfile: ${dockerfile}" + echo "============================================================" + + if [[ -n "${country_arg}" ]]; then + docker build \ + -f "${dockerfile}" \ + -t "${IMAGE_NAME}/${variant}:${VERSION}" \ + --build-arg COUNTRY="${country_arg}" \ + . + else + docker build \ + -f "${dockerfile}" \ + -t "${IMAGE_NAME}/${variant}:${VERSION}" \ + . + fi + + docker tag \ + "${IMAGE_NAME}/${variant}:${VERSION}" \ + "${IMAGE_NAME}/${variant}:latest" + + docker tag \ + "${IMAGE_NAME}/${variant}:${VERSION}" \ + "${REGISTRY}/${IMAGE_NAME}/${variant}:${VERSION}" + + docker tag \ + "${IMAGE_NAME}/${variant}:${VERSION}" \ + "${REGISTRY}/${IMAGE_NAME}/${variant}:latest" + + if [[ "${PUSH}" == "true" ]]; then + docker push "${REGISTRY}/${IMAGE_NAME}/${variant}:${VERSION}" + docker push "${REGISTRY}/${IMAGE_NAME}/${variant}:latest" + fi +} + +build_image "de" "src/main/docker/single.Dockerfile" "DE" +build_image "eu" "src/main/docker/eu.Dockerfile" +build_image "europe" "src/main/docker/europe.Dockerfile" +build_image "europe-central" "src/main/docker/europe-central.Dockerfile" + +echo "============================================================" +echo "Done. Built version: ${VERSION}. Push enabled: ${PUSH}" +echo "============================================================" diff --git a/duui-geonames-fst/src/main/docker/eu.Dockerfile b/duui-geonames-fst/src/main/docker/eu.Dockerfile index dbad91d0..21f19b4f 100644 --- a/duui-geonames-fst/src/main/docker/eu.Dockerfile +++ b/duui-geonames-fst/src/main/docker/eu.Dockerfile @@ -1,13 +1,12 @@ ARG GEONAMES_FST_VERSION=0.3.1 -FROM docker.texttechnologylab.org/duui-geonames-fst/base:${GEONAMES_FST_VERSION} AS builder +FROM docker.texttechnologylab.org/duui-geonames-fst/base:${GEONAMES_FST_VERSION} AS builder WORKDIR /build/ RUN cargo build --release --no-default-features --features duui RUN chmod +x /build/target/release/geonames-fst FROM alpine:latest AS data RUN apk --update add unzip && rm -rf /var/cache/apk/* - ADD https://download.geonames.org/export/dump/AT.zip \ https://download.geonames.org/export/dump/BE.zip \ https://download.geonames.org/export/dump/BG.zip \ @@ -38,7 +37,6 @@ ADD https://download.geonames.org/export/dump/AT.zip \ https://download.geonames.org/export/dump/SI.zip \ https://download.geonames.org/export/dump/SK.zip \ /tmp/geonames/ - ADD https://download.geonames.org/export/dump/alternatenames/AT.zip \ https://download.geonames.org/export/dump/alternatenames/BE.zip \ https://download.geonames.org/export/dump/alternatenames/BG.zip \ @@ -69,21 +67,23 @@ ADD https://download.geonames.org/export/dump/alternatenames/AT.zip \ https://download.geonames.org/export/dump/alternatenames/SI.zip \ https://download.geonames.org/export/dump/alternatenames/SK.zip \ /tmp/alternateNames/ - RUN mkdir -p /data/geonames /data/alternateNames && \ for COUNTRY in AT BE BG CH CY CZ DE DK EE ES FI FR GR HR HU IE IT LT LU LV MT NL NO PL PT RO SE SI SK; do \ - unzip -d /data/geonames/ /tmp/geonames/$COUNTRY.zip $COUNTRY.txt; \ - unzip -d /data/alternateNames/ /tmp/alternateNames/$COUNTRY.zip $COUNTRY.txt; \ + unzip -d /data/geonames/ /tmp/geonames/$COUNTRY.zip $COUNTRY.txt; \ + unzip -d /data/alternateNames/ /tmp/alternateNames/$COUNTRY.zip $COUNTRY.txt; \ done -FROM cgr.dev/chainguard/glibc-dynamic:latest AS prod +FROM python:3.12-slim AS prod COPY --from=builder /build/target/release/geonames-fst /app/ COPY --from=data /data /app/data/ COPY src/main/resources/ /app/resources/ WORKDIR /app/ - +RUN pip install --no-cache-dir -r /app/resources/requirements.txt \ + && chmod +x /app/resources/entrypoint.sh ENV RUST_LOG="info,tower_http=debug,axum::rejection=trace" - +ENV PORT="9714" +ENV GEONAMES_BACKEND_PORT="9715" +ENV GEONAMES_BACKEND="http://127.0.0.1:9715" EXPOSE 9714 -ENTRYPOINT ["/app/geonames-fst", "--port", "9714", "/app/data/geonames/", "--alternate", "/app/data/alternateNames/"] -CMD [] \ No newline at end of file +ENTRYPOINT ["/app/resources/entrypoint.sh"] +CMD [] diff --git a/duui-geonames-fst/src/main/docker/europe-central.Dockerfile b/duui-geonames-fst/src/main/docker/europe-central.Dockerfile index 6d63a9bf..1c145381 100644 --- a/duui-geonames-fst/src/main/docker/europe-central.Dockerfile +++ b/duui-geonames-fst/src/main/docker/europe-central.Dockerfile @@ -1,13 +1,12 @@ ARG GEONAMES_FST_VERSION=0.3.1 -FROM docker.texttechnologylab.org/duui-geonames-fst/base:${GEONAMES_FST_VERSION} AS builder +FROM docker.texttechnologylab.org/duui-geonames-fst/base:${GEONAMES_FST_VERSION} AS builder WORKDIR /build/ RUN cargo build --release --no-default-features --features duui RUN chmod +x /build/target/release/geonames-fst FROM alpine:latest AS data RUN apk --update add unzip && rm -rf /var/cache/apk/* - ADD https://download.geonames.org/export/dump/CZ.zip \ https://download.geonames.org/export/dump/DK.zip \ https://download.geonames.org/export/dump/DE.zip \ @@ -18,7 +17,6 @@ ADD https://download.geonames.org/export/dump/CZ.zip \ https://download.geonames.org/export/dump/SK.zip \ https://download.geonames.org/export/dump/SL.zip \ /tmp/geonames/ - ADD https://download.geonames.org/export/dump/alternatenames/CZ.zip \ https://download.geonames.org/export/dump/alternatenames/DK.zip \ https://download.geonames.org/export/dump/alternatenames/DE.zip \ @@ -29,21 +27,23 @@ ADD https://download.geonames.org/export/dump/alternatenames/CZ.zip \ https://download.geonames.org/export/dump/alternatenames/SK.zip \ https://download.geonames.org/export/dump/alternatenames/SL.zip \ /tmp/alternateNames/ - RUN mkdir -p /data/geonames /data/alternateNames && \ for COUNTRY in CZ DK DE GB HU IE PL SK SL; do \ - unzip -d /data/geonames/ /tmp/geonames/$COUNTRY.zip $COUNTRY.txt; \ - unzip -d /data/alternateNames/ /tmp/alternateNames/$COUNTRY.zip $COUNTRY.txt; \ + unzip -d /data/geonames/ /tmp/geonames/$COUNTRY.zip $COUNTRY.txt; \ + unzip -d /data/alternateNames/ /tmp/alternateNames/$COUNTRY.zip $COUNTRY.txt; \ done -FROM cgr.dev/chainguard/glibc-dynamic:latest AS prod +FROM python:3.12-slim AS prod COPY --from=builder /build/target/release/geonames-fst /app/ COPY --from=data /data /app/data/ COPY src/main/resources/ /app/resources/ WORKDIR /app/ - +RUN pip install --no-cache-dir -r /app/resources/requirements.txt \ + && chmod +x /app/resources/entrypoint.sh ENV RUST_LOG="info,tower_http=debug,axum::rejection=trace" - +ENV PORT="9714" +ENV GEONAMES_BACKEND_PORT="9715" +ENV GEONAMES_BACKEND="http://127.0.0.1:9715" EXPOSE 9714 -ENTRYPOINT ["/app/geonames-fst", "--port", "9714", "/app/data/geonames/", "--alternate", "/app/data/alternateNames/"] -CMD [] \ No newline at end of file +ENTRYPOINT ["/app/resources/entrypoint.sh"] +CMD [] diff --git a/duui-geonames-fst/src/main/docker/europe.Dockerfile b/duui-geonames-fst/src/main/docker/europe.Dockerfile index 4663e03c..006eb6ed 100644 --- a/duui-geonames-fst/src/main/docker/europe.Dockerfile +++ b/duui-geonames-fst/src/main/docker/europe.Dockerfile @@ -1,13 +1,12 @@ ARG GEONAMES_FST_VERSION=0.3.1 -FROM docker.texttechnologylab.org/duui-geonames-fst/base:${GEONAMES_FST_VERSION} AS builder +FROM docker.texttechnologylab.org/duui-geonames-fst/base:${GEONAMES_FST_VERSION} AS builder WORKDIR /build/ RUN cargo build --release --no-default-features --features duui RUN chmod +x /build/target/release/geonames-fst FROM alpine:latest AS data RUN apk --update add unzip && rm -rf /var/cache/apk/* - ADD https://download.geonames.org/export/dump/DE.zip \ https://download.geonames.org/export/dump/FR.zip \ https://download.geonames.org/export/dump/GR.zip \ @@ -45,7 +44,6 @@ ADD https://download.geonames.org/export/dump/DE.zip \ https://download.geonames.org/export/dump/UA.zip \ https://download.geonames.org/export/dump/VA.zip \ /tmp/geonames/ - ADD https://download.geonames.org/export/dump/alternatenames/DE.zip \ https://download.geonames.org/export/dump/alternatenames/FR.zip \ https://download.geonames.org/export/dump/alternatenames/GR.zip \ @@ -83,21 +81,23 @@ ADD https://download.geonames.org/export/dump/alternatenames/DE.zip \ https://download.geonames.org/export/dump/alternatenames/UA.zip \ https://download.geonames.org/export/dump/alternatenames/VA.zip \ /tmp/alternateNames/ - RUN mkdir -p /data/geonames /data/alternateNames && \ for COUNTRY in DE FR GR HU BG DK EE CH CY CZ FI ES GB IT LI LT LU LV ME MK MT NL NO PL PT RO RS RU SE SI SL SK SM TR UA VA; do \ - unzip -d /data/geonames/ /tmp/geonames/$COUNTRY.zip $COUNTRY.txt; \ - unzip -d /data/alternateNames/ /tmp/alternateNames/$COUNTRY.zip $COUNTRY.txt; \ + unzip -d /data/geonames/ /tmp/geonames/$COUNTRY.zip $COUNTRY.txt; \ + unzip -d /data/alternateNames/ /tmp/alternateNames/$COUNTRY.zip $COUNTRY.txt; \ done -FROM cgr.dev/chainguard/glibc-dynamic:latest AS prod +FROM python:3.12-slim AS prod COPY --from=builder /build/target/release/geonames-fst /app/ COPY --from=data /data /app/data/ COPY src/main/resources/ /app/resources/ WORKDIR /app/ - +RUN pip install --no-cache-dir -r /app/resources/requirements.txt \ + && chmod +x /app/resources/entrypoint.sh ENV RUST_LOG="info,tower_http=debug,axum::rejection=trace" - +ENV PORT="9714" +ENV GEONAMES_BACKEND_PORT="9715" +ENV GEONAMES_BACKEND="http://127.0.0.1:9715" EXPOSE 9714 -ENTRYPOINT ["/app/geonames-fst", "--port", "9714", "/app/data/geonames/", "--alternate", "/app/data/alternateNames/"] -CMD [] \ No newline at end of file +ENTRYPOINT ["/app/resources/entrypoint.sh"] +CMD [] diff --git a/duui-geonames-fst/src/main/docker/single.Dockerfile b/duui-geonames-fst/src/main/docker/single.Dockerfile index 82beabee..19ae2390 100644 --- a/duui-geonames-fst/src/main/docker/single.Dockerfile +++ b/duui-geonames-fst/src/main/docker/single.Dockerfile @@ -1,32 +1,31 @@ ARG GEONAMES_FST_VERSION=0.3.1 -FROM docker.texttechnologylab.org/duui-geonames-fst/base:${GEONAMES_FST_VERSION} AS builder +FROM docker.texttechnologylab.org/duui-geonames-fst/base:${GEONAMES_FST_VERSION} AS builder WORKDIR /build/ RUN cargo build --release --no-default-features --features duui RUN chmod +x /build/target/release/geonames-fst FROM alpine:latest AS data RUN apk --update add unzip && rm -rf /var/cache/apk/* - ARG COUNTRY="DE" - ADD https://download.geonames.org/export/dump/${COUNTRY}.zip /data/geonames/ ADD https://download.geonames.org/export/dump/alternatenames/${COUNTRY}.zip /data/alternateNames/ - WORKDIR /data/geonames RUN unzip ${COUNTRY}.zip && rm -f ${COUNTRY}.zip readme.txt - WORKDIR /data/alternateNames RUN unzip ${COUNTRY}.zip && rm -f ${COUNTRY}.zip readme.txt -FROM cgr.dev/chainguard/glibc-dynamic:latest AS prod +FROM python:3.12-slim AS prod COPY --from=builder /build/target/release/geonames-fst /app/ COPY --from=data /data /app/data/ COPY src/main/resources/ /app/resources/ WORKDIR /app/ - +RUN pip install --no-cache-dir -r /app/resources/requirements.txt \ + && chmod +x /app/resources/entrypoint.sh ENV RUST_LOG="info,tower_http=debug,axum::rejection=trace" - +ENV PORT="9714" +ENV GEONAMES_BACKEND_PORT="9715" +ENV GEONAMES_BACKEND="http://127.0.0.1:9715" EXPOSE 9714 -ENTRYPOINT ["/app/geonames-fst", "--port", "9714", "/app/data/geonames/", "--alternate", "/app/data/alternateNames/"] -CMD [] \ No newline at end of file +ENTRYPOINT ["/app/resources/entrypoint.sh"] +CMD [] diff --git a/duui-geonames-fst/src/main/resources/communication_layer.lua b/duui-geonames-fst/src/main/resources/communication_layer.lua index 52a9914d..8c9605a4 100644 --- a/duui-geonames-fst/src/main/resources/communication_layer.lua +++ b/duui-geonames-fst/src/main/resources/communication_layer.lua @@ -1,70 +1,30 @@ --- Bind static classes from java +-- Bind static classes from Java StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") ----Indicates that this component supports the "new" `process` method. -SUPPORTS_PROCESS = true ----Indicates that this component does NOT support the old `serialize`/`deserialize` methods. -SUPPORTS_SERIALIZE = false +---This component does NOT support the new `process` method. +SUPPORTS_PROCESS = false ----Create and yield batches of elements from an iterator after applying a transform function. ----@param iterator any an iterator over annotations ----@param batch_size integer size of each batch sent to the component -function get_batches(iterator, batch_size) - local entities, references = {}, {} - while iterator:hasNext() do - local entity = iterator:next() - - references[#references + 1] = entity - entities[#entities + 1] = { - reference = tostring(#references), - text = entity:getCoveredText() - } - - if #entities == batch_size then - coroutine.yield({ - entities, references - }) - entities, references = {}, {} - end - end - - if #entities > 0 then - coroutine.yield({ - entities, references - }) - end -end +---This component supports the old `serialize`/`deserialize` methods. +SUPPORTS_SERIALIZE = true ----Iterate over batches of elements from an iterator after applying a transform function. ----@param iterator any an iterator over annotations ----@param batch_size integer size of each batch ----@return fun(): table an iterator over batches to process -function batched(iterator, batch_size) - local co = coroutine.create(function() get_batches(iterator, batch_size) end) - return function() - local _, batch = coroutine.resume(co) - return batch - end -end - -REQUEST_BATCH_SIZE = 4096 ANNOTATION_TYPE = "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location" ----Process the sentences in the given JCas in small batches. ----@param sourceCas any JCas (view) to process ----@param handler any DuuiHttpRequestHandler with a connection to the running component +---Serialize annotations from the DUUI source view. +---DUUI resolves `.withSourceView(...)` before calling this function. +---Therefore inputCas is already the source view. +---@param inputCas any source JCas view +---@param outputStream any output stream to the remote component ---@param parameters table optional parameters ----@param targetCas any JCas (view) to write the results to (optional) -function process(sourceCas, handler, parameters, targetCas) +function serialize(inputCas, outputStream, parameters) parameters = parameters or {} - local batch_size = parameters.request_batch_size or REQUEST_BATCH_SIZE local annotation_type = parameters.annotation_type or ANNOTATION_TYPE local query = { mode = parameters.mode or "find", result_selection = parameters.result_selection or "first", + queries = {} } if parameters.filter ~= nil then @@ -79,65 +39,163 @@ function process(sourceCas, handler, parameters, targetCas) query.state_limit = tostring(parameters.state_limit) end - handler:setHeader("Content-Type", "application/json") + if parameters.min_length ~= nil then + query.min_length = tostring(parameters.min_length) + end - local iterator, results = JCasUtil:select(sourceCas, luajava.bindClass(annotation_type)):iterator() - for batch in batched(iterator, batch_size) do - local entities, references = table.unpack(batch) - query.queries = entities + local annotation_class = luajava.bindClass(annotation_type) + local iterator = JCasUtil:select(inputCas, annotation_class):iterator() - local response = handler:process(json.encode(query)) + local index = 1 - if not response:ok() then - error("Error " .. response:statusCode() .. " in communication with component: " .. response:bodyAsString()) + while iterator:hasNext() do + local entity = iterator:next() + + query.queries[#query.queries + 1] = { + reference = tostring(index), + text = entity:getCoveredText(), + begin = entity:getBegin(), + ["end"] = entity:getEnd() + } + + index = index + 1 + end + + outputStream:write(json.encode(query)) +end + +---Deserialize GeoNames results into the DUUI target view. +---DUUI resolves/creates `.withTargetView(...)` before calling this function. +---Therefore inputCas is already the target view. +---@param inputCas any target JCas view +---@param inputStream any response stream from the remote component +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance( + "java.lang.String", + inputStream:readAllBytes(), + StandardCharsets.UTF_8 + ) + + local results = json.decode(inputString) + + if results == nil then + return + end + + if results.results ~= nil then + for _, entity in ipairs(results.results) do + add_geonames_annotation(inputCas, entity) end - - results = json.decode(response:body()) - process_response(targetCas, results, references) end - local results_modification = results.modification - local document_modification = luajava.newInstance("org.texttechnologylab.annotation.DocumentModification", targetCas) - document_modification:setUser(results_modification.user) - document_modification:setTimestamp(results_modification.timestamp) - document_modification:setComment(results_modification.comment) - document_modification:addToIndexes() + if results.modification ~= nil then + add_document_modification(inputCas, results.modification) + end end ----Process the response from the component. ----@param targetCas any JCas ----@param results table the results from the component ----@param references table -function process_response(targetCas, results, references) - for _, entity in ipairs(results.results) do - local gn = entity.entry - - local annotation = luajava.newInstance("org.texttechnologylab.annotation.geonames.GeoNamesEntity", targetCas) +---Create one GeoNamesEntity annotation in the target view. +---@param targetCas any target JCas view +---@param entity table one result entry from the component +function add_geonames_annotation(targetCas, entity) + if entity == nil or entity.entry == nil then + return + end + + local gn = entity.entry + + local begin_pos = tonumber(entity.begin) + local end_pos = tonumber(entity["end"]) + + if begin_pos == nil or end_pos == nil then + error( + "Missing begin/end offsets in GeoNames response for reference: " .. + tostring(entity.reference) + ) + end + + local annotation = luajava.newInstance( + "org.texttechnologylab.annotation.geonames.GeoNamesEntity", + targetCas + ) + + annotation:setBegin(begin_pos) + annotation:setEnd(end_pos) + + if gn.id ~= nil then annotation:setId(tonumber(gn.id)) + end + + if gn.name ~= nil then annotation:setName(gn.name) + end + + if gn.latitude ~= nil then annotation:setLatitude(gn.latitude) + end + + if gn.longitude ~= nil then annotation:setLongitude(gn.longitude) + end + + if gn.feature_class ~= nil then annotation:setFeatureClass(gn.feature_class) + end + + if gn.feature_code ~= nil then annotation:setFeatureCode(gn.feature_code) + end + + if gn.country_code ~= nil then annotation:setCountryCode(gn.country_code) + end + + if gn.adm1 ~= nil then annotation:setAdm1(gn.adm1) + end + + if gn.adm2 ~= nil then annotation:setAdm2(gn.adm2) + end + + if gn.adm3 ~= nil then annotation:setAdm3(gn.adm3) + end + + if gn.adm4 ~= nil then annotation:setAdm4(gn.adm4) + end - if gn.elevation ~= nil then - annotation:setElevation(gn.elevation) - end + if gn.elevation ~= nil then + annotation:setElevation(gn.elevation) + end - local reference = references[tonumber(entity.reference)] - if reference == nil then - error("Failed to resolve reference annotation with index " .. entity.reference) - else - annotation:setReferenceAnnotation(reference) - annotation:setBegin(reference:getBegin()) - annotation:setEnd(reference:getEnd()) - end + annotation:addToIndexes() +end - annotation:addToIndexes() +---Add DocumentModification annotation to the target view. +---@param targetCas any target JCas view +---@param modification table modification metadata from component +function add_document_modification(targetCas, modification) + if modification == nil then + return end + + local document_modification = luajava.newInstance( + "org.texttechnologylab.annotation.DocumentModification", + targetCas + ) + + if modification.user ~= nil then + document_modification:setUser(modification.user) + end + + if modification.timestamp ~= nil then + document_modification:setTimestamp(modification.timestamp) + end + + if modification.comment ~= nil then + document_modification:setComment(modification.comment) + end + + document_modification:addToIndexes() end diff --git a/duui-geonames-fst/src/main/resources/duui_geonames_proxy.py b/duui-geonames-fst/src/main/resources/duui_geonames_proxy.py new file mode 100644 index 00000000..0209bfa7 --- /dev/null +++ b/duui-geonames-fst/src/main/resources/duui_geonames_proxy.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +import os +from pathlib import Path +from typing import Any, Dict + +import httpx +import uvicorn +from fastapi import FastAPI, Request, Response +from fastapi.responses import JSONResponse, PlainTextResponse + + +BACKEND_URL = os.environ.get("GEONAMES_BACKEND", "http://127.0.0.1:9715") +RESOURCE_DIR = Path(os.environ.get("DUUI_RESOURCE_DIR", Path(__file__).resolve().parent)) +COMMUNICATION_LAYER = Path( + os.environ.get("DUUI_COMMUNICATION_LAYER", RESOURCE_DIR / "communication_layer.lua") +) + +app = FastAPI(title="DUUI GeoNames FST Proxy") + + +@app.get("/") +async def root() -> Dict[str, Any]: + return { + "name": "duui-geonames-fst-proxy", + "backend": BACKEND_URL, + "communication_layer": str(COMMUNICATION_LAYER), + "description": ( + "Proxy for DUUI serialize/deserialize mode. " + "Serves the Lua communication layer, adds Content-Type: application/json " + "and restores begin/end offsets." + ), + } + + +@app.get("/v1/communication_layer") +async def communication_layer() -> Response: + """DUUI fetches the Lua communication layer via GET before processing.""" + try: + content = COMMUNICATION_LAYER.read_text(encoding="utf-8") + except FileNotFoundError: + return PlainTextResponse( + f"communication_layer.lua not found at {COMMUNICATION_LAYER}", + status_code=500, + ) + + return PlainTextResponse(content, status_code=200, media_type="text/plain") + + +@app.get("/{path:path}") +async def forward_get(path: str, request: Request) -> Response: + """Forward non-Lua GET requests to the real backend.""" + return await forward_raw_get(request, f"/{path}") + + +@app.post("/") +async def process_root(request: Request) -> Response: + return await forward_json_post(request, "/") + + +@app.post("/{path:path}") +async def process_path(path: str, request: Request) -> Response: + return await forward_json_post(request, f"/{path}") + + +async def forward_raw_get(request: Request, path: str) -> Response: + backend_url = BACKEND_URL.rstrip("/") + path + + try: + async with httpx.AsyncClient(timeout=None) as client: + backend_response = await client.get( + backend_url, + params=dict(request.query_params), + headers={"Accept": request.headers.get("accept", "*/*")}, + ) + except Exception as exc: + return PlainTextResponse( + f"Proxy error while forwarding GET to {backend_url}: {exc}", + status_code=502, + ) + + return Response( + content=backend_response.content, + status_code=backend_response.status_code, + media_type=backend_response.headers.get("content-type", "text/plain"), + ) + + +async def forward_json_post(request: Request, path: str) -> Response: + body = await request.body() + + if not body: + return PlainTextResponse("Empty request body", status_code=400) + + try: + request_json = await request.json() + except Exception as exc: + return PlainTextResponse(f"Invalid JSON request body: {exc}", status_code=400) + + reference_offsets = build_reference_offset_map(request_json) + backend_url = BACKEND_URL.rstrip("/") + path + + try: + async with httpx.AsyncClient(timeout=None) as client: + backend_response = await client.post( + backend_url, + content=body, + headers={ + "Content-Type": "application/json", + "Accept": "application/json", + }, + ) + except Exception as exc: + return PlainTextResponse( + f"Proxy error while forwarding POST to {backend_url}: {exc}", + status_code=502, + ) + + if backend_response.status_code < 200 or backend_response.status_code >= 300: + return Response( + content=backend_response.content, + status_code=backend_response.status_code, + media_type=backend_response.headers.get("content-type", "text/plain"), + ) + + try: + response_json = backend_response.json() + except Exception: + return Response( + content=backend_response.content, + status_code=backend_response.status_code, + media_type=backend_response.headers.get("content-type", "application/json"), + ) + + response_json = enrich_response_with_offsets(response_json, reference_offsets) + + return JSONResponse( + content=response_json, + status_code=backend_response.status_code, + ) + + +def build_reference_offset_map(request_json: Dict[str, Any]) -> Dict[str, Dict[str, int]]: + result: Dict[str, Dict[str, int]] = {} + + for query in request_json.get("queries", []): + reference = query.get("reference") + begin = query.get("begin") + end = query.get("end") + + if reference is None or begin is None or end is None: + continue + + result[str(reference)] = { + "begin": int(begin), + "end": int(end), + } + + return result + + +def enrich_response_with_offsets( + response_json: Dict[str, Any], + reference_offsets: Dict[str, Dict[str, int]], +) -> Dict[str, Any]: + for result in response_json.get("results", []): + reference = result.get("reference") + + if reference is None: + continue + + offsets = reference_offsets.get(str(reference)) + + if offsets is None: + continue + + result["begin"] = offsets["begin"] + result["end"] = offsets["end"] + + return response_json + + +if __name__ == "__main__": + port = int(os.environ.get("PORT", "9714")) + + uvicorn.run( + app, + host="0.0.0.0", + port=port, + log_level=os.environ.get("LOG_LEVEL", "info"), + ) \ No newline at end of file diff --git a/duui-geonames-fst/src/main/resources/entrypoint.sh b/duui-geonames-fst/src/main/resources/entrypoint.sh new file mode 100755 index 00000000..38c36fbc --- /dev/null +++ b/duui-geonames-fst/src/main/resources/entrypoint.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env sh +set -eu + +PORT="${PORT:-9714}" +GEONAMES_BACKEND_PORT="${GEONAMES_BACKEND_PORT:-9715}" +GEONAMES_BACKEND="${GEONAMES_BACKEND:-http://127.0.0.1:${GEONAMES_BACKEND_PORT}}" +LOG_LEVEL="${LOG_LEVEL:-info}" + +export PORT +export GEONAMES_BACKEND +export LOG_LEVEL + +backend_pid="" +proxy_pid="" + +cleanup() { + if [ -n "${proxy_pid}" ]; then + kill "${proxy_pid}" 2>/dev/null || true + fi + if [ -n "${backend_pid}" ]; then + kill "${backend_pid}" 2>/dev/null || true + fi +} + +trap cleanup INT TERM EXIT + +/app/geonames-fst \ + --port "${GEONAMES_BACKEND_PORT}" \ + /app/data/geonames/ \ + --alternate /app/data/alternateNames/ & +backend_pid="$!" + +python - <<'PY' +import os +import socket +import sys +import time + +host = "127.0.0.1" +port = int(os.environ.get("GEONAMES_BACKEND_PORT", "9715")) + +for _ in range(120): + try: + with socket.create_connection((host, port), timeout=1): + sys.exit(0) + except OSError: + time.sleep(0.5) + +print(f"GeoNames backend did not open {host}:{port}", file=sys.stderr) +sys.exit(1) +PY + +python /app/resources/duui_geonames_proxy.py & +proxy_pid="$!" + +wait "${proxy_pid}" diff --git a/duui-geonames-fst/src/main/resources/requirements.txt b/duui-geonames-fst/src/main/resources/requirements.txt new file mode 100644 index 00000000..d9593de4 --- /dev/null +++ b/duui-geonames-fst/src/main/resources/requirements.txt @@ -0,0 +1,3 @@ +fastapi==0.115.13 +uvicorn[standard]==0.34.3 +httpx==0.28.1 diff --git a/duui-geonames-fst/src/test/java/org/texttechnologylab/textimager/uima/rust/TestGeoNamesFst.java b/duui-geonames-fst/src/test/java/org/texttechnologylab/textimager/uima/rust/TestGeoNamesFst.java index 3b3b4d64..ed25bdc9 100644 --- a/duui-geonames-fst/src/test/java/org/texttechnologylab/textimager/uima/rust/TestGeoNamesFst.java +++ b/duui-geonames-fst/src/test/java/org/texttechnologylab/textimager/uima/rust/TestGeoNamesFst.java @@ -75,7 +75,7 @@ public void test_find_explicit() throws Exception { composer.addDriver(new DUUIRemoteDriver(10000)); composer.add( new DUUIRemoteDriver.Component("http://localhost:9714") - .withName("duui-geonames-fst") +// .withName("duui-geonames-fst") .withParameter("mode", "find") .withParameter("result_selection", "first") ); @@ -110,16 +110,18 @@ public void test_levenshtein() throws Exception { composer.addDriver(new DUUIRemoteDriver(10000)); composer.add( new DUUIRemoteDriver.Component("http://localhost:9714") - .withName("duui-geonames-fst") +// .withName("duui-geonames-fst") .withParameter("mode", "levenshtein") .withParameter("max_dist", "2") .withParameter("result_selection", "first") + .withTargetView("levenshtein") ); JCas jCas = getJCas(); composer.run(jCas); composer.shutdown(); + jCas = jCas.getView("levenshtein"); List annotations = jCas.select(GeoNamesEntity.class).toList(); assert annotations.size() == 6; for (GeoNamesEntity gn : annotations) { diff --git a/duui-geonames-fst/target/classes/communication_layer.lua b/duui-geonames-fst/target/classes/communication_layer.lua new file mode 100644 index 00000000..8c9605a4 --- /dev/null +++ b/duui-geonames-fst/target/classes/communication_layer.lua @@ -0,0 +1,201 @@ +-- Bind static classes from Java +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") + +---This component does NOT support the new `process` method. +SUPPORTS_PROCESS = false + +---This component supports the old `serialize`/`deserialize` methods. +SUPPORTS_SERIALIZE = true + +ANNOTATION_TYPE = "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location" + +---Serialize annotations from the DUUI source view. +---DUUI resolves `.withSourceView(...)` before calling this function. +---Therefore inputCas is already the source view. +---@param inputCas any source JCas view +---@param outputStream any output stream to the remote component +---@param parameters table optional parameters +function serialize(inputCas, outputStream, parameters) + parameters = parameters or {} + + local annotation_type = parameters.annotation_type or ANNOTATION_TYPE + + local query = { + mode = parameters.mode or "find", + result_selection = parameters.result_selection or "first", + queries = {} + } + + if parameters.filter ~= nil then + query.filter = parameters.filter + end + + if query.mode ~= "find" and parameters.max_dist ~= nil then + query.max_dist = tostring(parameters.max_dist) + end + + if query.mode == "levenshtein" and parameters.state_limit ~= nil then + query.state_limit = tostring(parameters.state_limit) + end + + if parameters.min_length ~= nil then + query.min_length = tostring(parameters.min_length) + end + + local annotation_class = luajava.bindClass(annotation_type) + local iterator = JCasUtil:select(inputCas, annotation_class):iterator() + + local index = 1 + + while iterator:hasNext() do + local entity = iterator:next() + + query.queries[#query.queries + 1] = { + reference = tostring(index), + text = entity:getCoveredText(), + begin = entity:getBegin(), + ["end"] = entity:getEnd() + } + + index = index + 1 + end + + outputStream:write(json.encode(query)) +end + +---Deserialize GeoNames results into the DUUI target view. +---DUUI resolves/creates `.withTargetView(...)` before calling this function. +---Therefore inputCas is already the target view. +---@param inputCas any target JCas view +---@param inputStream any response stream from the remote component +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance( + "java.lang.String", + inputStream:readAllBytes(), + StandardCharsets.UTF_8 + ) + + local results = json.decode(inputString) + + if results == nil then + return + end + + if results.results ~= nil then + for _, entity in ipairs(results.results) do + add_geonames_annotation(inputCas, entity) + end + end + + if results.modification ~= nil then + add_document_modification(inputCas, results.modification) + end +end + +---Create one GeoNamesEntity annotation in the target view. +---@param targetCas any target JCas view +---@param entity table one result entry from the component +function add_geonames_annotation(targetCas, entity) + if entity == nil or entity.entry == nil then + return + end + + local gn = entity.entry + + local begin_pos = tonumber(entity.begin) + local end_pos = tonumber(entity["end"]) + + if begin_pos == nil or end_pos == nil then + error( + "Missing begin/end offsets in GeoNames response for reference: " .. + tostring(entity.reference) + ) + end + + local annotation = luajava.newInstance( + "org.texttechnologylab.annotation.geonames.GeoNamesEntity", + targetCas + ) + + annotation:setBegin(begin_pos) + annotation:setEnd(end_pos) + + if gn.id ~= nil then + annotation:setId(tonumber(gn.id)) + end + + if gn.name ~= nil then + annotation:setName(gn.name) + end + + if gn.latitude ~= nil then + annotation:setLatitude(gn.latitude) + end + + if gn.longitude ~= nil then + annotation:setLongitude(gn.longitude) + end + + if gn.feature_class ~= nil then + annotation:setFeatureClass(gn.feature_class) + end + + if gn.feature_code ~= nil then + annotation:setFeatureCode(gn.feature_code) + end + + if gn.country_code ~= nil then + annotation:setCountryCode(gn.country_code) + end + + if gn.adm1 ~= nil then + annotation:setAdm1(gn.adm1) + end + + if gn.adm2 ~= nil then + annotation:setAdm2(gn.adm2) + end + + if gn.adm3 ~= nil then + annotation:setAdm3(gn.adm3) + end + + if gn.adm4 ~= nil then + annotation:setAdm4(gn.adm4) + end + + if gn.elevation ~= nil then + annotation:setElevation(gn.elevation) + end + + annotation:addToIndexes() +end + +---Add DocumentModification annotation to the target view. +---@param targetCas any target JCas view +---@param modification table modification metadata from component +function add_document_modification(targetCas, modification) + if modification == nil then + return + end + + local document_modification = luajava.newInstance( + "org.texttechnologylab.annotation.DocumentModification", + targetCas + ) + + if modification.user ~= nil then + document_modification:setUser(modification.user) + end + + if modification.timestamp ~= nil then + document_modification:setTimestamp(modification.timestamp) + end + + if modification.comment ~= nil then + document_modification:setComment(modification.comment) + end + + document_modification:addToIndexes() +end diff --git a/duui-geonames-fst/target/classes/duui_geonames_proxy.py b/duui-geonames-fst/target/classes/duui_geonames_proxy.py new file mode 100644 index 00000000..0209bfa7 --- /dev/null +++ b/duui-geonames-fst/target/classes/duui_geonames_proxy.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +import os +from pathlib import Path +from typing import Any, Dict + +import httpx +import uvicorn +from fastapi import FastAPI, Request, Response +from fastapi.responses import JSONResponse, PlainTextResponse + + +BACKEND_URL = os.environ.get("GEONAMES_BACKEND", "http://127.0.0.1:9715") +RESOURCE_DIR = Path(os.environ.get("DUUI_RESOURCE_DIR", Path(__file__).resolve().parent)) +COMMUNICATION_LAYER = Path( + os.environ.get("DUUI_COMMUNICATION_LAYER", RESOURCE_DIR / "communication_layer.lua") +) + +app = FastAPI(title="DUUI GeoNames FST Proxy") + + +@app.get("/") +async def root() -> Dict[str, Any]: + return { + "name": "duui-geonames-fst-proxy", + "backend": BACKEND_URL, + "communication_layer": str(COMMUNICATION_LAYER), + "description": ( + "Proxy for DUUI serialize/deserialize mode. " + "Serves the Lua communication layer, adds Content-Type: application/json " + "and restores begin/end offsets." + ), + } + + +@app.get("/v1/communication_layer") +async def communication_layer() -> Response: + """DUUI fetches the Lua communication layer via GET before processing.""" + try: + content = COMMUNICATION_LAYER.read_text(encoding="utf-8") + except FileNotFoundError: + return PlainTextResponse( + f"communication_layer.lua not found at {COMMUNICATION_LAYER}", + status_code=500, + ) + + return PlainTextResponse(content, status_code=200, media_type="text/plain") + + +@app.get("/{path:path}") +async def forward_get(path: str, request: Request) -> Response: + """Forward non-Lua GET requests to the real backend.""" + return await forward_raw_get(request, f"/{path}") + + +@app.post("/") +async def process_root(request: Request) -> Response: + return await forward_json_post(request, "/") + + +@app.post("/{path:path}") +async def process_path(path: str, request: Request) -> Response: + return await forward_json_post(request, f"/{path}") + + +async def forward_raw_get(request: Request, path: str) -> Response: + backend_url = BACKEND_URL.rstrip("/") + path + + try: + async with httpx.AsyncClient(timeout=None) as client: + backend_response = await client.get( + backend_url, + params=dict(request.query_params), + headers={"Accept": request.headers.get("accept", "*/*")}, + ) + except Exception as exc: + return PlainTextResponse( + f"Proxy error while forwarding GET to {backend_url}: {exc}", + status_code=502, + ) + + return Response( + content=backend_response.content, + status_code=backend_response.status_code, + media_type=backend_response.headers.get("content-type", "text/plain"), + ) + + +async def forward_json_post(request: Request, path: str) -> Response: + body = await request.body() + + if not body: + return PlainTextResponse("Empty request body", status_code=400) + + try: + request_json = await request.json() + except Exception as exc: + return PlainTextResponse(f"Invalid JSON request body: {exc}", status_code=400) + + reference_offsets = build_reference_offset_map(request_json) + backend_url = BACKEND_URL.rstrip("/") + path + + try: + async with httpx.AsyncClient(timeout=None) as client: + backend_response = await client.post( + backend_url, + content=body, + headers={ + "Content-Type": "application/json", + "Accept": "application/json", + }, + ) + except Exception as exc: + return PlainTextResponse( + f"Proxy error while forwarding POST to {backend_url}: {exc}", + status_code=502, + ) + + if backend_response.status_code < 200 or backend_response.status_code >= 300: + return Response( + content=backend_response.content, + status_code=backend_response.status_code, + media_type=backend_response.headers.get("content-type", "text/plain"), + ) + + try: + response_json = backend_response.json() + except Exception: + return Response( + content=backend_response.content, + status_code=backend_response.status_code, + media_type=backend_response.headers.get("content-type", "application/json"), + ) + + response_json = enrich_response_with_offsets(response_json, reference_offsets) + + return JSONResponse( + content=response_json, + status_code=backend_response.status_code, + ) + + +def build_reference_offset_map(request_json: Dict[str, Any]) -> Dict[str, Dict[str, int]]: + result: Dict[str, Dict[str, int]] = {} + + for query in request_json.get("queries", []): + reference = query.get("reference") + begin = query.get("begin") + end = query.get("end") + + if reference is None or begin is None or end is None: + continue + + result[str(reference)] = { + "begin": int(begin), + "end": int(end), + } + + return result + + +def enrich_response_with_offsets( + response_json: Dict[str, Any], + reference_offsets: Dict[str, Dict[str, int]], +) -> Dict[str, Any]: + for result in response_json.get("results", []): + reference = result.get("reference") + + if reference is None: + continue + + offsets = reference_offsets.get(str(reference)) + + if offsets is None: + continue + + result["begin"] = offsets["begin"] + result["end"] = offsets["end"] + + return response_json + + +if __name__ == "__main__": + port = int(os.environ.get("PORT", "9714")) + + uvicorn.run( + app, + host="0.0.0.0", + port=port, + log_level=os.environ.get("LOG_LEVEL", "info"), + ) \ No newline at end of file diff --git a/duui-geonames-fst/target/classes/entrypoint.sh b/duui-geonames-fst/target/classes/entrypoint.sh new file mode 100755 index 00000000..38c36fbc --- /dev/null +++ b/duui-geonames-fst/target/classes/entrypoint.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env sh +set -eu + +PORT="${PORT:-9714}" +GEONAMES_BACKEND_PORT="${GEONAMES_BACKEND_PORT:-9715}" +GEONAMES_BACKEND="${GEONAMES_BACKEND:-http://127.0.0.1:${GEONAMES_BACKEND_PORT}}" +LOG_LEVEL="${LOG_LEVEL:-info}" + +export PORT +export GEONAMES_BACKEND +export LOG_LEVEL + +backend_pid="" +proxy_pid="" + +cleanup() { + if [ -n "${proxy_pid}" ]; then + kill "${proxy_pid}" 2>/dev/null || true + fi + if [ -n "${backend_pid}" ]; then + kill "${backend_pid}" 2>/dev/null || true + fi +} + +trap cleanup INT TERM EXIT + +/app/geonames-fst \ + --port "${GEONAMES_BACKEND_PORT}" \ + /app/data/geonames/ \ + --alternate /app/data/alternateNames/ & +backend_pid="$!" + +python - <<'PY' +import os +import socket +import sys +import time + +host = "127.0.0.1" +port = int(os.environ.get("GEONAMES_BACKEND_PORT", "9715")) + +for _ in range(120): + try: + with socket.create_connection((host, port), timeout=1): + sys.exit(0) + except OSError: + time.sleep(0.5) + +print(f"GeoNames backend did not open {host}:{port}", file=sys.stderr) +sys.exit(1) +PY + +python /app/resources/duui_geonames_proxy.py & +proxy_pid="$!" + +wait "${proxy_pid}" diff --git a/duui-geonames-fst/target/classes/requirements.txt b/duui-geonames-fst/target/classes/requirements.txt new file mode 100644 index 00000000..d9593de4 --- /dev/null +++ b/duui-geonames-fst/target/classes/requirements.txt @@ -0,0 +1,3 @@ +fastapi==0.115.13 +uvicorn[standard]==0.34.3 +httpx==0.28.1 diff --git a/duui-geonames-fst/target/classes/typesystem.xml b/duui-geonames-fst/target/classes/typesystem.xml new file mode 100644 index 00000000..7baa4cb4 --- /dev/null +++ b/duui-geonames-fst/target/classes/typesystem.xml @@ -0,0 +1,4137 @@ + + + GeoNamesTypeSystem + + 1.0 + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + uima.tcas.Annotation + + + value + + uima.cas.String + + + identifier + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + org.texttechnologylab.annotation.geonames.GeoNamesEntity + GeoNames annotation base type. + uima.tcas.Annotation + + + id + Integer ID of this record in the GeoNames database. + uima.cas.Integer + + + name + Canonical name of this record, usually an English one. + uima.cas.String + + + featureClass + + Single character feature class, see: http://www.geonames.org/export/codes.html + + GeoNamesFeatureClass + + + featureCode + + Fine-grained feature code, see: + http://www.geonames.org/export/codes.html + + GeoNamesFeatureCode + + + countryCode + + ISO-3166 2-letter country code + + uima.cas.String + + + adm1 + + The code for top level administrative division, most of which are FIPS codes. + ISO codes are used for US, CH, BE and ME. + UK and Greece are using an additional level between country and fips code. + The code '00' stands for general features where no specific adm1 code is defined. + + uima.cas.String + + + adm2 + + The code for the second level administrative division, i.e. a county in the US. + + uima.cas.String + + + adm3 + + The code for third level administrative division. + + uima.cas.String + + + adm4 + + The code for fourth level administrative division. + + uima.cas.String + + + latitude + + Latitude as a 32-bit floating point number. + + uima.cas.Float + + + longitude + + Longitude as a 32-bit floating point number. + + uima.cas.Float + + + elevation + + Elevation in meters above/below normal as a 16-bit signed integer number; + optional, defaults to 0. + + uima.cas.Short + + + referenceAnnotation + + The annotation this GeoName annotation is in reference to. By default, this should be a + 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location' annotation. + + uima.tcas.Annotation + + + + + GeoNamesFeatureClass + A GeoNames feature class. + uima.cas.String + + + A + country, state, region,... + + + H + stream, lake, ... + + + L + parks,area, ... + + + P + city, village,... + + + R + road, railroad + + + S + spot, building, farm + + + T + mountain,hill,rock,... + + + U + undersea + + + V + forest,heath,... + + + + + GeoNamesFeatureCode + A GeoNames feature code. + uima.cas.String + + + ADM1 + first-order administrative division; a primary administrative division of a country, + such as a state in the United States + + + + ADM1H + historical first-order administrative division; a former first-order administrative + division + + + + ADM2 + second-order administrative division; a subdivision of a first-order administrative + division + + + + ADM2H + historical second-order administrative division; a former second-order administrative + division + + + + ADM3 + third-order administrative division; a subdivision of a second-order administrative + division + + + + ADM3H + historical third-order administrative division; a former third-order administrative + division + + + + ADM4 + fourth-order administrative division; a subdivision of a third-order administrative + division + + + + ADM4H + historical fourth-order administrative division; a former fourth-order administrative + division + + + + ADM5 + fifth-order administrative division; a subdivision of a fourth-order administrative + division + + + + ADM5H + historical fifth-order administrative division; a former fifth-order administrative + division + + + + ADMD + administrative division; an administrative division of a country, undifferentiated as + to administrative level + + + + ADMDH + historical administrative division ; a former administrative division of a political + entity, undifferentiated as to administrative level + + + + LTER + leased area; a tract of land leased to another country, usually for military + installations + + + + PCL + political entity + + + PCLD + dependent political entity + + + PCLF + freely associated state + + + PCLH + historical political entity; a former political entity + + + PCLI + independent political entity + + + PCLIX + section of independent political entity + + + PCLS + semi-independent political entity + + + PRSH + parish; an ecclesiastical district + + + TERR + territory + + + ZN + zone + + + ZNB + buffer zone; a zone recognized as a buffer between two nations in which military + presence is minimal or absent + + + + AIRS + seaplane landing area; a place on a waterbody where floatplanes land and take off + + + + ANCH + anchorage; an area where vessels may anchor + + + BAY + bay; a coastal indentation between two capes or headlands, larger than a cove but + smaller than a gulf + + + + BAYS + bays; coastal indentations between two capes or headlands, larger than a cove but + smaller than a gulf + + + + BGHT + bight(s); an open body of water forming a slight recession in a coastline + + + BNK + bank(s); an elevation, typically located on a shelf, over which the depth of water is + relatively shallow but sufficient for most surface navigation + + + + BNKR + stream bank; a sloping margin of a stream channel which normally confines the stream to + its channel on land + + + + BNKX + section of bank + + + BOG + bog(s); a wetland characterized by peat forming sphagnum moss, sedge, and other + acid-water plants + + + + CAPG + icecap; a dome-shaped mass of glacial ice covering an area of mountain summits or other + high lands; smaller than an ice sheet + + + + CHN + channel; the deepest part of a stream, bay, lagoon, or strait, through which the main + current flows + + + + CHNL + lake channel(s); that part of a lake having water deep enough for navigation between + islands, shoals, etc. + + + + CHNM + marine channel; that part of a body of water deep enough for navigation through an area + otherwise not suitable + + + + CHNN + navigation channel; a buoyed channel of sufficient depth for the safe navigation of + vessels + + + + CNFL + confluence; a place where two or more streams or intermittent streams flow together + + + + CNL + canal; an artificial watercourse + + + CNLA + aqueduct; a conduit used to carry water + + + CNLB + canal bend; a conspicuously curved or bent section of a canal + + + CNLD + drainage canal; an artificial waterway carrying water away from a wetland or from + drainage ditches + + + + CNLI + irrigation canal; a canal which serves as a main conduit for irrigation water + + + + CNLN + navigation canal(s); a watercourse constructed for navigation of vessels + + + CNLQ + abandoned canal + + + CNLSB + underground irrigation canal(s); a gently inclined underground tunnel bringing water + for irrigation from aquifers + + + + CNLX + section of canal + + + COVE + cove(s); a small coastal indentation, smaller than a bay + + + CRKT + tidal creek(s); a meandering channel in a coastal wetland subject to bi-directional + tidal currents + + + + CRNT + current; a horizontal flow of water in a given direction with uniform velocity + + + + CUTF + cutoff; a channel formed as a result of a stream cutting through a meander neck + + + + DCK + dock(s); a waterway between two piers, or cut into the land for the berthing of ships + + + + DCKB + docking basin; a part of a harbor where ships dock + + + DOMG + icecap dome; a comparatively elevated area on an icecap + + + DPRG + icecap depression; a comparatively depressed area on an icecap + + + DTCH + ditch; a small artificial watercourse dug for draining or irrigating the land + + + + DTCHD + drainage ditch; a ditch which serves to drain the land + + + DTCHI + irrigation ditch; a ditch which serves to distribute irrigation water + + + DTCHM + ditch mouth(s); an area where a drainage ditch enters a lagoon, lake or bay + + + + ESTY + estuary; a funnel-shaped stream mouth or embayment where fresh water mixes with sea + water under tidal influences + + + + FISH + fishing area; a fishing ground, bank or area where fishermen go to catch fish + + + + FJD + fjord; a long, narrow, steep-walled, deep-water arm of the sea at high latitudes, + usually along mountainous coasts + + + + FJDS + fjords; long, narrow, steep-walled, deep-water arms of the sea at high latitudes, + usually along mountainous coasts + + + + FLLS + waterfall(s); a perpendicular or very steep descent of the water of a stream + + + + FLLSX + section of waterfall(s) + + + FLTM + mud flat(s); a relatively level area of mud either between high and low tide lines, or + subject to flooding + + + + FLTT + tidal flat(s); a large flat area of mud or sand attached to the shore and alternately + covered and uncovered by the tide + + + + GLCR + glacier(s); a mass of ice, usually at high latitudes or high elevations, with + sufficient thickness to flow away from the source area in lobes, tongues, or masses + + + + GULF + gulf; a large recess in the coastline, larger than a bay + + + GYSR + geyser; a type of hot spring with intermittent eruptions of jets of hot water and + steam + + + + HBR + harbor(s); a haven or space of deep water so sheltered by the adjacent land as to + afford a safe anchorage for ships + + + + HBRX + section of harbor + + + INLT + inlet; a narrow waterway extending into the land, or connecting a bay or lagoon with a + larger body of water + + + + INLTQ + former inlet; an inlet which has been filled in, or blocked by deposits + + + LBED + lake bed(s); a dried up or drained area of a former lake + + + LGN + lagoon; a shallow coastal waterbody, completely or partly separated from a larger body + of water by a barrier island, coral reef or other depositional feature + + + + LGNS + lagoons; shallow coastal waterbodies, completely or partly separated from a larger body + of water by a barrier island, coral reef or other depositional feature + + + + LGNX + section of lagoon + + + LK + lake; a large inland body of standing water + + + LKC + crater lake; a lake in a crater or caldera + + + LKI + intermittent lake + + + LKN + salt lake; an inland body of salt water with no outlet + + + LKNI + intermittent salt lake + + + LKO + oxbow lake; a crescent-shaped lake commonly found adjacent to meandering streams + + + + LKOI + intermittent oxbow lake + + + LKS + lakes; large inland bodies of standing water + + + LKSB + underground lake; a standing body of water in a cave + + + LKSC + crater lakes; lakes in a crater or caldera + + + LKSI + intermittent lakes + + + LKSN + salt lakes; inland bodies of salt water with no outlet + + + LKSNI + intermittent salt lakes + + + LKX + section of lake + + + MFGN + salt evaporation ponds; diked salt ponds used in the production of solar evaporated + salt + + + + MGV + mangrove swamp; a tropical tidal mud flat characterized by mangrove vegetation + + + + MOOR + moor(s); an area of open ground overlaid with wet peaty soils + + + MRSH + marsh(es); a wetland dominated by grass-like vegetation + + + MRSHN + salt marsh; a flat area, subject to periodic salt water inundation, dominated by grassy + salt-tolerant plants + + + + NRWS + narrows; a navigable narrow part of a bay, strait, river, etc. + + + OCN + ocean; one of the major divisions of the vast expanse of salt water covering part of + the earth + + + + OVF + overfalls; an area of breaking waves caused by the meeting of currents or by waves + moving against the current + + + + PND + pond; a small standing waterbody + + + PNDI + intermittent pond + + + PNDN + salt pond; a small standing body of salt water often in a marsh or swamp, usually along + a seacoast + + + + PNDNI + intermittent salt pond(s) + + + PNDS + ponds; small standing waterbodies + + + PNDSF + fishponds; ponds or enclosures in which fish are kept or raised + + + PNDSI + intermittent ponds + + + PNDSN + salt ponds; small standing bodies of salt water often in a marsh or swamp, usually + along a seacoast + + + + POOL + pool(s); a small and comparatively still, deep part of a larger body of water such as a + stream or harbor; or a small body of standing water + + + + POOLI + intermittent pool + + + RCH + reach; a straight section of a navigable stream or channel between two bends + + + + RDGG + icecap ridge; a linear elevation on an icecap + + + RDST + roadstead; an open anchorage affording less protection than a harbor + + + RF + reef(s); a surface-navigation hazard composed of consolidated material + + + RFC + coral reef(s); a surface-navigation hazard composed of coral + + + RFX + section of reef + + + RPDS + rapids; a turbulent section of a stream associated with a steep, irregular stream bed + + + + RSV + reservoir(s); an artificial pond or lake + + + RSVI + intermittent reservoir + + + RSVT + water tank; a contained pool or tank of water at, below, or above ground level + + + + RVN + ravine(s); a small, narrow, deep, steep-sided stream channel, smaller than a gorge + + + + SBKH + sabkha(s); a salt flat or salt encrusted plain subject to periodic inundation from + flooding or high tides + + + + SD + sound; a long arm of the sea forming a channel between the mainland and an island or + islands; or connecting two larger bodies of water + + + + SEA + sea; a large body of salt water more or less confined by continuous land or chains of + islands forming a subdivision of an ocean + + + + SHOL + shoal(s); a surface-navigation hazard composed of unconsolidated material + + + SILL + sill; the low part of an underwater gap or saddle separating basins, including a + similar feature at the mouth of a fjord + + + + SPNG + spring(s); a place where ground water flows naturally out of the ground + + + SPNS + sulphur spring(s); a place where sulphur ground water flows naturally out of the + ground + + + + SPNT + hot spring(s); a place where hot ground water flows naturally out of the ground + + + + STM + stream; a body of running water moving to a lower level in a channel on land + + + + STMA + anabranch; a diverging branch flowing out of a main stream and rejoining it + downstream + + + + STMB + stream bend; a conspicuously curved or bent segment of a stream + + + STMC + canalized stream; a stream that has been substantially ditched, diked, or + straightened + + + + STMD + distributary(-ies); a branch which flows away from the main stream, as in a delta or + irrigation canal + + + + STMH + headwaters; the source and upper part of a stream, including the upper drainage basin + + + + STMI + intermittent stream + + + STMIX + section of intermittent stream + + + STMM + stream mouth(s); a place where a stream discharges into a lagoon, lake, or the sea + + + + STMQ + abandoned watercourse; a former stream or distributary no longer carrying flowing + water, but still evident due to lakes, wetland, topographic or vegetation patterns + + + + STMS + streams; bodies of running water moving to a lower level in a channel on land + + + + STMSB + lost river; a surface stream that disappears into an underground channel, or dries up + in an arid area + + + + STMX + section of stream + + + STRT + strait; a relatively narrow waterway, usually narrower and less extensive than a sound, + connecting two larger bodies of water + + + + SWMP + swamp; a wetland dominated by tree vegetation + + + SYSI + irrigation system; a network of ditches and one or more of the following elements: + water supply, reservoir, canal, pump, well, drain, etc. + + + + TNLC + canal tunnel; a tunnel through which a canal passes + + + WAD + wadi; a valley or ravine, bounded by relatively steep banks, which in the rainy season + becomes a watercourse; found primarily in North Africa and the Middle East + + + + WADB + wadi bend; a conspicuously curved or bent segment of a wadi + + + WADJ + wadi junction; a place where two or more wadies join + + + WADM + wadi mouth; the lower terminus of a wadi where it widens into an adjoining floodplain, + depression, or waterbody + + + + WADS + wadies; valleys or ravines, bounded by relatively steep banks, which in the rainy + season become watercourses; found primarily in North Africa and the Middle East + + + + WADX + section of wadi + + + WHRL + whirlpool; a turbulent, rotating movement of water in a stream + + + WLL + well; a cylindrical hole, pit, or tunnel drilled or dug down to a depth from which + water, oil, or gas can be pumped or brought to the surface + + + + WLLQ + abandoned well + + + WLLS + wells; cylindrical holes, pits, or tunnels drilled or dug down to a depth from which + water, oil, or gas can be pumped or brought to the surface + + + + WTLD + wetland; an area subject to inundation, usually characterized by bog, marsh, or swamp + vegetation + + + + WTLDI + intermittent wetland + + + WTRC + watercourse; a natural, well-defined channel produced by flowing water, or an + artificial channel designed to carry flowing water + + + + WTRH + waterhole(s); a natural hole, hollow, or small depression that contains water, used by + man and animals, especially in arid areas + + + + AGRC + agricultural colony; a tract of land set aside for agricultural settlement + + + + AMUS + amusement park; Amusement Park are theme parks, adventure parks offering entertainment, + similar to funfairs but with a fix location + + + + AREA + area; a tract of land without homogeneous character or boundaries + + + BSND + drainage basin; an area drained by a stream + + + BSNP + petroleum basin; an area underlain by an oil-rich structural basin + + + BTL + battlefield; a site of a land battle of historical importance + + + CLG + clearing; an area in a forest with trees removed + + + CMN + common; a park or pasture for community use + + + CNS + concession area; a lease of land by a government for economic development, e.g., + mining, forestry + + + + COLF + coalfield; a region in which coal deposits of possible economic value occur + + + + CONT + continent; continent: Europe, Africa, Asia, North America, South America, Oceania, + Antarctica + + + + CST + coast; a zone of variable width straddling the shoreline + + + CTRB + business center; a place where a number of businesses are located + + + DEVH + housing development; a tract of land on which many houses of similar design are built + according to a development plan + + + + FLD + field(s); an open as opposed to wooded area + + + FLDI + irrigated field(s); a tract of level or terraced land which is irrigated + + + GASF + gasfield; an area containing a subterranean store of natural gas of economic value + + + + GRAZ + grazing area; an area of grasses and shrubs used for grazing + + + GVL + gravel area; an area covered with gravel + + + INDS + industrial area; an area characterized by industrial activity + + + LAND + arctic land; a tract of land in the Arctic + + + LCTY + locality; a minor area or place of unspecified or mixed character and indefinite + boundaries + + + + MILB + military base; a place used by an army or other armed service for storing arms and + supplies, and for accommodating and training troops, a base from which operations can be + initiated + + + + MNA + mining area; an area of mine sites where minerals and ores are extracted + + + MVA + maneuver area; a tract of land where military field exercises are carried out + + + + NVB + naval base; an area used to store supplies, provide barracks for troops and naval + personnel, a port for naval vessels, and from which operations are initiated + + + + OAS + oasis(-es); an area in a desert made productive by the availability of water + + + + OILF + oilfield; an area containing a subterranean store of petroleum of economic value + + + + PEAT + peat cutting area; an area where peat is harvested + + + PRK + park; an area, often of forested land, maintained as a place of beauty, or for + recreation + + + + PRT + port; a place provided with terminal and transfer facilities for loading and + discharging waterborne cargo or passengers, usually located in a harbor + + + + QCKS + quicksand; an area where loose sand with water moving through it may become unstable + when heavy objects are placed at the surface, causing them to sink + + + + RES + reserve; a tract of public land reserved for future use or restricted as to use + + + + RESA + agricultural reserve; a tract of land reserved for agricultural reclamation and/or + development + + + + RESF + forest reserve; a forested area set aside for preservation or controlled use + + + + RESH + hunting reserve; a tract of land used primarily for hunting + + + RESN + nature reserve; an area reserved for the maintenance of a natural habitat + + + RESP + palm tree reserve; an area of palm trees where use is controlled + + + RESV + reservation; a tract of land set aside for aboriginal, tribal, or native populations + + + + RESW + wildlife reserve; a tract of public land reserved for the preservation of wildlife + + + + RGN + region; an area distinguished by one or more observable physical or cultural + characteristics + + + + RGNE + economic region; a region of a country established for economic development or for + statistical purposes + + + + RGNH + historical region; a former historic area distinguished by one or more observable + physical or cultural characteristics + + + + RGNL + lake region; a tract of land distinguished by numerous lakes + + + RNGA + artillery range; a tract of land used for artillery firing practice + + + SALT + salt area; a shallow basin or flat where salt accumulates after periodic inundation + + + + SNOW + snowfield; an area of permanent snow and ice forming the accumulation area of a + glacier + + + + TRB + tribal area; a tract of land used by nomadic or other tribes + + + PPL + populated place; a city, town, village, or other agglomeration of buildings where + people live and work + + + + PPLA + seat of a first-order administrative division; seat of a first-order administrative + division (PPLC takes precedence over PPLA) + + + + PPLA2 + seat of a second-order administrative division + + + PPLA3 + seat of a third-order administrative division + + + PPLA4 + seat of a fourth-order administrative division + + + PPLA5 + seat of a fifth-order administrative division + + + PPLC + capital of a political entity + + + PPLCH + historical capital of a political entity; a former capital of a political entity + + + + PPLF + farm village; a populated place where the population is largely engaged in agricultural + activities + + + + PPLG + seat of government of a political entity + + + PPLH + historical populated place; a populated place that no longer exists + + + PPLL + populated locality; an area similar to a locality but with a small group of dwellings + or other buildings + + + + PPLQ + abandoned populated place + + + PPLR + religious populated place; a populated place whose population is largely engaged in + religious occupations + + + + PPLS + populated places; cities, towns, villages, or other agglomerations of buildings where + people live and work + + + + PPLW + destroyed populated place; a village, town or city destroyed by a natural disaster, or + by war + + + + PPLX + section of populated place + + + STLMT + israeli settlement + + + CSWY + causeway; a raised roadway across wet ground or shallow water + + + OILP + oil pipeline; a pipeline used for transporting oil + + + PRMN + promenade; a place for public walking, usually along a beach front + + + PTGE + portage; a place where boats, goods, etc., are carried overland between navigable + waters + + + + RD + road; an open way with improved surface for transportation of animals, people and + vehicles + + + + RDA + ancient road; the remains of a road used by ancient cultures + + + RDB + road bend; a conspicuously curved or bent section of a road + + + RDCUT + road cut; an excavation cut through a hill or ridge for a road + + + RDJCT + road junction; a place where two or more roads join + + + RJCT + railroad junction; a place where two or more railroad tracks join + + + RR + railroad; a permanent twin steel-rail track on which freight and passenger cars move + long distances + + + + RRQ + abandoned railroad + + + RTE + caravan route; the route taken by caravans + + + RYD + railroad yard; a system of tracks used for the making up of trains, and switching and + storing freight cars + + + + ST + street; a paved urban thoroughfare + + + STKR + stock route; a route taken by livestock herds + + + TNL + tunnel; a subterranean passageway for transportation + + + TNLN + natural tunnel; a cave that is open at both ends + + + TNLRD + road tunnel; a tunnel through which a road passes + + + TNLRR + railroad tunnel; a tunnel through which a railroad passes + + + TNLS + tunnels; subterranean passageways for transportation + + + TRL + trail; a path, track, or route used by pedestrians, animals, or off-road vehicles + + + + ADMF + administrative facility; a government building + + + AGRF + agricultural facility; a building and/or tract of land used for improving agriculture + + + + AIRB + airbase; an area used to store supplies, provide barracks for air force personnel, + hangars and runways for aircraft, and from which operations are initiated + + + + AIRF + airfield; a place on land where aircraft land and take off; no facilities provided for + the commercial handling of passengers and cargo + + + + AIRH + heliport; a place where helicopters land and take off + + + AIRP + airport; a place where aircraft regularly land and take off, with runways, navigational + aids, and major facilities for the commercial handling of passengers and cargo + + + + AIRQ + abandoned airfield + + + AIRT + terminal; airport facilities for the handling of freight and passengers + + + AMTH + amphitheater; an oval or circular structure with rising tiers of seats about a stage or + open space + + + + ANS + archaeological/prehistoric site; a place where archeological remains, old structures, + or cultural artifacts are located + + + + AQC + aquaculture facility; facility or area for the cultivation of aquatic animals and + plants, especially fish, shellfish, and seaweed, in natural or controlled marine or freshwater + environments; underwater agriculture + + + + ARCH + arch; a natural or man-made structure in the form of an arch + + + ARCHV + archive; a place or institution where documents are preserved + + + ART + piece of art; a piece of art, like a sculpture, painting. In contrast to monument + (MNMT) it is not commemorative. + + + + ASTR + astronomical station; a point on the earth whose position has been determined by + observations of celestial bodies + + + + ASYL + asylum; a facility where the insane are cared for and protected + + + ATHF + athletic field; a tract of land used for playing team sports, and athletic track and + field events + + + + ATM + automatic teller machine; An unattended electronic machine in a public place, connected + to a data system and related equipment and activated by a bank customer to obtain cash + withdrawals and other banking services. + + + + BANK + bank; A business establishment in which money is kept for saving or commercial purposes + or is invested, supplied for loans, or exchanged. + + + + BCN + beacon; a fixed artificial navigation mark + + + BDG + bridge; a structure erected across an obstacle such as a stream, road, etc., in order + to carry roads, railroads, and pedestrians across + + + + BDGQ + ruined bridge; a destroyed or decayed bridge which is no longer functional + + + + BLDA + apartment building; a building containing several individual apartments + + + BLDG + building(s); a structure built for permanent use, as a house, factory, etc. + + + + BLDO + office building; commercial building where business and/or services are conducted + + + + BP + boundary marker; a fixture marking a point along a boundary + + + BRKS + barracks; a building for lodging military personnel + + + BRKW + breakwater; a structure erected to break the force of waves at the entrance to a harbor + or port + + + + BSTN + baling station; a facility for baling agricultural products + + + BTYD + boatyard; a waterside facility for servicing, repairing, and building small vessels + + + + BUR + burial cave(s); a cave used for human burials + + + BUSTN + bus station; a facility comprising ticket office, platforms, etc. for loading and + unloading passengers + + + + BUSTP + bus stop; a place lacking station facilities + + + CARN + cairn; a heap of stones erected as a landmark or for other purposes + + + CAVE + cave(s); an underground passageway or chamber, or cavity on the side of a cliff + + + + CH + church; a building for public Christian worship + + + CMP + camp(s); a site occupied by tents, huts, or other shelters for temporary use + + + + CMPL + logging camp; a camp used by loggers + + + CMPLA + labor camp; a camp used by migrant or temporary laborers + + + CMPMN + mining camp; a camp used by miners + + + CMPO + oil camp; a camp used by oilfield workers + + + CMPQ + abandoned camp + + + CMPRF + refugee camp; a camp used by refugees + + + CMTY + cemetery; a burial place or ground + + + COMC + communication center; a facility, including buildings, antennae, towers and electronic + equipment for receiving and transmitting information + + + + CRRL + corral(s); a pen or enclosure for confining or capturing animals + + + CSNO + casino; a building used for entertainment, especially gambling + + + CSTL + castle; a large fortified building or set of buildings + + + CSTM + customs house; a building in a port where customs and duties are paid, and where + vessels are entered and cleared + + + + CTHSE + courthouse; a building in which courts of law are held + + + CTRA + atomic center; a facility where atomic research is carried out + + + CTRCM + community center; a facility for community recreation and other activities + + + + CTRF + facility center; a place where more than one facility is situated + + + CTRM + medical center; a complex of health care buildings including two or more of the + following: hospital, medical school, clinic, pharmacy, doctor's offices, etc. + + + + CTRR + religious center; a facility where more than one religious activity is carried out, + e.g., retreat, school, monastery, worship + + + + CTRS + space center; a facility for launching, tracking, or controlling satellites and space + vehicles + + + + CVNT + convent; a building where a community of nuns lives in seclusion + + + DAM + dam; a barrier constructed across a stream to impound water + + + DAMQ + ruined dam; a destroyed or decayed dam which is no longer functional + + + DAMSB + sub-surface dam; a dam put down to bedrock in a sand river + + + DARY + dairy; a facility for the processing, sale and distribution of milk or milk products + + + + DCKD + dry dock; a dock providing support for a vessel, and means for removing the water so + that the bottom of the vessel can be exposed + + + + DCKY + dockyard; a facility for servicing, building, or repairing ships + + + DIKE + dike; an earth or stone embankment usually constructed for flood or stream control + + + + DIP + diplomatic facility; office, residence, or facility of a foreign government, which may + include an embassy, consulate, chancery, office of charge d'affaires, or other diplomatic, + economic, military, or cultural mission + + + + DPOF + fuel depot; an area where fuel is stored + + + EST + estate(s); a large commercialized agricultural landholding with associated buildings + and other facilities + + + + ESTO + oil palm plantation; an estate specializing in the cultivation of oil palm trees + + + + ESTR + rubber plantation; an estate which specializes in growing and tapping rubber trees + + + + ESTSG + sugar plantation; an estate that specializes in growing sugar cane + + + ESTT + tea plantation; an estate which specializes in growing tea bushes + + + ESTX + section of estate + + + FCL + facility; a building or buildings housing a center, institute, foundation, hospital, + prison, mission, courthouse, etc. + + + + FNDY + foundry; a building or works where metal casting is carried out + + + FRM + farm; a tract of land with associated buildings devoted to agriculture + + + FRMQ + abandoned farm + + + FRMS + farms; tracts of land with associated buildings devoted to agriculture + + + FRMT + farmstead; the buildings and adjacent service areas of a farm + + + FT + fort; a defensive structure or earthworks + + + FY + ferry; a boat or other floating conveyance and terminal facilities regularly used to + transport people and vehicles across a waterbody + + + + FYT + ferry terminal; a place where ferries pick-up and discharge passengers, vehicles and or + cargo + + + + GATE + gate; a controlled access entrance or exit + + + GDN + garden(s); an enclosure for displaying selected plant or animal life + + + GHAT + ghat; a set of steps leading to a river, which are of religious significance, and at + their base is usually a platform for bathing + + + + GHSE + guest house; a house used to provide lodging for paying guests + + + GOSP + gas-oil separator plant; a facility for separating gas from oil + + + GOVL + local government office; a facility housing local governmental offices, usually a city, + town, or village hall + + + + GRVE + grave; a burial site + + + HERM + hermitage; a secluded residence, usually for religious sects + + + HLT + halting place; a place where caravans stop for rest + + + HMSD + homestead; a residence, owner's or manager's, on a sheep or cattle station, woolshed, + outcamp, or Aboriginal outstation, specific to Australia and New Zealand + + + + HSE + house(s); a building used as a human habitation + + + HSEC + country house; a large house, mansion, or chateau, on a large estate + + + HSP + hospital; a building in which sick or injured, especially those confined to bed, are + medically treated + + + + HSPC + clinic; a medical facility associated with a hospital for outpatients + + + HSPD + dispensary; a building where medical or dental aid is dispensed + + + HSPL + leprosarium; an asylum or hospital for lepers + + + HSTS + historical site; a place of historical importance + + + HTL + hotel; a building providing lodging and/or meals for the public + + + HUT + hut; a small primitive house + + + HUTS + huts; small primitive houses + + + INSM + military installation; a facility for use of and control by armed forces + + + ITTR + research institute; a facility where research is carried out + + + JTY + jetty; a structure built out into the water at a river mouth or harbor entrance to + regulate currents and silting + + + + LDNG + landing; a place where boats receive or discharge passengers and freight, but lacking + most port facilities + + + + LEPC + leper colony; a settled area inhabited by lepers in relative isolation + + + LIBR + library; A place in which information resources such as books are kept for reading, + reference, or lending. + + + + LNDF + landfill; a place for trash and garbage disposal in which the waste is buried between + layers of earth to build up low-lying land + + + + LOCK + lock(s); a basin in a waterway with gates at each end by means of which vessels are + passed from one water level to another + + + + LTHSE + lighthouse; a distinctive structure exhibiting a major navigation light + + + MALL + mall; A large, often enclosed shopping complex containing various stores, businesses, + and restaurants usually accessible by common passageways. + + + + MAR + marina; a harbor facility for small boats, yachts, etc. + + + MFG + factory; one or more buildings where goods are manufactured, processed or fabricated + + + + MFGB + brewery; one or more buildings where beer is brewed + + + MFGC + cannery; a building where food items are canned + + + MFGCU + copper works; a facility for processing copper ore + + + MFGLM + limekiln; a furnace in which limestone is reduced to lime + + + MFGM + munitions plant; a factory where ammunition is made + + + MFGPH + phosphate works; a facility for producing fertilizer + + + MFGQ + abandoned factory + + + MFGSG + sugar refinery; a facility for converting raw sugar into refined sugar + + + MKT + market; a place where goods are bought and sold at regular intervals + + + ML + mill(s); a building housing machines for transforming, shaping, finishing, grinding, or + extracting products + + + + MLM + ore treatment plant; a facility for improving the metal content of ore by + concentration + + + + MLO + olive oil mill; a mill where oil is extracted from olives + + + MLSG + sugar mill; a facility where sugar cane is processed into raw sugar + + + MLSGQ + former sugar mill; a sugar mill no longer used as a sugar mill + + + MLSW + sawmill; a mill where logs or lumber are sawn to specified shapes and sizes + + + + MLWND + windmill; a mill or water pump powered by wind + + + MLWTR + water mill; a mill powered by running water + + + MN + mine(s); a site where mineral ores are extracted from the ground by excavating surface + pits and subterranean passages + + + + MNAU + gold mine(s); a mine where gold ore, or alluvial gold is extracted + + + MNC + coal mine(s); a mine where coal is extracted + + + MNCR + chrome mine(s); a mine where chrome ore is extracted + + + MNCU + copper mine(s); a mine where copper ore is extracted + + + MNFE + iron mine(s); a mine where iron ore is extracted + + + MNMT + monument; a commemorative structure or statue + + + MNN + salt mine(s); a mine from which salt is extracted + + + MNQ + abandoned mine + + + MNQR + quarry(-ies); a surface mine where building stone or gravel and sand, etc. are + extracted + + + + MOLE + mole; a massive structure of masonry or large stones serving as a pier or breakwater + + + + MSQE + mosque; a building for public Islamic worship + + + MSSN + mission; a place characterized by dwellings, school, church, hospital and other + facilities operated by a religious group for the purpose of providing charitable services and to + propagate religion + + + + MSSNQ + abandoned mission + + + MSTY + monastery; a building and grounds where a community of monks lives in seclusion + + + + MTRO + metro station; metro station (Underground, Tube, or Metro) + + + MUS + museum; a building where objects of permanent interest in one or more of the arts and + sciences are preserved and exhibited + + + + NOV + novitiate; a religious house or school where novices are trained + + + NSY + nursery(-ies); a place where plants are propagated for transplanting or grafting + + + + OBPT + observation point; a wildlife or scenic observation point + + + OBS + observatory; a facility equipped for observation of atmospheric or space phenomena + + + + OBSR + radio observatory; a facility equipped with an array of antennae for receiving radio + waves from space + + + + OILJ + oil pipeline junction; a section of an oil pipeline where two or more pipes join + together + + + + OILQ + abandoned oil well + + + OILR + oil refinery; a facility for converting crude oil into refined petroleum products + + + + OILT + tank farm; a tract of land occupied by large, cylindrical, metal tanks in which oil or + liquid petrochemicals are stored + + + + OILW + oil well; a well from which oil may be pumped + + + OPRA + opera house; A theater designed chiefly for the performance of operas. + + + PAL + palace; a large stately house, often a royal or presidential residence + + + PGDA + pagoda; a tower-like storied structure, usually a Buddhist shrine + + + PIER + pier; a structure built out into navigable water on piles providing berthing for ships + and recreation + + + + PKLT + parking lot; an area used for parking vehicles + + + PMPO + oil pumping station; a facility for pumping oil through a pipeline + + + PMPW + water pumping station; a facility for pumping water from a major well or through a + pipeline + + + + PO + post office; a public building in which mail is received, sorted and distributed + + + + PP + police post; a building in which police are stationed + + + PPQ + abandoned police post + + + PRKGT + park gate; a controlled access to a park + + + PRKHQ + park headquarters; a park administrative facility + + + PRN + prison; a facility for confining prisoners + + + PRNJ + reformatory; a facility for confining, training, and reforming young law offenders + + + + PRNQ + abandoned prison + + + PS + power station; a facility for generating electric power + + + PSH + hydroelectric power station; a building where electricity is generated from water + power + + + + PSN + nuclear power station; nuclear power station + + + PSTB + border post; a post or station at an international boundary for the regulation of + movement of people and goods + + + + PSTC + customs post; a building at an international boundary where customs and duties are paid + on goods + + + + PSTP + patrol post; a post from which patrols are sent out + + + PYR + pyramid; an ancient massive structure of square ground plan with four triangular faces + meeting at a point and used for enclosing tombs + + + + PYRS + pyramids; ancient massive structures of square ground plan with four triangular faces + meeting at a point and used for enclosing tombs + + + + QUAY + quay; a structure of solid construction along a shore or bank which provides berthing + for ships and which generally provides cargo handling facilities + + + + RDCR + traffic circle; a road junction formed around a central circle about which traffic + moves in one direction only + + + + RDIN + intersection; a junction of two or more highways by a system of separate levels that + permit traffic to pass from one to another without the crossing of traffic streams + + + + RECG + golf course; a recreation field where golf is played + + + RECR + racetrack; a track where races are held + + + REST + restaurant; A place where meals are served to the public + + + RET + store; a building where goods and/or services are offered for sale + + + RHSE + resthouse; a structure maintained for the rest and shelter of travelers + + + RKRY + rookery; a breeding place of a colony of birds or seals + + + RLG + religious site; an ancient site of significant religious importance + + + RLGR + retreat; a place of temporary seclusion, especially for religious groups + + + RNCH + ranch(es); a large farm specializing in extensive grazing of livestock + + + RSD + railroad siding; a short track parallel to and joining the main track + + + RSGNL + railroad signal; a signal at the entrance of a particular section of track governing + the movement of trains + + + + RSRT + resort; a specialized facility for vacation, health, or participation sports + activities + + + + RSTN + railroad station; a facility comprising ticket office, platforms, etc. for loading and + unloading train passengers and freight + + + + RSTNQ + abandoned railroad station + + + RSTP + railroad stop; a place lacking station facilities where trains stop to pick up and + unload passengers and freight + + + + RSTPQ + abandoned railroad stop + + + RUIN + ruin(s); a destroyed or decayed structure which is no longer functional + + + SCH + school; building(s) where instruction in one or more branches of knowledge takes + place + + + + SCHA + agricultural school; a school with a curriculum focused on agriculture + + + SCHC + college; the grounds and buildings of an institution of higher learning + + + SCHL + language school; Language Schools & Institutions + + + SCHM + military school; a school at which military science forms the core of the curriculum + + + + SCHN + maritime school; a school at which maritime sciences form the core of the curriculum + + + + SCHT + technical school; post-secondary school with a specifically technical or vocational + curriculum + + + + SECP + State Exam Prep Centre; state exam preparation centres + + + SHPF + sheepfold; a fence or wall enclosure for sheep and other small herd animals + + + + SHRN + shrine; a structure or place memorializing a person or religious concept + + + SHSE + storehouse; a building for storing goods, especially provisions + + + SLCE + sluice; a conduit or passage for carrying off surplus water from a waterbody, usually + regulated by means of a sluice gate + + + + SNTR + sanatorium; a facility where victims of physical or mental disorders are treated + + + + SPA + spa; a resort area usually developed around a medicinal spring + + + SPLY + spillway; a passage or outlet through which surplus water flows over, around or through + a dam + + + + SQR + square; a broad, open, public area near the center of a town or city + + + STBL + stable; a building for the shelter and feeding of farm animals, especially horses + + + + STDM + stadium; a structure with an enclosure for athletic games with tiers of seats for + spectators + + + + STNB + scientific research base; a scientific facility used as a base from which research is + carried out or monitored + + + + STNC + coast guard station; a facility from which the coast is guarded by armed vessels + + + + STNE + experiment station; a facility for carrying out experiments + + + STNF + forest station; a collection of buildings and facilities for carrying out forest + management + + + + STNI + inspection station; a station at which vehicles, goods, and people are inspected + + + + STNM + meteorological station; a station at which weather elements are recorded + + + STNR + radio station; a facility for producing and transmitting information by radio waves + + + + STNS + satellite station; a facility for tracking and communicating with orbiting satellites + + + + STNW + whaling station; a facility for butchering whales and processing train oil + + + + STPS + steps; stones or slabs placed for ease in ascending or descending a steep slope + + + + SWT + sewage treatment plant; facility for the processing of sewage and/or wastewater + + + + SYG + synagogue; a place for Jewish worship and religious instruction + + + THTR + theater; A building, room, or outdoor structure for the presentation of plays, films, + or other dramatic performances + + + + TMB + tomb(s); a structure for interring bodies + + + TMPL + temple(s); an edifice dedicated to religious worship + + + TNKD + cattle dipping tank; a small artificial pond used for immersing cattle in chemically + treated water for disease control + + + + TOLL + toll gate/barrier; highway toll collection station + + + TOWR + tower; a high conspicuous structure, typically much higher than its diameter + + + + TRAM + tram; rail vehicle along urban streets (also known as streetcar or trolley) + + + + TRANT + transit terminal; facilities for the handling of vehicular freight and passengers + + + + TRIG + triangulation station; a point on the earth whose position has been determined by + triangulation + + + + TRMO + oil pipeline terminal; a tank farm or loading facility at the end of an oil pipeline + + + + TWO + temp work office; Temporary Work Offices + + + UNIP + university prep school; University Preparation Schools & Institutions + + + UNIV + university; An institution for higher learning with teaching and research facilities + constituting a graduate school and professional schools that award master's degrees and + doctorates and an undergraduate division that awards bachelor's degrees. + + + + USGE + united states government establishment; a facility operated by the United States + Government in Panama + + + + VETF + veterinary facility; a building or camp at which veterinary services are available + + + + WALL + wall; a thick masonry structure, usually enclosing a field or building, or forming the + side of a structure + + + + WALLA + ancient wall; the remains of a linear defensive stone structure + + + WEIR + weir(s); a small dam in a stream, designed to raise the water level or to divert stream + flow through a desired channel + + + + WHRF + wharf(-ves); a structure of open rather than solid construction along a shore or a bank + which provides berthing for ships and cargo-handling facilities + + + + WRCK + wreck; the site of the remains of a wrecked vessel + + + WTRW + waterworks; a facility for supplying potable water through a water source and a system + of pumps and filtration beds + + + + ZNF + free trade zone; an area, usually a section of a port, where goods may be received and + shipped free of customs duty and of most customs regulations + + + + ZOO + zoo; a zoological garden or park where wild animals are kept for exhibition + + + + ASPH + asphalt lake; a small basin containing naturally occurring asphalt + + + ATOL + atoll(s); a ring-shaped coral reef which has closely spaced islands on it encircling a + lagoon + + + + BAR + bar; a shallow ridge or mound of coarse unconsolidated material in a stream channel, at + the mouth of a stream, estuary, or lagoon and in the wave-break zone along coasts + + + + BCH + beach; a shore zone of coarse unconsolidated sediment that extends from the low-water + line to the highest reach of storm waves + + + + BCHS + beaches; a shore zone of coarse unconsolidated sediment that extends from the low-water + line to the highest reach of storm waves + + + + BDLD + badlands; an area characterized by a maze of very closely spaced, deep, narrow, + steep-sided ravines, and sharp crests and pinnacles + + + + BLDR + boulder field; a high altitude or high latitude bare, flat area covered with large + angular rocks + + + + BLHL + blowhole(s); a hole in coastal rock through which sea water is forced by a rising tide + or waves and spurted through an outlet into the air + + + + BLOW + blowout(s); a small depression in sandy terrain, caused by wind erosion + + + BNCH + bench; a long, narrow bedrock platform bounded by steeper slopes above and below, + usually overlooking a waterbody + + + + BUTE + butte(s); a small, isolated, usually flat-topped hill with steep sides + + + CAPE + cape; a land area, more prominent than a point, projecting into the sea and marking a + notable change in coastal direction + + + + CFT + cleft(s); a deep narrow slot, notch, or groove in a coastal cliff + + + CLDA + caldera; a depression measuring kilometers across formed by the collapse of a volcanic + mountain + + + + CLF + cliff(s); a high, steep to perpendicular slope overlooking a waterbody or lower area + + + + CNYN + canyon; a deep, narrow valley with steep sides cutting into a plateau or mountainous + area + + + + CONE + cone(s); a conical landform composed of mud or volcanic material + + + CRDR + corridor; a strip or area of land having significance as an access way + + + CRQ + cirque; a bowl-like hollow partially surrounded by cliffs or steep slopes at the head + of a glaciated valley + + + + CRQS + cirques; bowl-like hollows partially surrounded by cliffs or steep slopes at the head + of a glaciated valley + + + + CRTR + crater(s); a generally circular saucer or bowl-shaped depression caused by volcanic or + meteorite explosive action + + + + CUET + cuesta(s); an asymmetric ridge formed on tilted strata + + + DLTA + delta; a flat plain formed by alluvial deposits at the mouth of a stream + + + DPR + depression(s); a low area surrounded by higher land and usually characterized by + interior drainage + + + + DSRT + desert; a large area with little or no vegetation due to extreme environmental + conditions + + + + DUNE + dune(s); a wave form, ridge or star shape feature composed of sand + + + DVD + divide; a line separating adjacent drainage basins + + + ERG + sandy desert; an extensive tract of shifting sand and sand dunes + + + FAN + fan(s); a fan-shaped wedge of coarse alluvium with apex merging with a mountain stream + bed and the fan spreading out at a low angle slope onto an adjacent plain + + + + FORD + ford; a shallow part of a stream which can be crossed on foot or by land vehicle + + + + FSR + fissure; a crack associated with volcanism + + + GAP + gap; a low place in a ridge, not used for transportation + + + GRGE + gorge(s); a short, narrow, steep-sided section of a stream valley + + + HDLD + headland; a high projection of land extending into a large body of water beyond the + line of the coast + + + + HLL + hill; a rounded elevation of limited extent rising above the surrounding land with + local relief of less than 300m + + + + HLLS + hills; rounded elevations of limited extent rising above the surrounding land with + local relief of less than 300m + + + + HMCK + hammock(s); a patch of ground, distinct from and slightly above the surrounding plain + or wetland. Often occurs in groups + + + + HMDA + rock desert; a relatively sand-free, high bedrock plateau in a hot desert, with or + without a gravel veneer + + + + INTF + interfluve; a relatively undissected upland between adjacent stream valleys + + + + ISL + island; a tract of land, smaller than a continent, surrounded by water at high water + + + + ISLET + islet; small island, bigger than rock, smaller than island. + + + ISLF + artificial island; an island created by landfill or diking and filling in a wetland, + bay, or lagoon + + + + ISLM + mangrove island; a mangrove swamp surrounded by a waterbody + + + ISLS + islands; tracts of land, smaller than a continent, surrounded by water at high water + + + + ISLT + land-tied island; a coastal island connected to the mainland by barrier beaches, levees + or dikes + + + + ISLX + section of island + + + ISTH + isthmus; a narrow strip of land connecting two larger land masses and bordered by + water + + + + KRST + karst area; a distinctive landscape developed on soluble rock such as limestone + characterized by sinkholes, caves, disappearing streams, and underground drainage + + + + LAVA + lava area; an area of solidified lava + + + LEV + levee; a natural low embankment bordering a distributary or meandering stream; often + built up artificially to control floods + + + + MESA + mesa(s); a flat-topped, isolated elevation with steep slopes on all sides, less + extensive than a plateau + + + + MND + mound(s); a low, isolated, rounded hill + + + MRN + moraine; a mound, ridge, or other accumulation of glacial till + + + MT + mountain; an elevation standing high above the surrounding area with small summit area, + steep slopes and local relief of 300m or more + + + + MTS + mountains; a mountain range or a group of mountains or high ridges + + + NKM + meander neck; a narrow strip of land between the two limbs of a meander loop at its + narrowest point + + + + NTK + nunatak; a rock or mountain peak protruding through glacial ice + + + NTKS + nunataks; rocks or mountain peaks protruding through glacial ice + + + PAN + pan; a near-level shallow, natural depression or basin, usually containing an + intermittent lake, pond, or pool + + + + PANS + pans; a near-level shallow, natural depression or basin, usually containing an + intermittent lake, pond, or pool + + + + PASS + pass; a break in a mountain range or other high obstruction, used for transportation + from one side to the other [See also gap] + + + + PEN + peninsula; an elongate area of land projecting into a body of water and nearly + surrounded by water + + + + PENX + section of peninsula + + + PK + peak; a pointed elevation atop a mountain, ridge, or other hypsographic feature + + + + PKS + peaks; pointed elevations atop a mountain, ridge, or other hypsographic features + + + + PLAT + plateau; an elevated plain with steep slopes on one or more sides, and often with + incised streams + + + + PLATX + section of plateau + + + PLDR + polder; an area reclaimed from the sea by diking and draining + + + PLN + plain(s); an extensive area of comparatively level to gently undulating land, lacking + surface irregularities, and usually adjacent to a higher area + + + + PLNX + section of plain + + + PROM + promontory(-ies); a bluff or prominent hill overlooking or projecting into a lowland + + + + PT + point; a tapering piece of land projecting into a body of water, less prominent than a + cape + + + + PTS + points; tapering pieces of land projecting into a body of water, less prominent than a + cape + + + + RDGB + beach ridge; a ridge of sand just inland and parallel to the beach, usually in series + + + + RDGE + ridge(s); a long narrow elevation with steep sides, and a more or less continuous + crest + + + + REG + stony desert; a desert plain characterized by a surface veneer of gravel and stones + + + + RK + rock; a conspicuous, isolated rocky mass + + + RKFL + rockfall; an irregular mass of fallen rock at the base of a cliff or steep slope + + + + RKS + rocks; conspicuous, isolated rocky masses + + + SAND + sand area; a tract of land covered with sand + + + SBED + dry stream bed; a channel formerly containing the water of a stream + + + SCRP + escarpment; a long line of cliffs or steep slopes separating level surfaces above and + below + + + + SDL + saddle; a broad, open pass crossing a ridge or between hills or mountains + + + SHOR + shore; a narrow zone bordering a waterbody which covers and uncovers at high and low + water, respectively + + + + SINK + sinkhole; a small crater-shape depression in a karst area + + + SLID + slide; a mound of earth material, at the base of a slope and the associated scoured + area + + + + SLP + slope(s); a surface with a relatively uniform slope angle + + + SPIT + spit; a narrow, straight or curved continuation of a beach into a waterbody + + + + SPUR + spur(s); a subordinate ridge projecting outward from a hill, mountain or other + elevation + + + + TAL + talus slope; a steep concave slope formed by an accumulation of loose rock fragments at + the base of a cliff or steep slope + + + + TRGD + interdune trough(s); a long wind-swept trough between parallel longitudinal dunes + + + + TRR + terrace; a long, narrow alluvial platform bounded by steeper slopes above and below, + usually overlooking a waterbody + + + + UPLD + upland; an extensive interior region of high land with low to moderate surface relief + + + + VAL + valley; an elongated depression usually traversed by a stream + + + VALG + hanging valley; a valley the floor of which is notably higher than the valley or shore + to which it leads; most common in areas that have been glaciated + + + + VALS + valleys; elongated depressions usually traversed by a stream + + + VALX + section of valley + + + VLC + volcano; a conical elevation composed of volcanic materials with a crater at the top + + + + APNU + apron; a gentle slope, with a generally smooth surface, particularly found around + groups of islands and seamounts + + + + ARCU + arch; a low bulge around the southeastern end of the island of Hawaii + + + ARRU + arrugado; an area of subdued corrugations off Baja California + + + BDLU + borderland; a region adjacent to a continent, normally occupied by or bordering a + shelf, that is highly irregular with depths well in excess of those typical of a shelf + + + + BKSU + banks; elevations, typically located on a shelf, over which the depth of water is + relatively shallow but sufficient for safe surface navigation + + + + BNKU + bank; an elevation, typically located on a shelf, over which the depth of water is + relatively shallow but sufficient for safe surface navigation + + + + BSNU + basin; a depression more or less equidimensional in plan and of variable extent + + + + CDAU + cordillera; an entire mountain system including the subordinate ranges, interior + plateaus, and basins + + + + CNSU + canyons; relatively narrow, deep depressions with steep sides, the bottom of which + generally has a continuous slope + + + + CNYU + canyon; a relatively narrow, deep depression with steep sides, the bottom of which + generally has a continuous slope + + + + CRSU + continental rise; a gentle slope rising from oceanic depths towards the foot of a + continental slope + + + + DEPU + deep; a localized deep area within the confines of a larger feature, such as a trough, + basin or trench + + + + EDGU + shelf edge; a line along which there is a marked increase of slope at the outer margin + of a continental shelf or island shelf + + + + ESCU + escarpment (or scarp); an elongated and comparatively steep slope separating flat or + gently sloping areas + + + + FANU + fan; a relatively smooth feature normally sloping away from the lower termination of a + canyon or canyon system + + + + FLTU + flat; a small level or nearly level area + + + FRZU + fracture zone; an extensive linear zone of irregular topography of the sea floor, + characterized by steep-sided or asymmetrical ridges, troughs, or escarpments + + + + FURU + furrow; a closed, linear, narrow, shallow depression + + + GAPU + gap; a narrow break in a ridge or rise + + + GLYU + gully; a small valley-like feature + + + HLLU + hill; an elevation rising generally less than 500 meters + + + HLSU + hills; elevations rising generally less than 500 meters + + + HOLU + hole; a small depression of the sea floor + + + KNLU + knoll; an elevation rising generally more than 500 meters and less than 1,000 meters + and of limited extent across the summit + + + + KNSU + knolls; elevations rising generally more than 500 meters and less than 1,000 meters and + of limited extent across the summits + + + + LDGU + ledge; a rocky projection or outcrop, commonly linear and near shore + + + LEVU + levee; an embankment bordering a canyon, valley, or seachannel + + + MESU + mesa; an isolated, extensive, flat-topped elevation on the shelf, with relatively steep + sides + + + + MNDU + mound; a low, isolated, rounded hill + + + MOTU + moat; an annular depression that may not be continuous, located at the base of many + seamounts, islands, and other isolated elevations + + + + MTU + mountain; a well-delineated subdivision of a large and complex positive feature + + + + PKSU + peaks; prominent elevations, part of a larger feature, either pointed or of very + limited extent across the summit + + + + PKU + peak; a prominent elevation, part of a larger feature, either pointed or of very + limited extent across the summit + + + + PLNU + plain; a flat, gently sloping or nearly level region + + + PLTU + plateau; a comparatively flat-topped feature of considerable extent, dropping off + abruptly on one or more sides + + + + PNLU + pinnacle; a high tower or spire-shaped pillar of rock or coral, alone or cresting a + summit + + + + PRVU + province; a region identifiable by a group of similar physiographic features whose + characteristics are markedly in contrast with surrounding areas + + + + RDGU + ridge; a long narrow elevation with steep sides + + + RDSU + ridges; long narrow elevations with steep sides + + + RFSU + reefs; surface-navigation hazards composed of consolidated material + + + RFU + reef; a surface-navigation hazard composed of consolidated material + + + RISU + rise; a broad elevation that rises gently, and generally smoothly, from the sea floor + + + + SCNU + seachannel; a continuously sloping, elongated depression commonly found in fans or + plains and customarily bordered by levees on one or two sides + + + + SCSU + seachannels; continuously sloping, elongated depressions commonly found in fans or + plains and customarily bordered by levees on one or two sides + + + + SDLU + saddle; a low part, resembling in shape a saddle, in a ridge or between contiguous + seamounts + + + + SHFU + shelf; a zone adjacent to a continent (or around an island) that extends from the low + water line to a depth at which there is usually a marked increase of slope towards oceanic + depths + + + + SHLU + shoal; a surface-navigation hazard composed of unconsolidated material + + + SHSU + shoals; hazards to surface navigation composed of unconsolidated material + + + SHVU + shelf valley; a valley on the shelf, generally the shoreward extension of a canyon + + + + SILU + sill; the low part of a gap or saddle separating basins + + + SLPU + slope; the slope seaward from the shelf edge to the beginning of a continental rise or + the point where there is a general reduction in slope + + + + SMSU + seamounts; elevations rising generally more than 1,000 meters and of limited extent + across the summit + + + + SMU + seamount; an elevation rising generally more than 1,000 meters and of limited extent + across the summit + + + + SPRU + spur; a subordinate elevation, ridge, or rise projecting outward from a larger + feature + + + + TERU + terrace; a relatively flat horizontal or gently inclined surface, sometimes long and + narrow, which is bounded by a steeper ascending slope on one side and by a steep descending + slope on the opposite side + + + + TMSU + tablemounts (or guyots); seamounts having a comparatively smooth, flat top + + + + TMTU + tablemount (or guyot); a seamount having a comparatively smooth, flat top + + + TNGU + tongue; an elongate (tongue-like) extension of a flat sea floor into an adjacent higher + feature + + + + TRGU + trough; a long depression of the sea floor characteristically flat bottomed and steep + sided, and normally shallower than a trench + + + + TRNU + trench; a long, narrow, characteristically very deep and asymmetrical depression of the + sea floor, with relatively steep sides + + + + VALU + valley; a relatively shallow, wide depression, the bottom of which usually has a + continuous gradient + + + + VLSU + valleys; a relatively shallow, wide depression, the bottom of which usually has a + continuous gradient + + + + BUSH + bush(es); a small clump of conspicuous bushes in an otherwise bare area + + + CULT + cultivated area; an area under cultivation + + + FRST + forest(s); an area dominated by tree vegetation + + + FRSTF + fossilized forest; a forest fossilized by geologic processes and now exposed at the + earth's surface + + + + GROVE + grove; a small wooded area or collection of trees growing closely together, occurring + naturally or deliberately planted + + + + GRSLD + grassland; an area dominated by grass vegetation + + + GRVC + coconut grove; a planting of coconut trees + + + GRVO + olive grove; a planting of olive trees + + + GRVP + palm grove; a planting of palm trees + + + GRVPN + pine grove; a planting of pine trees + + + HTH + heath; an upland moor or sandy area dominated by low shrubby vegetation including + heather + + + + MDW + meadow; a small, poorly drained area dominated by grassy vegetation + + + OCH + orchard(s); a planting of fruit or nut trees + + + SCRB + scrubland; an area of low trees, bushes, and shrubs stunted by some environmental + limitation + + + + TREE + tree(s); a conspicuous tree used as a landmark + + + TUND + tundra; a marshy, treeless, high latitude plain, dominated by mosses, lichens, and low + shrub vegetation under permafrost conditions + + + + VIN + vineyard; a planting of grapevines + + + VINS + vineyards; plantings of grapevines + + + ll + not available + + + + + diff --git a/duui-geonames-fst/target/test-classes/org/texttechnologylab/textimager/uima/rust/TestGeoNamesFst.class b/duui-geonames-fst/target/test-classes/org/texttechnologylab/textimager/uima/rust/TestGeoNamesFst.class new file mode 100644 index 0000000000000000000000000000000000000000..ada1a1d336d2f07a07191a6e082ca86f85ab7288 GIT binary patch literal 7705 zcmd5=XLuad6+KUF*6XpoT31mQM}Q<_t!V)y<0iKS$$%xx*apOCccdL@cV?NHwJnDZ zAqgeKfN2SKsD_Y&DQj#IJH&(}g!Donz4zV;ft)w9TCFU}V3~Yh_Q%efJFnb#&%Ni~ z_g;8?_Z|QX#q=ObP^zFzML8-2LOt4gEt=BI?&vw4J$l>~sF-V*hC5H7w5GN_hycbZ z2&xzlRp5Bb?v831EuPe)StG4Qy9_tlrNv##?u(w)q&Z9dwm>j$>zb=enZS`XwaxpF z?1^iRH>5Fui30nU+M3zhm9^c7mX5S&h8Zy&H=-M+9%*&8gd3q`v7U9Ec#`TS>iJ?v zmf6<6ZF*#>o=SA;cDFYqL3wgtE}nE;J+eqj6S@;I(vjAE&m=bJl$w#{y5s1k6Y17d zdbe&;(qg!-p6b>+4ZEA&+qA5ibs||a;fPlhnwd4s5r^lZM{K1@eRH$0LMF z1z{DFF-4$i)GZ54a&&jG70;%1({0l?((F@fng_9Hb#24!Zmew&V46TlLYGs|P*El4 zom@1wnaE|eZe79Q{NPQHb&XVXp>1n@&9q<;he^DT5;!@bN8M~fv(t{WXEfWJi6(k8 zwiS(AwoX)ysHxjgw=bhdo2|I!8kQNvk%&lFIYuB-a{vruG3hIZ;T{qG{^8kwe1R`0%uB$Tms`tnR*)))^%+=0_0qT}2{aw_JgKba&DEUMxD%;D5GSHk{++7goj8q7vcYhZ zXE~PHY;@Y1-6yatA4mtK@$i*svU;4Mq5+Kp6Xhy{S_oX{Lvg5U_#uB#vlQFb)A39d z^Dtjviqx^S*T}T%HZk#?Q(#fe+CyDMAt_?{L8F(jjr9x`sqjiYZMphlPY}ccG|6~b zEKpYC#Y8=ps#u2Q0^=AHem8-e$6(|9nlZVkKTp9c8xP$KX9=u)yUbAS1qf-?4WOAc zkaXQlLo}KqtE7^a<2Iae+KHzG5yJ{;t0@7TD^QupW{uh1x@Br<-I?7bsVjz+Dq7Jd z5ReYAf?ouZBN3%XO$X&`d28;E;GpWB1GY;(TP?6oj$TYnrT|DmtLX+a7PDJSz@||+b6SKw~8bTW(bElTGk~e z731EN(i|u6^~LpwEGk;P0ZUt*#_{MyNz7`HlxWu+C=9Nt-m9t&0;tE_zyBbVC8mlaUX+w02<>lP!)wo8%wJP3?>$ro# zwFkNo;-u!R@QMwpKy9zadsVy-*Hbgo6HghKyQm;+RB$6#9aPn?O%`RlSo#EUgG^Zo z+^pgQ*eozUW9zQlcP`5p+VQBGSZr8TS~}8q3b5FO~r?CFVUj^ zH(4ZgJ<&FTG4d5z7y|dJcmNLyOiydQI&(L((Q2}jtVdkUbm<_+6}##{EjP>%-VU~_ zcvu#7il=DG4B#V#BW)%006xlAmtiI(5q@07C-9iS1WAPFYBuAR&U?=ozMJQ9SO!%0J|AA zC&}GCqvDg2=Dk{^O=C&zwj25e$pd>;d#cjUQZ+J`*Z} z#LH6x#VAS9Xl^w)&uD4bLumqiI^Nr&WjrTP@S6du^>Rd!b*olpnY`0_yWtr0?S+Lb zPc@J5w%wB@e$k$6MrJebz$vQ`?kQXVrbFMDNf~j2tRM1(MeM<8_7`;&1vE#bv~*`e ztIk(4)pG<|4^+F1$#ZMl=@KSG`P4BDr#H-vr}8%}!B)?|ml~29rbuH9)IluC>LuN9 zQ;RIirL$_87Trx+2`7YmCRO0yvi5sT#h>u!5MF2S)Qm|T-Y}4ZF(DXNHJn-0jHRySKe%U51m1?CQH_{~I zgF+P(6fu#7ftVyvJI=qX|We-$T`*=up9_p{AjNp3o&j1eLg<5}WrE-Cv2l z1ERX!I1b36uI@1^RqAczBq|F`K^dm9HG{3ga0CwLt=17ZlLg8O9K|!)(X>s375*`} z88f|2PvgeQa1*|PZ*u$*JTrU?FVPU+=9@H#FL{}*S16HY_zu3uw;!-w>KW%2T3O+@ z{(jWc5Oex*A|EH`a7qrRm*rTN&=L#QR30DNiA6aqk%oz_37@?aE#Y%=I4_4)tDnaC zEwfI@;lf!tXeWddyU5@i;PfW9sZ(Af;Dq!}xw|;`Ep9rQ|Z^)v~IxUARiH zmOoRr-XmD;B^t{6aRYnZ)KF1XURqvNK{ek0IBuz`$l*@;EYD%-IzPqFXxjO>)Ptjh zGwb*{enCSIz~M`NNy)D`b~;L52hYm&3PK99uR&38^{Z&|uqxwJzs7IbO*;AS_!Rhk z0kv&;)SepyYLA2;&EaF=9pNYV_`ia$oq&PBOyrGOC6@3lM)=Mqe9Z*w90GGL0cmEB zRzlZ?7OcW~Sk1^d9~;OZTd)>ev5rTo3-A~&q%U2Bef&BA;no6#TZ$ljt^nbqc?g#m zAnX_hp)VN%;f8~N@DBwDpUp$~%9tR0NfJ>?uLZ2w^$R56jTo z9G9OS9+#b@G@zTn_7K=4flU!ulfYU8)+VqHL3Igtmhf7Hc_ZQNBcz)M3m$WInPezXYWR|+6Mn+LfK*X85X<@`hA)Rzna z`KE&a`HuyV_vAr-X$&Asb`3uwVJ#rNFZ`K7z(>G(H0CXPdzp7sgl{CmHxuETi1212 zd@B*YjeL7Mx%Cb#!kt9;E~0xk1Nb%u?mZ0Ft+<$+o4~#FgZpqj?q~QufE{>{-1-p1 zcRTa>!$kR!0>V2A2tQne@Jj`R_v8^?R|sQ!SQz_~AqZdpRuK*bW!+6amX%2Zb7~2n z6|?Gc_^eDt#ihwHp3*W)OCI;Lv~2waf93c9{>HXH&nbM$UuAer4lmpdrEELON}oVx z|0%;>?kGozx_~!i_&djx;2-!WpEFR(Vy6`U!mId?2y(_Z Date: Wed, 17 Jun 2026 14:58:36 +0200 Subject: [PATCH 14/19] Update HeidelTime --- duui-HeidelTimeExt/.idea/.gitignore | 10 + duui-HeidelTimeExt/.idea/compiler.xml | 13 + duui-HeidelTimeExt/.idea/encodings.xml | 7 + duui-HeidelTimeExt/.idea/jarRepositories.xml | 30 ++ duui-HeidelTimeExt/.idea/misc.xml | 12 + duui-HeidelTimeExt/.idea/modules.xml | 8 + duui-HeidelTimeExt/.idea/vcs.xml | 6 + duui-HeidelTimeExt/HeidelTimeExt.iml | 8 + duui-HeidelTimeExt/docker_build.sh | 46 ++ duui-HeidelTimeExt/dockerfile | 27 +- duui-HeidelTimeExt/pom.xml | 213 +++++++-- .../tools/HeidelTimeExt.java | 426 ++++++++++++++---- .../main/resources/communication_layer.lua | 17 + .../tools/HeidelTimeExtTest.java | 213 +++++++++ .../target/classes/communication_layer.lua | 17 + .../HeidelTimeExt$CommunicationLayer.class | Bin 1526 -> 0 bytes ...delTimeExt$CommunicationLayerHandler.class | Bin 0 -> 1819 bytes .../tools/HeidelTimeExt$IOHandler.class | Bin 2118 -> 1773 bytes .../tools/HeidelTimeExt$ProcessHandler.class | Bin 4138 -> 11001 bytes .../tools/HeidelTimeExt$RootHandler.class | Bin 0 -> 1682 bytes .../HeidelTimeExt$TypesystemHandler.class | Bin 2263 -> 2378 bytes .../tools/HeidelTimeExt.class | Bin 2004 -> 5254 bytes .../tools/HeidelTimeExtTest.class | Bin 0 -> 9939 bytes 23 files changed, 927 insertions(+), 126 deletions(-) create mode 100644 duui-HeidelTimeExt/.idea/.gitignore create mode 100644 duui-HeidelTimeExt/.idea/compiler.xml create mode 100644 duui-HeidelTimeExt/.idea/encodings.xml create mode 100644 duui-HeidelTimeExt/.idea/jarRepositories.xml create mode 100644 duui-HeidelTimeExt/.idea/misc.xml create mode 100644 duui-HeidelTimeExt/.idea/modules.xml create mode 100644 duui-HeidelTimeExt/.idea/vcs.xml create mode 100644 duui-HeidelTimeExt/HeidelTimeExt.iml create mode 100755 duui-HeidelTimeExt/docker_build.sh create mode 100644 duui-HeidelTimeExt/src/main/resources/communication_layer.lua create mode 100644 duui-HeidelTimeExt/src/main/test/java/org/texttechnology/tools/HeidelTimeExtTest.java create mode 100644 duui-HeidelTimeExt/target/classes/communication_layer.lua delete mode 100644 duui-HeidelTimeExt/target/classes/org/texttechnologylab/tools/HeidelTimeExt$CommunicationLayer.class create mode 100644 duui-HeidelTimeExt/target/classes/org/texttechnologylab/tools/HeidelTimeExt$CommunicationLayerHandler.class create mode 100644 duui-HeidelTimeExt/target/classes/org/texttechnologylab/tools/HeidelTimeExt$RootHandler.class create mode 100644 duui-HeidelTimeExt/target/test-classes/org/texttechnology/tools/HeidelTimeExtTest.class diff --git a/duui-HeidelTimeExt/.idea/.gitignore b/duui-HeidelTimeExt/.idea/.gitignore new file mode 100644 index 00000000..7bc07ec2 --- /dev/null +++ b/duui-HeidelTimeExt/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Environment-dependent path to Maven home directory +/mavenHomeManager.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/duui-HeidelTimeExt/.idea/compiler.xml b/duui-HeidelTimeExt/.idea/compiler.xml new file mode 100644 index 00000000..afd76b09 --- /dev/null +++ b/duui-HeidelTimeExt/.idea/compiler.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/encodings.xml b/duui-HeidelTimeExt/.idea/encodings.xml new file mode 100644 index 00000000..aa00ffab --- /dev/null +++ b/duui-HeidelTimeExt/.idea/encodings.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/jarRepositories.xml b/duui-HeidelTimeExt/.idea/jarRepositories.xml new file mode 100644 index 00000000..22dd35ce --- /dev/null +++ b/duui-HeidelTimeExt/.idea/jarRepositories.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/misc.xml b/duui-HeidelTimeExt/.idea/misc.xml new file mode 100644 index 00000000..4c7d54ea --- /dev/null +++ b/duui-HeidelTimeExt/.idea/misc.xml @@ -0,0 +1,12 @@ + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/modules.xml b/duui-HeidelTimeExt/.idea/modules.xml new file mode 100644 index 00000000..cfbfba02 --- /dev/null +++ b/duui-HeidelTimeExt/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/vcs.xml b/duui-HeidelTimeExt/.idea/vcs.xml new file mode 100644 index 00000000..6c0b8635 --- /dev/null +++ b/duui-HeidelTimeExt/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/HeidelTimeExt.iml b/duui-HeidelTimeExt/HeidelTimeExt.iml new file mode 100644 index 00000000..30905331 --- /dev/null +++ b/duui-HeidelTimeExt/HeidelTimeExt.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/docker_build.sh b/duui-HeidelTimeExt/docker_build.sh new file mode 100755 index 00000000..e48150ad --- /dev/null +++ b/duui-HeidelTimeExt/docker_build.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Build and optionally push the DUUI HeidelTimeExt Docker image. +# +# Examples: +# ./docker_build.sh +# ./docker_build.sh 1.0 +# PUSH=true ./docker_build.sh 1.0 +# +# Optional environment variables: +# ANNOTATOR_NAME=duui-heideltime-ext +# DOCKER_REGISTRY=docker.texttechnologylab.org/ +# PUSH=true + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SCRIPT_DIR}" + +VERSION="${1:-${ANNOTATOR_VERSION:-1.0}}" +ANNOTATOR_NAME="${ANNOTATOR_NAME:-duui-heideltime-ext}" +DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.texttechnologylab.org/}" +PUSH="${PUSH:-false}" + +LOCAL_VERSION_TAG="${ANNOTATOR_NAME}:${VERSION}" +LOCAL_LATEST_TAG="${ANNOTATOR_NAME}:latest" +REMOTE_VERSION_TAG="${DOCKER_REGISTRY}${ANNOTATOR_NAME}:${VERSION}" +REMOTE_LATEST_TAG="${DOCKER_REGISTRY}${ANNOTATOR_NAME}:latest" + +printf '\nBuilding %s\n' "${LOCAL_VERSION_TAG}" +docker build -f dockerfile \ + --build-arg ANNOTATOR_VERSION="${VERSION}" \ + -t "${LOCAL_VERSION_TAG}" \ + . + +docker tag "${LOCAL_VERSION_TAG}" "${LOCAL_LATEST_TAG}" +docker tag "${LOCAL_VERSION_TAG}" "${REMOTE_VERSION_TAG}" +docker tag "${LOCAL_VERSION_TAG}" "${REMOTE_LATEST_TAG}" + +printf '\nBuilt images:\n' +printf ' %s\n' "${LOCAL_VERSION_TAG}" "${LOCAL_LATEST_TAG}" "${REMOTE_VERSION_TAG}" "${REMOTE_LATEST_TAG}" + +if [[ "${PUSH}" == "true" ]]; then + printf '\nPushing images:\n' + docker push "${REMOTE_VERSION_TAG}" + docker push "${REMOTE_LATEST_TAG}" +fi diff --git a/duui-HeidelTimeExt/dockerfile b/duui-HeidelTimeExt/dockerfile index aa556fc2..2b8cb109 100644 --- a/duui-HeidelTimeExt/dockerfile +++ b/duui-HeidelTimeExt/dockerfile @@ -1,5 +1,22 @@ -FROM maven:3.8.5-jdk-11 -ADD src src -ADD pom.xml pom.xml -RUN mvn clean compile -CMD mvn exec:java -Dexec.mainClass="org.texttechnologylab.tools.HeidelTimeExt" +FROM maven:3.9.9-eclipse-temurin-21 AS build + +WORKDIR /build + +COPY pom.xml pom.xml +RUN mvn -U -q -P '!duui-tests' -Dmaven.test.skip=true -DskipTests dependency:go-offline || true + +COPY src/main src/main +RUN mvn -U -q -P '!duui-tests' -Dmaven.test.skip=true -DskipTests clean package + +FROM eclipse-temurin:21-jre + +WORKDIR /app + +COPY --from=build /build/target/duui-HeidelTimeExt-*.jar /app/HeidelTimeExt.jar + +ENV DUUI_PORT=9714 +ENV DUUI_WORKERS=1 + +EXPOSE 9714 + +ENTRYPOINT ["java", "-jar", "/app/HeidelTimeExt.jar"] diff --git a/duui-HeidelTimeExt/pom.xml b/duui-HeidelTimeExt/pom.xml index 0d54ddb3..60c378c3 100644 --- a/duui-HeidelTimeExt/pom.xml +++ b/duui-HeidelTimeExt/pom.xml @@ -1,69 +1,218 @@ + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + 4.0.0 org.texttechnologylab.tools - HeidelTimeExt - 1.0 + duui-HeidelTimeExt + 0.1.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + 21 + UTF-8 + 2.4.0 + - - central - Central Repository - https://repo.maven.apache.org/maven2 - default - - false - - - - ukp-oss-model-releases - https://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local - jitpack.io https://jitpack.io - - 11 - 11 - + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + - + + + com.github.texttechnologylab + Utilities + 3.0.2 + + + + - org.apache.uima - uimaj-core - 2.11.0 + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 1.5.5 + + + com.github.texttechnologylab + Utilities + + - org.apache.uima - uimafit-core - 2.5.0 + com.github.texttechnologylab + Utilities + 3.0.2 com.github.texttechnologylab UIMATypeSystem - 29fe3e0ab5 + 3.0.14 - com.github.texttechnologylab.textimager-uima - textimager-uima-heideltime-biofid - 9b70623c7f + com.github.mevbagci + heideltime + 4.0.7 + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + + org.dkpro.core + dkpro-core-io-xmi-asl + + + + org.dkpro.core + dkpro-core-api-resources-asl org.json json - 20180813 + 20240303 + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + duui-HeidelTimeExt-${project.version} + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + ${maven.compiler.release} + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.6.0 + + + package + + shade + + + false + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + org.texttechnologylab.tools.HeidelTimeExt + + + + + + + META-INF/org.apache.uima.fit/types.txt + + + META-INF/org.apache.uima.fit/typepriorities.txt + + + META-INF/org.apache.uima.fit/fsindexes.txt + + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/src/main/java/org/texttechnologylab/tools/HeidelTimeExt.java b/duui-HeidelTimeExt/src/main/java/org/texttechnologylab/tools/HeidelTimeExt.java index 59dd9857..71ccdccf 100644 --- a/duui-HeidelTimeExt/src/main/java/org/texttechnologylab/tools/HeidelTimeExt.java +++ b/duui-HeidelTimeExt/src/main/java/org/texttechnologylab/tools/HeidelTimeExt.java @@ -1,12 +1,12 @@ package org.texttechnologylab.tools; +import com.sun.net.httpserver.Headers; import com.sun.net.httpserver.HttpExchange; import com.sun.net.httpserver.HttpHandler; import com.sun.net.httpserver.HttpServer; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.unihd.dbs.uima.annotator.heideltime.HeidelTime; -import de.unihd.dbs.uima.annotator.heideltime.biofid.HeidelTimeBioFID; import de.unihd.dbs.uima.types.heideltime.Timex3; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; @@ -22,149 +22,397 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.json.JSONArray; -import org.json.JSONException; import org.json.JSONObject; import org.texttechnologylab.annotation.type.Time; import org.xml.sax.SAXException; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.io.StringWriter; import java.net.InetSocketAddress; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Executors; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +/** + * DUUI wrapper for TTLab's HeidelTimeExt component. + * + * This wrapper uses the HeidelTimeExt repository directly via the heideltime artifact. + * It does not depend on textimager-uima-heideltime-biofid, because that wrapper can pull + * incompatible legacy UIMA dependencies into the runtime classpath. + * + * The service communicates with DUUI using XMI serialize/deserialize mode: + * - GET /v1/communication_layer returns communication_layer.lua + * - GET /v1/typesystem returns the UIMA type system + * - GET /v1/details/input_output returns declared input/output types + * - POST /v1/process accepts an XMI CAS and returns the processed XMI CAS + */ public class HeidelTimeExt { + private static final int DEFAULT_PORT = 9714; + private static final String COMMUNICATION_LAYER_RESOURCE = "/communication_layer.lua"; + private static final String DEFAULT_FILENAME = "duui-document"; + public static void main(String[] args) throws Exception { - HttpServer server = HttpServer.create(new InetSocketAddress(9714), 0); - server.createContext("/v1/communication_layer", new CommunicationLayer()); + int port = getIntEnv("DUUI_PORT", DEFAULT_PORT); + int workers = getIntEnv("DUUI_WORKERS", Runtime.getRuntime().availableProcessors()); + + HttpServer server = HttpServer.create(new InetSocketAddress(port), 0); + server.createContext("/v1/communication_layer", new CommunicationLayerHandler()); server.createContext("/v1/typesystem", new TypesystemHandler()); server.createContext("/v1/process", new ProcessHandler()); server.createContext("/v1/details/input_output", new IOHandler()); - - server.setExecutor(null); // creates a default executor + server.createContext("/", new RootHandler()); + server.setExecutor(Executors.newFixedThreadPool(Math.max(1, workers))); server.start(); - System.out.println(HeidelTimeExt.class.getSimpleName()+" ready"); + + System.out.println(HeidelTimeExt.class.getSimpleName() + " ready on port " + port + " with " + workers + " workers"); } - static class ProcessHandler implements HttpHandler { - static JCas jc; - private static AggregateBuilder pipeline = new AggregateBuilder(); - private static AnalysisEngine pAE = null; + private static int getIntEnv(String key, int fallback) { + String value = System.getenv(key); + if (value == null || value.isBlank()) { + return fallback; + } + try { + return Integer.parseInt(value.trim()); + } catch (NumberFormatException ignored) { + return fallback; + } + } - static { - try { - jc = JCasFactory.createJCas(); - pipeline.add(createEngineDescription(HeidelTimeBioFID.class)); + private static String getEnv(String key, String fallback) { + String value = System.getenv(key); + if (value == null || value.isBlank()) { + return fallback; + } + return value.trim(); + } - } catch (UIMAException e) { - e.printStackTrace(); - } + private static void writeResponse(HttpExchange exchange, int statusCode, String contentType, byte[] body) throws IOException { + Headers headers = exchange.getResponseHeaders(); + headers.set("Content-Type", contentType); + exchange.sendResponseHeaders(statusCode, body.length); + try (OutputStream os = exchange.getResponseBody()) { + os.write(body); + } + } + + private static void writeText(HttpExchange exchange, int statusCode, String contentType, String body) throws IOException { + writeResponse(exchange, statusCode, contentType, body.getBytes(StandardCharsets.UTF_8)); + } + + private static void methodNotAllowed(HttpExchange exchange) throws IOException { + writeText(exchange, 405, "text/plain; charset=utf-8", "Method not allowed"); + } + + private static class RootHandler implements HttpHandler { + @Override + public void handle(HttpExchange exchange) throws IOException { + JSONObject details = new JSONObject(); + details.put("name", "duui-heideltime-ext"); + details.put("version", System.getenv().getOrDefault("ANNOTATOR_VERSION", "1.0")); + details.put("description", "DUUI wrapper for TTLab HeidelTimeExt"); + details.put("endpoints", new JSONArray() + .put("/v1/communication_layer") + .put("/v1/typesystem") + .put("/v1/details/input_output") + .put("/v1/process")); + writeText(exchange, 200, "application/json; charset=utf-8", details.toString()); } + } + + private static class ProcessHandler implements HttpHandler { @Override - public void handle(HttpExchange t) throws IOException { - try { - jc.reset(); + public void handle(HttpExchange exchange) throws IOException { + if (!"POST".equalsIgnoreCase(exchange.getRequestMethod())) { + methodNotAllowed(exchange); + return; + } + try { + JCas jCas = JCasFactory.createJCas(); XmiSerializationSharedData sharedData = new XmiSerializationSharedData(); - XmiCasDeserializer.deserialize(t.getRequestBody(), jc.getCas(), true, sharedData); - pAE = pipeline.createAggregate(); - SimplePipeline.runPipeline(jc, pAE); - - for (Timex3 timex3 : JCasUtil.select(jc, Timex3.class)) { - Time nTime = new Time(jc); - nTime.setBegin(timex3.getBegin()); - nTime.setEnd(timex3.getEnd()); - nTime.setValue(timex3.getTimexType()); - nTime.setIdentifier(timex3.getTimexValue()); - nTime.addToIndexes(); - } + XmiCasDeserializer.deserialize(exchange.getRequestBody(), jCas.getCas(), true, sharedData); + + ensureHeidelTimeInputAnnotations(jCas); - t.sendResponseHeaders(200, 0); - XmiCasSerializer.serialize(jc.getCas(), null, t.getResponseBody(), false, sharedData); + AnalysisEngine analysisEngine = createPipeline(jCas); + SimplePipeline.runPipeline(jCas, analysisEngine); + copyTimex3ToTTLabTime(jCas); - t.getResponseBody().close(); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + XmiCasSerializer.serialize(jCas.getCas(), null, outputStream, false, sharedData); + + writeResponse(exchange, 200, "application/xmi+xml", outputStream.toByteArray()); } catch (Exception e) { e.printStackTrace(); - t.sendResponseHeaders(404, -1); + String message = "HeidelTimeExt processing failed: " + e.getMessage(); + writeText(exchange, 500, "text/plain; charset=utf-8", message); } + } - t.getResponseBody().close(); + private AnalysisEngine createPipeline(JCas jCas) throws UIMAException { + String language = getEnv("HEIDELTIME_LANGUAGE", normalizeLanguage(jCas.getDocumentLanguage())); + String typeToProcess = getEnv("HEIDELTIME_TYPE", "narrative"); + String locale = getEnv("HEIDELTIME_LOCALE", localeForLanguage(language)); + boolean findTemponyms = Boolean.parseBoolean(getEnv("HEIDELTIME_TEMPONYMS", "true")); + + AggregateBuilder pipeline = new AggregateBuilder(); + pipeline.add(createEngineDescription( + HeidelTime.class, + "Language", language, + "Type", typeToProcess, + "locale", locale, + "Date", true, + "Time", true, + "Duration", true, + "Set", true, + "Temponym", findTemponyms, + "ConvertDurations", true, + "Debugging", false + )); + return pipeline.createAggregate(); } - } - static class TypesystemHandler implements HttpHandler { - @Override - public void handle(HttpExchange t) throws IOException { - try { - TypeSystemDescription desc = TypeSystemDescriptionFactory.createTypeSystemDescription(); - StringWriter writer = new StringWriter(); - desc.toXML(writer); - String response = writer.getBuffer().toString(); + /** + * HeidelTime expects its own Sentence/Token types. DUUI pipelines often provide DKPro + * Sentence/Token annotations, so we mirror them into the HeidelTime type system before + * executing the HeidelTime annotator. + */ + private void ensureHeidelTimeInputAnnotations(JCas jCas) { + if (JCasUtil.exists(jCas, de.unihd.dbs.uima.types.heideltime.Sentence.class) + && JCasUtil.exists(jCas, de.unihd.dbs.uima.types.heideltime.Token.class)) { + return; + } + + List dkproSentences = new ArrayList<>(JCasUtil.select(jCas, Sentence.class)); + if (dkproSentences.isEmpty() && jCas.getDocumentText() != null) { + Sentence sentence = new Sentence(jCas, 0, jCas.getDocumentText().length()); + sentence.addToIndexes(); + dkproSentences.add(sentence); + } - t.sendResponseHeaders(200, response.getBytes(Charset.defaultCharset()).length); + int sentenceId = 1; + for (Sentence dkproSentence : dkproSentences) { + de.unihd.dbs.uima.types.heideltime.Sentence heidelSentence = + new de.unihd.dbs.uima.types.heideltime.Sentence( + jCas, + dkproSentence.getBegin(), + dkproSentence.getEnd() + ); + heidelSentence.setFilename(DEFAULT_FILENAME); + heidelSentence.setSentenceId(sentenceId); + heidelSentence.addToIndexes(); + sentenceId++; + } - OutputStream os = t.getResponseBody(); - os.write(response.getBytes(Charset.defaultCharset())); + List dkproTokens = new ArrayList<>(JCasUtil.select(jCas, Token.class)); + if (dkproTokens.isEmpty()) { + createWhitespaceTokens(jCas); + dkproTokens = new ArrayList<>(JCasUtil.select(jCas, Token.class)); + } - } catch (ResourceInitializationException e) { - e.printStackTrace(); - t.sendResponseHeaders(404, -1); + int tokenId = 1; + for (Token dkproToken : dkproTokens) { + de.unihd.dbs.uima.types.heideltime.Token heidelToken = + new de.unihd.dbs.uima.types.heideltime.Token( + jCas, + dkproToken.getBegin(), + dkproToken.getEnd() + ); + heidelToken.setFilename(DEFAULT_FILENAME); + heidelToken.setTokenId(tokenId); + heidelToken.setSentId(findSentenceId(dkproSentences, dkproToken)); + heidelToken.setPos("NN"); + heidelToken.addToIndexes(); + tokenId++; + } + + ensureDct(jCas); + } + + private void createWhitespaceTokens(JCas jCas) { + String text = jCas.getDocumentText(); + if (text == null || text.isEmpty()) { return; - } catch (SAXException e) { - e.printStackTrace(); - } finally { - t.getResponseBody().close(); } + int tokenBegin = -1; + for (int i = 0; i <= text.length(); i++) { + boolean boundary = i == text.length() || Character.isWhitespace(text.charAt(i)); + + if (!boundary && tokenBegin < 0) { + tokenBegin = i; + } + + if (boundary && tokenBegin >= 0) { + Token token = new Token(jCas, tokenBegin, i); + token.addToIndexes(); + tokenBegin = -1; + } + } + } + + private int findSentenceId(List sentences, Token token) { + for (int i = 0; i < sentences.size(); i++) { + Sentence sentence = sentences.get(i); + if (token.getBegin() >= sentence.getBegin() && token.getEnd() <= sentence.getEnd()) { + return i + 1; + } + } + return 1; + } + + private void ensureDct(JCas jCas) { + if (JCasUtil.exists(jCas, de.unihd.dbs.uima.types.heideltime.Dct.class)) { + return; + } + + String today = LocalDate.now().format(DateTimeFormatter.BASIC_ISO_DATE); + de.unihd.dbs.uima.types.heideltime.Dct dct = new de.unihd.dbs.uima.types.heideltime.Dct(jCas, 0, 0); + dct.setFilename(DEFAULT_FILENAME); + dct.setTimexId("dct"); + dct.setValue(today); + dct.addToIndexes(); + } + + private String normalizeLanguage(String documentLanguage) { + if (documentLanguage == null || documentLanguage.isBlank()) { + return "german"; + } + + String language = documentLanguage.trim().toLowerCase(); + if (language.equals("de") || language.equals("deu") || language.equals("ger") || language.equals("german")) { + return "german"; + } + if (language.equals("en") || language.equals("eng") || language.equals("english")) { + return "english"; + } + if (language.equals("nl") || language.equals("nld") || language.equals("dut") || language.equals("dutch")) { + return "dutch"; + } + if (language.equals("es") || language.equals("spa") || language.equals("spanish")) { + return "spanish"; + } + if (language.equals("it") || language.equals("ita") || language.equals("italian")) { + return "italian"; + } + if (language.equals("fr") || language.equals("fra") || language.equals("fre") || language.equals("french")) { + return "french"; + } + if (language.equals("pt") || language.equals("por") || language.equals("portuguese")) { + return "portuguese"; + } + if (language.equals("ru") || language.equals("rus") || language.equals("russian")) { + return "russian"; + } + if (language.equals("zh") || language.equals("zho") || language.equals("chi") || language.equals("chinese")) { + return "chinese"; + } + if (language.equals("ar") || language.equals("ara") || language.equals("arabic")) { + return "arabic"; + } + if (language.equals("hr") || language.equals("hrv") || language.equals("croatian")) { + return "croatian"; + } + if (language.equals("et") || language.equals("est") || language.equals("estonian")) { + return "estonian"; + } + if (language.equals("vi") || language.equals("vie") || language.equals("vietnamese")) { + return "vietnamese"; + } + + return language; + } + + private String localeForLanguage(String language) { + if ("german".equalsIgnoreCase(language)) { + return "de_DE"; + } + if ("english".equalsIgnoreCase(language)) { + return "en_GB"; + } + return "en_GB"; + } + + private void copyTimex3ToTTLabTime(JCas jCas) { + for (Timex3 timex3 : JCasUtil.select(jCas, Timex3.class)) { + Time time = new Time(jCas); + time.setBegin(timex3.getBegin()); + time.setEnd(timex3.getEnd()); + time.setValue(timex3.getTimexType()); + time.setIdentifier(timex3.getTimexValue()); + time.addToIndexes(); + } } } - static class IOHandler implements HttpHandler { + private static class TypesystemHandler implements HttpHandler { @Override - public void handle(HttpExchange t) throws IOException { - try { - JSONObject rObject = new JSONObject(); - rObject.put("input", new JSONArray().put(Token.class.getName()).put(Sentence.class.getName())); - rObject.put("output", new JSONArray().put(Timex3.class.getName()).put(Time.class.getName())); - String response = rObject.toString(); - t.sendResponseHeaders(200, response.getBytes(Charset.defaultCharset()).length); - - OutputStream os = t.getResponseBody(); - os.write(response.getBytes(Charset.defaultCharset())); + public void handle(HttpExchange exchange) throws IOException { + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + methodNotAllowed(exchange); + return; + } - } catch (JSONException e) { + try { + TypeSystemDescription description = TypeSystemDescriptionFactory.createTypeSystemDescription(); + StringWriter writer = new StringWriter(); + description.toXML(writer); + writeText(exchange, 200, "application/xml; charset=utf-8", writer.toString()); + } catch (ResourceInitializationException | SAXException e) { e.printStackTrace(); - t.sendResponseHeaders(404, -1); + writeText(exchange, 500, "text/plain; charset=utf-8", "Could not create type system: " + e.getMessage()); + } + } + } + + private static class IOHandler implements HttpHandler { + @Override + public void handle(HttpExchange exchange) throws IOException { + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + methodNotAllowed(exchange); return; - } finally { - t.getResponseBody().close(); } + JSONObject response = new JSONObject(); + response.put("input", new JSONArray() + .put(Token.class.getName()) + .put(Sentence.class.getName())); + response.put("output", new JSONArray() + .put(Timex3.class.getName()) + .put(Time.class.getName())); + + writeText(exchange, 200, "application/json; charset=utf-8", response.toString()); } } - static class CommunicationLayer implements HttpHandler { + private static class CommunicationLayerHandler implements HttpHandler { @Override - public void handle(HttpExchange t) throws IOException { - String response = "serial = luajava.bindClass(\"org.apache.uima.cas.impl.XmiCasSerializer\")\n" + - "deserial = luajava.bindClass(\"org.apache.uima.cas.impl.XmiCasDeserializer\")" + - "function serialize(inputCas,outputStream,params)\n" + - " serial:serialize(inputCas:getCas(),outputStream)\n" + - "end\n" + - "\n" + - "function deserialize(inputCas,inputStream)\n" + - " inputCas:reset()\n" + - " deserial:deserialize(inputStream,inputCas:getCas(),true)\n" + - "end"; - t.sendResponseHeaders(200, response.length()); - OutputStream os = t.getResponseBody(); - os.write(response.getBytes()); - os.close(); + public void handle(HttpExchange exchange) throws IOException { + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + methodNotAllowed(exchange); + return; + } + + try (InputStream inputStream = HeidelTimeExt.class.getResourceAsStream(COMMUNICATION_LAYER_RESOURCE)) { + if (inputStream == null) { + writeText(exchange, 500, "text/plain; charset=utf-8", "Missing resource: " + COMMUNICATION_LAYER_RESOURCE); + return; + } + byte[] response = inputStream.readAllBytes(); + writeResponse(exchange, 200, "text/plain; charset=utf-8", response); + } } } } diff --git a/duui-HeidelTimeExt/src/main/resources/communication_layer.lua b/duui-HeidelTimeExt/src/main/resources/communication_layer.lua new file mode 100644 index 00000000..551b8246 --- /dev/null +++ b/duui-HeidelTimeExt/src/main/resources/communication_layer.lua @@ -0,0 +1,17 @@ +-- DUUI communication layer for HeidelTimeExt. +-- This component uses the classic serialize/deserialize mode and transfers the CAS as XMI. + +serial = luajava.bindClass("org.apache.uima.cas.impl.XmiCasSerializer") +deserial = luajava.bindClass("org.apache.uima.cas.impl.XmiCasDeserializer") + +SUPPORTS_PROCESS = false +SUPPORTS_SERIALIZE = true + +function serialize(inputCas, outputStream, params) + serial:serialize(inputCas:getCas(), outputStream) +end + +function deserialize(inputCas, inputStream) + inputCas:reset() + deserial:deserialize(inputStream, inputCas:getCas(), true) +end diff --git a/duui-HeidelTimeExt/src/main/test/java/org/texttechnology/tools/HeidelTimeExtTest.java b/duui-HeidelTimeExt/src/main/test/java/org/texttechnology/tools/HeidelTimeExtTest.java new file mode 100644 index 00000000..aa75c886 --- /dev/null +++ b/duui-HeidelTimeExt/src/main/test/java/org/texttechnology/tools/HeidelTimeExtTest.java @@ -0,0 +1,213 @@ +package org.texttechnology.tools; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.unihd.dbs.uima.types.heideltime.Timex3; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.XmlCasSerializer; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.texttechnologylab.annotation.type.Time; +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class HeidelTimeExtTest { + + static DUUIComposer composer; + static JCas cas; + static String url = getenvOrDefault("HEIDELTIME_EXT_URL", "http://127.0.0.1:9714"); + + @BeforeAll + static void beforeAll() throws Exception { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); + + cas = JCasFactory.createJCas(); + } + + @AfterEach + public void afterEach() throws IOException, SAXException { + composer.resetPipeline(); + + if (cas != null) { + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, stream); + System.out.println(stream.toString(StandardCharsets.UTF_8)); + cas.reset(); + } + } + + @AfterAll + static void afterAll() throws UnknownHostException { + if (composer != null) { + composer.shutdown(); + } + } + + @Test + public void CommunicationLayerTest() throws Exception { + Assumptions.assumeTrue( + serviceAvailable(url), + "Skipping test because no HeidelTimeExt DUUI service is reachable at " + url + ); + + String communicationLayer = httpGet(url + "/v1/communication_layer"); + + assertTrue(communicationLayer.contains("SUPPORTS_SERIALIZE = true")); + assertTrue(communicationLayer.contains("function serialize")); + assertTrue(communicationLayer.contains("function deserialize")); + } + + @Test + public void HeidelTimeExtRemoteTest() throws Exception { + Assumptions.assumeTrue( + serviceAvailable(url), + "Skipping test because no HeidelTimeExt DUUI service is reachable at " + url + ); + + composer.add(new DUUIRemoteDriver.Component(url)); + + createCas( + "de", + Arrays.asList( + "Am 12. Mai 2024 begann die Exkursion.", + "Nach zwei Tagen wurden weitere Proben gesammelt.", + "Morgen um 14 Uhr soll ein weiteres Treffen stattfinden." + ) + ); + + composer.run(cas); + + Collection timexAnnotations = JCasUtil.select(cas, Timex3.class); + Collection