diff --git a/duui-Climate/.dockerignore b/duui-Climate/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-Climate/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-Climate/.gitignore b/duui-Climate/.gitignore new file mode 100644 index 00000000..d2092691 --- /dev/null +++ b/duui-Climate/.gitignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv*/ \ No newline at end of file diff --git a/duui-Climate/Readme.md b/duui-Climate/Readme.md new file mode 100644 index 00000000..52078b0b --- /dev/null +++ b/duui-Climate/Readme.md @@ -0,0 +1,90 @@ +[![Version](https://img.shields.io/static/v1?label=duui-climate&message=0.1.0&color=blue)](https://docker.texttechnologylab.org/v2/duui-transformers-topic/tags/list) +[![Version](https://img.shields.io/static/v1?label=Python&message=3.12&color=green)]() +[![Version](https://img.shields.io/static/v1?label=Transformers&message=5.9.0&color=yellow)]() +[![Version](https://img.shields.io/static/v1?label=Torch&message=2.11.0&color=red)]() + +# Transformers Climate + +DUUI implementation for selected Hugging-Face-based transformer [Climate tools](https://huggingface.co/models?sort=trending&search=climatebert) models. +## Included Models + +| Name | | Revision | Languages | +|-------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|--------------------------------|----------| +| distilroberta-base-climate-sentiment | https://huggingface.co/climatebert/distilroberta-base-climate-sentiment | e9f9a94ee4263f5ad5cfc97b8539a497fc88aa7d | EN | +| distilroberta-base-climate-tcfd | https://huggingface.co/climatebert/distilroberta-base-climate-tcfd | 970630beedc21db81a84156448ad2e3ac860153d | EN | +| distilroberta-base-climate-commitment | https://huggingface.co/climatebert/distilroberta-base-climate-commitment | 17337c3292df16a8fe93b1505dfe4122d50a4c91 | EN | +| distilroberta-base-climate-sentiment | https://huggingface.co/climatebert/distilroberta-base-climate-sentiment | e9f9a94ee4263f5ad5cfc97b8539a497fc88aa7d | EN | +| distilroberta-base-climate-specificity | https://huggingface.co/climatebert/distilroberta-base-climate-specificity | 4ada96ed4bf5c3a7a711282e41f1ab9b29f0ddea | EN | + +# How To Use + +For using duui-climate as a DUUI image it is necessary to use the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +## Start Docker container + +``` +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-climate-[modelname]:latest + +``` + +Find all available image tags here: [https://docker.texttechnologylab.org/v2/duui-climate-[modelname]/tags/list](https://docker.texttechnologylab.org/v2/duui-transformers-topic-[modelname]/tags/list) + +## Run within DUUI + +``` +composer.add( + new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-climate-[modelname]:latest") + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") +); +``` + +### Parameters + +| Name | Description | +| ---- | ----------- | +| `selection` | Use `text` to process the full document text or any selectable UIMA type class name | + +# Cite + +If you want to use the DUUI image please quote this as follows: + +Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. [[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] + +## BibTeX + +``` +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander}, + editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023}, + year = {2023}, + address = {Singapore}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, + pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf}, + abstract = {Automatic analysis of large corpora is a complex task, especially + in terms of time efficiency. This complexity is increased by the + fact that flexible, extensible text analysis requires the continuous + integration of ever new tools. Since there are no adequate frameworks + for these purposes in the field of NLP, and especially in the + context of UIMA, that are not outdated or unusable for security + reasons, we present a new approach to address the latter task: + Docker Unified UIMA Interface (DUUI), a scalable, flexible, lightweight, + and feature-rich framework for automatic distributed analysis + of text corpora that leverages Big Data experience and virtualization + with Docker. We evaluate DUUI{'}s communication approach against + a state-of-the-art approach and demonstrate its outstanding behavior + in terms of time efficiency, enabling the analysis of big text + data.} +} + +@misc{Bagci:2024, + author = {Bagci, Mevlüt}, + title = {Hugging-Face-based climate models as {DUUI} component}, + year = {2026}, + howpublished = {https://github.com/texttechnologylab/duui-uima/tree/main/duui-Climate} +} + +``` diff --git a/duui-Climate/docker_build.sh b/duui-Climate/docker_build.sh new file mode 100644 index 00000000..0abfc296 --- /dev/null +++ b/duui-Climate/docker_build.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +export ANNOTATOR_CUDA= +#export ANNOTATOR_CUDA="-cuda" + +export ANNOTATOR_NAME=duui-climate +export ANNOTATOR_VERSION=0.1.0 +export LOG_LEVEL=DEBUG +export MODEL_CACHE_SIZE=3 +export DOCKER_REGISTRY="docker.texttechnologylab.org/" + +###--------------------------------------------------------------------- +#export MODEL_NAME="climatebert/distilroberta-base-climate-detector" +#export MODEL_SPECNAME="distilroberta-base-climate-detector" +#export MODEL_VERSION="2c3bc660d45a59e31b35f5d3e365ee4f59fdf76c" +#export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-detector" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="climatebert/distilroberta-base-climate-tcfd" +#export MODEL_SPECNAME="distilroberta-base-climate-tcfd" +#export MODEL_VERSION="970630beedc21db81a84156448ad2e3ac860153d" +#export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-tcfd" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="climatebert/distilroberta-base-climate-commitment" +#export MODEL_SPECNAME="distilroberta-base-climate-commitment" +#export MODEL_VERSION="17337c3292df16a8fe93b1505dfe4122d50a4c91" +#export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-commitment" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="climatebert/distilroberta-base-climate-sentiment" +#export MODEL_SPECNAME="distilroberta-base-climate-sentiment" +#export MODEL_VERSION="e9f9a94ee4263f5ad5cfc97b8539a497fc88aa7d" +#export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-sentiment" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +##--------------------------------------------------------------------- +export MODEL_NAME="climatebert/distilroberta-base-climate-specificity" +export MODEL_SPECNAME="distilroberta-base-climate-specificity" +export MODEL_VERSION="4ada96ed4bf5c3a7a711282e41f1ab9b29f0ddea" +export MODEL_SOURCE="https://huggingface.co/climatebert/distilroberta-base-climate-specificity" +export MODEL_LANG="EN" +##-------------------------------------------------------------------- + + + +docker build \ + --build-arg ANNOTATOR_NAME \ + --build-arg ANNOTATOR_VERSION \ + --build-arg LOG_LEVEL \ + --build-arg MODEL_CACHE_SIZE \ + --build-arg MODEL_NAME \ + --build-arg MODEL_VERSION \ + --build-arg MODEL_SOURCE \ + --build-arg MODEL_LANG \ + -t ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + -f src/main/docker/Dockerfile${ANNOTATOR_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:latest${ANNOTATOR_CUDA} diff --git a/duui-Climate/pom.xml b/duui-Climate/pom.xml new file mode 100644 index 00000000..23c49fe7 --- /dev/null +++ b/duui-Climate/pom.xml @@ -0,0 +1,157 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui-climate + 0.1.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + + 17 + 17 + 2.4.0 + + + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + com.github.mevbagci + DockerUnifiedUIMAInterface + + + ad501be374 + + + + + + + + + com.github.mevbagci + UIMATypeSystem + 3.0.23.1 + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + \ No newline at end of file diff --git a/duui-Climate/requirements.txt b/duui-Climate/requirements.txt new file mode 100644 index 00000000..c8109fba --- /dev/null +++ b/duui-Climate/requirements.txt @@ -0,0 +1,14 @@ +torch==2.11.0 +torchaudio==2.11.0 +torchvision==0.26.0 +scipy==1.17.1 +transformers==5.9.0 +sentencepiece==0.2.1 +protobuf==4.25.3 +numpy==2.4.6 +scikit-learn==1.8.0 +fastapi==0.110.0 +dkpro-cassis==0.9.1 +uvicorn[standard]==0.27.1 +pydantic-settings==2.0.2 +torchmetrics==1.2.0 \ No newline at end of file diff --git a/duui-Climate/src/main/docker/Dockerfile b/duui-Climate/src/main/docker/Dockerfile new file mode 100644 index 00000000..69b89a12 --- /dev/null +++ b/duui-Climate/src/main/docker/Dockerfile @@ -0,0 +1,55 @@ +FROM python:3.12 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_climate.py ./duui_climate.py +COPY ./src/main/python/duui_climate.lua ./duui_climate.lua +COPY ./src/main/python/Climate.py ./Climate.py + +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-detector'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-detector')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-tcfd'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-tcfd')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-commitment'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-commitment')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-sentiment'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-sentiment')" +RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-specificity'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-specificity')" + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-climate" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_climate:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-Climate/src/main/docker/Dockerfile-cuda b/duui-Climate/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..8d902811 --- /dev/null +++ b/duui-Climate/src/main/docker/Dockerfile-cuda @@ -0,0 +1,74 @@ +FROM nvidia/cuda:11.8.0-base-ubuntu22.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +RUN pip install setuptools wheel +COPY ./requirements.txt ./requirements.txt +RUN apt remove -y python3-blinker || true +RUN pip install -r requirements.txt + + + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_genre.py ./duui_genre.py +COPY ./src/main/python/duui_genre.lua ./duui_genre.lua +COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-detector'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-detector')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-tcfd'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-tcfd')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-commitment'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-commitment')" +#RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-sentiment'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-sentiment')" +RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoTokenizer.from_pretrained('climatebert/distilroberta-base-climate-specificity'); AutoModelForSequenceClassification.from_pretrained('climatebert/distilroberta-base-climate-specificity')" + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-climate" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_climate:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] + diff --git a/duui-Climate/src/main/python/Climate.py b/duui-Climate/src/main/python/Climate.py new file mode 100644 index 00000000..b9ed6344 --- /dev/null +++ b/duui-Climate/src/main/python/Climate.py @@ -0,0 +1,53 @@ +import torch +import math +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from scipy.special import softmax +import numpy as np +from typing import List + +model_name_map = { + "climatebert/distilroberta-base-climate-detector": "ClimateDetector", + "climatebert/distilroberta-base-climate-tcfd": "ClimateTCFD", + "climatebert/distilroberta-base-climate-commitment": "ClimateCommitment", + "climatebert/distilroberta-base-climate-sentiment": "ClimateSentiment", + "climatebert/distilroberta-base-climate-specificity": "ClimateSpecificity", +} + +def sigmoid(x): + return 1 / (1 + math.exp(-x)) + +class ClimateBert: + def __init__(self, model_name: str, device='cuda:0'): + self.device = device + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device) + self.class_mapping = self.model.config.id2label + self.labels = list(self.class_mapping.values()) + + def prediction(self, texts: List[str]): + with torch.no_grad(): + inputs = self.tokenizer( + texts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(self.device) + + outputs = self.model(**inputs) + logits = outputs[0].float() # convert bfloat16 -> float32 + probs = torch.softmax(logits, dim=-1) + + score_list = [] + + for prob in probs.cpu(): + ranking = torch.argsort(prob, descending=True) + + score_dict_i = { + self.labels[i]: float(prob[i]) + for i in ranking + } + + score_list.append(score_dict_i) + return score_list + diff --git a/duui-Climate/src/main/python/TypeSystemTopic.xml b/duui-Climate/src/main/python/TypeSystemTopic.xml new file mode 100644 index 00000000..dc052a36 --- /dev/null +++ b/duui-Climate/src/main/python/TypeSystemTopic.xml @@ -0,0 +1,132 @@ + + + + + org.texttechnologylab.annotation.AnnotatorMetaData + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + name + + uima.cas.String + + + version + + uima.cas.String + + + modelName + + uima.cas.String + + + modelVersion + + uima.cas.String + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + org.texttechnologylab.annotation.DocumentModification + + uima.cas.AnnotationBase + + + user + + uima.cas.String + + + timestamp + + uima.cas.Long + + + comment + + uima.cas.String + + + + + org.hucompute.textimager.uima.type.Sentiment + + uima.tcas.Annotation + + + sentiment + + uima.cas.Double + + + subjectivity + + uima.cas.Double + + + + + org.hucompute.textimager.uima.type.CategorizedSentiment + + org.hucompute.textimager.uima.type.Sentiment + + + pos + + uima.cas.Double + + + neu + + uima.cas.Double + + + neg + + uima.cas.Double + + + + + org.texttechnologylab.annotation.AnnotationComment + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + value + + uima.cas.String + + + key + + uima.cas.String + + + + + diff --git a/duui-Climate/src/main/python/duui_climate.lua b/duui-Climate/src/main/python/duui_climate.lua new file mode 100644 index 00000000..fcd1740f --- /dev/null +++ b/duui-Climate/src/main/python/duui_climate.lua @@ -0,0 +1,133 @@ +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +TopicUtils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") + +function serialize(inputCas, outputStream, parameters) + local doc_lang = inputCas:getDocumentLanguage() + local doc_text = inputCas:getDocumentText() + local doc_len = TopicUtils:getDocumentTextLength(inputCas) + + local selection_types = parameters["selection"] + + local selections = {} + local selections_count = 1 + for selection_type in string.gmatch(selection_types, "([^,]+)") do + local sentences = {} + if selection_type == "text" then + local s = { + text = doc_text, + begin = 0, + ['end'] = doc_len + } + sentences[1] = s + else + local sentences_count = 1 + local clazz = Class:forName(selection_type); + local sentences_it = JCasUtil:select(inputCas, clazz):iterator() + while sentences_it:hasNext() do + local sentence = sentences_it:next() + local s = { + text = sentence:getCoveredText(), + begin = sentence:getBegin(), + ['end'] = sentence:getEnd() + } + sentences[sentences_count] = s + sentences_count = sentences_count + 1 + end + end + + local selection = { + sentences = sentences, + selection = selection_type + } + selections[selections_count] = selection + selections_count = selections_count + 1 + end + + outputStream:write(json.encode({ + selections = selections, + lang = doc_lang, + doc_len = doc_len + })) +end + +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) + if results["modification_meta"] ~= nil and results["meta"] ~= nil and results["results"] ~= nil then + -- print("GetInfo") + local source = results["model_source"] + local model_version = results["model_version"] + local model_name = results["model_name"] + local model_lang = results["model_lang"] + -- print("meta") + local modification_meta = results["modification_meta"] + local modification_anno = luajava.newInstance("org.texttechnologylab.annotation.DocumentModification", inputCas) + modification_anno:setUser(modification_meta["user"]) + modification_anno:setTimestamp(modification_meta["timestamp"]) + modification_anno:setComment(modification_meta["comment"]) + modification_anno:addToIndexes() + + -- print("setMetaData") + local model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) + model_meta:setModelVersion(model_version) + -- print(model_version) + model_meta:setModelName(model_name) + -- print(model_name) + model_meta:setSource(source) + -- print(source) + model_meta:setLang(model_lang) + -- print(model_lang) + model_meta:addToIndexes() + + local meta = results["meta"] + -- print("meta") + local begin_climate = results["begin"] + -- print("begin_emo") + local end_climate = results["end"] + -- print("end_emo") + local res_out = results["results"] +-- print("results") + local res_len = results["len_results"] + -- print("Len_results") + local factors = results["factors"] + local maptype = results["model_type"] +-- print(factors) + for index_i, res in ipairs(res_out) do + -- print(res) + local begin_climate_i = begin_climate[index_i] + -- print(begin_climate_i) + local end_climate_i = end_climate[index_i] + -- print(end_climate_i) + local len_i = res_len[index_i] + -- print(len_i) + -- print(type(len_i)) + local climate_i = luajava.newInstance("org.texttechnologylab.annotation.Climate", inputCas, begin_climate_i, end_climate_i) + -- print(climate_i) + local fsarray = luajava.newInstance("org.apache.uima.jcas.cas.FSArray", inputCas, len_i) + -- print(fsarray) + climate_i:setClimates(fsarray) + local counter = 0 + local factor_i = factors[index_i] + -- print(factor_i) + for index_j, climate_j in ipairs(res) do + -- print(climate_j) + local factor_j = factor_i[index_j] + -- print(factor_j) + climate_in_i = luajava.newInstance("org.texttechnologylab.annotation.AnnotationComment", inputCas) + climate_in_i:setReference(climate_i) + climate_in_i:setKey(climate_j) + climate_in_i:setValue(factor_j) + climate_in_i:addToIndexes() + climate_i:setClimates(counter, climate_in_i) + counter = counter + 1 + end + climate_i:setModel(model_meta) + climate_i:setClimateType(maptype) + climate_i:addToIndexes() + -- print("add") + end + end + -- print("end") + end diff --git a/duui-Climate/src/main/python/duui_climate.py b/duui-Climate/src/main/python/duui_climate.py new file mode 100644 index 00000000..1fd9390c --- /dev/null +++ b/duui-Climate/src/main/python/duui_climate.py @@ -0,0 +1,286 @@ +from pydantic import BaseModel +from pydantic_settings import BaseSettings +from typing import List, Optional, Dict, Union +import logging +from time import time +from fastapi import FastAPI, Response +from cassis import load_typesystem +import torch +from threading import Lock +from functools import lru_cache +from Climate import ClimateBert,model_name_map +# from sp_correction import SentenceBestPrediction + +# Settings +# These are automatically loaded from env variables +from starlette.responses import PlainTextResponse + +model_lock = Lock() + + +class UimaSentence(BaseModel): + text: str + begin: int + end: int + + +class UimaSentenceSelection(BaseModel): + selection: str + sentences: List[UimaSentence] + + +class Settings(BaseSettings): + # Name of this annotator + annotator_name: str + # Version of this annotator + annotator_version: str + # Log level + log_level: str + # model_name + model_name: str + # Name of this annotator + model_version: str + #cach_size + model_cache_size: int + # url of the model + model_source: str + # language of the model + model_lang: str + + +# Load settings from env vars +settings = Settings() +lru_cache_with_size = lru_cache(maxsize=settings.model_cache_size) +logging.basicConfig(level=settings.log_level) +logger = logging.getLogger(__name__) + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +# device = "cpu" +logger.info(f'USING {device}') +# Load the predefined typesystem that is needed for this annotator to work +typesystem_filename = 'TypeSystemTopic.xml' +logger.debug("Loading typesystem from \"%s\"", typesystem_filename) +with open(typesystem_filename, 'rb') as f: + typesystem = load_typesystem(f) + logger.debug("Base typesystem:") + logger.debug(typesystem.to_xml()) + +# Load the Lua communication script +lua_communication_script_filename = "duui_climate.lua" +logger.debug("Loading Lua communication script from \"%s\"", lua_communication_script_filename) + + +# Request sent by DUUI +# Note, this is transformed by the Lua script +class DUUIRequest(BaseModel): + # The texts language + doc_len: int + # + lang: str + # + selections: List[UimaSentenceSelection] + # + + +# UIMA type: mark modification of the document +class DocumentModification(BaseModel): + user: str + timestamp: int + comment: str + + +# UIMA type: adds metadata to each annotation +class AnnotationMeta(BaseModel): + name: str + version: str + modelName: str + modelVersion: str + + +# Response sent by DUUI +# Note, this is transformed by the Lua script +class DUUIResponse(BaseModel): + # Symspelloutput + # List of Sentence with every token + # Every token is a dictionary with following Infos: + # Symspelloutput right if the token is correct, wrong if the token is incorrect, skipped if the token was skipped, unkownn if token can corrected with Symspell + # If token is unkown it will be predicted with BERT Three output pos: + # 1. Best Prediction with BERT MASKED + # 2. Best Cos-sim with Sentence-Bert and with perdicted words of BERT MASK + # 3. Option 1 and 2 together + meta: AnnotationMeta + # Modification meta, one per document + modification_meta: DocumentModification + begin: List[int] + end: List[int] + results: List + factors: List + len_results: List[int] + model_name: str + model_version: str + model_source: str + model_lang: str + model_type: str + + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.annotator_name, + description="Factuality annotator", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "TTLab Team", + "url": "https://texttechnologylab.org", + "email": "bagci@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +with open(lua_communication_script_filename, 'rb') as f: + lua_communication_script = f.read().decode("utf-8") +logger.debug("Lua communication script:") +logger.debug(lua_communication_script_filename) + + +# Get typesystem of this annotator +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO remove cassis dependency, as only needed for typesystem at the moment? + xml = typesystem.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +# Return documentation info +@app.get("/v1/documentation") +def get_documentation(): + return "Test" + + +@lru_cache_with_size +def load_model(model_name): + model_i = ClimateBert(model_name, device) + return model_i + + +def fix_unicode_problems(text): + # fix emoji in python string and prevent json error on response + # File "/usr/local/lib/python3.8/site-packages/starlette/responses.py", line 190, in render + # UnicodeEncodeError: 'utf-8' codec can't encode characters in position xx-yy: surrogates not allowed + clean_text = text.encode('utf-16', 'surrogatepass').decode('utf-16', 'surrogateescape') + return clean_text + + +def process_selection(model_name, selection): + begin = [] + end = [] + results_out = [] + factors = [] + len_results = [] + for s in selection.sentences: + s.text = fix_unicode_problems(s.text) + + texts = [ + s.text + for s in selection.sentences + ] + logger.debug("Preprocessed texts:") + logger.debug(texts) + model_map = "others" + + with model_lock: + if model_name in model_name_map: + model_map = model_name_map[model_name] + classifier = load_model(model_name) + + results = classifier.prediction(texts) + for c, res in enumerate(results): + res_i = [] + factor_i = [] + sentence_i = selection.sentences[c] + begin_i = sentence_i.begin + end_i = sentence_i.end + len_rel = len(res) + begin.append(begin_i) + end.append(end_i) + for i in res: + res_i.append(i) + factor_i.append(res[i]) + len_results.append(len_rel) + results_out.append(res_i) + factors.append(factor_i) + output = { + "begin": begin, + "end": end, + "len_results": len_results, + "results": results_out, + "factors": factors, + "model_type": model_map + } + + return output + + +# Process request from DUUI +@app.post("/v1/process") +def post_process(request: DUUIRequest): + # Return data + meta = None + begin = [] + end = [] + len_results = [] + results = [] + factors = [] + model_type = "others" + # Save modification start time for later + modification_timestamp_seconds = int(time()) + try: + model_source = settings.model_source + model_lang = settings.model_lang + model_version = settings.model_version + # set meta Informations + meta = AnnotationMeta( + name=settings.annotator_name, + version=settings.annotator_version, + modelName=settings.model_name, + modelVersion=model_version, + ) + if settings.model_name in model_name_map: + model_type = model_name_map[settings.model_name] + # Add modification info + modification_meta_comment = f"{settings.annotator_name} ({settings.annotator_version}))" + modification_meta = DocumentModification( + user=settings.annotator_name, + timestamp=modification_timestamp_seconds, + comment=modification_meta_comment + ) + mv = "" + + for selection in request.selections: + processed_sentences = process_selection(settings.model_name, selection) + begin = begin + processed_sentences["begin"] + end = end + processed_sentences["end"] + len_results = len_results + processed_sentences["len_results"] + results = results + processed_sentences["results"] + factors = factors + processed_sentences["factors"] + except Exception as ex: + logger.exception(ex) + return DUUIResponse(meta=meta, modification_meta=modification_meta, begin=begin, end=end, results=results, + len_results=len_results, factors=factors, model_name=settings.model_name, + model_version=model_version, model_source=model_source, model_lang=model_lang, model_type=model_type) diff --git a/duui-Climate/src/test/java/org/hucompute/textimager/uima/climate/ClimateTest.java b/duui-Climate/src/test/java/org/hucompute/textimager/uima/climate/ClimateTest.java new file mode 100644 index 00000000..23db65e7 --- /dev/null +++ b/duui-Climate/src/test/java/org/hucompute/textimager/uima/climate/ClimateTest.java @@ -0,0 +1,183 @@ +package org.hucompute.textimager.uima.climate; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.util.XmlCasSerializer; +import org.junit.jupiter.api.*; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.texttechnologylab.annotation.Climate; +import org.texttechnologylab.annotation.AnnotationComment; + +public class ClimateTest { + static DUUIComposer composer; + static JCas cas; + + static String url = "http://127.0.0.1:9714"; +// static String url = "http://tweentopic.service.component.duui.texttechnologylab.org"; +// static String model = "chkla/parlbert-topic-german"; + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); +// DUUIDockerDriver docker_driver = new DUUIDockerDriver(); +// composer.addDriver(docker_driver); + + + cas = JCasFactory.createJCas(); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + public void afterEach() throws IOException, SAXException { + composer.resetPipeline(); + + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, stream); + System.out.println(stream.toString(StandardCharsets.UTF_8)); + + cas.reset(); + } + + public void createCas(String language, List sentences) throws UIMAException { + cas.setDocumentLanguage(language); + + StringBuilder sb = new StringBuilder(); + for (String sentence : sentences) { + Sentence sentenceAnnotation = new Sentence(cas, sb.length(), sb.length()+sentence.length()); + sentenceAnnotation.addToIndexes(); + sb.append(sentence).append(" "); + } + + cas.setDocumentText(sb.toString()); + } + + @Test + public void DeTest() throws Exception { + HashMap> expected1 = new HashMap<>(); + ArrayList expected2 = new ArrayList<>(); + expected2.add("Domestic"); + expected2.add("Technology"); + expected1.put("test", expected2); + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + ); + + List sentences = Arrays.asList( + "Ich bin ein Profi-Fußballspieler und spiele bei FC Barcelona in Spanien.", + "Das sind die Aktuellen Neuigkeiten aus den USA. Joe Biden hat die Wahl gewonnen." + ); + + createCas("de", sentences); + composer.run(cas); + + Collection all_climates = JCasUtil.select(cas, Climate.class); + ArrayList> expected = new ArrayList>(); + for (Climate climate: all_climates){ + System.out.println(climate.getCoveredText()); + Map climates = new HashMap(); + FSArray climates_all = climate.getClimates(); + for (AnnotationComment comment_i: climates_all){ + climates.put(comment_i.getKey(), Float.parseFloat(comment_i.getValue())); + System.out.println("key:"+comment_i.getKey()+"; Value:"+comment_i.getValue()); + } + expected.add(climates); + } + + for (Map topic: expected){ + // highest value + String key = Collections.max(topic.entrySet(), Map.Entry.comparingByValue()).getKey(); + Assertions.assertEquals(expected1.get("test").get(expected.indexOf(topic)), key); + } + } + + @Test + public void EnTest() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + ); + + List sentences = Arrays.asList( + "I will guide through the Labyrinth. First you need to find the entrance. Then you need to find the exit.", + "These are the latest news from the USA. Joe Biden has won the election." + ); + + createCas("de", sentences); + composer.run(cas); + + Collection all_climates = JCasUtil.select(cas, Climate.class); + ArrayList> expected = new ArrayList>(); + for (Climate climate: all_climates){ + System.out.println(climate.getCoveredText()); + Map climates = new HashMap(); + String model_name = climate.getModel().getModelName(); + String type_name = climate.getClimateType(); + System.out.println(model_name); + System.out.println(type_name); + FSArray climates_all = climate.getClimates(); + for (AnnotationComment comment_i: climates_all){ + climates.put(comment_i.getKey(), Float.parseFloat(comment_i.getValue())); + System.out.println("key:"+comment_i.getKey()+"; Value:"+comment_i.getValue()); + } + expected.add(climates); + } + +// HashMap> expected = new HashMap<>(); +// Collection topics = JCasUtil.select(cas, CategoryCoveredTagged.class); +//// System.out.println(topics.size()); +// for (CategoryCoveredTagged topic: topics){ +// int start = topic.getBegin(); +// int end = topic.getEnd(); +// String coveredText = topic.getCoveredText(); +// String value = topic.getValue(); +// double score = topic.getScore(); +// String key1 = start + "_" + end; +// HashMap value1 = new HashMap<>(); +// value1.put(value, score); +// if (expected.containsKey(key1)){ +// expected.get(key1).put(value, score); +// } else { +// expected.put(key1, value1); +// } +// } +// HashMap expected1 = new HashMap<>(); +// expected1.put("0_104", "Instruction"); +// expected1.put("105_176", "News"); +// for (Map.Entry> entry: expected.entrySet()){ +// String key = Collections.max(entry.getValue().entrySet(), Map.Entry.comparingByValue()).getKey(); +// // compare the expected with same index in the actual +// String expectedValue = expected1.get(entry.getKey()); +// assertEquals(expectedValue, key); +// } + + } +} diff --git a/duui-Coreference/.dockerignore b/duui-Coreference/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-Coreference/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-Coreference/.gitignore b/duui-Coreference/.gitignore new file mode 100644 index 00000000..98adfc38 --- /dev/null +++ b/duui-Coreference/.gitignore @@ -0,0 +1,4 @@ +.idea +target +venv +models \ No newline at end of file diff --git a/duui-Coreference/Readme.md b/duui-Coreference/Readme.md new file mode 100644 index 00000000..90a1d60a --- /dev/null +++ b/duui-Coreference/Readme.md @@ -0,0 +1 @@ +... \ No newline at end of file diff --git a/duui-Coreference/docker_build.sh b/duui-Coreference/docker_build.sh new file mode 100644 index 00000000..5da1de30 --- /dev/null +++ b/duui-Coreference/docker_build.sh @@ -0,0 +1,99 @@ +export ANNOTATOR_NAME=duui-coreference +export ANNOTATOR_VERSION=0.2.0 +export LOG_LEVEL=INFO +eport MODEL_CACHE_SIZE=3 + +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="en" +#export MODEL_VARIANT="sm" +###-------------------------------------------------------------------- + +##--------------------------------------------------------------------- +export MODEL_NAME="coreferee" +export MODEL_SPECNAME="coreferee" +export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +export MODEL_LANG="de" +export MODEL_VARIANT="sm" +##-------------------------------------------------------------------- +# +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="fr" +#export MODEL_VARIANT="sm" +###-------------------------------------------------------------------- +# +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="pl" +#export MODEL_VARIANT="sm" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="en" +#export MODEL_VARIANT="lg" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="de" +#export MODEL_VARIANT="lg" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="fr" +#export MODEL_VARIANT="lg" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="coreferee" +#export MODEL_SPECNAME="coreferee" +#export MODEL_VERSION="3ee6f2781e54988d6c3593c6b8f37cc3bae8f982" +#export MODEL_SOURCE="https://github.com/richardpaulhudson/coreferee" +#export MODEL_LANG="pl" +#export MODEL_VARIANT="lg" +###-------------------------------------------------------------------- + + +export DOCKER_REGISTRY="docker.texttechnologylab.org/" +export DUUI_CUDA= +#export DUUI_CUDA="-cuda" + +docker build \ + --build-arg ANNOTATOR_NAME \ + --build-arg ANNOTATOR_VERSION \ + --build-arg LOG_LEVEL \ + --build-arg MODEL_CACHE_SIZE \ + --build-arg MODEL_NAME \ + --build-arg MODEL_VERSION \ + --build-arg MODEL_SOURCE \ + --build-arg MODEL_LANG \ + --build-arg MODEL_VARIANT \ + -t ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_LANG}"-"${MODEL_VARIANT}:${ANNOTATOR_VERSION}${DUUI_CUDA} \ + -f src/main/docker/Dockerfile${DUUI_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_LANG}"-"${MODEL_VARIANT}:${ANNOTATOR_VERSION}${DUUI_CUDA} \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}"-"${MODEL_LANG}"-"${MODEL_VARIANT}:latest${DUUI_CUDA} \ No newline at end of file diff --git a/duui-Coreference/pom.xml b/duui-Coreference/pom.xml new file mode 100644 index 00000000..837436f9 --- /dev/null +++ b/duui-Coreference/pom.xml @@ -0,0 +1,155 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui-Coreference + 0.1.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + + 17 + 17 + 2.4.0 + + 2789ba29fa1f236b64b0402315ffe1cf5d81b654 + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 7cef2433b5 + + + + + + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.14 + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + \ No newline at end of file diff --git a/duui-Coreference/requirements.txt b/duui-Coreference/requirements.txt new file mode 100644 index 00000000..bb516d09 --- /dev/null +++ b/duui-Coreference/requirements.txt @@ -0,0 +1,9 @@ +spacy==3.2.0 +coreferee==1.4.1 +numpy==1.26.4 +setuptools<70 +pydantic>=1.7.4,<1.11.0 +regex==2023.12.25 +fastapi==0.110.0 +uvicorn[standard]==0.27.1 +dkpro-cassis==0.9.1 \ No newline at end of file diff --git a/duui-Coreference/src/.dockerignore b/duui-Coreference/src/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-Coreference/src/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-Coreference/src/main/docker/Dockerfile b/duui-Coreference/src/main/docker/Dockerfile new file mode 100644 index 00000000..f6dd18c9 --- /dev/null +++ b/duui-Coreference/src/main/docker/Dockerfile @@ -0,0 +1,80 @@ +FROM python:3.10 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +RUN python -m pip install --no-cache-dir \ + "spacy==3.2.0" \ + "coreferee" \ + "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl" \ + "de-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.2.0/de_core_news_sm-3.2.0-py3-none-any.whl" \ + "fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.2.0/fr_core_news_sm-3.2.0-py3-none-any.whl" \ + "pl-core-news-md @ https://github.com/explosion/spacy-models/releases/download/pl_core_news_md-3.2.0/pl_core_news_md-3.2.0-py3-none-any.whl" && \ + python -m spacy validate && \ + python -m coreferee install en && \ + python -m coreferee install de && \ + python -m coreferee install fr && \ + python -m coreferee install pl + + +#RUN python -m pip install --no-cache-dir \ +# "spacy==3.2.0" \ +# "coreferee" \ +# "en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl" \ +# "de-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.2.0/de_core_news_lg-3.2.0-py3-none-any.whl" \ +# "fr-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.2.0/fr_core_news_lg-3.2.0-py3-none-any.whl" \ +# "pl-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/pl_core_news_lg-3.2.0/pl_core_news_lg-3.2.0-py3-none-any.whl" && \ +# python -m spacy validate && \ +# python -m coreferee install en && \ +# python -m coreferee install de && \ +# python -m coreferee install fr && \ +# python -m coreferee install pl + +# copy scripts +COPY ./src/main/python/TypeSystemCoreference.xml ./TypeSystemCoreference.xml +COPY ./src/main/python/duui_coreference.py ./duui_coreference.py +COPY ./src/main/python/Coreferee_resolver.py ./Coreferee_resolver.py +COPY ./src/main/python/duui_coreference.lua ./duui_coreference.lua +#COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-transformers-topic" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG +ARG MODEL_VARIANT="" +ENV MODEL_VARIANT=$MODEL_VARIANT + + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_coreference:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-Coreference/src/main/docker/Dockerfile-cuda b/duui-Coreference/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..2ad8b60b --- /dev/null +++ b/duui-Coreference/src/main/docker/Dockerfile-cuda @@ -0,0 +1,57 @@ +FROM nvidia/cuda:11.0.3-base-ubuntu20.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.8 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + + +# meta data +ARG FACT_ANNOTATOR_NAME="duui-Factchecking:app" +ENV FACT_ANNOTATOR_NAME=$FACT_ANNOTATOR_NAME +ARG FACT_ANNOTATOR_VERSION="unset" +ENV FACT_ANNOTATOR_VERSION=$FACT_ANNOTATOR_VERSION + +# log level +ARG FACT_LOG_LEVEL="DEBUG" +ENV FACT_LOG_LEVEL=$FACT_LOG_LEVEL + +# config +ARG FACT_MODEL_CACHE_SIZE=3 +ENV FACT_MODEL_CACHE_SIZE=$FACT_MODEL_CACHE_SIZE + +# Model Info +ARG FACT_MODEL_NAME="" +ENV FACT_MODEL_NAME=$FACT_MODEL_NAME +ARG FACT_MODEL_VERSION=0.1 +ENV FACT_MODEL_VERSION=$FACT_MODEL_VERSION + +# service script +COPY ./src/main/python/TypeSystemFactChecking.xml ./TypeSystemFactChecking.xml +COPY ./src/main/python/scorer.py ./scorer.py +COPY ./src/main/python/evaluator.py ./evaluator.py +COPY ./src/main/python/utils.py ./utils.py +COPY ./src/main/python/factchecker.py ./factchecker.py +COPY ./src/main/python/duui_fact.lua ./duui_fact.lua +COPY ./src/main/python/duui_fact.py ./duui_fact.py +COPY ./reqiurements.txt ./reqiurements.txt + +RUN pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118 +RUN pip install -r reqiurements.txt +RUN python -m nltk.downloader punkt +RUN python -c "from evaluator import get_evaluator; get_evaluator('fact', device='cpu')" +#RUN python -c "from nubia_score import Nubia; nubia = Nubia()" + + +ENTRYPOINT ["uvicorn", "duui_fact:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] + diff --git a/duui-Coreference/src/main/python/Coreferee_resolver.py b/duui-Coreference/src/main/python/Coreferee_resolver.py new file mode 100644 index 00000000..aa542d1f --- /dev/null +++ b/duui-Coreference/src/main/python/Coreferee_resolver.py @@ -0,0 +1,550 @@ +from __future__ import annotations + +from typing import Any, Optional, Union + +import coreferee # noqa: F401 # Registers the spaCy pipeline component "coreferee" +import spacy + +from spacy.tokens import Doc + + +EXTERNAL_OFFSETS_EXTENSION = "external_token_offsets" + +if not Doc.has_extension(EXTERNAL_OFFSETS_EXTENSION): + Doc.set_extension(EXTERNAL_OFFSETS_EXTENSION, default=None) + + +class CorefereeResolver: + """ + Coreferee wrapper with one fixed language per instance. + + Input: + - tokens: list[str] + - begins: list[int] + - ends: list[int] + + Output: + { + "begin": [...], + "end": [...], + "begin_resolve": [...], + "end_resolve": [...], + "token": [...], + "token_resolve": [...], + } + + The language is set once during initialization. + Runtime language switching is intentionally not supported. + """ + + DEFAULT_MODELS = { + "sm": { + "en": "en_core_web_sm", + "de": "de_core_news_sm", + "fr": "fr_core_news_sm", + "pl": "pl_core_news_md", + }, + "lg": { + "en": "en_core_web_lg", + "de": "de_core_news_lg", + "fr": "fr_core_news_lg", + "pl": "pl_core_news_lg", + }, + } + + LANG_ALIASES = { + "en": "en", + "english": "en", + "englisch": "en", + + "de": "de", + "german": "de", + "deutsch": "de", + + "fr": "fr", + "french": "fr", + "französisch": "fr", + "franzoesisch": "fr", + + "pl": "pl", + "polish": "pl", + "polnisch": "pl", + } + + def __init__( + self, + language: str, + variant: str, + model_overrides: Optional[dict[str, str]] = None, + ): + self._language = self._normalize_language(language) + self.variant = variant + + self.models = dict(self.DEFAULT_MODELS) + if model_overrides: + self.models.update(model_overrides) + + self.nlp = self._load_pipeline() + + @property + def language(self) -> str: + return self._language + + def _normalize_language(self, language: str) -> str: + lang = language.strip().lower() + + if lang not in self.LANG_ALIASES: + supported = ", ".join(self.DEFAULT_MODELS.keys()) + raise ValueError( + f"Unsupported language: {language!r}. " + f"Supported languages are: {supported}" + ) + + return self.LANG_ALIASES[lang] + + def _load_pipeline(self): + model_name = self.models[self.variant][self.language] + + try: + nlp = spacy.load(model_name) + except OSError as exc: + raise RuntimeError( + f"spaCy model not found: {model_name!r}\n\n" + f"Install it with:\n" + f"python -m spacy download {model_name}\n\n" + f"Or override the model name, for example:\n" + f"CorefereeResolver('en', model_overrides={{'en': 'en_core_web_sm'}})" + ) from exc + + if "coreferee" not in nlp.pipe_names: + try: + nlp.add_pipe("coreferee") + except Exception as exc: + raise RuntimeError( + f"Could not load Coreferee for language {self.language!r}.\n\n" + f"Install the Coreferee language data with:\n" + f"python -m coreferee install {self.language}" + ) from exc + + return nlp + + def process_text(self, text: str) -> Doc: + """ + Process raw text. + + This is optional. If you already have tokens/begins/ends, + use process_tokens instead. + + For raw text, external offsets are taken from spaCy token offsets. + """ + if not text or not text.strip(): + raise ValueError("text must not be empty.") + + doc = self.nlp(text) + + doc._.external_token_offsets = [ + { + "begin": token.idx, + "end": token.idx + len(token.text), + } + for token in doc + ] + + return doc + + def process_tokens( + self, + tokens: list[str], + begins: list[int], + ends: list[int], + spaces: Optional[list[bool]] = None, + ) -> Doc: + """ + Process pre-tokenized input with external begin/end offsets. + + Args: + tokens: + Separate list of token strings. + + begins: + Separate list of begin offsets. + + ends: + Separate list of end offsets. + + spaces: + Optional whitespace information. + If None, spaces are inferred from begin/end offsets. + + Important: + This method does not call self.nlp(" ".join(tokens)), + because that would let spaCy tokenize the text again. + """ + self._validate_token_offsets(tokens, begins, ends) + + if spaces is None: + spaces = self._infer_spaces_from_offsets(begins, ends) + + if len(tokens) != len(spaces): + raise ValueError( + "tokens and spaces must have the same length. " + f"tokens={len(tokens)}, spaces={len(spaces)}" + ) + + doc = Doc(self.nlp.vocab, words=tokens, spaces=spaces) + + doc._.external_token_offsets = [ + { + "begin": int(begin), + "end": int(end), + } + for begin, end in zip(begins, ends) + ] + + for _, component in self.nlp.pipeline: + doc = component(doc) + + return doc + + def process( + self, + input_data: Union[str, list[str]], + begins: Optional[list[int]] = None, + ends: Optional[list[int]] = None, + spaces: Optional[list[bool]] = None, + ) -> Doc: + """ + Generic input processor. + + Supported: + - str + - list[str] with begins and ends + + If input_data is list[str], begins and ends are required. + """ + if isinstance(input_data, str): + return self.process_text(input_data) + + if isinstance(input_data, list): + if begins is None or ends is None: + raise ValueError( + "begins and ends are required when input_data is a token list." + ) + + return self.process_tokens( + tokens=input_data, + begins=begins, + ends=ends, + spaces=spaces, + ) + + raise TypeError("input_data must be either a string or a list of tokens.") + + def get_coreference_dict( + self, + doc: Doc, + include_self: bool = False, + expand_noun_chunks: bool = True, + ) -> dict[str, list]: + """ + Return all detected coreferences as a dictionary with six lists. + + Output: + { + "begin": [...], + "end": [...], + "begin_resolve": [...], + "end_resolve": [...], + "token": [...], + "token_resolve": [...], + } + + Meaning: + begin[i], end[i], token[i] + The detected mention. + + begin_resolve[i], end_resolve[i], token_resolve[i] + The resolved mention of the same coreference chain. + + No pronoun list is required. + """ + if doc._.external_token_offsets is None: + raise RuntimeError( + "The Doc has no external offsets. " + "Use process_tokens(tokens, begins, ends) or process_text(text)." + ) + + result: dict[str, list] = { + "begin": [], + "end": [], + "begin_resolve": [], + "end_resolve": [], + "token": [], + "token_resolve": [], + } + + seen: set[tuple[int, int, int, int]] = set() + + for chain in doc._.coref_chains: + mentions = self._get_chain_mentions(chain) + + if not mentions: + continue + + representative_index = getattr( + chain, + "most_specific_mention_index", + 0, + ) + + if representative_index is None: + representative_index = 0 + + if representative_index < 0 or representative_index >= len(mentions): + representative_index = 0 + + representative_mention = mentions[representative_index] + + resolved_span = self._mention_to_external_span( + doc=doc, + mention=representative_mention, + expand_noun_chunks=expand_noun_chunks, + ) + + for mention in mentions: + mention_span = self._mention_to_external_span( + doc=doc, + mention=mention, + expand_noun_chunks=expand_noun_chunks, + ) + + same_span = ( + mention_span["begin"] == resolved_span["begin"] + and mention_span["end"] == resolved_span["end"] + ) + + if same_span and not include_self: + continue + + key = ( + mention_span["begin"], + mention_span["end"], + resolved_span["begin"], + resolved_span["end"], + ) + + if key in seen: + continue + + seen.add(key) + + result["begin"].append(mention_span["begin"]) + result["end"].append(mention_span["end"]) + result["begin_resolve"].append(resolved_span["begin"]) + result["end_resolve"].append(resolved_span["end"]) + result["token"].append(mention_span["text"]) + result["token_resolve"].append(resolved_span["text"]) + + return result + + def _get_chain_mentions(self, chain) -> list: + """ + Return mentions from a Coreferee chain. + + Coreferee chains behave like lists, but some versions also expose + a .mentions attribute. This helper supports both variants. + """ + if hasattr(chain, "mentions"): + return list(chain.mentions) + + return list(chain) + + def _mention_to_external_span( + self, + doc: Doc, + mention, + expand_noun_chunks: bool, + ) -> dict[str, Any]: + """ + Convert a Coreferee mention to external begin/end offsets. + + A Coreferee mention is usually a list of token indices, for example: + [14] + [16, 19] + """ + token_indices = self._mention_to_token_indices(mention) + + if not token_indices: + raise ValueError("Coreferee mention does not contain token indices.") + + if expand_noun_chunks: + token_indices = self._expand_indices_to_noun_chunks( + doc=doc, + token_indices=token_indices, + ) + + first_i = min(token_indices) + last_i = max(token_indices) + + offsets = doc._.external_token_offsets + + begin = offsets[first_i]["begin"] + end = offsets[last_i]["end"] + + contiguous_indices = list(range(first_i, last_i + 1)) + is_contiguous = token_indices == contiguous_indices + + if is_contiguous: + text = doc[first_i:last_i + 1].text + else: + text = " ".join(doc[i].text for i in token_indices) + + return { + "begin": begin, + "end": end, + "text": text, + "token_indices": token_indices, + } + + def _mention_to_token_indices(self, mention) -> list[int]: + """ + Normalize a Coreferee mention to a list of token indices. + """ + if mention is None: + return [] + + if hasattr(mention, "token_indexes"): + return [int(i) for i in mention.token_indexes] + + if hasattr(mention, "token_indices"): + return [int(i) for i in mention.token_indices] + + if isinstance(mention, int): + return [int(mention)] + + return [int(i) for i in mention] + + def _expand_indices_to_noun_chunks( + self, + doc: Doc, + token_indices: list[int], + ) -> list[int]: + """ + Expand token indices to their noun chunks if possible. + + Examples: + token index for "cactus" -> indices for "a cactus" + token index for "vase" -> indices for "The vase" + """ + expanded_indices = set(token_indices) + + try: + noun_chunks = list(doc.noun_chunks) + except Exception: + return sorted(expanded_indices) + + for token_index in token_indices: + for chunk in noun_chunks: + if chunk.start <= token_index < chunk.end: + expanded_indices.update(range(chunk.start, chunk.end)) + break + + return sorted(expanded_indices) + + @staticmethod + def _infer_spaces_from_offsets( + begins: list[int], + ends: list[int], + ) -> list[bool]: + """ + Infer spaCy spaces from begin/end offsets. + + If the next token starts after the current token ends, + there is whitespace between them. + """ + spaces: list[bool] = [] + + for i in range(len(begins)): + if i == len(begins) - 1: + spaces.append(False) + else: + spaces.append(ends[i] < begins[i + 1]) + + return spaces + + @staticmethod + def _validate_token_offsets( + tokens: list[str], + begins: list[int], + ends: list[int], + ) -> None: + """ + Validate that tokens, begins and ends are aligned. + """ + if not tokens: + raise ValueError("tokens must not be empty.") + + if len(tokens) != len(begins) or len(tokens) != len(ends): + raise ValueError( + "tokens, begins and ends must have the same length. " + f"tokens={len(tokens)}, begins={len(begins)}, ends={len(ends)}" + ) + + for i, (token, begin, end) in enumerate(zip(tokens, begins, ends)): + if not token: + raise ValueError(f"token must not be empty at index {i}.") + + if begin < 0: + raise ValueError(f"begin must be >= 0 at index {i}.") + + if end < begin: + raise ValueError(f"end must be >= begin at index {i}.") + + if i > 0 and begin < ends[i - 1]: + raise ValueError( + f"Token offsets must not overlap. " + f"Problem at index {i}: begin={begin}, previous_end={ends[i - 1]}" + ) + + +if __name__ == "__main__": + resolver = CorefereeResolver("en", "sm") + + tokens = [ + "Anna", "bought", "a", "cactus", ".", + "The", "plant", "needed", "sunlight", ".", + "She", "put", "a", "vase", "on", "the", "table", ".", + "The", "vase", "was", "old", ",", "but", "it", "was", "beautiful", ".", + "The", "cactus", "grew", "quickly", "because", "it", "got", "enough", "light", ".", + ] + + begins = [ + 0, 5, 12, 14, 20, + 22, 26, 32, 39, 47, + 49, 53, 57, 59, 64, 67, 71, 76, + 78, 82, 87, 91, 94, 96, 100, 103, 107, 116, + 118, 122, 129, 134, 142, 150, 153, 157, 164, 169, + ] + + ends = [ + 4, 11, 13, 20, 21, + 25, 31, 38, 47, 48, + 52, 56, 58, 63, 66, 70, 76, 77, + 81, 86, 90, 94, 95, 99, 102, 106, 116, 117, + 121, 128, 133, 141, 149, 152, 156, 163, 169, 170, + ] + + doc = resolver.process_tokens( + tokens=tokens, + begins=begins, + ends=ends, + ) + + # print("Coreference dictionary:") + # result = resolver.get_coreference_dict( + # doc, + # include_self=False, + # expand_noun_chunks=True, + # ) + + # print(result) \ No newline at end of file diff --git a/duui-Coreference/src/main/python/TypeSystemCoreference.xml b/duui-Coreference/src/main/python/TypeSystemCoreference.xml new file mode 100644 index 00000000..15d18277 --- /dev/null +++ b/duui-Coreference/src/main/python/TypeSystemCoreference.xml @@ -0,0 +1,568 @@ + + TypeSystemFactChecking + + 1.0 + + + + + + + + + + + + org.texttechnologylab.annotation.ModelAnnotation + + + + + + uima.tcas.Annotation + + + + + + + ModelReference + + Reference to the Model + + org.texttechnologylab.annotation.MetaData + + + + + + + + + + + + + + org.texttechnologylab.uima.type.Embedding + + + + + + org.texttechnologylab.annotation.ModelAnnotation + + + + + + + + + embedding + + + + + + uima.cas.FloatArray + + + + + + + + + + + + + + + org.texttechnologylab.uima.type.Classification + + + + + + org.texttechnologylab.annotation.ModelAnnotation + + + + + + + + + org.texttechnologylab.uima.type.Topic + + + + + + org.texttechnologylab.uima.type.Classification + + + + + + + + + topic + + + + + + uima.cas.String + + + + + + + + + score + + + + + + uima.cas.Double + + + + + + + + + + + + + + + org.texttechnologylab.uima.type.Sentiment + + + + + + org.texttechnologylab.uima.type.Classification + + + + + + + + + sentiment + + + + + + uima.cas.Double + + + + + + + + + subjectivity + + + + + + uima.cas.Double + + + + + + + + + + + + + + + org.texttechnologylab.uima.type.CategorizedSentiment + + + + + + org.texttechnologylab.uima.type.Sentiment + + + + + + + + + pos + + + + + + uima.cas.Double + + + + + + + + + neu + + + + + + uima.cas.Double + + + + + + + + + neg + + + + + + uima.cas.Double + + + + + + + + + + + + + + + org.texttechnologylab.uima.type.StarSentiment + + + + + + org.texttechnologylab.uima.type.Sentiment + + + + + + + + + OneStar + + + + + + uima.cas.Double + + + + + + + + + TwoStars + + + + + + uima.cas.Double + + + + + + + + + ThreeStars + + + + + + uima.cas.Double + + + + + + + + + FourStars + + + + + + uima.cas.Double + + + + + + + + + FiveStars + + + + + + uima.cas.Double + + + + + + + + + + + + org.texttechnologylab.annotation.MetaData + + + + uima.tcas.Annotation + + + + + + Lang + + Language of the method or the Model + + uima.cas.String + + + + + + Source + + Link of the used resource + + uima.cas.String + + + + + + + + + + org.texttechnologylab.annotation.model.MetaData + + + + org.texttechnologylab.annotation.MetaData + + + + + + ModelVersion + + Version of the Model + + uima.cas.String + + + + + + ModelName + + Name of the Model + + uima.cas.String + + + + + + + + + + org.texttechnologylab.annotation.model.SpacyMetaData + + + + org.texttechnologylab.annotation.model.MetaData + + + + + + SpacyVersion + + Spacy Libary Version + + uima.cas.String + + + + + + ModelSpacyGitVersion + + Explicit Spacy git version + + uima.cas.String + + + + + + + + + + org.texttechnologylab.annotation.model.HuggingfaceMetaData + + + + org.texttechnologylab.annotation.model.MetaData + + + + + + HuggingfaceVersion + + Transformer Library Version + + uima.cas.String + + + + + + DependeciesVersion + + Dependency Library Version e.g. Pytorch... + + uima.cas.StringArray + + + + + + + + org.texttechnologylab.annotation.Claim + One Claim for different facts + uima.tcas.Annotation + + + value + Information of Claim + uima.cas.String + + + Facts + Set of Fact + uima.cas.FSArray + org.texttechnologylab.annotation.Fact + + + + + org.texttechnologylab.annotation.Fact + One Fact for different claims + uima.tcas.Annotation + + + value + Information for the fact + uima.cas.String + + + Claims + Set of Claims + uima.cas.FSArray + org.texttechnologylab.annotation.Claim + + + + + org.texttechnologylab.annotation.model.FactCheckingMetaData + + org.texttechnologylab.annotation.model.MetaData + + + DependeciesVersion + Dependency Library Version e.g. Pytorch... + uima.cas.StringArray + + + + + org.texttechnologylab.annotation.FactChecking + Does the assertion confirm the statement + uima.tcas.Annotation + + + Fact + + org.texttechnologylab.annotation.Fact + + + Claim + + org.texttechnologylab.annotation.Claim + + + consistency + + uima.cas.Double + + + model + + org.texttechnologylab.annotation.model.MetaData + + + + + \ No newline at end of file diff --git a/duui-Coreference/src/main/python/duui_coreference.lua b/duui-Coreference/src/main/python/duui_coreference.lua new file mode 100644 index 00000000..051b1a53 --- /dev/null +++ b/duui-Coreference/src/main/python/duui_coreference.lua @@ -0,0 +1,144 @@ +-- Bind static classes from java +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +DUUIutils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") +Token = luajava.bindClass("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") +Coreference = luajava.bindClass("org.texttechnologylab.annotation.Coreference") + +-- This "serialize" function is called to transform the CAS object into an stream that is sent to the annotator +-- Inputs: +-- - inputCas: The actual CAS object to serialize +-- - outputStream: Stream that is sent to the annotator, can be e.g. a string, JSON payload, ... +function serialize(inputCas, outputStream) + -- Get data from CAS + -- For spaCy, we need the documents text and its language + -- TODO add additional params? +-- print("start") + local doc_text = inputCas:getDocumentText() +-- print(doc_text) + local doc_lang = inputCas:getDocumentLanguage() + local tokens = {} + local begin_token = {} + local end_token = {} + local tokens_count = 1 + local tokens_it = luajava.newInstance("java.util.ArrayList", JCasUtil:select(inputCas, Token)):listIterator() + while tokens_it:hasNext() do + local token = tokens_it:next() + tokens[tokens_count] = token:getCoveredText() + begin_token[tokens_count] = token:getBegin() + end_token[tokens_count] = token:getEnd() + tokens_count = tokens_count + 1 + end +-- print("sentences") +-- print(tokens) +-- print(begin_token) +-- print(end_token) + outputStream:write(json.encode({ + tokens = tokens, + lang = doc_lang, + begin_token = begin_token, + end_token = end_token, + })) +-- -- print("sendToPython") +end + +-- This "deserialize" function is called on receiving the results from the annotator that have to be transformed into a CAS object +-- Inputs: +-- - inputCas: The actual CAS object to deserialize into +-- - inputStream: Stream that is received from to the annotator, can be e.g. a string, JSON payload, ... +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) +-- print("begin_deserialize") + + if results["modification_meta"] ~= nil and results["meta"] ~= nil and results["begin_resolve"] ~= nil then +-- print("GetInfo") + local source = results["model_source"] + local model_version = results["model_version"] + local model_name = results["model_name"] + local model_lang = results["model_lang"] +-- print("meta") + local modification_meta = results["modification_meta"] + local modification_anno = luajava.newInstance("org.texttechnologylab.annotation.DocumentModification", inputCas) + modification_anno:setUser(modification_meta["user"]) + modification_anno:setTimestamp(modification_meta["timestamp"]) + modification_anno:setComment(modification_meta["comment"]) + modification_anno:addToIndexes() + +-- print("setMetaData") + local model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) + model_meta:setModelVersion(model_version) +-- print(model_version) + model_meta:setModelName(model_name) +-- print(model_name) + model_meta:setSource(source) +-- print(source) + model_meta:setLang(model_lang) +-- print(model_lang) + model_meta:addToIndexes() + + local meta = results["meta"] +-- print("meta") + local begin = results["begin"] + local end_token = results["end"] + local begin_resolve = results["begin_resolve"] + local end_resolve = results["end_resolve"] + for index_i, begin_i in ipairs(begin) do + local end_i = end_token[index_i] + local begin_resolve_i = begin_resolve[index_i] + local end_resolve_i = end_resolve[index_i] + local coref_resolve = JCasUtil:selectAt(inputCas, Coreference, begin_resolve_i, end_resolve_i) +-- print(coref_resolve) + if coref_resolve:size() == 0 then + coref_resolve = luajava.newInstance("org.texttechnologylab.annotation.Coreference", inputCas, begin_resolve_i, end_resolve_i) + else + coref_resolve = coref_resolve:iterator():next() + end +-- print(coref_resolve) + local coref_anno = luajava.newInstance("org.texttechnologylab.annotation.Coreference", inputCas, begin_i, end_i) + coref_anno:setLink(coref_resolve) + coref_anno:addToIndexes() + end + +-- local meta = results["meta"] +-- -- print("meta") +-- local begin_claims = results["begin_claims"] +-- -- print("begin_claims") +-- local end_claims = results["end_claims"] +-- -- print("end_claims") +-- local begin_facts = results["begin_facts"] +-- -- print("begin_facts") +-- local end_facts = results["end_facts"] +-- -- print("end_facts") +-- local consistency = results["consistency"] +-- -- print("consistency") +-- for index_i, cons in ipairs(consistency) do +-- -- print(cons) +-- local begin_claim_i = begin_claims[index_i] +-- -- print(begin_claim_i) +-- local end_claim_i = end_claims[index_i] +-- -- print(end_claim_i) +-- local begin_fact_i = begin_facts[index_i] +-- -- print(begin_fact_i) +-- local end_fact_i = end_facts[index_i] +-- -- print(end_fact_i) +-- local claim_i = util:selectAt(inputCas, claims, begin_claim_i, end_claim_i):iterator():next() +-- -- print(claim_i) +-- local fact_i = util:selectAt(inputCas, facts, begin_fact_i, end_fact_i):iterator():next() +-- -- print(fact_i) +-- local factcheck_i = luajava.newInstance("org.texttechnologylab.annotation.FactChecking", inputCas) +-- -- print("FactCheck") +-- factcheck_i:setClaim(claim_i) +-- -- print("claim") +-- factcheck_i:setFact(fact_i) +-- -- print("fact") +-- factcheck_i:setConsistency(cons) +-- -- print("cons") +-- factcheck_i:setModel(model_meta) +-- -- print("setModel") +-- factcheck_i:addToIndexes() +-- -- print(factcheck_i) +-- end + end +end diff --git a/duui-Coreference/src/main/python/duui_coreference.py b/duui-Coreference/src/main/python/duui_coreference.py new file mode 100644 index 00000000..5e4b4ec5 --- /dev/null +++ b/duui-Coreference/src/main/python/duui_coreference.py @@ -0,0 +1,242 @@ +# from pydantic import BaseModel +# from pydantic_settings import BaseSettings +from pydantic import BaseModel, BaseSettings +from typing import List, Optional, Dict, Union +import logging +from time import time +from fastapi import FastAPI, Response +from cassis import load_typesystem +from threading import Lock +from functools import lru_cache + +from Coreferee_resolver import CorefereeResolver + +# from Climate import ClimateBert,model_name_map +# from sp_correction import SentenceBestPrediction + +# Settings +# These are automatically loaded from env variables +from starlette.responses import PlainTextResponse + +model_lock = Lock() + + +class UimaSentence(BaseModel): + text: str + begin: int + end: int + + +class UimaSentenceSelection(BaseModel): + selection: str + sentences: List[UimaSentence] + + +class Settings(BaseSettings): + # Name of this annotator + annotator_name: str + # Version of this annotator + annotator_version: str + # Log level + log_level: str + # model_name + model_name: str + # Name of this annotator + model_version: str + #cach_size + model_cache_size: int + # url of the model + model_source: str + # language of the model + model_lang: str + # sm or lg + model_variant: str + + +# Load settings from env vars +settings = Settings() +lru_cache_with_size = lru_cache(maxsize=settings.model_cache_size) +logging.basicConfig(level=settings.log_level) +logger = logging.getLogger(__name__) + + +# Load the predefined typesystem that is needed for this annotator to work +typesystem_filename = 'TypeSystemCoreference.xml' +logger.debug("Loading typesystem from \"%s\"", typesystem_filename) +with open(typesystem_filename, 'rb') as f: + typesystem = load_typesystem(f) + logger.debug("Base typesystem:") + logger.debug(typesystem.to_xml()) + +# Load the Lua communication script +lua_communication_script_filename = "duui_coreference.lua" +logger.debug("Loading Lua communication script from \"%s\"", lua_communication_script_filename) + + +# Request sent by DUUI +# Note, this is transformed by the Lua script +class DUUIRequest(BaseModel): + # + tokens: List[str] + # + lang: str + # + begin_token: List[int] + # + end_token: List[int] + + + +# UIMA type: mark modification of the document +class DocumentModification(BaseModel): + user: str + timestamp: int + comment: str + + +# UIMA type: adds metadata to each annotation +class AnnotationMeta(BaseModel): + name: str + version: str + modelName: str + modelVersion: str + + +# Response sent by DUUI +# Note, this is transformed by the Lua script +class DUUIResponse(BaseModel): + # Symspelloutput + # List of Sentence with every token + # Every token is a dictionary with following Infos: + # Symspelloutput right if the token is correct, wrong if the token is incorrect, skipped if the token was skipped, unkownn if token can corrected with Symspell + # If token is unkown it will be predicted with BERT Three output pos: + # 1. Best Prediction with BERT MASKED + # 2. Best Cos-sim with Sentence-Bert and with perdicted words of BERT MASK + # 3. Option 1 and 2 together + meta: AnnotationMeta + # Modification meta, one per document + modification_meta: DocumentModification + begin: List[int] + end: List[int] + begin_resolve: List[int] + end_resolve: List[int] + model_name: str + model_version: str + model_source: str + model_lang: str + model_variant: str + + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.annotator_name, + description="Factuality annotator", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "TTLab Team", + "url": "https://texttechnologylab.org", + "email": "bagci@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +with open(lua_communication_script_filename, 'rb') as f: + lua_communication_script = f.read().decode("utf-8") +logger.debug("Lua communication script:") +logger.debug(lua_communication_script_filename) + + +# Get typesystem of this annotator +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO remove cassis dependency, as only needed for typesystem at the moment? + xml = typesystem.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +# Return documentation info +@app.get("/v1/documentation") +def get_documentation(): + return "Test" + +@lru_cache_with_size +def load_model(language, variant): + model_i = CorefereeResolver(language, variant) + return model_i + + +# @lru_cache_with_size +# def load_model(model_name): +# model_i = ClimateBert(model_name, device) +# return model_i + + +def fix_unicode_problems(text): + # fix emoji in python string and prevent json error on response + # File "/usr/local/lib/python3.8/site-packages/starlette/responses.py", line 190, in render + # UnicodeEncodeError: 'utf-8' codec can't encode characters in position xx-yy: surrogates not allowed + clean_text = text.encode('utf-16', 'surrogatepass').decode('utf-16', 'surrogateescape') + return clean_text + + +# Process request from DUUI +@app.post("/v1/process") +def post_process(request: DUUIRequest): + # Return data + meta = None + begin = [] + end = [] + begin_resolve = [] + end_resolve = [] + # Save modification start time for later + modification_timestamp_seconds = int(time()) + try: + model_source = settings.model_source + model_lang = settings.model_lang + model_version = settings.model_version + # set meta Informations + meta = AnnotationMeta( + name=settings.annotator_name, + version=settings.annotator_version, + modelName=settings.model_name, + modelVersion=model_version, + ) + # Add modification info + modification_meta_comment = f"{settings.annotator_name} ({settings.annotator_version}))" + modification_meta = DocumentModification( + user=settings.annotator_name, + timestamp=modification_timestamp_seconds, + comment=modification_meta_comment + ) + mv = "" + + with model_lock: + coreference_resolver = load_model(model_lang, settings.model_variant.lower()) + doc = coreference_resolver.process_tokens(request.tokens, request.begin_token, request.end_token) + result = coreference_resolver.get_coreference_dict(doc, include_self=False, expand_noun_chunks=True) + begin = result["begin"] + end = result["end"] + begin_resolve = result["begin_resolve"] + end_resolve = result["end_resolve"] + + except Exception as ex: + logger.exception(ex) + return DUUIResponse(meta=meta, modification_meta=modification_meta, begin=begin, end=end, begin_resolve=begin_resolve, end_resolve=end_resolve, model_name=settings.model_name, + model_version=model_version, model_source=model_source, model_lang=model_lang, model_variant=settings.model_variant) diff --git a/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java b/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java new file mode 100644 index 00000000..4fd42bd7 --- /dev/null +++ b/duui-Coreference/src/test/java/org/hucompute/textimager/uima/Coreference/CoreferenceTest.java @@ -0,0 +1,242 @@ +package org.hucompute.textimager.uima.Coreference; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import org.apache.uima.fit.util.JCasUtil; +import org.texttechnologylab.uima.type.spacy.SpacyToken; +import org.texttechnologylab.annotation.Coreference; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.XmlCasSerializer; + +import org.junit.jupiter.api.*; + +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; + +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class CoreferenceTest { + static DUUIComposer composer; + static JCas cas; + + static String url = "http://127.0.0.1:9714"; + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); + + cas = JCasFactory.createJCas(); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + public void afterEach() throws IOException, SAXException { + composer.resetPipeline(); + + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, stream); + System.out.println(stream.toString(StandardCharsets.UTF_8)); + + cas.reset(); + } + + public void createCas( + String language, + List tokens, + List begins, + List ends + ) throws UIMAException { + validateInput(tokens, begins, ends); + + cas.setDocumentLanguage(language); + + String documentText = buildDocumentText(tokens, begins, ends); + cas.setDocumentText(documentText); + + addTokens(tokens, begins, ends); + addSentencesFromPunctuation(tokens, begins, ends); + } + + private void addTokens( + List tokens, + List begins, + List ends + ) { + for (int i = 0; i < tokens.size(); i++) { + SpacyToken token = new SpacyToken(cas, begins.get(i), ends.get(i)); + token.addToIndexes(); + } + } + + private void addSentencesFromPunctuation( + List tokens, + List begins, + List ends + ) { + int sentenceBegin = begins.get(0); + + for (int i = 0; i < tokens.size(); i++) { + String token = tokens.get(i); + + if (token.equals(".") || token.equals("!") || token.equals("?")) { + int sentenceEnd = ends.get(i); + + Sentence sentence = new Sentence(cas, sentenceBegin, sentenceEnd); + sentence.addToIndexes(); + + if (i + 1 < tokens.size()) { + sentenceBegin = begins.get(i + 1); + } + } + } + } + + private String buildDocumentText( + List tokens, + List begins, + List ends + ) { + int documentLength = ends.get(ends.size() - 1); + char[] chars = new char[documentLength]; + Arrays.fill(chars, ' '); + + for (int i = 0; i < tokens.size(); i++) { + String token = tokens.get(i); + int begin = begins.get(i); + int end = ends.get(i); + + for (int j = 0; j < token.length(); j++) { + chars[begin + j] = token.charAt(j); + } + } + + return new String(chars); + } + + private void validateInput( + List tokens, + List begins, + List ends + ) { + assertEquals(tokens.size(), begins.size()); + assertEquals(tokens.size(), ends.size()); + + for (int i = 0; i < tokens.size(); i++) { + String token = tokens.get(i); + int begin = begins.get(i); + int end = ends.get(i); + + if (token.length() != end - begin) { + throw new IllegalArgumentException( + "Token length does not match offsets at index " + i + + ": token='" + token + "'" + + ", begin=" + begin + + ", end=" + end + ); + } + + if (i > 0 && begin < ends.get(i - 1)) { + throw new IllegalArgumentException( + "Token offsets overlap at index " + i + ); + } + } + } + + @Test + public void EnTest() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter( + "selection", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" + ) + ); + + List tokens = Arrays.asList( + "Anna", "bought", "a", "cactus", ".", + "The", "plant", "needed", "sunlight", ".", + "She", "put", "a", "vase", "on", "the", "table", ".", + "The", "vase", "was", "old", ",", "but", "it", "was", "beautiful", ".", + "The", "cactus", "grew", "quickly", "because", "it", "got", "enough", "light", "." + ); + + List begins = Arrays.asList( + 0, 5, 12, 14, 20, + 22, 26, 32, 39, 47, + 49, 53, 57, 59, 64, 67, 71, 76, + 78, 82, 87, 91, 94, 96, 100, 103, 107, 116, + 118, 122, 129, 134, 142, 150, 153, 157, 164, 169 + ); + + List ends = Arrays.asList( + 4, 11, 13, 20, 21, + 25, 31, 38, 47, 48, + 52, 56, 58, 63, 66, 70, 76, 77, + 81, 86, 90, 94, 95, 99, 102, 106, 116, 117, + 121, 128, 133, 141, 149, 152, 156, 163, 169, 170 + ); + + createCas("en", tokens, begins, ends); + + System.out.println("Input document:"); + System.out.println(cas.getDocumentText()); + SpacyToken h = JCasUtil.selectAt(cas, SpacyToken.class, 0, 4).iterator().next(); + composer.run(cas); + + Collection coreferences = JCasUtil.select(cas, Coreference.class); + Map> result = extractCoreferenceResult(); + for (Coreference coreference : coreferences) { + String token = coreference.getCoveredText(); + int begin = coreference.getBegin(); + int end = coreference.getEnd(); + + result.get("token").add(token); + result.get("begin").add(begin); + result.get("end").add(end); + + if (coreference.getLink() != null) { + String token_resolve = coreference.getLink().getCoveredText(); + int begin_resolve = coreference.getLink().getBegin(); + int end_resolve = coreference.getLink().getEnd(); + System.out.println("Coreference: '" + token + "' (begin=" + begin + ", end=" + end + ")" + " -> '" + token_resolve + "' (begin=" + begin_resolve + ", end=" + end_resolve + ")"); + } + + } + } + + private Map> extractCoreferenceResult() { + Map> result = new LinkedHashMap<>(); + + result.put("begin", new ArrayList<>()); + result.put("end", new ArrayList<>()); + result.put("begin_resolve", new ArrayList<>()); + result.put("end_resolve", new ArrayList<>()); + result.put("token", new ArrayList<>()); + result.put("token_resolve", new ArrayList<>()); + return result; + } +} \ No newline at end of file diff --git a/duui-Genre/.dockerignore b/duui-Genre/.dockerignore new file mode 100644 index 00000000..caab0c36 --- /dev/null +++ b/duui-Genre/.dockerignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv/ \ No newline at end of file diff --git a/duui-Genre/.gitignore b/duui-Genre/.gitignore new file mode 100644 index 00000000..d2092691 --- /dev/null +++ b/duui-Genre/.gitignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv*/ \ No newline at end of file diff --git a/duui-Genre/Readme.md b/duui-Genre/Readme.md new file mode 100644 index 00000000..48df6bfe --- /dev/null +++ b/duui-Genre/Readme.md @@ -0,0 +1,90 @@ +[![Version](https://img.shields.io/static/v1?label=duui-genre&message=0.1.0&color=blue)](https://docker.texttechnologylab.org/v2/duui-transformers-topic/tags/list) +[![Version](https://img.shields.io/static/v1?label=Python&message=3.12&color=green)]() +[![Version](https://img.shields.io/static/v1?label=Transformers&message=5.9.0&color=yellow)]() +[![Version](https://img.shields.io/static/v1?label=Torch&message=2.11.0&color=red)]() + +# Transformers Genre + +DUUI implementation for selected Hugging-Face-based transformer [Genre tools](https://huggingface.co/models?sort=trending&search=genre) models. +## Included Models + +| Name | | Revision | Languages | +|-------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------|--------------| +| turkunlp-genre-multi | https://huggingface.co/TurkuNLP/web-register-classification-multilingual | a22ad8b652f6825ec1505dab779979e0f255d7ae | Multilingual | +| turkunlp-genre-en | https://huggingface.co/TurkuNLP/web-register-classification-en | 93969151434144dc8505865d31823c79bd385167 | EN | +| turkunlp-genre-finerweb |https://huggingface.co/TurkuNLP/finerweb-quality-classifier| 93d1635105c974a675e3be8c636d7a5cac6f7b11 | EN | +| ssharoff-genre |https://huggingface.co/ssharoff/genres| 93d1635105c974a675e3be8c636d7a5cac6f7b11| EN | +| x-genre-classifier |https://huggingface.co/classla/xlm-roberta-base-multilingual-text-genre-classifier| ebe54ca322f6fd4dc95700705b99f23e3437c8d0 | Multingual | + +# How To Use + +For using duui-genre as a DUUI image it is necessary to use the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +## Start Docker container + +``` +docker run --rm -p 9714:9714 docker.texttechnologylab.org/duui-genre-[modelname]:latest + +``` + +Find all available image tags here: [https://docker.texttechnologylab.org/v2/duui-genre-[modelname]/tags/list](https://docker.texttechnologylab.org/v2/duui-transformers-topic-[modelname]/tags/list) + +## Run within DUUI + +``` +composer.add( + new DUUIDockerDriver.Component("docker.texttechnologylab.org/duui-genre-[modelname]:latest") + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") +); +``` + +### Parameters + +| Name | Description | +| ---- | ----------- | +| `selection` | Use `text` to process the full document text or any selectable UIMA type class name | + +# Cite + +If you want to use the DUUI image please quote this as follows: + +Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. [[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] + +## BibTeX + +``` +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander}, + editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023}, + year = {2023}, + address = {Singapore}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, + pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf}, + abstract = {Automatic analysis of large corpora is a complex task, especially + in terms of time efficiency. This complexity is increased by the + fact that flexible, extensible text analysis requires the continuous + integration of ever new tools. Since there are no adequate frameworks + for these purposes in the field of NLP, and especially in the + context of UIMA, that are not outdated or unusable for security + reasons, we present a new approach to address the latter task: + Docker Unified UIMA Interface (DUUI), a scalable, flexible, lightweight, + and feature-rich framework for automatic distributed analysis + of text corpora that leverages Big Data experience and virtualization + with Docker. We evaluate DUUI{'}s communication approach against + a state-of-the-art approach and demonstrate its outstanding behavior + in terms of time efficiency, enabling the analysis of big text + data.} +} + +@misc{Bagci:2024, + author = {Bagci, Mevlüt}, + title = {Hugging-Face-based genre models as {DUUI} component}, + year = {2024}, + howpublished = {https://github.com/texttechnologylab/duui-uima/tree/main/duui-Genre} +} + +``` diff --git a/duui-Genre/docker_build.sh b/duui-Genre/docker_build.sh new file mode 100644 index 00000000..558615b0 --- /dev/null +++ b/duui-Genre/docker_build.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +export ANNOTATOR_CUDA= +#export ANNOTATOR_CUDA="-cuda" + +export ANNOTATOR_NAME=duui-genre +export ANNOTATOR_VERSION=0.1.0 +export LOG_LEVEL=DEBUG +export MODEL_CACHE_SIZE=3 +export DOCKER_REGISTRY="docker.texttechnologylab.org/" + +###--------------------------------------------------------------------- +#export MODEL_NAME="TurkuNLP/web-register-classification-multilingual" +#export MODEL_SPECNAME="turkunlp-genre-multi" +#export MODEL_VERSION="a22ad8b652f6825ec1505dab779979e0f255d7ae" +#export MODEL_SOURCE="https://huggingface.co/TurkuNLP/web-register-classification-multilingual" +#export MODEL_LANG="Multi" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="TurkuNLP/web-register-classification-en" +#export MODEL_SPECNAME="turkunlp-genre-en" +#export MODEL_VERSION="93969151434144dc8505865d31823c79bd385167" +#export MODEL_SOURCE="https://huggingface.co/TurkuNLP/web-register-classification-en" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="TurkuNLP/finerweb-quality-classifier" +#export MODEL_SPECNAME="turkunlp-genre-finerweb" +#export MODEL_VERSION="93d1635105c974a675e3be8c636d7a5cac6f7b11" +#export MODEL_SOURCE="https://huggingface.co/TurkuNLP/finerweb-quality-classifier" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +###--------------------------------------------------------------------- +#export MODEL_NAME="ssharoff/genres" +#export MODEL_SPECNAME="ssharoff-genre" +#export MODEL_VERSION="93d1635105c974a675e3be8c636d7a5cac6f7b11" +#export MODEL_SOURCE="https://huggingface.co/ssharoff/genres" +#export MODEL_LANG="EN" +###-------------------------------------------------------------------- + +##--------------------------------------------------------------------- +export MODEL_NAME="classla/xlm-roberta-base-multilingual-text-genre-classifier" +export MODEL_SPECNAME="x-genre-classifier" +export MODEL_VERSION="ebe54ca322f6fd4dc95700705b99f23e3437c8d0" +export MODEL_SOURCE="https://huggingface.co/classla/xlm-roberta-base-multilingual-text-genre-classifier" +export MODEL_LANG="Multi" +##-------------------------------------------------------------------- + + + +docker build \ + --build-arg ANNOTATOR_NAME \ + --build-arg ANNOTATOR_VERSION \ + --build-arg LOG_LEVEL \ + --build-arg MODEL_CACHE_SIZE \ + --build-arg MODEL_NAME \ + --build-arg MODEL_VERSION \ + --build-arg MODEL_SOURCE \ + --build-arg MODEL_LANG \ + -t ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + -f src/main/docker/Dockerfile${ANNOTATOR_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:${ANNOTATOR_VERSION}${ANNOTATOR_CUDA} \ + ${DOCKER_REGISTRY}${ANNOTATOR_NAME}"-"${MODEL_SPECNAME}:latest${ANNOTATOR_CUDA} diff --git a/duui-Genre/pom.xml b/duui-Genre/pom.xml new file mode 100644 index 00000000..4cf8be75 --- /dev/null +++ b/duui-Genre/pom.xml @@ -0,0 +1,155 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui-genre + 0.2.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + + 17 + 17 + 2.4.0 + + + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 7cef2433b5 + + + + + + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.14 + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + \ No newline at end of file diff --git a/duui-Genre/requirements.txt b/duui-Genre/requirements.txt new file mode 100644 index 00000000..c8109fba --- /dev/null +++ b/duui-Genre/requirements.txt @@ -0,0 +1,14 @@ +torch==2.11.0 +torchaudio==2.11.0 +torchvision==0.26.0 +scipy==1.17.1 +transformers==5.9.0 +sentencepiece==0.2.1 +protobuf==4.25.3 +numpy==2.4.6 +scikit-learn==1.8.0 +fastapi==0.110.0 +dkpro-cassis==0.9.1 +uvicorn[standard]==0.27.1 +pydantic-settings==2.0.2 +torchmetrics==1.2.0 \ No newline at end of file diff --git a/duui-Genre/service_start.sh b/duui-Genre/service_start.sh new file mode 100644 index 00000000..34cc130e --- /dev/null +++ b/duui-Genre/service_start.sh @@ -0,0 +1,5 @@ +TEXTIMAGER_DUUI_TRANSFORMERS_TOPIC_ANNOTATOR_NAME="textimager-duui-transformers-topic" \ +TEXTIMAGER_DUUI_TRANSFORMERS_TOPIC_ANNOTATOR_VERSION="unset" \ +TEXTIMAGER_DUUI_TRANSFORMERS_TOPIC_LOG_LEVEL="DEBUG" \ +TEXTIMAGER_DUUI_TRANSFORMERS_TOPIC_MODEL_CACHE_SIZE="1" \ +uvicorn src.main.python.textimager_duui_transformers_topic:app --host 0.0.0.0 --port 9714 --workers 1 diff --git a/duui-Genre/src/main/docker/Dockerfile b/duui-Genre/src/main/docker/Dockerfile new file mode 100644 index 00000000..8b3bd8ec --- /dev/null +++ b/duui-Genre/src/main/docker/Dockerfile @@ -0,0 +1,55 @@ +FROM python:3.12 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_genre.py ./duui_genre.py +COPY ./src/main/python/duui_genre.lua ./duui_genre.lua +COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/web-register-classification-multilingual'); pipeline('fill-mask', model='FacebookAI/xlm-roberta-large')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/web-register-classification-en'); pipeline('fill-mask', model='FacebookAI/xlm-roberta-large')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/finerweb-quality-classifier')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='ssharoff/genres')" +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='classla/xlm-roberta-base-multilingual-text-genre-classifier')" + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-transformers-topic" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_genre:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-Genre/src/main/docker/Dockerfile-cuda b/duui-Genre/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..e64d175e --- /dev/null +++ b/duui-Genre/src/main/docker/Dockerfile-cuda @@ -0,0 +1,70 @@ +FROM nvidia/cuda:11.8.0-base-ubuntu22.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# dependencies +RUN pip install setuptools wheel +COPY ./requirements.txt ./requirements.txt +RUN apt remove -y python3-blinker || true +RUN pip install -r requirements.txt + + + +# dependencies +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy scripts +COPY ./src/main/python/TypeSystemTopic.xml ./TypeSystemTopic.xml +COPY ./src/main/python/duui_genre.py ./duui_genre.py +COPY ./src/main/python/duui_genre.lua ./duui_genre.lua +COPY ./src/main/python/GenreSpeech.py ./GenreSpeech.py + +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='TurkuNLP/web-register-classification-multilingual')" + + +# log level +ARG LOG_LEVEL="DEBUG" +ENV LOG_LEVEL=$LOG_LEVEL + +# config +ARG MODEL_CACHE_SIZE=3 +ENV MODEL_CACHE_SIZE=$MODEL_CACHE_SIZE + +# meta data +ARG ANNOTATOR_NAME="duui-transformers-topic" +ENV ANNOTATOR_NAME=$ANNOTATOR_NAME +ARG ANNOTATOR_VERSION="unset" +ENV ANNOTATOR_VERSION=$ANNOTATOR_VERSION + +# Model Info +ARG MODEL_VERSION=0.1 +ENV MODEL_VERSION=$MODEL_VERSION +ARG MODEL_NAME="" +ENV MODEL_NAME=$MODEL_NAME +ARG MODEL_SOURCE="" +ENV MODEL_SOURCE=$MODEL_SOURCE +ARG MODEL_LANG="" +ENV MODEL_LANG=$MODEL_LANG + +# offline mode for huggingface +ARG TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=$TEXTIMAGER_DUUI_TRANSFORMERS_SENTIMENT_TRANSFORMERS_OFFLINE + + + + +ENTRYPOINT ["uvicorn", "duui_genre:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] diff --git a/duui-Genre/src/main/python/GenreSpeech.py b/duui-Genre/src/main/python/GenreSpeech.py new file mode 100644 index 00000000..e6c112b1 --- /dev/null +++ b/duui-Genre/src/main/python/GenreSpeech.py @@ -0,0 +1,70 @@ +import torch +import math +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from scipy.special import softmax +import numpy as np +from typing import List + +ssharoff_genres = { + 0: "argum", + 1: "fictive", + 2: "instruct", + 3: "reporting", + 4: "legal", + 5: "personal", + 6: "commercial", + 7: "academic", + 8: "info", + 9: "reviews", +} + + +def sigmoid(x): + return 1 / (1 + math.exp(-x)) + +class GenreCheck: + def __init__(self, model_name: str, device='cuda:0'): + self.device = device + if model_name == "TurkuNLP/web-register-classification-en" or model_name=="TurkuNLP/web-register-classification-multilingual": + self.tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large") + else: + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + # if "manifesto-project" in model_name: + # self.model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True).to(device) + # elif "WebOrganizer/TopicClassifier" in model_name: + # self.model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, use_memory_efficient_attention=False).to(device) + # else: + self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device) + if "ssharoff" in model_name: + self.class_mapping = ssharoff_genres + else: + self.class_mapping = self.model.config.id2label + self.labels = list(self.class_mapping.values()) + + def genre_prediction(self, texts: List[str]): + with torch.no_grad(): + inputs = self.tokenizer( + texts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(self.device) + + outputs = self.model(**inputs) + logits = outputs[0].float() # convert bfloat16 -> float32 + probs = torch.softmax(logits, dim=-1) + + score_list = [] + + for prob in probs.cpu(): + ranking = torch.argsort(prob, descending=True) + + score_dict_i = { + self.labels[i]: float(prob[i]) + for i in ranking + } + + score_list.append(score_dict_i) + return score_list + diff --git a/duui-Genre/src/main/python/TypeSystemTopic.xml b/duui-Genre/src/main/python/TypeSystemTopic.xml new file mode 100644 index 00000000..dc052a36 --- /dev/null +++ b/duui-Genre/src/main/python/TypeSystemTopic.xml @@ -0,0 +1,132 @@ + + + + + org.texttechnologylab.annotation.AnnotatorMetaData + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + name + + uima.cas.String + + + version + + uima.cas.String + + + modelName + + uima.cas.String + + + modelVersion + + uima.cas.String + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + org.texttechnologylab.annotation.DocumentModification + + uima.cas.AnnotationBase + + + user + + uima.cas.String + + + timestamp + + uima.cas.Long + + + comment + + uima.cas.String + + + + + org.hucompute.textimager.uima.type.Sentiment + + uima.tcas.Annotation + + + sentiment + + uima.cas.Double + + + subjectivity + + uima.cas.Double + + + + + org.hucompute.textimager.uima.type.CategorizedSentiment + + org.hucompute.textimager.uima.type.Sentiment + + + pos + + uima.cas.Double + + + neu + + uima.cas.Double + + + neg + + uima.cas.Double + + + + + org.texttechnologylab.annotation.AnnotationComment + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + value + + uima.cas.String + + + key + + uima.cas.String + + + + + diff --git a/duui-Genre/src/main/python/duui_genre.lua b/duui-Genre/src/main/python/duui_genre.lua new file mode 100644 index 00000000..7fc4ffc6 --- /dev/null +++ b/duui-Genre/src/main/python/duui_genre.lua @@ -0,0 +1,131 @@ +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +TopicUtils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") + +function serialize(inputCas, outputStream, parameters) + local doc_lang = inputCas:getDocumentLanguage() + local doc_text = inputCas:getDocumentText() + local doc_len = TopicUtils:getDocumentTextLength(inputCas) + + local selection_types = parameters["selection"] + + local selections = {} + local selections_count = 1 + for selection_type in string.gmatch(selection_types, "([^,]+)") do + local sentences = {} + if selection_type == "text" then + local s = { + text = doc_text, + begin = 0, + ['end'] = doc_len + } + sentences[1] = s + else + local sentences_count = 1 + local clazz = Class:forName(selection_type); + local sentences_it = JCasUtil:select(inputCas, clazz):iterator() + while sentences_it:hasNext() do + local sentence = sentences_it:next() + local s = { + text = sentence:getCoveredText(), + begin = sentence:getBegin(), + ['end'] = sentence:getEnd() + } + sentences[sentences_count] = s + sentences_count = sentences_count + 1 + end + end + + local selection = { + sentences = sentences, + selection = selection_type + } + selections[selections_count] = selection + selections_count = selections_count + 1 + end + + outputStream:write(json.encode({ + selections = selections, + lang = doc_lang, + doc_len = doc_len + })) +end + +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) + if results["modification_meta"] ~= nil and results["meta"] ~= nil and results["results"] ~= nil then + -- print("GetInfo") + local source = results["model_source"] + local model_version = results["model_version"] + local model_name = results["model_name"] + local model_lang = results["model_lang"] + -- print("meta") + local modification_meta = results["modification_meta"] + local modification_anno = luajava.newInstance("org.texttechnologylab.annotation.DocumentModification", inputCas) + modification_anno:setUser(modification_meta["user"]) + modification_anno:setTimestamp(modification_meta["timestamp"]) + modification_anno:setComment(modification_meta["comment"]) + modification_anno:addToIndexes() + + -- print("setMetaData") + local model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) + model_meta:setModelVersion(model_version) + -- print(model_version) + model_meta:setModelName(model_name) + -- print(model_name) + model_meta:setSource(source) + -- print(source) + model_meta:setLang(model_lang) + -- print(model_lang) + model_meta:addToIndexes() + + local meta = results["meta"] + -- print("meta") + local begin_genre = results["begin"] + -- print("begin_emo") + local end_genre = results["end"] + -- print("end_emo") + local res_out = results["results"] +-- print("results") + local res_len = results["len_results"] + -- print("Len_results") + local factors = results["factors"] +-- print(factors) + for index_i, res in ipairs(res_out) do + -- print(res) + local begin_genre_i = begin_genre[index_i] + -- print(begin_genre_i) + local end_genre_i = end_genre[index_i] + -- print(end_genre_i) + local len_i = res_len[index_i] + -- print(len_i) + -- print(type(len_i)) + local genre_i = luajava.newInstance("org.texttechnologylab.annotation.Genre", inputCas, begin_genre_i, end_genre_i) + -- print(genre_i) + local fsarray = luajava.newInstance("org.apache.uima.jcas.cas.FSArray", inputCas, len_i) + -- print(fsarray) + genre_i:setGenres(fsarray) + local counter = 0 + local factor_i = factors[index_i] + -- print(factor_i) + for index_j, genre_j in ipairs(res) do + -- print(genre_j) + local factor_j = factor_i[index_j] + -- print(factor_j) + genre_in_i = luajava.newInstance("org.texttechnologylab.annotation.AnnotationComment", inputCas) + genre_in_i:setReference(genre_i) + genre_in_i:setKey(genre_j) + genre_in_i:setValue(factor_j) + genre_in_i:addToIndexes() + genre_i:setGenres(counter, genre_in_i) + counter = counter + 1 + end + genre_i:setModel(model_meta) + genre_i:addToIndexes() + -- print("add") + end + end + -- print("end") + end diff --git a/duui-Genre/src/main/python/duui_genre.py b/duui-Genre/src/main/python/duui_genre.py new file mode 100644 index 00000000..ccb8763d --- /dev/null +++ b/duui-Genre/src/main/python/duui_genre.py @@ -0,0 +1,278 @@ +from pydantic import BaseModel +from pydantic_settings import BaseSettings +from typing import List, Optional, Dict, Union +import logging +from time import time +from fastapi import FastAPI, Response +from cassis import load_typesystem +import torch +from threading import Lock +from functools import lru_cache +from GenreSpeech import GenreCheck +# from sp_correction import SentenceBestPrediction + +# Settings +# These are automatically loaded from env variables +from starlette.responses import PlainTextResponse + +model_lock = Lock() + + +class UimaSentence(BaseModel): + text: str + begin: int + end: int + + +class UimaSentenceSelection(BaseModel): + selection: str + sentences: List[UimaSentence] + + +class Settings(BaseSettings): + # Name of this annotator + annotator_name: str + # Version of this annotator + annotator_version: str + # Log level + log_level: str + # model_name + model_name: str + # Name of this annotator + model_version: str + #cach_size + model_cache_size: int + # url of the model + model_source: str + # language of the model + model_lang: str + + +# Load settings from env vars +settings = Settings() +lru_cache_with_size = lru_cache(maxsize=settings.model_cache_size) +logging.basicConfig(level=settings.log_level) +logger = logging.getLogger(__name__) + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +# device = "cpu" +logger.info(f'USING {device}') +# Load the predefined typesystem that is needed for this annotator to work +typesystem_filename = 'TypeSystemTopic.xml' +logger.debug("Loading typesystem from \"%s\"", typesystem_filename) +with open(typesystem_filename, 'rb') as f: + typesystem = load_typesystem(f) + logger.debug("Base typesystem:") + logger.debug(typesystem.to_xml()) + +# Load the Lua communication script +lua_communication_script_filename = "duui_genre.lua" +logger.debug("Loading Lua communication script from \"%s\"", lua_communication_script_filename) + + +# Request sent by DUUI +# Note, this is transformed by the Lua script +class DUUIRequest(BaseModel): + # The texts language + doc_len: int + # + lang: str + # + selections: List[UimaSentenceSelection] + # + + +# UIMA type: mark modification of the document +class DocumentModification(BaseModel): + user: str + timestamp: int + comment: str + + +# UIMA type: adds metadata to each annotation +class AnnotationMeta(BaseModel): + name: str + version: str + modelName: str + modelVersion: str + + +# Response sent by DUUI +# Note, this is transformed by the Lua script +class DUUIResponse(BaseModel): + # Symspelloutput + # List of Sentence with every token + # Every token is a dictionary with following Infos: + # Symspelloutput right if the token is correct, wrong if the token is incorrect, skipped if the token was skipped, unkownn if token can corrected with Symspell + # If token is unkown it will be predicted with BERT Three output pos: + # 1. Best Prediction with BERT MASKED + # 2. Best Cos-sim with Sentence-Bert and with perdicted words of BERT MASK + # 3. Option 1 and 2 together + meta: AnnotationMeta + # Modification meta, one per document + modification_meta: DocumentModification + begin: List[int] + end: List[int] + results: List + factors: List + len_results: List[int] + model_name: str + model_version: str + model_source: str + model_lang: str + + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.annotator_name, + description="Factuality annotator", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "TTLab Team", + "url": "https://texttechnologylab.org", + "email": "bagci@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +with open(lua_communication_script_filename, 'rb') as f: + lua_communication_script = f.read().decode("utf-8") +logger.debug("Lua communication script:") +logger.debug(lua_communication_script_filename) + + +# Get typesystem of this annotator +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO remove cassis dependency, as only needed for typesystem at the moment? + xml = typesystem.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +# Return documentation info +@app.get("/v1/documentation") +def get_documentation(): + return "Test" + + +@lru_cache_with_size +def load_model(model_name): + model_i = GenreCheck(model_name, device) + return model_i + + +def fix_unicode_problems(text): + # fix emoji in python string and prevent json error on response + # File "/usr/local/lib/python3.8/site-packages/starlette/responses.py", line 190, in render + # UnicodeEncodeError: 'utf-8' codec can't encode characters in position xx-yy: surrogates not allowed + clean_text = text.encode('utf-16', 'surrogatepass').decode('utf-16', 'surrogateescape') + return clean_text + + +def process_selection(model_name, selection): + begin = [] + end = [] + results_out = [] + factors = [] + len_results = [] + for s in selection.sentences: + s.text = fix_unicode_problems(s.text) + + texts = [ + s.text + for s in selection.sentences + ] + logger.debug("Preprocessed texts:") + logger.debug(texts) + + with model_lock: + classifier = load_model(model_name) + + results = classifier.genre_prediction(texts) + for c, res in enumerate(results): + res_i = [] + factor_i = [] + sentence_i = selection.sentences[c] + begin_i = sentence_i.begin + end_i = sentence_i.end + len_rel = len(res) + begin.append(begin_i) + end.append(end_i) + for i in res: + res_i.append(i) + factor_i.append(res[i]) + len_results.append(len_rel) + results_out.append(res_i) + factors.append(factor_i) + output = { + "begin": begin, + "end": end, + "len_results": len_results, + "results": results_out, + "factors": factors + } + + return output + + +# Process request from DUUI +@app.post("/v1/process") +def post_process(request: DUUIRequest): + # Return data + meta = None + begin = [] + end = [] + len_results = [] + results = [] + factors = [] + # Save modification start time for later + modification_timestamp_seconds = int(time()) + try: + model_source = settings.model_source + model_lang = settings.model_lang + model_version = settings.model_version + # set meta Informations + meta = AnnotationMeta( + name=settings.annotator_name, + version=settings.annotator_version, + modelName=settings.model_name, + modelVersion=model_version, + ) + # Add modification info + modification_meta_comment = f"{settings.annotator_name} ({settings.annotator_version}))" + modification_meta = DocumentModification( + user=settings.annotator_name, + timestamp=modification_timestamp_seconds, + comment=modification_meta_comment + ) + mv = "" + + for selection in request.selections: + processed_sentences = process_selection(settings.model_name, selection) + begin = begin + processed_sentences["begin"] + end = end + processed_sentences["end"] + len_results = len_results + processed_sentences["len_results"] + results = results + processed_sentences["results"] + factors = factors + processed_sentences["factors"] + except Exception as ex: + logger.exception(ex) + return DUUIResponse(meta=meta, modification_meta=modification_meta, begin=begin, end=end, results=results, + len_results=len_results, factors=factors, model_name=settings.model_name, + model_version=model_version, model_source=model_source, model_lang=model_lang) diff --git a/duui-HeidelTimeExt/.idea/.gitignore b/duui-HeidelTimeExt/.idea/.gitignore new file mode 100644 index 00000000..7bc07ec2 --- /dev/null +++ b/duui-HeidelTimeExt/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Environment-dependent path to Maven home directory +/mavenHomeManager.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/duui-HeidelTimeExt/.idea/compiler.xml b/duui-HeidelTimeExt/.idea/compiler.xml new file mode 100644 index 00000000..afd76b09 --- /dev/null +++ b/duui-HeidelTimeExt/.idea/compiler.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/encodings.xml b/duui-HeidelTimeExt/.idea/encodings.xml new file mode 100644 index 00000000..aa00ffab --- /dev/null +++ b/duui-HeidelTimeExt/.idea/encodings.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/jarRepositories.xml b/duui-HeidelTimeExt/.idea/jarRepositories.xml new file mode 100644 index 00000000..22dd35ce --- /dev/null +++ b/duui-HeidelTimeExt/.idea/jarRepositories.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/misc.xml b/duui-HeidelTimeExt/.idea/misc.xml new file mode 100644 index 00000000..4c7d54ea --- /dev/null +++ b/duui-HeidelTimeExt/.idea/misc.xml @@ -0,0 +1,12 @@ + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/modules.xml b/duui-HeidelTimeExt/.idea/modules.xml new file mode 100644 index 00000000..cfbfba02 --- /dev/null +++ b/duui-HeidelTimeExt/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/.idea/vcs.xml b/duui-HeidelTimeExt/.idea/vcs.xml new file mode 100644 index 00000000..6c0b8635 --- /dev/null +++ b/duui-HeidelTimeExt/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/HeidelTimeExt.iml b/duui-HeidelTimeExt/HeidelTimeExt.iml new file mode 100644 index 00000000..30905331 --- /dev/null +++ b/duui-HeidelTimeExt/HeidelTimeExt.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/docker_build.sh b/duui-HeidelTimeExt/docker_build.sh new file mode 100755 index 00000000..e48150ad --- /dev/null +++ b/duui-HeidelTimeExt/docker_build.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Build and optionally push the DUUI HeidelTimeExt Docker image. +# +# Examples: +# ./docker_build.sh +# ./docker_build.sh 1.0 +# PUSH=true ./docker_build.sh 1.0 +# +# Optional environment variables: +# ANNOTATOR_NAME=duui-heideltime-ext +# DOCKER_REGISTRY=docker.texttechnologylab.org/ +# PUSH=true + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SCRIPT_DIR}" + +VERSION="${1:-${ANNOTATOR_VERSION:-1.0}}" +ANNOTATOR_NAME="${ANNOTATOR_NAME:-duui-heideltime-ext}" +DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.texttechnologylab.org/}" +PUSH="${PUSH:-false}" + +LOCAL_VERSION_TAG="${ANNOTATOR_NAME}:${VERSION}" +LOCAL_LATEST_TAG="${ANNOTATOR_NAME}:latest" +REMOTE_VERSION_TAG="${DOCKER_REGISTRY}${ANNOTATOR_NAME}:${VERSION}" +REMOTE_LATEST_TAG="${DOCKER_REGISTRY}${ANNOTATOR_NAME}:latest" + +printf '\nBuilding %s\n' "${LOCAL_VERSION_TAG}" +docker build -f dockerfile \ + --build-arg ANNOTATOR_VERSION="${VERSION}" \ + -t "${LOCAL_VERSION_TAG}" \ + . + +docker tag "${LOCAL_VERSION_TAG}" "${LOCAL_LATEST_TAG}" +docker tag "${LOCAL_VERSION_TAG}" "${REMOTE_VERSION_TAG}" +docker tag "${LOCAL_VERSION_TAG}" "${REMOTE_LATEST_TAG}" + +printf '\nBuilt images:\n' +printf ' %s\n' "${LOCAL_VERSION_TAG}" "${LOCAL_LATEST_TAG}" "${REMOTE_VERSION_TAG}" "${REMOTE_LATEST_TAG}" + +if [[ "${PUSH}" == "true" ]]; then + printf '\nPushing images:\n' + docker push "${REMOTE_VERSION_TAG}" + docker push "${REMOTE_LATEST_TAG}" +fi diff --git a/duui-HeidelTimeExt/dockerfile b/duui-HeidelTimeExt/dockerfile index aa556fc2..2b8cb109 100644 --- a/duui-HeidelTimeExt/dockerfile +++ b/duui-HeidelTimeExt/dockerfile @@ -1,5 +1,22 @@ -FROM maven:3.8.5-jdk-11 -ADD src src -ADD pom.xml pom.xml -RUN mvn clean compile -CMD mvn exec:java -Dexec.mainClass="org.texttechnologylab.tools.HeidelTimeExt" +FROM maven:3.9.9-eclipse-temurin-21 AS build + +WORKDIR /build + +COPY pom.xml pom.xml +RUN mvn -U -q -P '!duui-tests' -Dmaven.test.skip=true -DskipTests dependency:go-offline || true + +COPY src/main src/main +RUN mvn -U -q -P '!duui-tests' -Dmaven.test.skip=true -DskipTests clean package + +FROM eclipse-temurin:21-jre + +WORKDIR /app + +COPY --from=build /build/target/duui-HeidelTimeExt-*.jar /app/HeidelTimeExt.jar + +ENV DUUI_PORT=9714 +ENV DUUI_WORKERS=1 + +EXPOSE 9714 + +ENTRYPOINT ["java", "-jar", "/app/HeidelTimeExt.jar"] diff --git a/duui-HeidelTimeExt/pom.xml b/duui-HeidelTimeExt/pom.xml index 0d54ddb3..60c378c3 100644 --- a/duui-HeidelTimeExt/pom.xml +++ b/duui-HeidelTimeExt/pom.xml @@ -1,69 +1,218 @@ + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + 4.0.0 org.texttechnologylab.tools - HeidelTimeExt - 1.0 + duui-HeidelTimeExt + 0.1.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + bagci + Mevlüt Bagci + bagci@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/mevl%c3%bct-bagci/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + lead developer + + Europe/Berlin + + + + + 21 + UTF-8 + 2.4.0 + - - central - Central Repository - https://repo.maven.apache.org/maven2 - default - - false - - - - ukp-oss-model-releases - https://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local - jitpack.io https://jitpack.io - - 11 - 11 - + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + - + + + com.github.texttechnologylab + Utilities + 3.0.2 + + + + - org.apache.uima - uimaj-core - 2.11.0 + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 1.5.5 + + + com.github.texttechnologylab + Utilities + + - org.apache.uima - uimafit-core - 2.5.0 + com.github.texttechnologylab + Utilities + 3.0.2 com.github.texttechnologylab UIMATypeSystem - 29fe3e0ab5 + 3.0.14 - com.github.texttechnologylab.textimager-uima - textimager-uima-heideltime-biofid - 9b70623c7f + com.github.mevbagci + heideltime + 4.0.7 + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + + org.dkpro.core + dkpro-core-io-xmi-asl + + + + org.dkpro.core + dkpro-core-api-resources-asl org.json json - 20180813 + 20240303 + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + duui-HeidelTimeExt-${project.version} + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + ${maven.compiler.release} + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.6.0 + + + package + + shade + + + false + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + org.texttechnologylab.tools.HeidelTimeExt + + + + + + + META-INF/org.apache.uima.fit/types.txt + + + META-INF/org.apache.uima.fit/typepriorities.txt + + + META-INF/org.apache.uima.fit/fsindexes.txt + + + + + + + + + \ No newline at end of file diff --git a/duui-HeidelTimeExt/src/main/java/org/texttechnologylab/tools/HeidelTimeExt.java b/duui-HeidelTimeExt/src/main/java/org/texttechnologylab/tools/HeidelTimeExt.java index 59dd9857..71ccdccf 100644 --- a/duui-HeidelTimeExt/src/main/java/org/texttechnologylab/tools/HeidelTimeExt.java +++ b/duui-HeidelTimeExt/src/main/java/org/texttechnologylab/tools/HeidelTimeExt.java @@ -1,12 +1,12 @@ package org.texttechnologylab.tools; +import com.sun.net.httpserver.Headers; import com.sun.net.httpserver.HttpExchange; import com.sun.net.httpserver.HttpHandler; import com.sun.net.httpserver.HttpServer; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.unihd.dbs.uima.annotator.heideltime.HeidelTime; -import de.unihd.dbs.uima.annotator.heideltime.biofid.HeidelTimeBioFID; import de.unihd.dbs.uima.types.heideltime.Timex3; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; @@ -22,149 +22,397 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.json.JSONArray; -import org.json.JSONException; import org.json.JSONObject; import org.texttechnologylab.annotation.type.Time; import org.xml.sax.SAXException; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.io.StringWriter; import java.net.InetSocketAddress; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Executors; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +/** + * DUUI wrapper for TTLab's HeidelTimeExt component. + * + * This wrapper uses the HeidelTimeExt repository directly via the heideltime artifact. + * It does not depend on textimager-uima-heideltime-biofid, because that wrapper can pull + * incompatible legacy UIMA dependencies into the runtime classpath. + * + * The service communicates with DUUI using XMI serialize/deserialize mode: + * - GET /v1/communication_layer returns communication_layer.lua + * - GET /v1/typesystem returns the UIMA type system + * - GET /v1/details/input_output returns declared input/output types + * - POST /v1/process accepts an XMI CAS and returns the processed XMI CAS + */ public class HeidelTimeExt { + private static final int DEFAULT_PORT = 9714; + private static final String COMMUNICATION_LAYER_RESOURCE = "/communication_layer.lua"; + private static final String DEFAULT_FILENAME = "duui-document"; + public static void main(String[] args) throws Exception { - HttpServer server = HttpServer.create(new InetSocketAddress(9714), 0); - server.createContext("/v1/communication_layer", new CommunicationLayer()); + int port = getIntEnv("DUUI_PORT", DEFAULT_PORT); + int workers = getIntEnv("DUUI_WORKERS", Runtime.getRuntime().availableProcessors()); + + HttpServer server = HttpServer.create(new InetSocketAddress(port), 0); + server.createContext("/v1/communication_layer", new CommunicationLayerHandler()); server.createContext("/v1/typesystem", new TypesystemHandler()); server.createContext("/v1/process", new ProcessHandler()); server.createContext("/v1/details/input_output", new IOHandler()); - - server.setExecutor(null); // creates a default executor + server.createContext("/", new RootHandler()); + server.setExecutor(Executors.newFixedThreadPool(Math.max(1, workers))); server.start(); - System.out.println(HeidelTimeExt.class.getSimpleName()+" ready"); + + System.out.println(HeidelTimeExt.class.getSimpleName() + " ready on port " + port + " with " + workers + " workers"); } - static class ProcessHandler implements HttpHandler { - static JCas jc; - private static AggregateBuilder pipeline = new AggregateBuilder(); - private static AnalysisEngine pAE = null; + private static int getIntEnv(String key, int fallback) { + String value = System.getenv(key); + if (value == null || value.isBlank()) { + return fallback; + } + try { + return Integer.parseInt(value.trim()); + } catch (NumberFormatException ignored) { + return fallback; + } + } - static { - try { - jc = JCasFactory.createJCas(); - pipeline.add(createEngineDescription(HeidelTimeBioFID.class)); + private static String getEnv(String key, String fallback) { + String value = System.getenv(key); + if (value == null || value.isBlank()) { + return fallback; + } + return value.trim(); + } - } catch (UIMAException e) { - e.printStackTrace(); - } + private static void writeResponse(HttpExchange exchange, int statusCode, String contentType, byte[] body) throws IOException { + Headers headers = exchange.getResponseHeaders(); + headers.set("Content-Type", contentType); + exchange.sendResponseHeaders(statusCode, body.length); + try (OutputStream os = exchange.getResponseBody()) { + os.write(body); + } + } + + private static void writeText(HttpExchange exchange, int statusCode, String contentType, String body) throws IOException { + writeResponse(exchange, statusCode, contentType, body.getBytes(StandardCharsets.UTF_8)); + } + + private static void methodNotAllowed(HttpExchange exchange) throws IOException { + writeText(exchange, 405, "text/plain; charset=utf-8", "Method not allowed"); + } + + private static class RootHandler implements HttpHandler { + @Override + public void handle(HttpExchange exchange) throws IOException { + JSONObject details = new JSONObject(); + details.put("name", "duui-heideltime-ext"); + details.put("version", System.getenv().getOrDefault("ANNOTATOR_VERSION", "1.0")); + details.put("description", "DUUI wrapper for TTLab HeidelTimeExt"); + details.put("endpoints", new JSONArray() + .put("/v1/communication_layer") + .put("/v1/typesystem") + .put("/v1/details/input_output") + .put("/v1/process")); + writeText(exchange, 200, "application/json; charset=utf-8", details.toString()); } + } + + private static class ProcessHandler implements HttpHandler { @Override - public void handle(HttpExchange t) throws IOException { - try { - jc.reset(); + public void handle(HttpExchange exchange) throws IOException { + if (!"POST".equalsIgnoreCase(exchange.getRequestMethod())) { + methodNotAllowed(exchange); + return; + } + try { + JCas jCas = JCasFactory.createJCas(); XmiSerializationSharedData sharedData = new XmiSerializationSharedData(); - XmiCasDeserializer.deserialize(t.getRequestBody(), jc.getCas(), true, sharedData); - pAE = pipeline.createAggregate(); - SimplePipeline.runPipeline(jc, pAE); - - for (Timex3 timex3 : JCasUtil.select(jc, Timex3.class)) { - Time nTime = new Time(jc); - nTime.setBegin(timex3.getBegin()); - nTime.setEnd(timex3.getEnd()); - nTime.setValue(timex3.getTimexType()); - nTime.setIdentifier(timex3.getTimexValue()); - nTime.addToIndexes(); - } + XmiCasDeserializer.deserialize(exchange.getRequestBody(), jCas.getCas(), true, sharedData); + + ensureHeidelTimeInputAnnotations(jCas); - t.sendResponseHeaders(200, 0); - XmiCasSerializer.serialize(jc.getCas(), null, t.getResponseBody(), false, sharedData); + AnalysisEngine analysisEngine = createPipeline(jCas); + SimplePipeline.runPipeline(jCas, analysisEngine); + copyTimex3ToTTLabTime(jCas); - t.getResponseBody().close(); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + XmiCasSerializer.serialize(jCas.getCas(), null, outputStream, false, sharedData); + + writeResponse(exchange, 200, "application/xmi+xml", outputStream.toByteArray()); } catch (Exception e) { e.printStackTrace(); - t.sendResponseHeaders(404, -1); + String message = "HeidelTimeExt processing failed: " + e.getMessage(); + writeText(exchange, 500, "text/plain; charset=utf-8", message); } + } - t.getResponseBody().close(); + private AnalysisEngine createPipeline(JCas jCas) throws UIMAException { + String language = getEnv("HEIDELTIME_LANGUAGE", normalizeLanguage(jCas.getDocumentLanguage())); + String typeToProcess = getEnv("HEIDELTIME_TYPE", "narrative"); + String locale = getEnv("HEIDELTIME_LOCALE", localeForLanguage(language)); + boolean findTemponyms = Boolean.parseBoolean(getEnv("HEIDELTIME_TEMPONYMS", "true")); + + AggregateBuilder pipeline = new AggregateBuilder(); + pipeline.add(createEngineDescription( + HeidelTime.class, + "Language", language, + "Type", typeToProcess, + "locale", locale, + "Date", true, + "Time", true, + "Duration", true, + "Set", true, + "Temponym", findTemponyms, + "ConvertDurations", true, + "Debugging", false + )); + return pipeline.createAggregate(); } - } - static class TypesystemHandler implements HttpHandler { - @Override - public void handle(HttpExchange t) throws IOException { - try { - TypeSystemDescription desc = TypeSystemDescriptionFactory.createTypeSystemDescription(); - StringWriter writer = new StringWriter(); - desc.toXML(writer); - String response = writer.getBuffer().toString(); + /** + * HeidelTime expects its own Sentence/Token types. DUUI pipelines often provide DKPro + * Sentence/Token annotations, so we mirror them into the HeidelTime type system before + * executing the HeidelTime annotator. + */ + private void ensureHeidelTimeInputAnnotations(JCas jCas) { + if (JCasUtil.exists(jCas, de.unihd.dbs.uima.types.heideltime.Sentence.class) + && JCasUtil.exists(jCas, de.unihd.dbs.uima.types.heideltime.Token.class)) { + return; + } + + List dkproSentences = new ArrayList<>(JCasUtil.select(jCas, Sentence.class)); + if (dkproSentences.isEmpty() && jCas.getDocumentText() != null) { + Sentence sentence = new Sentence(jCas, 0, jCas.getDocumentText().length()); + sentence.addToIndexes(); + dkproSentences.add(sentence); + } - t.sendResponseHeaders(200, response.getBytes(Charset.defaultCharset()).length); + int sentenceId = 1; + for (Sentence dkproSentence : dkproSentences) { + de.unihd.dbs.uima.types.heideltime.Sentence heidelSentence = + new de.unihd.dbs.uima.types.heideltime.Sentence( + jCas, + dkproSentence.getBegin(), + dkproSentence.getEnd() + ); + heidelSentence.setFilename(DEFAULT_FILENAME); + heidelSentence.setSentenceId(sentenceId); + heidelSentence.addToIndexes(); + sentenceId++; + } - OutputStream os = t.getResponseBody(); - os.write(response.getBytes(Charset.defaultCharset())); + List dkproTokens = new ArrayList<>(JCasUtil.select(jCas, Token.class)); + if (dkproTokens.isEmpty()) { + createWhitespaceTokens(jCas); + dkproTokens = new ArrayList<>(JCasUtil.select(jCas, Token.class)); + } - } catch (ResourceInitializationException e) { - e.printStackTrace(); - t.sendResponseHeaders(404, -1); + int tokenId = 1; + for (Token dkproToken : dkproTokens) { + de.unihd.dbs.uima.types.heideltime.Token heidelToken = + new de.unihd.dbs.uima.types.heideltime.Token( + jCas, + dkproToken.getBegin(), + dkproToken.getEnd() + ); + heidelToken.setFilename(DEFAULT_FILENAME); + heidelToken.setTokenId(tokenId); + heidelToken.setSentId(findSentenceId(dkproSentences, dkproToken)); + heidelToken.setPos("NN"); + heidelToken.addToIndexes(); + tokenId++; + } + + ensureDct(jCas); + } + + private void createWhitespaceTokens(JCas jCas) { + String text = jCas.getDocumentText(); + if (text == null || text.isEmpty()) { return; - } catch (SAXException e) { - e.printStackTrace(); - } finally { - t.getResponseBody().close(); } + int tokenBegin = -1; + for (int i = 0; i <= text.length(); i++) { + boolean boundary = i == text.length() || Character.isWhitespace(text.charAt(i)); + + if (!boundary && tokenBegin < 0) { + tokenBegin = i; + } + + if (boundary && tokenBegin >= 0) { + Token token = new Token(jCas, tokenBegin, i); + token.addToIndexes(); + tokenBegin = -1; + } + } + } + + private int findSentenceId(List sentences, Token token) { + for (int i = 0; i < sentences.size(); i++) { + Sentence sentence = sentences.get(i); + if (token.getBegin() >= sentence.getBegin() && token.getEnd() <= sentence.getEnd()) { + return i + 1; + } + } + return 1; + } + + private void ensureDct(JCas jCas) { + if (JCasUtil.exists(jCas, de.unihd.dbs.uima.types.heideltime.Dct.class)) { + return; + } + + String today = LocalDate.now().format(DateTimeFormatter.BASIC_ISO_DATE); + de.unihd.dbs.uima.types.heideltime.Dct dct = new de.unihd.dbs.uima.types.heideltime.Dct(jCas, 0, 0); + dct.setFilename(DEFAULT_FILENAME); + dct.setTimexId("dct"); + dct.setValue(today); + dct.addToIndexes(); + } + + private String normalizeLanguage(String documentLanguage) { + if (documentLanguage == null || documentLanguage.isBlank()) { + return "german"; + } + + String language = documentLanguage.trim().toLowerCase(); + if (language.equals("de") || language.equals("deu") || language.equals("ger") || language.equals("german")) { + return "german"; + } + if (language.equals("en") || language.equals("eng") || language.equals("english")) { + return "english"; + } + if (language.equals("nl") || language.equals("nld") || language.equals("dut") || language.equals("dutch")) { + return "dutch"; + } + if (language.equals("es") || language.equals("spa") || language.equals("spanish")) { + return "spanish"; + } + if (language.equals("it") || language.equals("ita") || language.equals("italian")) { + return "italian"; + } + if (language.equals("fr") || language.equals("fra") || language.equals("fre") || language.equals("french")) { + return "french"; + } + if (language.equals("pt") || language.equals("por") || language.equals("portuguese")) { + return "portuguese"; + } + if (language.equals("ru") || language.equals("rus") || language.equals("russian")) { + return "russian"; + } + if (language.equals("zh") || language.equals("zho") || language.equals("chi") || language.equals("chinese")) { + return "chinese"; + } + if (language.equals("ar") || language.equals("ara") || language.equals("arabic")) { + return "arabic"; + } + if (language.equals("hr") || language.equals("hrv") || language.equals("croatian")) { + return "croatian"; + } + if (language.equals("et") || language.equals("est") || language.equals("estonian")) { + return "estonian"; + } + if (language.equals("vi") || language.equals("vie") || language.equals("vietnamese")) { + return "vietnamese"; + } + + return language; + } + + private String localeForLanguage(String language) { + if ("german".equalsIgnoreCase(language)) { + return "de_DE"; + } + if ("english".equalsIgnoreCase(language)) { + return "en_GB"; + } + return "en_GB"; + } + + private void copyTimex3ToTTLabTime(JCas jCas) { + for (Timex3 timex3 : JCasUtil.select(jCas, Timex3.class)) { + Time time = new Time(jCas); + time.setBegin(timex3.getBegin()); + time.setEnd(timex3.getEnd()); + time.setValue(timex3.getTimexType()); + time.setIdentifier(timex3.getTimexValue()); + time.addToIndexes(); + } } } - static class IOHandler implements HttpHandler { + private static class TypesystemHandler implements HttpHandler { @Override - public void handle(HttpExchange t) throws IOException { - try { - JSONObject rObject = new JSONObject(); - rObject.put("input", new JSONArray().put(Token.class.getName()).put(Sentence.class.getName())); - rObject.put("output", new JSONArray().put(Timex3.class.getName()).put(Time.class.getName())); - String response = rObject.toString(); - t.sendResponseHeaders(200, response.getBytes(Charset.defaultCharset()).length); - - OutputStream os = t.getResponseBody(); - os.write(response.getBytes(Charset.defaultCharset())); + public void handle(HttpExchange exchange) throws IOException { + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + methodNotAllowed(exchange); + return; + } - } catch (JSONException e) { + try { + TypeSystemDescription description = TypeSystemDescriptionFactory.createTypeSystemDescription(); + StringWriter writer = new StringWriter(); + description.toXML(writer); + writeText(exchange, 200, "application/xml; charset=utf-8", writer.toString()); + } catch (ResourceInitializationException | SAXException e) { e.printStackTrace(); - t.sendResponseHeaders(404, -1); + writeText(exchange, 500, "text/plain; charset=utf-8", "Could not create type system: " + e.getMessage()); + } + } + } + + private static class IOHandler implements HttpHandler { + @Override + public void handle(HttpExchange exchange) throws IOException { + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + methodNotAllowed(exchange); return; - } finally { - t.getResponseBody().close(); } + JSONObject response = new JSONObject(); + response.put("input", new JSONArray() + .put(Token.class.getName()) + .put(Sentence.class.getName())); + response.put("output", new JSONArray() + .put(Timex3.class.getName()) + .put(Time.class.getName())); + + writeText(exchange, 200, "application/json; charset=utf-8", response.toString()); } } - static class CommunicationLayer implements HttpHandler { + private static class CommunicationLayerHandler implements HttpHandler { @Override - public void handle(HttpExchange t) throws IOException { - String response = "serial = luajava.bindClass(\"org.apache.uima.cas.impl.XmiCasSerializer\")\n" + - "deserial = luajava.bindClass(\"org.apache.uima.cas.impl.XmiCasDeserializer\")" + - "function serialize(inputCas,outputStream,params)\n" + - " serial:serialize(inputCas:getCas(),outputStream)\n" + - "end\n" + - "\n" + - "function deserialize(inputCas,inputStream)\n" + - " inputCas:reset()\n" + - " deserial:deserialize(inputStream,inputCas:getCas(),true)\n" + - "end"; - t.sendResponseHeaders(200, response.length()); - OutputStream os = t.getResponseBody(); - os.write(response.getBytes()); - os.close(); + public void handle(HttpExchange exchange) throws IOException { + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + methodNotAllowed(exchange); + return; + } + + try (InputStream inputStream = HeidelTimeExt.class.getResourceAsStream(COMMUNICATION_LAYER_RESOURCE)) { + if (inputStream == null) { + writeText(exchange, 500, "text/plain; charset=utf-8", "Missing resource: " + COMMUNICATION_LAYER_RESOURCE); + return; + } + byte[] response = inputStream.readAllBytes(); + writeResponse(exchange, 200, "text/plain; charset=utf-8", response); + } } } } diff --git a/duui-HeidelTimeExt/src/main/resources/communication_layer.lua b/duui-HeidelTimeExt/src/main/resources/communication_layer.lua new file mode 100644 index 00000000..551b8246 --- /dev/null +++ b/duui-HeidelTimeExt/src/main/resources/communication_layer.lua @@ -0,0 +1,17 @@ +-- DUUI communication layer for HeidelTimeExt. +-- This component uses the classic serialize/deserialize mode and transfers the CAS as XMI. + +serial = luajava.bindClass("org.apache.uima.cas.impl.XmiCasSerializer") +deserial = luajava.bindClass("org.apache.uima.cas.impl.XmiCasDeserializer") + +SUPPORTS_PROCESS = false +SUPPORTS_SERIALIZE = true + +function serialize(inputCas, outputStream, params) + serial:serialize(inputCas:getCas(), outputStream) +end + +function deserialize(inputCas, inputStream) + inputCas:reset() + deserial:deserialize(inputStream, inputCas:getCas(), true) +end diff --git a/duui-HeidelTimeExt/src/main/test/java/org/texttechnology/tools/HeidelTimeExtTest.java b/duui-HeidelTimeExt/src/main/test/java/org/texttechnology/tools/HeidelTimeExtTest.java new file mode 100644 index 00000000..aa75c886 --- /dev/null +++ b/duui-HeidelTimeExt/src/main/test/java/org/texttechnology/tools/HeidelTimeExtTest.java @@ -0,0 +1,213 @@ +package org.texttechnology.tools; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.unihd.dbs.uima.types.heideltime.Timex3; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.XmlCasSerializer; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.texttechnologylab.annotation.type.Time; +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class HeidelTimeExtTest { + + static DUUIComposer composer; + static JCas cas; + static String url = getenvOrDefault("HEIDELTIME_EXT_URL", "http://127.0.0.1:9714"); + + @BeforeAll + static void beforeAll() throws Exception { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); + + cas = JCasFactory.createJCas(); + } + + @AfterEach + public void afterEach() throws IOException, SAXException { + composer.resetPipeline(); + + if (cas != null) { + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, stream); + System.out.println(stream.toString(StandardCharsets.UTF_8)); + cas.reset(); + } + } + + @AfterAll + static void afterAll() throws UnknownHostException { + if (composer != null) { + composer.shutdown(); + } + } + + @Test + public void CommunicationLayerTest() throws Exception { + Assumptions.assumeTrue( + serviceAvailable(url), + "Skipping test because no HeidelTimeExt DUUI service is reachable at " + url + ); + + String communicationLayer = httpGet(url + "/v1/communication_layer"); + + assertTrue(communicationLayer.contains("SUPPORTS_SERIALIZE = true")); + assertTrue(communicationLayer.contains("function serialize")); + assertTrue(communicationLayer.contains("function deserialize")); + } + + @Test + public void HeidelTimeExtRemoteTest() throws Exception { + Assumptions.assumeTrue( + serviceAvailable(url), + "Skipping test because no HeidelTimeExt DUUI service is reachable at " + url + ); + + composer.add(new DUUIRemoteDriver.Component(url)); + + createCas( + "de", + Arrays.asList( + "Am 12. Mai 2024 begann die Exkursion.", + "Nach zwei Tagen wurden weitere Proben gesammelt.", + "Morgen um 14 Uhr soll ein weiteres Treffen stattfinden." + ) + ); + + composer.run(cas); + + Collection timexAnnotations = JCasUtil.select(cas, Timex3.class); + Collection