diff --git a/duui-Hate/Readme.md b/duui-Hate/Readme.md index b259c12d..7ab15c2a 100644 --- a/duui-Hate/Readme.md +++ b/duui-Hate/Readme.md @@ -35,6 +35,7 @@ DUUI implementation for selected hate classification tools: [Hate](https://huggi | mehate-bert | https://huggingface.co/l3cube-pune/me-hate-bert | 407f19357c3b2166db6cbc2107807fc07a17b8f5 | MULTI | | hatemoji | https://huggingface.co/HannahRoseKirk/Hatemoji | f2f98581ab15fb3ccf8b8a5465d7ca70c2958902 | EN | | codemix-hate | https://huggingface.co/debajyotimaz/codemix_hate | b07d73f1a05dd04c0adbb941b5446064b14feb10 | EN, HI | +| phobert-hsd | https://huggingface.co/visolex/phobert-hsd | 844b4cda62a864907038a33edb346cf8b612054f | VI | # How To Use diff --git a/duui-Hate/docker_build.sh b/duui-Hate/docker_build.sh old mode 100644 new mode 100755 index c0aa55d1..666537d4 --- a/duui-Hate/docker_build.sh +++ b/duui-Hate/docker_build.sh @@ -1,7 +1,7 @@ export ANNOTATOR_NAME=duui-hate export ANNOTATOR_VERSION=0.3.0 export LOG_LEVEL=INFO -eport MODEL_CACHE_SIZE=3 +export MODEL_CACHE_SIZE=3 #--------------------------------------------------------------------- #export MODEL_NAME="Andrazp/multilingual-hate-speech-robacofi" @@ -211,7 +211,13 @@ export MODEL_SOURCE="https://huggingface.co/debajyotimaz/codemix_hate" export MODEL_LANG="EN, HI" ##-------------------------------------------------------------------- - +##--------------------------------------------------------------------- +export MODEL_NAME="visolex/phobert-hsd" +export MODEL_SPECNAME="phobert-hsd" +export MODEL_VERSION="844b4cda62a864907038a33edb346cf8b612054f" +export MODEL_SOURCE="https://huggingface.co/visolex/phobert-hsd" +export MODEL_LANG="VI" +##-------------------------------------------------------------------- export DOCKER_REGISTRY="docker.texttechnologylab.org/" export DUUI_CUDA= diff --git a/duui-Hate/pom.xml b/duui-Hate/pom.xml index 19bf3634..97dfe74f 100644 --- a/duui-Hate/pom.xml +++ b/duui-Hate/pom.xml @@ -101,7 +101,7 @@ com.github.texttechnologylab DockerUnifiedUIMAInterface - 7cef2433b5 + 1.5.3 @@ -112,7 +112,7 @@ com.github.texttechnologylab UIMATypeSystem - fedfa0ace + 02fb1a2f13 diff --git a/duui-Hate/src/main/docker/Dockerfile b/duui-Hate/src/main/docker/Dockerfile index 519512c2..cc2a3b09 100644 --- a/duui-Hate/src/main/docker/Dockerfile +++ b/duui-Hate/src/main/docker/Dockerfile @@ -57,8 +57,9 @@ RUN pip install -r reqiurements.txt #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='HannahRoseKirk/Hatemoji')" -RUN python -c "from transformers import pipeline; pipeline('text-classification', model='debajyotimaz/codemix_hate')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='debajyotimaz/codemix_hate')" +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='visolex/phobert-hsd')" # service script COPY ./src/main/python/TypeSystemHate.xml ./TypeSystemHate.xml diff --git a/duui-Hate/src/main/docker/Dockerfile-cuda b/duui-Hate/src/main/docker/Dockerfile-cuda index f61fa681..7603b56f 100644 --- a/duui-Hate/src/main/docker/Dockerfile-cuda +++ b/duui-Hate/src/main/docker/Dockerfile-cuda @@ -46,9 +46,11 @@ RUN pip install -r reqiurements.txt #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-french')" -RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-indonesian')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-indonesian')" #RUN python -c "from nubia_score import Nubia; nubia = Nubia()" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='visolex/phobert-hsd')" + # service script COPY ./src/main/python/TypeSystemHate.xml ./TypeSystemHate.xml COPY ./src/main/python/evaluator.py ./evaluator.py diff --git a/duui-Hate/src/main/python/hatechecker.py b/duui-Hate/src/main/python/hatechecker.py index afadbd07..d4aee7b5 100644 --- a/duui-Hate/src/main/python/hatechecker.py +++ b/duui-Hate/src/main/python/hatechecker.py @@ -96,25 +96,10 @@ def sigmoid(x): 0: "NOT HATE", 1: "HATE" }, - "l3cube-pune/me-hate-bert": { + "visolex/phobert-hsd": { 0: "NOT HATE", - 1: "HATE" - }, - "HannahRoseKirk/Hatemoji": { - 0: "NOT HATE", - 1: "HATE", - }, - "debajyotimaz/codemix_hate": { - 0: "NOT HATE", - 1: "HATE" - }, - "MilaNLProc/hate-ita": { - 0: "NOT HATE", - 1: "HATE" - }, - "MilaNLProc/hate-ita-xlm-r-base": { - 0: "NOT HATE", - 1: "HATE" + 1: "OFFENSIVE", + 2: "HATE" } } diff --git a/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java b/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java index 06c08e97..c0a30960 100644 --- a/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java +++ b/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java @@ -156,4 +156,41 @@ public void DeTest() throws Exception { Assertions.assertEquals(expected_i, out_i); } } + + @Test + public void VietnameseTest() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + ); + List sentences = Arrays.asList( + "Tôi ghét cay ghét đắng điều đó. Sao bạn có thể làm điều tồi tệ đó với tôi! TẠI SAO!", + "Tôi rất vui khi được ở đây. Tôi yêu nơi này." + ); + + createCas("vi", sentences); + composer.run(cas); + + // Update to match actual offsets and predictions + HashMap expected = new HashMap<>(); + expected.put("0_43", "NonHate"); // Model predicts NonHate + expected.put("44_82", "NonHate"); + + Collection all_hate = JCasUtil.select(cas, Hate.class); + for (Hate hate : all_hate) { + int begin = hate.getBegin(); + int end = hate.getEnd(); + double hate_i = hate.getHate(); + double non_hate = hate.getNonHate(); + String out_i = "HATE"; + if (hate_i < non_hate){ + out_i = "NonHate"; + } + String expected_i = expected.get(begin+"_"+end); + if (expected_i != null) { + Assertions.assertEquals(expected_i, out_i); + } + } + } + } diff --git a/duui-ocr/Readme.md b/duui-ocr/Readme.md new file mode 100644 index 00000000..3bce964d --- /dev/null +++ b/duui-ocr/Readme.md @@ -0,0 +1,99 @@ +# DUUI OCR + +DUUI implementation for vision-language OCR models. + +## Supported Models + +| Name | Params | Languages | Supported Tasks | +| ---- | ------ | --------- | --------------- | +| [PaddlePaddle/PaddleOCR-VL-1.5](https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5) | 0.9B | multilingual | ocr, table, formula, chart, spotting, seal | +| [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR) | 0.9B | multilingual | ocr, table, formula | + +## Supported Tasks + +| Task | PaddleOCR-VL Prompt | GLM-OCR Prompt | Description | +| ---- | ------------------- | -------------- | ----------- | +| `ocr` | `OCR:` | `Text Recognition:` | General text recognition | +| `table` | `Table Recognition:` | `Table Recognition:` | Table structure recognition | +| `formula` | `Formula Recognition:` | `Formula Recognition:` | LaTeX formula recognition | +| `chart` | `Chart Recognition:` | — | Chart content recognition | +| `spotting` | `Spotting:` | — | Text spotting with location | +| `seal` | `Seal Recognition:` | — | Seal text recognition | + +## How To Use + +Requires +[Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +### Run within DUUI + +```java +// PaddleOCR-VL +composer.add( + new DUUIDockerDriver.Component( + "docker.texttechnologylab.org/duui-ocr:latest" + ) + .withParameter("model_name", + "PaddlePaddle/PaddleOCR-VL-1.5") + .withParameter("task", "ocr") +); + +// GLM-OCR +composer.add( + new DUUIDockerDriver.Component( + "docker.texttechnologylab.org/duui-ocr:latest" + ) + .withParameter("model_name", "zai-org/GLM-OCR") + .withParameter("task", "ocr") +); +``` + +### Parameters + +| Name | Description | Default | +| ---- | ----------- | ------- | +| `model_name` | Model to use (see table above) | — | +| `task` | OCR task type | `ocr` | +| `max_new_tokens` | Maximum tokens to generate | `1024` | + +### Input / Output + +- **Input**: `org.texttechnologylab.annotation.type.Image` + annotations in CAS (src can be base64 or file path) +- **Output**: `org.texttechnologylab.annotation.AnnotationComment` + with key = task name, value = recognized text + +## Cite + +```bibtex +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data + {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe + and Baumartz, Daniel and Mehler, Alexander}, + booktitle = {Findings of the Association for Computational + Linguistics: EMNLP 2023}, + year = {2023}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, +} + +@misc{cui2026paddleocrvl15multitask09bvlm, + title = {PaddleOCR-VL-1.5: Towards a Multi-Task 0.9B VLM + for Robust In-the-Wild Document Parsing}, + author = {Cheng Cui and Ting Sun and Suyin Liang and others}, + year = {2026}, + eprint = {2601.21957}, + archivePrefix = {arXiv}, + primaryClass = {cs.CV}, +} + +@misc{glmocr2026, + title = {GLM-OCR: A Multimodal OCR Model for Complex + Document Understanding}, + author = {Z.ai Team}, + year = {2026}, + url = {https://huggingface.co/zai-org/GLM-OCR}, +} +``` \ No newline at end of file diff --git a/duui-ocr/docker_build.sh b/duui-ocr/docker_build.sh new file mode 100755 index 00000000..2093907e --- /dev/null +++ b/duui-ocr/docker_build.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +export DUUI_OCR_CUDA= +#export DUUI_OCR_CUDA="-cuda" + +export DUUI_OCR_ANNOTATOR_NAME=duui-ocr +export DUUI_OCR_ANNOTATOR_VERSION=0.2.0 +export DUUI_OCR_LOG_LEVEL=DEBUG +export DUUI_OCR_MODEL_CACHE_SIZE=1 +export DOCKER_REGISTRY="docker.texttechnologylab.org/" + +docker build \ + --build-arg DUUI_OCR_ANNOTATOR_NAME \ + --build-arg DUUI_OCR_ANNOTATOR_VERSION \ + --build-arg DUUI_OCR_LOG_LEVEL \ + --build-arg DUUI_OCR_MODEL_CACHE_SIZE \ + -t ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:${DUUI_OCR_ANNOTATOR_VERSION}${DUUI_OCR_CUDA} \ + -f src/main/docker/Dockerfile${DUUI_OCR_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:${DUUI_OCR_ANNOTATOR_VERSION}${DUUI_OCR_CUDA} \ + ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:latest${DUUI_OCR_CUDA} \ No newline at end of file diff --git a/duui-ocr/pom.xml b/duui-ocr/pom.xml new file mode 100644 index 00000000..d8406d68 --- /dev/null +++ b/duui-ocr/pom.xml @@ -0,0 +1,133 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui_ocr + 0.2.0 + + + + AGPL-3.0-or-later + + https://www.gnu.org/licenses/agpl.txt + + repo + + GNU Affero General Public License v3.0 + or later + + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + + + + org.apache.maven.plugins + + + maven-surefire-plugin + + 2.22.0 + + + --illegal-access=permit + --add-opens + java.base/java.util=ALL-UNNAMED + + + + + + + + 17 + 17 + + 2.4.0 + + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + + dkpro-core-asl + + + ${dkpro.core.version} + + pom + import + + + + + + + + com.github.texttechnologylab + + + DockerUnifiedUIMAInterface + + 1.5.3 + + + + + com.github.texttechnologylab + + UIMATypeSystem + 02fb1a2f13 + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + + dkpro-core-api-segmentation-asl + + test + + + + org.dkpro.core + + dkpro-core-io-xmi-asl + + test + + + + org.dkpro.core + + dkpro-core-api-resources-asl + + test + + + \ No newline at end of file diff --git a/duui-ocr/requirements.txt b/duui-ocr/requirements.txt new file mode 100644 index 00000000..4aa290b8 --- /dev/null +++ b/duui-ocr/requirements.txt @@ -0,0 +1,12 @@ +transformers>=5.0.0 +torch==2.6.0 +torchvision==0.21.0 +Pillow>=10.0.0 +fastapi==0.110.0 +uvicorn[standard]==0.27.1 +pydantic-settings==2.0.2 +dkpro-cassis==0.9.1 +numpy>=1.26.0 +sentencepiece>=0.2.0 +protobuf>=5.0.0 +accelerate>=0.30.0 \ No newline at end of file diff --git a/duui-ocr/src/main/docker/Dockerfile b/duui-ocr/src/main/docker/Dockerfile new file mode 100644 index 00000000..6d87c793 --- /dev/null +++ b/duui-ocr/src/main/docker/Dockerfile @@ -0,0 +1,107 @@ +# Builds the container image for the multi-model OCR annotator. +# This is v3 of the Dockerfile. +# +# v1: Only downloaded PaddleOCR-VL-1.5. Straightforward, worked fine. +# +# v2: Added microsoft/trocr-base-printed to the pre-download step. +# This actually *built* successfully, unlike the Python code which +# never ran properly with TrOCR. I also had to add +# VisionEncoderDecoderModel and TrOCRProcessor to the import +# line, which was the moment I started realizing TrOCR was a +# different "thing". The image was ~2GB larger for a model we never +# ended up using. Removed it. +# +# v3: Replaced TrOCR with GLM-OCR. Downloads both PaddleOCR-VL and +# GLM-OCR at build time. Current version. +# +# BORROWED. The overall structure (WORKDIR, EXPOSE, pip install pattern, +# ARG/ENV pairs for config) is copied from other DUUI annotator +# Dockerfiles in the TTLab repo: +# https://github.com/texttechnologylab/DockerUnifiedUIMAInterface +# +# +# Last meaningful edit: Feb 2026 + + +# SOLID. Python 3.10 because that's what the TTLab DUUI components +# standardize on. Didn't investigate further, just went with 3.10. +FROM python:3.10 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# -- Dependencies -- +# SOLID. pip install in order: upgrade pip itself, install build tools +# (setuptools/wheel, needed for some packages that compile C extensions), +# then install from requirements.txt. + +RUN pip install --upgrade pip +RUN pip install setuptools wheel +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# -- Model pre-download -- +# SOLID (the idea) + COPILOT (the syntax) +# +# This downloads the model weights during the Docker *build*, not at +# runtime. +# +# v2 ABANDONED: this block used to also download TrOCR: +# +# from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \ +# VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed'); \ +# TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed'); \ +# +# Removed it when I abandoned TrOCR. No point baking a ~900MB model +# into the image if we're never going to call it. +# +# Note: both working models (PaddleOCR-VL and GLM-OCR) load through +# the same Auto* classes AutoModelForImageTextToText and +# AutoProcessor. This is the same compatibility that makes them work +# in the shared Python backend. TrOCR needed VisionEncoderDecoderModel +# and TrOCRProcessor, which was another hint that it didn't belong here. +RUN python -c "\ +from transformers import AutoProcessor, AutoModelForImageTextToText; \ +AutoModelForImageTextToText.from_pretrained('PaddlePaddle/PaddleOCR-VL-1.5'); \ +AutoProcessor.from_pretrained('PaddlePaddle/PaddleOCR-VL-1.5'); \ +AutoModelForImageTextToText.from_pretrained('zai-org/GLM-OCR'); \ +AutoProcessor.from_pretrained('zai-org/GLM-OCR')" + +# -- Source files -- +# SOLID. Copy the actual application code. +COPY ./src/main/python/TypeSystemOCR.xml ./TypeSystemOCR.xml +COPY ./src/main/python/duui_ocr.py ./duui_ocr.py +COPY ./src/main/python/duui_ocr.lua ./duui_ocr.lua + +# -- Configuration -- +# BORROWED. The ARG/ENV pattern is from TTLab's other Dockerfiles. + +ARG DUUI_OCR_LOG_LEVEL="DEBUG" +ENV DUUI_OCR_LOG_LEVEL=$DUUI_OCR_LOG_LEVEL + +# How many models to keep loaded in memory simultaneously. +# Default 1 means loading GLM-OCR evicts PaddleOCR and vice versa. +# Set to 2 if you have enough VRAM for both (~20GB+). +ARG DUUI_OCR_MODEL_CACHE_SIZE=1 +ENV DUUI_OCR_MODEL_CACHE_SIZE=$DUUI_OCR_MODEL_CACHE_SIZE + +# -- Metadata -- +# These get reported through the /v1/documentation endpoint. +# The version is "unset" by default and gets overridden by the CI/CD +# pipeline (or manually with --build-arg). +ARG DUUI_OCR_ANNOTATOR_NAME="duui-ocr" +ENV DUUI_OCR_ANNOTATOR_NAME=$DUUI_OCR_ANNOTATOR_NAME +ARG DUUI_OCR_ANNOTATOR_VERSION="unset" +ENV DUUI_OCR_ANNOTATOR_VERSION=$DUUI_OCR_ANNOTATOR_VERSION + +# -- Startup -- +# BORROWED. uvicorn is the ASGI server that runs the FastAPI app. +# +# REVISIT. I've read that uvicorn with --workers > 1 uses separate +# processes, each with its own memory space. So two workers = two +# copies of the model in VRAM? That's definitely not what we want. +# But I wonder if there's a way to share the model across workers. +# Haven't looked into it. For now, 1 worker is fine for our throughput. +ENTRYPOINT ["uvicorn", "duui_ocr:app", "--host", "0.0.0.0", "--port", "9714"] +CMD ["--workers", "1"] \ No newline at end of file diff --git a/duui-ocr/src/main/docker/Dockerfile_cuda b/duui-ocr/src/main/docker/Dockerfile_cuda new file mode 100644 index 00000000..8d9ed2a9 --- /dev/null +++ b/duui-ocr/src/main/docker/Dockerfile_cuda @@ -0,0 +1,88 @@ +# BORROWED. Almost entirely lifted from the base Dockerfile, with +# the only real difference being the base image (NVIDIA CUDA runtime +# instead of plain Python) and a couple of CUDA-specific env vars. +# +# See the base Dockerfile for detailed commentary on each section. +# I'm not repeating all of that here. +# +# Last meaningful edit: Feb 2026 + +# COPILOT. I asked "what's the right NVIDIA base image for running +# HuggingFace models with CUDA" and Copilot suggested this one. +FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 + +# BORROWED. Stops Python from buffering stdout/stderr, so logs +# show up immediately in docker logs. Copied from the other +# Dockerfile for another DUUI GPU component. +ENV PYTHONUNBUFFERED=1 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# -- Python installation -- +# The NVIDIA base image doesn't come with Python, unlike the python:3.10 +# image we use in the base Dockerfile. So we install it manually. +# COPILOT helped with the apt-get lines. The DEBIAN_FRONTEND=noninteractive +# suppresses interactive prompts during install that would hang the build. +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + python3.10 \ + python3-pip \ + python3.10-venv \ + && rm -rf /var/lib/apt/lists/* + +# -- Dependencies -- +# Same as base Dockerfile. +RUN pip install --upgrade pip +RUN pip install setuptools wheel +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# -- Model pre-download -- +# Same as base Dockerfile. +# +# v2 ABANDONED: TrOCR download +# RUN python3 -c "\ +# from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \ +# VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed'); \ +# TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')" + +RUN python3 -c "\ +from transformers import AutoProcessor, AutoModelForImageTextToText; \ +AutoModelForImageTextToText.from_pretrained('PaddlePaddle/PaddleOCR-VL-1.5'); \ +AutoProcessor.from_pretrained('PaddlePaddle/PaddleOCR-VL-1.5'); \ +AutoModelForImageTextToText.from_pretrained('zai-org/GLM-OCR'); \ +AutoProcessor.from_pretrained('zai-org/GLM-OCR')" + +# -- Source files -- +COPY ./src/main/python/TypeSystemOCR.xml ./TypeSystemOCR.xml +COPY ./src/main/python/duui_ocr.py ./duui_ocr.py +COPY ./src/main/python/duui_ocr.lua ./duui_ocr.lua + +# -- Configuration -- +# Same ARG/ENV pairs as base Dockerfile. +ARG DUUI_OCR_LOG_LEVEL="DEBUG" +ENV DUUI_OCR_LOG_LEVEL=$DUUI_OCR_LOG_LEVEL + +ARG DUUI_OCR_MODEL_CACHE_SIZE=1 +ENV DUUI_OCR_MODEL_CACHE_SIZE=$DUUI_OCR_MODEL_CACHE_SIZE + +ARG DUUI_OCR_ANNOTATOR_NAME="duui-ocr" +ENV DUUI_OCR_ANNOTATOR_NAME=$DUUI_OCR_ANNOTATOR_NAME +ARG DUUI_OCR_ANNOTATOR_VERSION="unset" +ENV DUUI_OCR_ANNOTATOR_VERSION=$DUUI_OCR_ANNOTATOR_VERSION + +# -- CUDA-specific -- +# COPILOT. Asked "what NVIDIA env vars does a container need to +# use the host GPU." These tell the NVIDIA container runtime which +# GPU capabilities to expose. "compute,utility" covers inference +# (compute) and nvidia-smi (utility). There's also "graphics" and +# "video" but we don't need those. +# Source: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility + +# -- Startup -- +ENTRYPOINT ["python3", "-m", "uvicorn", "duui_ocr:app", "--host", "0.0.0.0", "--port", "9714"] +CMD ["--workers", "1"] \ No newline at end of file diff --git a/duui-ocr/src/main/python/TypeSystemOCR.xml b/duui-ocr/src/main/python/TypeSystemOCR.xml new file mode 100644 index 00000000..2164ae04 --- /dev/null +++ b/duui-ocr/src/main/python/TypeSystemOCR.xml @@ -0,0 +1,89 @@ + + + + + org.texttechnologylab.annotation.type.Image + Image annotation with source data + uima.tcas.Annotation + + + src + + Base64 encoded image data or file path + + uima.cas.String + + + width + + uima.cas.Integer + + + height + + uima.cas.Integer + + + mimetype + + uima.cas.String + + + + + + + org.texttechnologylab.annotation.AnnotationComment + + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + value + + uima.cas.String + + + key + + uima.cas.String + + + + + + + org.texttechnologylab.annotation.model.MetaData + + + uima.tcas.Annotation + + + modelName + + uima.cas.String + + + modelVersion + + uima.cas.String + + + source + + uima.cas.String + + + lang + + uima.cas.String + + + + + \ No newline at end of file diff --git a/duui-ocr/src/main/python/duui_ocr.lua b/duui-ocr/src/main/python/duui_ocr.lua new file mode 100644 index 00000000..c40d8ae2 --- /dev/null +++ b/duui-ocr/src/main/python/duui_ocr.lua @@ -0,0 +1,248 @@ +--[[ +I do not really know Lua :(. I've never written Lua before this Praktikum. +Copilot is near-useless here because it doesn't understand the luajava +bridge or the DUUI-specific patterns, and keeps hallucinating methods +that don't exist on the Java objects :((. So for this file I leaned heavily +on ChatGPT and on reading existing Lua scripts from other DUUI +annotators in the TTLab repo. + +Sources I borrowed from (all from the same GitHub org): + https://github.com/texttechnologylab/duui-uima + - duui-transformers-summary/src/.../duui_summary.lua + (the serialize/deserialize skeleton, the JCasUtil iteration pattern, + the MetaData and DocumentModification annotation creation) + - duui-transformers-sentiment/src/.../duui_sentiment.lua + (the selection-based iteration with Class:forName, the pattern for + writing results back as typed annotations with begin/end offsets) + - duui-image-generation/src/.../duui_image_generation.lua + (the Image annotation type usage, error handling with + AnnotationComment, writing config key-value pairs back as + annotation comments. I basically lifted that pattern wholesale) + +The structure is always the same across all DUUI Lua scripts. +Once I understood that pattern from reading the existing scripts, +writing this one was mostly a matter of swapping in the right +annotation types and field names for OCR. + +ChatGPT wrote the first draft of both functions. I edited field names +and annotation types to match our Python server's request/response +schemas. + +Last meaningful edit: Feb 2026 +--]] + + +-- -- Java class bindings -- +-- BORROWED. I copied this block from duui-transformers-sentiment and +-- duui-image-generation, then added/removed classes as needed. +-- DUUILuaUtils is a TTLab helper that wraps some common operations +-- like getting document text length (which is apparently not trivial +-- in UIMA because of how surrogate pairs work? I didn't dig into it). +-- +-- The string concatenation with ".." is Lua's version of "+" for +-- strings. ChatGPT taught me that. I split long class names across +-- lines because some of these fully-qualified Java names are... something. + +StandardCharsets = luajava.bindClass( + "java.nio.charset.StandardCharsets" +) +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass( + "org.apache.uima.fit.util.JCasUtil" +) +DUUILuaUtils = luajava.bindClass( + "org.texttechnologylab.DockerUnifiedUIMAInterface" + .. ".lua.DUUILuaUtils" +) + + +-- -- serialize -- +-- BORROWED + CHATGPT. The overall skeleton is from duui-transformers-sentiment +-- and duui-image-generation. The Image annotation iteration is adapted +-- from duui-image-generation's deserialize function, but run in +-- reverse. There they *write* Image annotations, here I *read* them. +-- +-- ChatGPT wrote the first working version after I described what I +-- needed: "read all Image annotations from the CAS, extract their +-- src/begin/end fields, and send them as a JSON array along with +-- model config parameters." +-- +-- I understand the flow: get params, iterate over typed annotations, +-- build a table, encode to JSON. The luajava method-call syntax with +-- the colons (obj:method()) vs dots (obj.field) still trips me up. +-- In Lua, colon means "call this method on the object" and dot means +-- "access this field." I think. ChatGPT explained it three times. + +function serialize(inputCas, outputStream, parameters) + local doc_lang = inputCas:getDocumentLanguage() + local doc_text = inputCas:getDocumentText() + local doc_len = + DUUILuaUtils:getDocumentTextLength(inputCas) + + local model_name = parameters["model_name"] + + -- Default task to "ocr" if not specified. Most of the time + -- that's what we want anyway. + local task = parameters["task"] + if task == nil then + task = "ocr" + end + + -- Cap on how much text the model can generate per image. + -- 1024 is generous for OCR, a full page of text is usually + -- well under that in tokens. But better too high than truncated. + local max_new_tokens = parameters["max_new_tokens"] + if max_new_tokens == nil then + max_new_tokens = 1024 + end + + -- -- Collect Image annotations from the CAS -- + -- BORROWED. This pattern is straight from duui-image-generation. + -- I was going to comment more but then remembered the line from + -- Game of Throne "You know nothing, Jon Snow." Jon Snow is me, + -- I am the Jon Snow of Lua. + + local images = {} + local images_count = 1 + local ImageClass = Class:forName( + "org.texttechnologylab.annotation.type.Image" + ) + local images_it = + JCasUtil:select(inputCas, ImageClass):iterator() + + while images_it:hasNext() do + local img = images_it:next() + local image_data = { + src = img:getSrc(), + begin = img:getBegin(), + ["end"] = img:getEnd(), + } + images[images_count] = image_data + images_count = images_count + 1 + end + + outputStream:write(json.encode({ + images = images, + lang = doc_lang, + doc_len = doc_len, + model_name = model_name, + task = task, + max_new_tokens = max_new_tokens, + })) +end + + +-- -- deserialize -- +-- BORROWED + CHATGPT. The overall structure is a patchwork of patterns +-- from the three existing Lua scripts I studied: +-- - Error handling with AnnotationComment: from duui-image-generation +-- - MetaData annotation creation: from duui-transformers-sentiment +-- and duui-transformers-summary (they're basically identical) +-- - Writing results as AnnotationComment key-value pairs: from +-- duui-image-generation's config loop +-- +-- ChatGPT helped me stitch these patterns together and adapt them +-- to match the OCRResponse schema from our Python server. +-- +-- FRAGILE. This function assumes the Python server's response JSON +-- has exactly the field names we check for. If someone changes the +-- Pydantic model on the Python side without updating this Lua script, +-- results will silently not appear in the CAS. I don't know how to +-- make this more robust in Lua. There's no schema validation. + +function deserialize(inputCas, inputStream) + -- I would never have figured out this incantation on my own. + -- Copied verbatim from duui-transformers-summary. + local inputString = luajava.newInstance( + "java.lang.String", + inputStream:readAllBytes(), + StandardCharsets.UTF_8 + ) + local results = json.decode(inputString) + + -- -- Error handling -- + -- BORROWED from duui-image-generation. Jon Snow speaking here. + if results["errors"] ~= nil then + for _, error in ipairs(results["errors"]) do + local err_annotation = luajava.newInstance( + "org.texttechnologylab.annotation" + .. ".AnnotationComment", + inputCas + ) + err_annotation:setKey("error") + err_annotation:setValue(error) + err_annotation:addToIndexes() + end + end + + -- -- Model metadata -- + -- BORROWED from duui-transformers-sentiment and + -- duui-transformers-summary. + if results["model_name"] ~= nil then + local model_meta = luajava.newInstance( + "org.texttechnologylab.annotation" + .. ".model.MetaData", + inputCas + ) + model_meta:setModelName(results["model_name"]) + model_meta:setModelVersion( + results["model_version"] + ) + model_meta:setSource(results["model_source"]) + model_meta:setLang(results["model_lang"]) + model_meta:addToIndexes() + end + + -- -- OCR results -- + -- CHATGPT. I asked ChatGPT to write this block. + -- Prompt was roughly: "iterate over ocr_results from the JSON, + -- create an AnnotationComment for each, set the key to the task + -- name and the value to the recognized text." + -- + -- I'm using AnnotationComment as the output type because our + -- TypeSystemOCR.xml doesn't define a dedicated OCR annotation + -- type (yet). AnnotationComment is a generic key-value pair + -- that's available in the TTLab type system. The key stores + -- which task produced this result ("ocr", "table", "formula", + -- etc.) and the value stores the actual text. + -- + -- REVISIT. This loses the begin/end offset information from + -- the OCR results. The AnnotationComment gets added at position + -- 0,0 in the document rather than at the original image's + -- location. I should probably set the begin/end on the + -- annotation to match result["begin"] and result["end"], but + -- I wasn't sure if AnnotationComment supports positional offsets + -- the way other annotation types do. Need to check the type + -- system definition. For now the offset data is just... lost + -- between Python and here. Not great. + if results["ocr_results"] ~= nil then + for _, result in ipairs(results["ocr_results"]) do + local ocr_annotation = luajava.newInstance( + "org.texttechnologylab.annotation" + .. ".AnnotationComment", + inputCas + ) + ocr_annotation:setKey(result["task"]) + ocr_annotation:setValue(result["text"]) + ocr_annotation:addToIndexes() + end + end + + -- -- Config as annotation comments -- + -- BORROWED + CHATGPT. This pattern is directly from duui-image-generation. + -- Lua is weakly typed but Java is not, and the luajava bridge + -- doesn't do implicit conversion. Found that out when it threw + -- an error on a numeric value. ChatGPT suggested tostring() as the fix. + if results["config"] ~= nil then + for key, value in pairs(results["config"]) do + local config_annotation = luajava.newInstance( + "org.texttechnologylab.annotation" + .. ".AnnotationComment", + inputCas + ) + config_annotation:setKey("config_" .. key) + config_annotation:setValue(tostring(value)) + config_annotation:addToIndexes() + end + end +end \ No newline at end of file diff --git a/duui-ocr/src/main/python/duui_ocr.py b/duui-ocr/src/main/python/duui_ocr.py new file mode 100644 index 00000000..4b977cc7 --- /dev/null +++ b/duui-ocr/src/main/python/duui_ocr.py @@ -0,0 +1,883 @@ +""" +duui_ocr_server.py + +FastAPI server that wraps vision-language models and exposes them as a +DUUI-compatible annotator component. You send it images (base64 or file +paths), it sends back OCR text. + +ITERATION HISTORY: + + v1: PaddleOCR-VL-1.5 only. One model, everything in one huge function, + worked but was impossible to extend. I knew from the start we'd + need to support more models, so I had to improve on this version + even though it technically ran fine. The problem was architectural, + not functional. + + v2: Tried to add microsoft/trocr-base-printed as a second model. + Spent two days on this before realizing TrOCR is a fundamentally + different kind of model. It uses VisionEncoderDecoderModel instead + of AutoModelForImageTextToText, needs its own TrOCRProcessor + instead of AutoProcessor, has no concept of chat templates or + text prompts, and this is the real killer: it only works on + single text-line images :(. You literally have to pre-crop every + line of text before feeding it in. It doesn't do full-page OCR. + My whole infrastructure assumes you hand the model a page and get + text back. TrOCR assumes someone else already found the text + lines for you. I couldn't reconcile these two approaches without + rewriting everything into two completely separate pipelines, and + at that point what's the shared infrastructure even for? + The abandoned TrOCR backend code is still in this file, commented + out, as proof of "concept". + + v3: Added zai-org/GLM-OCR instead. This worked almost immediately + because GLM-OCR is architecturally the same *kind* of model as + PaddleOCR-VL: it's a vision-language model that uses + AutoModelForImageTextToText, supports AutoProcessor with chat + templates, accepts text prompts alongside images, and does + full-page OCR. The backend pattern I'd already built for PaddleOCR + fit GLM-OCR with only minor adjustments. Sometimes the answer + isn't "write more code," it's "pick a compatible model." + +Heavy lifting on the model loading, batching, and the generate() call +was done with GitHub Copilot. I understand the flow but some of the +torch-specific idioms (inference_mode, bfloat16, cache eviction) are +things I looked up rather than knew from experience. + +The DUUI integration layer (typesystem, lua script, endpoints) is +mostly lifted from existing DUUI annotator examples in the TTLab repo: +https://github.com/texttechnologylab/DockerUnifiedUIMAInterface + +Last meaningful edit: Feb 2026 +""" + +from __future__ import annotations + +import base64 +import gc +import logging +import os +from abc import ABC, abstractmethod +from functools import lru_cache +from io import BytesIO +from threading import Lock +from typing import Dict, List, Optional, Union + +import torch +from cassis import load_typesystem +from fastapi import FastAPI +from fastapi.encoders import jsonable_encoder +from PIL import Image as PILImage +from pydantic import BaseModel +from pydantic_settings import BaseSettings +from starlette.responses import JSONResponse, PlainTextResponse, Response +from transformers import AutoModelForImageTextToText, AutoProcessor + +# -- v2: TrOCR imports -- +# ABANDONED. +# TrOCR needs its own model class and processor class. It can't use +# the Auto* classes that PaddleOCR-VL and GLM-OCR share. +# +# from transformers import TrOCRProcessor, VisionEncoderDecoderModel + +# -- Registry -- +# SOLID. This is just a dictionary. +# +# Each model we support gets an entry here with its metadata. +# "task_prompts" maps a task name to the literal string the model +# expects as its instruction. I got these prompt strings from the +# respective model cards on HuggingFace: +# - https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5 +# - https://huggingface.co/zai-org/GLM-OCR +# +# If you add a new model, you add it here and write a backend class +# for it below. "backend" is just a string key that maps to a class +# in BACKEND_MAP at the bottom of the backends section. + +MODEL_REGISTRY = { + "PaddlePaddle/PaddleOCR-VL-1.5": { + "source": "https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5", + "lang": "multi", + "version": "2026-01-28", + "tasks": ["ocr", "table", "formula", "chart", "spotting", "seal"], + "task_prompts": { + "ocr": "OCR:", + "table": "Table Recognition:", + "formula": "Formula Recognition:", + "chart": "Chart Recognition:", + "spotting": "Spotting:", + "seal": "Seal Recognition:", + }, + "backend": "paddleocr", + }, + # -- v2: TrOCR registry entry -- + # ABANDONED. I had this in the registry for about 6 hours before + # I realized it was never going to work with the shared backend. + # + # "microsoft/trocr-base-printed": { + # "source": "https://huggingface.co/microsoft/trocr-base-printed", + # "lang": "en", # TrOCR is English-only, unlike the others + # "version": "2021-09-21", + # "tasks": ["ocr"], # only OCR, no table/formula/chart support + # "task_prompts": { + # # TrOCR doesn't actually use text prompts at all. + # # It just takes pixel_values and generates text directly. + # # I put this here to fit the registry schema but it's + # # meaningless, the TrOCR backend ignores it. + # "ocr": "", + # }, + # "backend": "trocr", + # }, + "zai-org/GLM-OCR": { + "source": "https://huggingface.co/zai-org/GLM-OCR", + "lang": "multi", + "version": "2026-02-09", + "tasks": ["ocr", "table", "formula"], + "task_prompts": { + "ocr": "Text Recognition:", + "table": "Table Recognition:", + "formula": "Formula Recognition:", + }, + "backend": "glmocr", + }, +} + +# Just collects every unique task string across all models. +# The sorted() is cosmetic, I like alphabetical order in API docs. +ALL_SUPPORTED_TASKS = sorted( + {t for m in MODEL_REGISTRY.values() for t in m["tasks"]} +) + +# -- Settings & globals -- +# BORROWED. pydantic-settings pattern from TTLab's other DUUI components. +# Source: https://github.com/texttechnologylab/DockerUnifiedUIMAInterface +# +# The idea is that all config comes from environment variables so the +# Docker container can be parameterized at runtime. BaseSettings does +# the env-var-to-field mapping automatically, which I didn't know before. + + +class Settings(BaseSettings): + duui_ocr_annotator_name: str + duui_ocr_annotator_version: str + duui_ocr_log_level: str + duui_ocr_model_cache_size: int = 1 # how many models to keep loaded + + +settings = Settings() +logging.basicConfig(level=settings.duui_ocr_log_level) +logger = logging.getLogger(__name__) + +# COPILOT. I asked Copilot "how to pick GPU vs CPU and set dtype for +# transformers inference" and this is essentially what it gave me. +# bfloat16 is a half-precision float that saves VRAM. I *think* it's +# fine for inference but not for training? Either way it works here. +# On CPU we fall back to float32 because bfloat16 support on CPU is +# patchy depending on the hardware. +DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" +DTYPE = torch.bfloat16 if DEVICE != "cpu" else torch.float32 +logger.info("Using device: %s", DEVICE) + +# Only one thread can use the model at a time. Without this lock, +# concurrent requests can corrupt the GPU state and you get cryptic +# CUDA errors. Learned that the hard way during testing. +model_lock = Lock() + +# BORROWED, DUUI boilerplate. Every DUUI annotator needs a UIMA type +# system (XML) and a Lua communication script. These are loaded once +# at startup. The format is dictated by the DUUI framework. +with open("TypeSystemOCR.xml", "rb") as f: + typesystem = load_typesystem(f) +with open("duui_ocr.lua", "rb") as f: + lua_communication_script = f.read().decode("utf-8") + +# -- Schemas -- +# SOLID. These are just data shapes for the API. Pydantic validates +# incoming JSON against these classes automatically, which is genuinely +# one of the nicest things about FastAPI. +# +# "begin" and "end" are character offsets in the original UIMA document. +# They travel with the image so we can attach the OCR result back to +# the right spot in the document. + + +class ImageInput(BaseModel): + src: str # base64-encoded image data or a file path + begin: int + end: int + + +class OCRResult(BaseModel): + text: str # the recognized text + task: str # which task produced this ("ocr", "table", etc.) + begin: int + end: int + + +class OCRRequest(BaseModel): + images: List[ImageInput] + lang: str + doc_len: int + model_name: str + task: str = "ocr" + max_new_tokens: int = 1024 # upper bound on model output length + + +class OCRResponse(BaseModel): + ocr_results: List[OCRResult] + model_name: str + model_version: str + model_source: str + model_lang: str + errors: List[str] # we collect errors instead of crashing + config: Dict[str, Union[str, int, bool]] + + +# BORROWED, DUUI documentation schema. Every annotator must describe +# itself through this endpoint. Copied from existing annotators. +class TextImagerDocumentation(BaseModel): + annotator_name: str + version: str + implementation_lang: Optional[str] = None + meta: Optional[dict] = None + parameters: Optional[dict] = None + + +# -- Helpers -- + + +def decode_image(src: str) -> PILImage.Image: + """ + SOLID. Takes either a file path or a base64 string and gives + back a PIL image. The .convert("RGB") is important because some + PNGs come in as RGBA or palette mode and the models choke on that. + I found that out after a very confusing afternoon of "why does + this work on JPEGs but not PNGs?" + """ + if os.path.isfile(src): + return PILImage.open(src).convert("RGB") + return PILImage.open(BytesIO(base64.b64decode(src))).convert("RGB") + + +def to_device(mapping: dict) -> dict: + """ + COPILOT. Moves all tensors in a dict to the target device (GPU/CPU). + Copilot generated this as a one-liner dict comprehension. I expanded + it for readability. The isinstance check is there because the + processor output dict also contains non-tensor values (like + attention masks as lists sometimes?) and you can't call .to() on those. + """ + return { + k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v + for k, v in mapping.items() + } + + +def generate(model, inputs: dict, max_new_tokens: int): + """ + COPILOT. wraps model.generate() with the settings we want. + Prompt was roughly "generate from a transformers model with no + sampling deterministic output." + + - inference_mode: faster than no_grad, Copilot's suggestion. + I *think* it disables autograd more aggressively. + - do_sample=False: deterministic output, same image = same text. + - use_cache=True: something about reusing intermediate computations + during token generation. Makes it faster. I don't fully understand + the KV-cache mechanism but every example I've seen sets this to True. + """ + with torch.inference_mode(): + return model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=False, + use_cache=True, + ) + + +# -- Backends -- +# This is the part I'm least confident about architecturally. +# The idea: each model family has slightly different ways of building +# the input prompt and decoding the output. So each one gets its own +# "backend" class that knows how to talk to that specific model. +# +# The abstract base class defines the interface. Subclasses fill in +# the details. I learned this pattern from the original code in the +# DUUI repo. +# +# v1 had just PaddleOCR, no abstraction needed. +# v2 is where I introduced the ABC because I thought TrOCR would be +# a second subclass. It wasn't :((. TrOCR's interface was too different. +# v3 kept the ABC because GLM-OCR actually fits it perfectly. +# So the abstraction turned out to be useful, just not for the model +# I originally designed it for. + + +class OCRBackend(ABC): + def __init__(self, model_name: str, model, processor): + self.model_name = model_name + self.model = model + self.processor = processor + self.meta = MODEL_REGISTRY[model_name] + + def get_prompt(self, task: str) -> str: + """ + SOLID. Looks up the prompt string for a given task. + Falls back to the "ocr" prompt if the task isn't found, + which is a safety net that probably shouldn't be needed + since we validate tasks earlier. But just in case. + """ + prompts = self.meta["task_prompts"] + return prompts.get(task, prompts.get("ocr", "OCR:")) + + @abstractmethod + def run_single( + self, image: PILImage.Image, task: str, max_new_tokens: int + ) -> str: ... + + def run_batch( + self, + images: List[PILImage.Image], + task: str, + max_new_tokens: int, + ) -> List[str]: + """ + WORKS. Tries batch processing first, and if that fails for + any reason (OOM, padding issues, whatever) falls back to + processing images one at a time. This saved me during testing + when batch processing would randomly fail on certain image + size combinations. The sequential fallback is slower but + at least it doesn't crash the whole request. + """ + try: + return self._run_batch_impl(images, task, max_new_tokens) + except Exception as e: + logger.warning("Batch failed, falling back to sequential: %s", e) + return [ + self.run_single(img, task, max_new_tokens) for img in images + ] + + def _run_batch_impl( + self, + images: List[PILImage.Image], + task: str, + max_new_tokens: int, + ) -> List[str]: + """Default: just loops. Subclasses override with real batching.""" + return [ + self.run_single(img, task, max_new_tokens) for img in images + ] + + +class PaddleOCRBackend(OCRBackend): + """ + BORROWED + COPILOT. Backend for PaddlePaddle/PaddleOCR-VL-1.5. + This was the first model I got working (v1). The chat template + pattern (apply_chat_template) comes from the HuggingFace model + card example: + https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5 + + Copilot helped me adapt it for batch processing. The _decode + method with the .split(chat_text)[-1] trick is from the model + card too. Yhe model repeats the prompt in its output so you + have to strip it. Took me a while to figure out why I was + getting the prompt text echoed back in my results. + """ + + def _chat_text(self, task: str) -> str: + # Builds the chat-formatted prompt string the model expects. + # The structure with "role" / "content" / list of dicts is + # the HuggingFace chat template convention. + # {"type": "image"} is a placeholder — the actual pixel data + # gets passed separately to the processor. + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": self.get_prompt(task)}, + ], + } + ] + return self.processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + def _decode(self, generated, chat_text: str) -> List[str]: + # FRAGILE. The split-on-prompt-text approach assumes the + # model always echoes the prompt. If a future model version + # changes this behavior, results will break silently. + decoded = self.processor.batch_decode( + generated, skip_special_tokens=True + ) + return [r.split(chat_text)[-1].strip() for r in decoded] + + def run_single(self, image, task, max_new_tokens): + text = self._chat_text(task) + inputs = to_device( + self.processor(text=[text], images=[image], return_tensors="pt") + ) + out = generate(self.model, inputs, max_new_tokens) + return self._decode(out, text)[0] + + def _run_batch_impl(self, images, task, max_new_tokens): + # Same as run_single but we pass all images at once with + # padding=True so the processor pads shorter sequences to + # match the longest one. Faster on GPU because it processes + # in parallel (I think). + text = self._chat_text(task) + inputs = to_device( + self.processor( + text=[text] * len(images), + images=images, + return_tensors="pt", + padding=True, + ) + ) + out = generate(self.model, inputs, max_new_tokens) + return self._decode(out, text) + + + +# v2 ABANDONED: TrOCR Backend +# +# I spent a full weekend trying to make this work. Leaving it here +# commented out as documentation of why it failed, in case anyone +# else gets the same idea. +# +# The core problem: TrOCR (microsoft/trocr-base-printed) is a +# VisionEncoderDecoderModel, not an AutoModelForImageTextToText. +# +# Our whole pipeline sends full page images. TrOCR expects someone +# to have already detected and cropped individual text lines. I'd +# need to add a whole text detection step before TrOCR, basically +# building a separate pipeline. +# +# I also couldn't get it to load through AutoModelForImageTextToText +# without it throwing architecture mismatch errors. Copilot kept +# suggesting workarounds that compiled but produced garbage output. +# +# Source that finally made me understand the difference: +# https://huggingface.co/docs/transformers/en/model_doc/trocr +# https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/vision-encoder-decoder +# Also this HF discussion where someone asks the same question I had: +# https://huggingface.co/microsoft/trocr-base-printed/discussions/3 +# +# class TrOCRBackend(OCRBackend): +# """ +# ABANDONED, backend for microsoft/trocr-base-printed. +# +# This doesn't actually inherit from OCRBackend cleanly because +# the interface is too different. I tried to force it to fit by +# ignoring the task parameter and skipping the prompt, but the +# real issue is deeper: TrOCR only does single-line OCR. +# +# Model card: https://huggingface.co/microsoft/trocr-base-printed +# Paper: https://arxiv.org/abs/2109.10282 +# """ +# +# def __init__(self, model_name: str, model, processor): +# # Can't call super().__init__() cleanly because the parent +# # expects self.processor to have apply_chat_template(), which +# # TrOCRProcessor doesn't have. Already a bad sign. +# self.model_name = model_name +# self.model = model +# self.processor = processor +# self.meta = MODEL_REGISTRY[model_name] +# +# def run_single(self, image, task, max_new_tokens): +# # TrOCR ignores the task parameter entirely. It only does OCR. +# # No table recognition, no formula recognition, nothing. +# # +# # The processor here is TrOCRProcessor, which only takes images. +# # No text= argument. No chat template. Just pixel_values. +# pixel_values = self.processor( +# images=image, return_tensors="pt" +# ).pixel_values.to(DEVICE) +# +# with torch.inference_mode(): +# generated_ids = self.model.generate( +# pixel_values, +# max_new_tokens=max_new_tokens, +# ) +# +# return self.processor.batch_decode( +# generated_ids, skip_special_tokens=True +# )[0] +# +# def _run_batch_impl(self, images, task, max_new_tokens): +# # FRAGILE: TrOCR batching. I got this working but the results +# # were garbage on full-page images. The model would output +# # random fragments or repeat the same word over and over. +# # +# # In hindsight this is obvious: the model was trained on +# # cropped single-line images at 384x384 resolution. +# pixel_values = self.processor( +# images=images, return_tensors="pt", padding=True +# ).pixel_values.to(DEVICE) +# +# with torch.inference_mode(): +# generated_ids = self.model.generate( +# pixel_values, +# max_new_tokens=max_new_tokens, +# ) +# +# return self.processor.batch_decode( +# generated_ids, skip_special_tokens=True +# ) +# +# End of abandoned TrOCR code. + + +class GlmOCRBackend(OCRBackend): + """ + BORROWED + COPILOT. Backend for zai-org/GLM-OCR (v3 addition). + + After the TrOCR failure I was nervous about adding another model, + but GLM-OCR turned out to be almost suspiciously easy to integrate. + The reason: it's the same *kind* of model as PaddleOCR-VL. + + Both are vision-language models built on the + AutoModelForImageTextToText architecture. Both use AutoProcessor + with chat templates. Both accept full-page images with text prompts. + The only real differences are in how the messages dict is structured + and how you decode the output. + + Specifically, why GLM-OCR works where TrOCR didn't: + 1. GLM-OCR loads with AutoModelForImageTextToText: same class + as PaddleOCR-VL. No special imports needed. + 2. GLM-OCR's processor supports apply_chat_template() so the + prompt-building pattern from OCRBackend.get_prompt() just works. + 3. GLM-OCR handles full document pages natively. It was designed + for "complex document understanding" (their words). No need + to pre-crop text lines. + 4. GLM-OCR supports multiple tasks (ocr, table, formula): same + multi-task pattern as PaddleOCR-VL. + + If I'd found GLM-OCR first, I wouldn't have wasted time on TrOCR. + Lesson learned: check the model architecture *class* before you + check the model's benchmarks. + + Model card: https://huggingface.co/zai-org/GLM-OCR + GitHub/SDK: https://github.com/zai-org/GLM-OCR + + The big difference from PaddleOCR in terms of code: here you pass + the actual PIL image object inside the messages dict + ({"type": "image", "image": img}), whereas PaddleOCR wants a + placeholder token and the images separately. + + I followed the model card example for the message format. + Copilot wrote _generate_and_decode. The apply_chat_template call + here does tokenization directly (tokenize=True, return_dict=True) + unlike PaddleOCR where we tokenize in a separate step. I don't + love that the two backends work so differently internally but + that's what the models expect. + """ + + def _build_messages(self, images: List[PILImage.Image], task: str): + # One message-list per image. Each is a separate "conversation" + # because the model processes them independently even in a batch. + prompt = self.get_prompt(task) + return [ + [ + { + "role": "user", + "content": [ + {"type": "image", "image": img}, + {"type": "text", "text": prompt}, + ], + } + ] + for img in images + ] + + def _generate_and_decode(self, images, task, max_new_tokens): + inputs = to_device( + self.processor.apply_chat_template( + self._build_messages(images, task), + tokenize=True, + add_generation_prompt=True, + return_tensors="pt", + padding=True, + return_dict=True, + ) + ) + out = generate(self.model, inputs, max_new_tokens) + + # COPILOT. This slice strips the input prompt tokens from + # the output. "shape[-1]" is the length of the input sequence. + # Everything after that is what the model actually generated. + generated = out[:, inputs["input_ids"].shape[-1] :] + + return [ + t.strip() + for t in self.processor.batch_decode( + generated, skip_special_tokens=True + ) + ] + + def run_single(self, image, task, max_new_tokens): + return self._generate_and_decode([image], task, max_new_tokens)[0] + + def _run_batch_impl(self, images, task, max_new_tokens): + return self._generate_and_decode(images, task, max_new_tokens) + + +# Maps the "backend" string from MODEL_REGISTRY to the actual class. +# TrOCR was going to be "trocr": TrOCRBackend here. Now it's just +# the two that actually work. +BACKEND_MAP = {"paddleocr": PaddleOCRBackend, "glmocr": GlmOCRBackend} + +# -- Model loading -- + +# -- v2 ABANDONED: TrOCR loader -- +# ABANDONED. TrOCR needs its own loading function because it uses +# different classes. This was a telling sign it wasn't going to fit. +# +# def load_trocr(model_name: str): +# """ +# Loads TrOCR with VisionEncoderDecoderModel instead of +# AutoModelForImageTextToText. I tried using Auto* classes first +# and got: +# ValueError: Unrecognized configuration class +# +# for this kind of AutoModel: AutoModelForImageTextToText. +# +# Source for the correct loading pattern: +# https://huggingface.co/microsoft/trocr-base-printed +# """ +# from transformers import TrOCRProcessor, VisionEncoderDecoderModel +# processor = TrOCRProcessor.from_pretrained(model_name) +# model = VisionEncoderDecoderModel.from_pretrained( +# model_name, torch_dtype=DTYPE +# ) +# model.to(DEVICE).eval() +# return TrOCRBackend(model_name, model, processor) + + +@lru_cache(maxsize=settings.duui_ocr_model_cache_size) +def load_backend(model_name: str) -> OCRBackend: + """ + COPILOT + BORROWED. Loads a model and its processor from HuggingFace, + wraps them in the appropriate backend class, and caches the result. + + The lru_cache decorator means we only download/load each model once. + With cache_size=1 (default), loading a second model evicts the first. + This is important because these models are huge and you probably can't + fit two on one GPU. + + REVISIT. lru_cache doesn't actually free the GPU memory when it + evicts an entry. The old model just becomes unreferenced and *eventually* + gets garbage collected, maybe. I've seen CUDA OOM errors when switching + models. Might need a custom cache that explicitly calls del + gc.collect() + + torch.cuda.empty_cache() on eviction. Haven't figured out a clean + way to do that yet. + + The AutoProcessor / AutoModelForImageTextToText pattern is from the + HuggingFace transformers docs: + https://huggingface.co/docs/transformers/model_doc/auto + Copilot filled in the dtype and device placement. + + Note: this only works for models that support AutoModelForImageTextToText. + TrOCR doesn't. That was a big part of why v2 failed. Both PaddleOCR-VL + and GLM-OCR declare "auto_model": "AutoModelForImageTextToText" in their + HuggingFace config, which is how the Auto* classes know what to load. + TrOCR's config says VisionEncoderDecoderModel, which is a different + class hierarchy entirely. + """ + if model_name not in MODEL_REGISTRY: + raise ValueError( + f"Unknown model: {model_name}. " + f"Choose from: {list(MODEL_REGISTRY.keys())}" + ) + meta = MODEL_REGISTRY[model_name] + + # v2 remnant: I had a special case here for TrOCR. + # if meta["backend"] == "trocr": + # return load_trocr(model_name) + + logger.info("Loading model: %s", model_name) + processor = AutoProcessor.from_pretrained(model_name) + model = AutoModelForImageTextToText.from_pretrained( + model_name, torch_dtype=DTYPE + ) + # .eval() puts the model in inference mode (disables dropout etc.) + # .to(DEVICE) moves all parameters to GPU. These two calls are in + # every single HuggingFace example I've ever seen. + model.to(DEVICE).eval() + logger.info("Model loaded on %s", DEVICE) + return BACKEND_MAP[meta["backend"]](model_name, model, processor) + + +# -- FastAPI -- +# BORROWED. The app setup and DUUI endpoint structure is standard +# across all DUUI annotators. + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.duui_ocr_annotator_name, + description="Multi-model OCR Component for DUUI", + version=settings.duui_ocr_annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={"name": "TTLab Team", "url": "https://texttechnologylab.org"}, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + + +# The next four endpoints are pure DUUI boilerplate. They just serve +# static content that the DUUI framework needs to discover and +# configure this annotator. Nothing interesting happens here. + + +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + return Response( + content=typesystem.to_xml().encode("utf-8"), + media_type="application/xml", + ) + + +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +@app.get("/v1/documentation") +def get_documentation(): + return TextImagerDocumentation( + annotator_name=settings.duui_ocr_annotator_name, + version=settings.duui_ocr_annotator_version, + implementation_lang="Python", + meta={ + "models": { + name: {k: m[k] for k in ("source", "lang", "version", "tasks")} + for name, m in MODEL_REGISTRY.items() + }, + "supported_tasks": ALL_SUPPORTED_TASKS, + }, + parameters={ + "model_name": "Model to use: " + ", ".join(MODEL_REGISTRY), + "task": "OCR task: " + ", ".join(ALL_SUPPORTED_TASKS), + "max_new_tokens": "Maximum tokens to generate", + }, + ) + + +@app.get("/v1/details/input_output") +def get_input_output() -> JSONResponse: + return JSONResponse( + content=jsonable_encoder( + { + "inputs": ["org.texttechnologylab.annotation.type.Image"], + "outputs": [ + "org.texttechnologylab.annotation.AnnotationComment" + ], + } + ) + ) + + +@app.post("/v1/process") +def post_process(request: OCRRequest): + """ + SOLID (mostly). This is where the actual OCR happens. + + The flow: + 1. Check the requested task is valid for the chosen model + 2. Decode all images from base64/filepath to PIL + 3. Acquire the model lock (one request at a time on the GPU) + 4. Run the OCR backend on the batch + 5. Pair each result back with its original document offsets + 6. Clean up GPU memory + + I collect errors in a list instead of raising exceptions because + DUUI expects a response even if some images failed. A partial + result (3 out of 5 images worked) is more useful than a crash. + + FRAGILE. The finally block with cuda.empty_cache() and gc.collect() + is my attempt at preventing memory leaks between requests. I'm not + 100% sure it's sufficient. During long runs the VRAM usage seems + to creep up slowly. Might be a leak in the processor or in PIL. + Haven't had time to profile it properly. + """ + meta = MODEL_REGISTRY.get(request.model_name, {}) + ocr_results: List[OCRResult] = [] + errors: List[str] = [] + + try: + # Validate task before we do any heavy work + supported = meta.get("tasks", []) + if request.task not in supported: + errors.append( + f"Task '{request.task}' not supported by " + f"{request.model_name}. Choose from: {supported}" + ) + else: + # Decode images: keep track of which ones succeeded so we + # can match results back to the right request indices later. + # Bad images (corrupt base64, missing files) get logged as + # errors but don't kill the whole batch. + pil_images, valid_indices = [], [] + for i, img_in in enumerate(request.images): + try: + pil_images.append(decode_image(img_in.src)) + valid_indices.append(i) + except Exception as e: + logger.error("Failed to decode image %d: %s", i, e) + errors.append(f"Image {i}: {e}") + + if pil_images: + with model_lock: + backend = load_backend(request.model_name) + texts = backend.run_batch( + pil_images, request.task, request.max_new_tokens + ) + + # Pair each OCR result with the original image's + # document offsets (begin/end). The zip with + # valid_indices is how we skip over failed images. + for idx, text in zip(valid_indices, texts): + img_in = request.images[idx] + ocr_results.append( + OCRResult( + text=text, + task=request.task, + begin=img_in.begin, + end=img_in.end, + ) + ) + + # Close PIL images to free memory. I kept forgetting + # this and wondering why RAM usage kept growing. + for img in pil_images: + img.close() + except Exception as ex: + logger.exception(ex) + errors.append(str(ex)) + finally: + # COPILOT. Asked "how to free GPU memory after inference in + # pytorch" and got this. empty_cache releases unused cached + # memory back to CUDA, gc.collect nudges Python's garbage + # collector. Belt and suspenders. + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect() + + return OCRResponse( + ocr_results=ocr_results, + model_name=request.model_name, + model_version=meta.get("version", "Unknown"), + model_source=meta.get("source", "Unknown"), + model_lang=meta.get("lang", "Unknown"), + errors=errors, + config={ + "task": request.task, + "max_new_tokens": request.max_new_tokens, + }, + ) \ No newline at end of file diff --git a/duui-ocr/src/test/java/org/hucompute/textimager/uima/ocr/ocr/DUUIOCRTest.java b/duui-ocr/src/test/java/org/hucompute/textimager/uima/ocr/ocr/DUUIOCRTest.java new file mode 100644 index 00000000..af443156 --- /dev/null +++ b/duui-ocr/src/test/java/org/hucompute/textimager/uima/ocr/ocr/DUUIOCRTest.java @@ -0,0 +1,654 @@ +/* + * I do not really know Java. I know Java ish but not enough for this. + * I'm a Python person who got handed a Java test and tried to + * make it work. Most of my understanding of JUnit, JCas, and the + * DUUI test patterns comes from three places: + * + * 1. ChatGPT: I asked it to explain basically everything. + * "What does @BeforeAll do?" "Why does JCas need a factory?" + * "What's the difference between @AfterEach and @AfterAll?" + * I used it as a tutor more than a code generator here. + * + * 2. GitHub Copilot: wrote most of the repetitive test methods + * after I got the first one working. Once Copilot saw the + * pattern for testPaddleOCRWithBase64, it basically generated + * the GLM variants with minimal prompting. + * + * 3. Existing DUUI test files: this is where the real skeleton + * came from. I studied and borrowed heavily from: + * - GermanSummaryTest.java (the createCas + Sentence pattern) + * - SentimentTest.java (the @BeforeAll/@AfterEach lifecycle, + * the composer setup, the general structure of everything) + * - TextToImageTest.java (the base64 image encoding/decoding, + * which was exactly what I needed for OCR image input) + * These three files are the real authors of the test scaffolding. + * I adapted their patterns for OCR. + * + * ITERATION HISTORY (mirrors the Python server's evolution): + * + * v1: Tests for PaddleOCR-VL-1.5 only. This was the first model + * I got working end-to-end (Python server → DUUI → Java test). + * Took longer than it should have because I was learning JUnit + * and UIMA at the same time. But PaddleOCR-VL cooperated. + * The test structure comes from this iteration. + * + * v2: Tried to add tests for microsoft/trocr-base-printed. Wrote + * the test methods, ran them, got results that were garbage. + * Spent time debugging on the Java side before realizing the + * problem wasn't here at all. Commented out below with notes + * on what went wrong. See the Python server file for the full + * post-mortem. Or don't, it's kinda embarrasing. + * + * v3: Added tests for zai-org/GLM-OCR. Worked almost immediately. + * GLM-OCR is architecturally the same *kind* of model as + * PaddleOCR-VL (AutoModelForImageTextToText, chat templates, + * full-page OCR), so the test pattern from v1 transferred + * directly. Copilot generated most of these by pattern-matching + * on the Paddle tests. This is the current working state. + * + * Source for DUUI test conventions: + * https://github.com/texttechnologylab/DockerUnifiedUIMAInterface + * + * Last meaningful edit: Feb 2026 + */ + +package org.hucompute.textimager.uima.ocr; + +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.XmlCasSerializer; +import org.junit.jupiter.api.*; +import org.texttechnologylab.DockerUnifiedUIMAInterface + .DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface + .driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface + .lua.DUUILuaContext; +import org.texttechnologylab.annotation.AnnotationComment; +import org.texttechnologylab.annotation.type.Image; +import org.xml.sax.SAXException; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.*; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.*; + +public class DUUIOCRTest { + + // ── Static fields / shared state ──────────────────────────────── + // BORROWED. This entire static block pattern is from SentimentTest. + // + // I asked ChatGPT why these are static and it explained that + // @BeforeAll methods must be static in JUnit 5 (unless you change + // the lifecycle mode), so anything they initialize also has to be + // static. Coming from Python's pytest fixtures this felt weird, + // but I get it now. Mostly. + + static DUUIComposer composer; + static JCas cas; + + // Port 9714 is what I set in my docker-compose for the OCR server. + // If you're running the server somewhere else, change this. + static String url = "http://127.0.0.1:9714"; + static String paddleModel = + "PaddlePaddle/PaddleOCR-VL-1.5"; + + // ── v2: TrOCR model string ────────────────────────────────── + // ABANDONED. This model exists and loads fine. The problem is + // what it *does* with full-page images (spoiler: nothing useful). + // Kept here as documentation of the attempt. + // + // Source: https://huggingface.co/microsoft/trocr-base-printed + // + // static String trOcrModel = + // "microsoft/trocr-base-printed"; + + // v3: GLM-OCR, the model that actually worked as a second option. + // Source: https://huggingface.co/zai-org/GLM-OCR + static String glmModel = "zai-org/GLM-OCR"; + + // I keep test images in a local directory. They're not committed + // to the repo because they're 5-10MB each. You need to put your + // own test images here before running these tests. + static String testImageDir = "src/test_images"; + + // ── Lifecycle methods ─────────────────────────────────────────── + + @BeforeAll + static void beforeAll() throws Exception { + // BORROWED. Copied almost verbatim from SentimentTest.beforeAll(). + // withSkipVerification(true) skips some DUUI handshake checks + // that were failing locally. ChatGPT told me this is fine for + // testing but you probably shouldn't do it in production. + // withJsonLibrary() adds JSON support to the Lua context, + // which the communication script needs. Concerning... + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext( + new DUUILuaContext().withJsonLibrary() + ); + + DUUIRemoteDriver remoteDriver = + new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); + + // ChatGPT explained: JCasFactory.createJCas() needs a type + // system on the classpath. If this line throws a + // ResourceInitializationException, it usually means the + // type system XML isn't where UIMA expects it. I spent an + // hour on that before realizing my pom.xml was missing a + // dependency. Not my proudest debugging session. + cas = JCasFactory.createJCas(); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + public void afterEach() + throws IOException, SAXException { + // BORROWED from SentimentTest. Nothing else to say. + composer.resetPipeline(); + + ByteArrayOutputStream stream = + new ByteArrayOutputStream(); + XmlCasSerializer.serialize( + cas.getCas(), null, stream + ); + System.out.println( + stream.toString(StandardCharsets.UTF_8) + ); + + cas.reset(); + } + + // ── Helper methods ────────────────────────────────────────────── + + private static String imageToBase64(String imagePath) { + // BORROWED. This is essentially the same as + // convertImageToBase64() from TextToImageTest.java, just + // renamed to match my naming convention. The pattern is: + // read file → BufferedImage → write to ByteArrayOutputStream + // as PNG → base64-encode the bytes. + // + // I asked ChatGPT why we can't just read the raw file bytes + // and encode those directly. Answer: we *could*, but going + // through BufferedImage + ImageIO normalizes the format. + // A JPEG file's raw bytes would be JPEG-encoded, and this + // way we always send PNG regardless of the source format. + // Honestly not sure if the server cares, but TextToImageTest + // did it this way and it works. + try { + File file = new File(imagePath); + BufferedImage bufferedImage = + ImageIO.read(file); + ByteArrayOutputStream baos = + new ByteArrayOutputStream(); + ImageIO.write(bufferedImage, "png", baos); + return Base64.getEncoder() + .encodeToString(baos.toByteArray()); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + private List collectTestImagePaths() { + // COPILOT + CHATGPT. I told Copilot "find all image files + // in a directory" and it generated most of this. ChatGPT + // helped me understand the Objects.requireNonNull() part. + // + // FRAGILE. This only checks file extensions, not actual + // file content. A corrupted PNG with a .png extension will + // pass this filter and fail later in imageToBase64(). I was + // hungry so this is good enough for testing. + List paths = new ArrayList<>(); + File dir = new File(testImageDir); + if (dir.exists() && dir.isDirectory()) { + for (File file : + Objects.requireNonNull( + dir.listFiles() + )) { + String name = file.getName().toLowerCase(); + if (name.endsWith(".png") + || name.endsWith(".jpg") + || name.endsWith(".jpeg")) { + paths.add(file.getAbsolutePath()); + } + } + } + return paths; + } + + private void createCasWithBase64Images( + String language, List imagePaths) { + // BORROWED + COPILOT. Adapted from the createCas() pattern in + // GermanSummaryTest and SentimentTest, but instead of adding + // Sentence annotations, I'm adding Image annotations with + // base64-encoded src data. + // + // The "OCR document" placeholder text is there because UIMA + // requires a document text. You can't have a CAS with no text. + // ChatGPT explained that begin/end offsets on annotations must + // fall within the document text range, so I set all images to + // span the full "OCR document" string (0 to length). + cas.setDocumentLanguage(language); + cas.setDocumentText("OCR document"); + for (String path : imagePaths) { + String b64 = imageToBase64(path); + if (b64 != null) { + Image img = new Image( + cas, 0, cas.getDocumentText().length() + ); + img.setSrc(b64); + img.addToIndexes(); + } + } + } + + private void createCasWithFilePaths( + String language, List imagePaths) { + // Same as above but passes file paths instead of base64 data. + cas.setDocumentLanguage(language); + cas.setDocumentText("OCR document"); + for (String path : imagePaths) { + Image img = new Image( + cas, 0, cas.getDocumentText().length() + ); + img.setSrc(path); + img.addToIndexes(); + } + } + + private void printResults() { + // BORROWED. The pattern of selecting annotations by type and + // printing them is everywhere in the DUUI test files. + // SentimentTest does this with SentimentModel, TextToImageTest + // with Image, and I do it with AnnotationComment. + Collection results = + JCasUtil.select(cas, AnnotationComment.class); + for (AnnotationComment c : results) { + System.out.println( + "Key: " + c.getKey() + + " | Value: " + c.getValue() + ); + } + } + + // ── PaddleOCR-VL tests (v1) ──────────────────────────────────── + // These were the first tests I wrote. The whole test structure I + // use everywhere else in this file crystallized during this + // iteration: compose a pipeline with a model name and task, load + // images into the CAS, run the pipeline, check that + // AnnotationComments came back. + // + // Everything after this section is a variation on this pattern. + + @Test + public void testPaddleOCRWithBase64() throws Exception { + // COPILOT + CHATGPT. This is the first test I got running. + // It took an embarrassingly long time. The composer.add() + // pattern with .withParameter() is from SentimentTest. ChatGPT + // walked me through how DUUIRemoteDriver.Component works: + // you give it a URL and parameters, and those parameters get + // passed to the Python server as part of the DUUI protocol. + // + // The assertions are minimal - I just check that: + // 1. I actually have test images (otherwise what are we testing?) + // 2. At least one AnnotationComment came back + // + // I don't check the *content* of the OCR results because that + // depends on what test images you have. SentimentTest checks + // exact probability values, which is possible because text + // input is deterministic. OCR results vary with the image, so + // I just verify something came back and eyeball the printResults() + // output. Not ideal, I know. + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", paddleModel) + .withParameter("task", "ocr") + ); + List paths = collectTestImagePaths(); + assertFalse(paths.isEmpty(), + "No test images in " + testImageDir); + createCasWithBase64Images("en", paths); + composer.run(cas); + assertFalse( + JCasUtil.select( + cas, AnnotationComment.class + ).isEmpty(), + "No OCR results returned" + ); + printResults(); + } + + @Test + public void testPaddleOCRWithFilePaths() + throws Exception { + // Same as testPaddleOCRWithBase64 but sends file paths instead + // of base64. Copilot generated this after seeing the base64 + // version. It just swapped createCasWithBase64Images for + // createCasWithFilePaths. Amazeballs. + // + // This test will fail if the server is containerized and + // can't see your local filesystem. + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", paddleModel) + .withParameter("task", "ocr") + ); + List paths = collectTestImagePaths(); + assertFalse(paths.isEmpty()); + createCasWithFilePaths("en", paths); + composer.run(cas); + assertFalse( + JCasUtil.select( + cas, AnnotationComment.class + ).isEmpty() + ); + printResults(); + } + + @Test + public void testPaddleTableRecognition() + throws Exception { + // COPILOT generated this and the formula test below almost + // entirely on its own after seeing the OCR tests above. + // The only difference is .withParameter("task", "table"). + // + // I only send a single image here (paths.get(0)) because table + // recognition is slower and I don't need to batch-test it. + // ChatGPT told me Collections.singletonList() is the Java way + // to make a one-element list. In Python I'd just write [paths[0]]. + // Java is... verbose. + // + // REVISIT. I should really use an image that actually contains + // a table for this test. Right now I'm just sending whatever + // the first image in the directory is, which is probably a + // regular text page. The model still returns *something*, but + // it's not a meaningful test of table recognition quality. + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", paddleModel) + .withParameter("task", "table") + ); + List paths = collectTestImagePaths(); + assertFalse(paths.isEmpty()); + createCasWithBase64Images( + "en", + Collections.singletonList(paths.get(0)) + ); + composer.run(cas); + printResults(); + } + + @Test + public void testPaddleFormulaRecognition() + throws Exception { + // COPILOT. Same pattern as testPaddleTableRecognition, just + // with task="formula". Copilot's autocomplete at this point + // was finishing entire test methods before I could type the + // method name. Saved me a lot of time, all hail the AI-Overlord. + // + // Same REVISIT as above: should use an image with actual + // mathematical formulas. + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", paddleModel) + .withParameter("task", "formula") + ); + List paths = collectTestImagePaths(); + assertFalse(paths.isEmpty()); + createCasWithBase64Images( + "en", + Collections.singletonList(paths.get(0)) + ); + composer.run(cas); + printResults(); + } + + // ── TrOCR tests (v2) - ABANDONED :( ────────────────────────────── + // + // This is why I should do more reading and research before doing. + + // What happened: + // I wanted a second model to count towards my duui-module count + // microsoft/trocr-base-printed seemed perfect: well-known, + // well-documented, specifically designed for OCR. I wrote the + // tests below by duplicating the PaddleOCR tests and swapping + // in the TrOCR model name. + // + // The tests *ran* but failed. I assumed I had a bug in how I + // was building the CAS, or in the base64 encoding, or in the Lua + // communication script. I spent hours debugging before giving up. + // I commented everything out. Moved on to GLM-OCR (v3), + // which turned out to be the right call. + // + // Leaving these tests here as documentation. If anyone in the + // future wants to integrate TrOCR, they need to know it requires + // a fundamentally different approach: pre-crop text lines first, + // then feed each line to TrOCR individually. + // + // BORROWED. Test structure is identical to the Paddle tests. + // The problem was never in the test code. It was in the assumption + // that TrOCR could handle the same input as PaddleOCR. + + // @Test + // public void testTrOCRWithBase64() throws Exception { + // // ABANDONED. Too sad to comment further. + // composer.add( + // new DUUIRemoteDriver.Component(url) + // .withParameter("model_name", trOcrModel) + // .withParameter("task", "ocr") + // ); + // List paths = collectTestImagePaths(); + // assertFalse(paths.isEmpty(), + // "No test images in " + testImageDir); + // createCasWithBase64Images("en", paths); + // composer.run(cas); + // // This assertion *passes* — results do come back. + // // They're just meaningless. + // assertFalse( + // JCasUtil.select( + // cas, AnnotationComment.class + // ).isEmpty(), + // "No OCR results returned" + // ); + // printResults(); + // } + + // @Test + // public void testTrOCRWithFilePaths() throws Exception { + // // ABANDONED. Same story as testTrOCRWithBase64. + // composer.add( + // new DUUIRemoteDriver.Component(url) + // .withParameter("model_name", trOcrModel) + // .withParameter("task", "ocr") + // ); + // List paths = collectTestImagePaths(); + // assertFalse(paths.isEmpty()); + // createCasWithFilePaths("en", paths); + // composer.run(cas); + // assertFalse( + // JCasUtil.select( + // cas, AnnotationComment.class + // ).isEmpty() + // ); + // printResults(); + // } + + // @Test + // public void testTrOCRTableRecognition() throws Exception { + // // ABANDONED. I wrote this knowing it probably wouldn't work, + // // because TrOCR only supports OCR, so no table recognition, + // // no formula recognition, nothing. It's a single-task model. + // // On the Python side, the TrOCR backend ignores the task + // // parameter entirely. But I wrote the test anyway to see + // // what would happen. + // // + // // What happened: the server accepted the request (it falls + // // back to basic OCR when the task isn't supported), and + // // returned the same garbage the OCR test. No table structure, + // // obviously. + // // + // // This was the moment I started suspecting the problem was + // // deeper than a bug. Three different test configurations, + // // all returning the same kind of fragmented output. That's + // // not a bug, that's a model doing what it was designed to do + // // on input it was never designed to handle. + // composer.add( + // new DUUIRemoteDriver.Component(url) + // .withParameter("model_name", trOcrModel) + // .withParameter("task", "table") + // ); + // List paths = collectTestImagePaths(); + // assertFalse(paths.isEmpty()); + // createCasWithBase64Images( + // "en", + // Collections.singletonList(paths.get(0)) + // ); + // composer.run(cas); + // printResults(); + // } + + // @Test + // public void testTrOCRFormulaRecognition() throws Exception { + // // ABANDONED. Same as table, TrOCR doesn't do formulas. + // // Included for completeness. + // composer.add( + // new DUUIRemoteDriver.Component(url) + // .withParameter("model_name", trOcrModel) + // .withParameter("task", "formula") + // ); + // List paths = collectTestImagePaths(); + // assertFalse(paths.isEmpty()); + // createCasWithBase64Images( + // "en", + // Collections.singletonList(paths.get(0)) + // ); + // composer.run(cas); + // printResults(); + // } + // + // End of abandoned TrOCR tests. + + // ── GLM-OCR tests (v3) ───────────────────────────────────────── + // After the TrOCR failure I was genuinely nervous about trying a + // third model. But GLM-OCR (zai-org/GLM-OCR) turned out to be + // almost suspiciously easy. These tests worked on the first run. + // + // Source: https://huggingface.co/zai-org/GLM-OCR + // + // BORROWED. The strategy of mirroring every test across models + // is something I saw in SentimentTest, which has separate tests + // for different languages and model variants (DeTest, EnTest, + // EnCadriffNLPTest, VietnamesePhoBertTest, etc.). Each one is + // basically the same flow with different parameters. Repetitive + // but easy to read and debug. + // + // Copilot generated all four of these by pattern-matching on the + // Paddle tests above. I only had to change the model string and + // the method names. + + @Test + public void testGlmOCRWithBase64() throws Exception { + // COPILOT. Generated by duplicating testPaddleOCRWithBase64 + // and changing paddleModel to glmModel. + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", glmModel) + .withParameter("task", "ocr") + ); + List paths = collectTestImagePaths(); + assertFalse(paths.isEmpty(), + "No test images in " + testImageDir); + createCasWithBase64Images("en", paths); + composer.run(cas); + assertFalse( + JCasUtil.select( + cas, AnnotationComment.class + ).isEmpty(), + "No OCR results returned" + ); + printResults(); + } + + @Test + public void testGlmOCRWithFilePaths() + throws Exception { + // COPILOT generated. Same file path caveat as the Paddle + // file path test (Docker path resolution). + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", glmModel) + .withParameter("task", "ocr") + ); + List paths = collectTestImagePaths(); + assertFalse(paths.isEmpty()); + createCasWithFilePaths("en", paths); + composer.run(cas); + assertFalse( + JCasUtil.select( + cas, AnnotationComment.class + ).isEmpty() + ); + printResults(); + } + + @Test + public void testGlmTableRecognition() + throws Exception { + // COPILOT generated. Same REVISIT about using a real table + // image applies here. + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", glmModel) + .withParameter("task", "table") + ); + List paths = collectTestImagePaths(); + assertFalse(paths.isEmpty()); + createCasWithBase64Images( + "en", + Collections.singletonList(paths.get(0)) + ); + composer.run(cas); + printResults(); + } + + @Test + public void testGlmFormulaRecognition() + throws Exception { + // COPILOT generated. Last of the mirrored tests. At this + // point Copilot was basically writing the whole method from + // the method name alone. + // + // REVISIT. I have eight active test methods (four Paddle, + // four GLM) and they're all structurally identical. ChatGPT + // suggested using @ParameterizedTest with a @MethodSource to + // collapse these into one or two parameterized methods + // (model × task × input mode). That would be cleaner but I + // haven't learned JUnit parameterized tests yet. The + // SentimentTest and TextToImageTest files both use separate + // methods per scenario too, so at least I'm in good company. + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", glmModel) + .withParameter("task", "formula") + ); + List paths = collectTestImagePaths(); + assertFalse(paths.isEmpty()); + createCasWithBase64Images( + "en", + Collections.singletonList(paths.get(0)) + ); + composer.run(cas); + printResults(); + } +} \ No newline at end of file diff --git a/duui-ocr/src/test_images/test_1.png b/duui-ocr/src/test_images/test_1.png new file mode 100644 index 00000000..343b92b6 Binary files /dev/null and b/duui-ocr/src/test_images/test_1.png differ diff --git a/duui-ocr/src/test_images/test_2.png b/duui-ocr/src/test_images/test_2.png new file mode 100644 index 00000000..87d343b2 Binary files /dev/null and b/duui-ocr/src/test_images/test_2.png differ diff --git a/duui-pos-ancient-greek/.gitignore b/duui-pos-ancient-greek/.gitignore new file mode 100644 index 00000000..2a679d94 --- /dev/null +++ b/duui-pos-ancient-greek/.gitignore @@ -0,0 +1,47 @@ +# Python +.venv/ +venv/ +__pycache__/ +*.pyc +*.pyo +*.egg-info/ +*.egg +dist/ +build/ + +# Model artifacts (large files) +model/ +*.bin +*.safetensors +checkpoint*/ + +# IDE +.idea/ +*.iml +.DS_Store +.vscode/ +*.swp +*.swo + +# Java build +target/ +*.class + +# Data (cloned separately) +data/ +UD_Ancient_Greek-*/ + +# Docker +*.tar +*.tar.gz + +# Test outputs +evaluation_report.txt +test_output.json + +# Cache +.cache/ +.pytest_cache/ + +# Logs +*.log \ No newline at end of file diff --git a/duui-pos-ancient-greek/README.md b/duui-pos-ancient-greek/README.md new file mode 100644 index 00000000..ea1f8a4f --- /dev/null +++ b/duui-pos-ancient-greek/README.md @@ -0,0 +1,97 @@ +[![Version](https://img.shields.io/static/v1?label=Python&message=3.10&color=green)]() +[![Version](https://img.shields.io/static/v1?label=Transformers&message=5.2.0&color=yellow)]() +[![Version](https://img.shields.io/static/v1?label=Torch&message=2.2.0&color=red)]() + +# Ancient Greek Part-of-Speech Tagger + +DUUI implementation for Ancient Greek Part-of-Speech (POS) tagging. This component utilizes a fine-tuned `xlm-roberta-base` model trained on the Universal Dependencies [Ancient Greek Perseus treebank](https://github.com/UniversalDependencies/UD_Ancient_Greek-Perseus), achieving a 91.38% test accuracy for 17 Universal POS tags. + +## 1. Annotations + +The following is a list of Annotations that are needed as Input for the Docker-Image and are returned as Output by the Docker-Image: +- ### Input (Optional): + - `de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence` (If sentences are provided, tagging is performed per sentence. Otherwise, the whole document text is processed). +- ### Output: + - `de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS` + +## 2. Included Models + +| Name | Source | Revision | Languages | +|---------------|--------------------------------------------------------------------------------|------------------------------------------|-----------| +| ancient-greek-pos-xlmr | https://huggingface.co/qbnguyen/ancient-greek-pos-xlmr | a297f1e9bffaa7831ce6f2f58d8f6f3a22948952 | Ancient Greek | + + +# How To Use + +For using duui-pos-ancient-greek as a DUUI image it is necessary to use the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +## Start Docker container + +```bash +docker run --rm -p 9714:9714 duui-pos-ancient-greek:latest +``` + +*(Note: If deployed to the TTLab registry, replace `duui-pos-ancient-greek:latest` with `docker.texttechnologylab.org/duui-pos-ancient-greek:latest`)* + +## Run within DUUI + +```java +composer.add( + new DUUIDockerDriver.Component("duui-pos-ancient-greek:latest") + .withScale(iWorkers) + .withImageFetching() + // Optional: specify a different HF model ID or local path + // .withParameter("model_name", "qbnguyen/ancient-greek-pos-xlmr") +); +``` + +### Parameters + +| Name | Description | +|--------------|------------------------------------| +| `model_name` | Model to use. Default is `qbnguyen/ancient-greek-pos-xlmr` | + + +# Cite + +If you want to use the DUUI image please quote this as follows: + +Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. [[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] + +## BibTeX + +```bibtex +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander}, + editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023}, + year = {2023}, + address = {Singapore}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, + pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf}, + abstract = {Automatic analysis of large corpora is a complex task, especially + in terms of time efficiency. This complexity is increased by the + fact that flexible, extensible text analysis requires the continuous + integration of ever new tools. Since there are no adequate frameworks + for these purposes in the field of NLP, and especially in the + context of UIMA, that are not outdated or unusable for security + reasons, we present a new approach to address the latter task: + Docker Unified UIMA Interface (DUUI), a scalable, flexible, lightweight, + and feature-rich framework for automatic distributed analysis + of text corpora that leverages Big Data experience and virtualization + with Docker. We evaluate DUUI{'}s communication approach against + a state-of-the-art approach and demonstrate its outstanding behavior + in terms of time efficiency, enabling the analysis of big text + data.} +} + +@misc{Nguyen:2026, + author = {Nguyen, Quoc-Bao}, + title = {Ancient Greek POS Tagger as {DUUI} component}, + year = {2026}, + howpublished = {https://github.com/texttechnologylab/duui-uima/tree/main/duui-pos-ancient-greek} +} +``` \ No newline at end of file diff --git a/duui-pos-ancient-greek/docker_build.sh b/duui-pos-ancient-greek/docker_build.sh new file mode 100755 index 00000000..d1772975 --- /dev/null +++ b/duui-pos-ancient-greek/docker_build.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e + +DUUI_POS_AG_ANNOTATOR_NAME="duui-pos-ancient-greek" +DUUI_POS_AG_ANNOTATOR_VERSION="0.1.0" + +IMAGE_NAME="${DUUI_POS_AG_ANNOTATOR_NAME}" +IMAGE_TAG="${DUUI_POS_AG_ANNOTATOR_VERSION}" + +echo "=============================================" +echo "Building: ${IMAGE_NAME}:${IMAGE_TAG}" +echo "=============================================" + +# Build from project root, using the Dockerfile in src/main/docker/ +docker build \ + -t "${IMAGE_NAME}:${IMAGE_TAG}" \ + -t "${IMAGE_NAME}:latest" \ + -f src/main/docker/Dockerfile \ + . + +echo "" +echo "=============================================" +echo " Build complete" +echo " Image: ${IMAGE_NAME}:${IMAGE_TAG}" +echo "" +echo "Run with:" +echo " docker run -p 9714:9714 ${IMAGE_NAME}:${IMAGE_TAG}" +echo "=============================================" \ No newline at end of file diff --git a/duui-pos-ancient-greek/pom.xml b/duui-pos-ancient-greek/pom.xml new file mode 100644 index 00000000..bec5766a --- /dev/null +++ b/duui-pos-ancient-greek/pom.xml @@ -0,0 +1,105 @@ + + + 4.0.0 + + org.hucompute.textimager.uima + duui_pos_ancient_greek + 0.1.0 + jar + + DUUI POS Ancient Greek + + DUUI component for Ancient Greek POS tagging + using a fine-tuned XLM-RoBERTa model. + + + + 17 + 17 + UTF-8 + 5.10.2 + + + + + jitpack.io + https://jitpack.io + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 1.5.3 + + + + com.github.texttechnologylab + UIMATypeSystem + 02fb1a2f13 + + + + org.apache.uima + uimaj-core + 3.5.0 + + + + org.apache.uima + uimafit-core + 3.5.0 + + + + org.dkpro.core + dkpro-core-api-lexmorph-asl + 2.4.0 + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + 2.4.0 + + + + org.junit.jupiter + junit-jupiter-api + ${junit.version} + test + + + org.junit.jupiter + junit-jupiter-engine + ${junit.version} + test + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.2.5 + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + 17 + 17 + + + + + + \ No newline at end of file diff --git a/duui-pos-ancient-greek/requirements.txt b/duui-pos-ancient-greek/requirements.txt new file mode 100644 index 00000000..7174c16d --- /dev/null +++ b/duui-pos-ancient-greek/requirements.txt @@ -0,0 +1,9 @@ +transformers>=4.40.0 +torch==2.6.0 +fastapi==0.115.0 +uvicorn[standard]==0.27.1 +pydantic-settings==2.0.2 +numpy>=1.26.0 +sentencepiece>=0.2.0 +protobuf>=5.0.0 +accelerate>=0.30.0 \ No newline at end of file diff --git a/duui-pos-ancient-greek/src/main/docker/Dockerfile b/duui-pos-ancient-greek/src/main/docker/Dockerfile new file mode 100644 index 00000000..c0afc32e --- /dev/null +++ b/duui-pos-ancient-greek/src/main/docker/Dockerfile @@ -0,0 +1,115 @@ +# Builds the container image for the Ancient Greek POS tagger. +# Uses a fine-tuned XLM-RoBERTa model (qbnguyen/ancient-greek-pos-xlmr) +# to tag parts of speech in Ancient Greek text, served as a DUUI +# annotator component. + + +# BORROWED. The overall skeleton (WORKDIR, EXPOSE, pip pattern, +# ARG/ENV pairs, uvicorn entrypoint) is lifted from existing +# DUUI annotator Dockerfiles. Specifically I had these open while +# writing this: +# - duui-sentencizer-spacy (the spaCy one with all the model downloads) +# - duui-flair-pos (the Flair POS tagger, for the ARG/ENV pattern) +# - my own duui-ocr Dockerfile (for the HF model pre-download trick) +# All from: https://github.com/texttechnologylab/DockerUnifiedUIMAInterface +# +# Last meaningful edit: Feb 2026 + + +# BORROWED. python:3.10-slim instead of the full 3.10 that other DUUI +# normally uses. ChatGPT suggested this when I complained about image +# size. The "-slim" variant strips out a lot of system tooling we +# don't need (man pages, extra locales, etc). I stuck with 3.10 +# specifically because that's what DUUI often have on. +FROM python:3.10-slim + +WORKDIR /usr/src/app + +EXPOSE 9714 + +# -- System dependencies -- +# FRAGILE. This block exists because python:3.10-slim doesn't include +# a C compiler, and at least one of our pip dependencies (I think it's +# a tokenizers thing? or maybe sentencepiece?) needs to compile from +# source. +# +# I only found this out because `pip install` was failing with a +# cryptic "error: command 'gcc' not found" buried in the output. +# Asked ChatGPT what was going on and it explained that slim images +# don't ship build tools. +# +# The --no-install-recommends flag and the rm -rf cleanup at the end +# are from ChatGPT's suggestion to keep the image small. The idea is: +# install gcc and friends, let pip use them to compile whatever it +# needs, and the compiled .so files stay even though we could +# theoretically remove build-essential afterward. I haven't bothered +# with that cleanup because multi-stage builds seem like a rabbit hole +# I don't need right now. +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential && \ + rm -rf /var/lib/apt/lists/* + +# -- Python dependencies -- +# SOLID. Same pattern as every other DUUI Dockerfile. +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# -- Model pre-download -- +# BORROWED (the pattern) from my OCR Dockerfile, which I originally +# got from Copilot. AI is eating itself ouroboros style I guess. +# +# Downloads the fine-tuned XLM-RoBERTa model for Ancient Greek POS +# tagging at *build time* so the container doesn't need internet +# access at runtime. This is the same trick the OCR Dockerfile uses +# for PaddleOCR-VL and GLM-OCR. +# +# The model itself is from me on Hugging Face. It's XLM-R +# fine-tuned on Ancient Greek POS data. I self-trained it and +# upload to Hugging Face instead of using locally, fitting DUUI +# framework so far. +# +# Note: unlike the OCR models which use AutoModelForImageTextToText, +# this one uses AutoModelForTokenClassification because POS tagging +# is a token classification task. +RUN python -c "\ +from transformers import AutoTokenizer, AutoModelForTokenClassification; \ +AutoModelForTokenClassification.from_pretrained('qbnguyen/ancient-greek-pos-xlmr'); \ +AutoTokenizer.from_pretrained('qbnguyen/ancient-greek-pos-xlmr')" + +# -- Source files -- +# Just listing the source files, nothing more to comment. +COPY src/main/python/duui_pos_ancient_greek.py . +COPY src/main/python/duui_pos_ancient_greek.lua . +COPY src/main/python/TypeSystemPOS.xml . + +# -- Configuration -- +# BORROWED. The ENV pattern is from every other DUUI Dockerfile. +# The other Dockerfiles use the ARG/ENV pair pattern (ARG with a +# default, then ENV=$ARG) so you can override at build time with +# --build-arg. I simplified to just ENV here because I don't +# actually need build-time overrides yet. I'm the only one building +# this. If this ever goes into TTLab's pipeline I should +# probably switch to the ARG/ENV pair pattern to match the others. +# +# REVISIT: should DUUI_POS_AG_MODEL_PATH be configurable? Right now +# it's hardcoded to the one model. But if someone fine-tunes a better +# Ancient Greek POS model later, being able to swap it via env var +# without rebuilding the image would be nice. Leaving it as ENV for +# that reason even though currently there's only one option. +ENV DUUI_POS_AG_ANNOTATOR_NAME="duui-pos-ancient-greek" +ENV DUUI_POS_AG_ANNOTATOR_VERSION="0.1.0" +ENV DUUI_POS_AG_LOG_LEVEL="DEBUG" +ENV DUUI_POS_AG_MODEL_PATH="qbnguyen/ancient-greek-pos-xlmr" + +# -- Startup -- +# BORROWED. Identical pattern to every other DUUI annotator. +# uvicorn runs the FastAPI app on port 9714. +# +# Not setting --workers here (the other Dockerfiles put it in CMD). +# I should probably add CMD ["--workers", "1"] to match the +# convention, but it defaults to 1 anyway so it's not breaking +# anything. In the OCR Dockerfile I made a note about multiple workers +# each loading their own copy of the model into VRAM. Same concern +# applies here, so 1 worker is correct for now. +ENTRYPOINT ["uvicorn", "duui_pos_ancient_greek:app", \ + "--host", "0.0.0.0", "--port", "9714"] \ No newline at end of file diff --git a/duui-pos-ancient-greek/src/main/python/TypeSystemPOS.xml b/duui-pos-ancient-greek/src/main/python/TypeSystemPOS.xml new file mode 100644 index 00000000..2f064b08 --- /dev/null +++ b/duui-pos-ancient-greek/src/main/python/TypeSystemPOS.xml @@ -0,0 +1,41 @@ + + + TypeSystemPOS + + Type system for the Ancient Greek POS tagger DUUI component. + Defines POS annotation type. DocumentModification and AnnotationComment + are provided by the UIMATypeSystem dependency. + + 0.1.0 + TTLab / Goethe University Frankfurt + + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + Part-of-speech tag annotation + uima.tcas.Annotation + + + PosValue + Fine-grained POS tag value + uima.cas.String + + + coarseValue + Coarse-grained POS tag value (UPOS) + uima.cas.String + + + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence + Sentence annotation + uima.tcas.Annotation + + + + \ No newline at end of file diff --git a/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.lua b/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.lua new file mode 100644 index 00000000..a2f378a8 --- /dev/null +++ b/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.lua @@ -0,0 +1,163 @@ +--[[ +I same struggled with this Lua file just like with other Lua files. I don't +really know Lua, and the `luajava` bridge feels like dark magic. GitHub Copilot +was actively harmful here. It kept hallucinating Lua/Java syntax that doesn't +actually work. I ended up relying entirely on ChatGPT for debugging and pieced +this together by studying the existing DUUI components (especially the Flair +POS tagger and the Emotion annotator). + +Last meaningful edit: Feb 2026 +]] + +-- Java class bindings -- +-- BORROWED. Standard boilerplate from literally every DUUI script. +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") + +-- FRAGILE. I wasted an hour trying to `require("json")` because ChatGPT +-- told me to, which broke the whole pipeline. Turns out DUUI injects `json` +-- as a global variable at runtime. + +-- SERIALIZE: CAS → JSON request -- +function serialize(inputCas, outputStream, parameters) + -- 1. Extract document text and language + local doc_text = inputCas:getDocumentText() + local doc_lang = inputCas:getDocumentLanguage() + + -- I force default to "grc" (Greek) if unspecified, because + -- sometimes the upstream reader drops the language tag before + -- the text reaches this component. + if doc_lang == nil or doc_lang == "x-unspecified" then + doc_lang = "grc" + end + local doc_len = #doc_text + + -- 2. Extract model_name from parameters + local model_name = nil + if parameters ~= nil and parameters["model_name"] ~= nil then + model_name = parameters["model_name"] + end + + -- 3. Extract existing Sentence annotations + -- + -- SOLID / CHATGPT. This chunk took three iterations. Originally, I copied + -- the `JCasUtil:select(inputCas, Sentence):iterator()` pattern from the + -- Flair POS script. But it threw a massive Java 17 "InaccessibleObjectException" + -- about ArrayList iterators. + -- + -- ChatGPT explained that Java 17 blocks reflection on certain native Java + -- classes, and suggested using UIMA's native index instead of JCasUtil + -- to bypass the security block. I don't fully grasp UIMA's index internals, + -- but this approach doesn't crash. + local sentences = {} + local sent_counter = 1 + local has_sentences = false + local sentence_type = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + + local uimaType = inputCas:getTypeSystem():getType(sentence_type) + if uimaType ~= nil then + local sent_index = inputCas:getAnnotationIndex(uimaType) + if sent_index ~= nil then + local it = sent_index:iterator() + while it:hasNext() do + local sent = it:next() + sentences[sent_counter] = { + begin = sent:getBegin(), + ["end"] = sent:getEnd(), + text = sent:getCoveredText() + } + sent_counter = sent_counter + 1 + has_sentences = true + end + end + end + + -- 4. Build JSON request + local request = { + doc_text = doc_text, + doc_len = doc_len, + lang = doc_lang, + model_name = model_name + } + + if has_sentences then + request.sentences = sentences + end + + -- 5. Write to output stream + outputStream:write(json.encode(request)) +end + +-- DESERIALIZE: JSON response → CAS annotations -- +function deserialize(inputCas, inputStream) + -- 1. Read and parse the JSON response + local javaString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + + -- CHATGPT (CRITICAL FIX). The java.lang.String returned above looks + -- like a string to Lua, but it's actually a Java object reference. + -- `json.decode` was failing silently and returning nil. ChatGPT caught + -- this typing mismatch. You *must* cast it to a native Lua string. + local inputString = tostring(javaString) + local response = json.decode(inputString) + + if response == nil then + print("LUA ERROR: json.decode returned nil. Cannot parse response.") + return + end + + -- DEBUG PRINT. I added this because I kept getting silent failures + -- when the Python inference server crashed. This forces Python errors + -- into the TextImager Java logs. + if response["errors"] ~= nil and #response["errors"] > 0 then + for _, err in ipairs(response["errors"]) do + print("PYTHON API ERROR: " .. tostring(err)) + end + end + + -- 2. Get type references + local pos_type = "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" + + -- 3. Create POS annotations for each token + -- BORROWED. The instantiation and `.addToIndexes()` pattern is lifted + -- almost verbatim from the DUUI Flair POS script. + if response["tokens"] ~= nil then + for _, token in ipairs(response["tokens"]) do + local pos = luajava.newInstance(pos_type, inputCas) + pos:setBegin(token["begin"]) + pos:setEnd(token["end"]) + pos:setPosValue(token["pos_value"]) + + -- Unlike Flair, I'm setting the coarse value too since my Python + -- script returns it. + pos:setCoarseValue(token["pos_coarse_value"]) + pos:addToIndexes() + end + else + print("LUA WARNING: 'tokens' array is nil or missing in the response.") + end + + -- 4. Create MetaData annotation + -- BORROWED. I took this `DocumentModification` block from the Emotion + -- and spaCy sentencizer scripts. It leaves an audit trail in the CAS + -- so that my tags show up properly with a timestamp and model version + -- in the TextImager UI. + local meta_type = "org.texttechnologylab.annotation.DocumentModification" + local meta = luajava.newInstance(meta_type, inputCas) + meta:setUser(response["model_name"] or "duui-pos-ancient-greek") + meta:setTimestamp(os.time()) + meta:setComment( + "POS tagging by " .. (response["model_name"] or "unknown") + .. " v" .. (response["model_version"] or "0.1.0") + ) + meta:addToIndexes() + + -- 5. Create AnnotationComment for any errors + if response["errors"] ~= nil and #response["errors"] > 0 then + local comment_type = "org.texttechnologylab.annotation.AnnotationComment" + for _, err in ipairs(response["errors"]) do + local comment = luajava.newInstance(comment_type, inputCas) + comment:setKey("error") + comment:setValue(err) + comment:addToIndexes() + end + end +end \ No newline at end of file diff --git a/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.py b/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.py new file mode 100644 index 00000000..776bef3f --- /dev/null +++ b/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.py @@ -0,0 +1,437 @@ +""" +POS tagger for Ancient Greek, built as a DUUI component for the +TextImager pipeline. Uses a fine-tuned XLM-RoBERTa model (trained +on the UD Perseus treebank) to tag tokens with Universal POS labels. + +I did not write this from nothing. The DUUI boilerplate: the endpoint +structure, the Lua communication layer, the typesystem handshake, +is borrowed heavily from the existing TTLab components, especially +the Flair POS tagger and the spaCy sentencizer . I studied those to +understand how DUUI components are supposed to be wired up, then +adapted the skeleton for my own model. + +The actual inference logic (tokenisation, subword-to-word alignment) +was written with a lot of help from GitHub Copilot and several rounds +of asking ChatGPT "why is word_ids() returning None for special tokens." + +Last meaningful edit: Feb 2026 +""" + +import logging +import os +import re +from functools import lru_cache +from pathlib import Path +from typing import Optional + +import torch +import uvicorn +from fastapi import FastAPI +from fastapi.responses import PlainTextResponse +from pydantic import BaseModel +from transformers import AutoModelForTokenClassification, AutoTokenizer + +# -- Config -- +# BORROWED. This env-var-based config pattern is straight from the +# DUUI emotion and sentiment components. +# I liked it better than the pydantic BaseSettings approach used in +# the spaCy sentencizer, mostly because I don't fully understand how +# pydantic settings auto-loads from env vars and I didn't want to +# debug that on top of everything else. + +ANNOTATOR_NAME = os.environ.get( + "DUUI_POS_AG_ANNOTATOR_NAME", "duui-pos-ancient-greek" +) +ANNOTATOR_VERSION = os.environ.get( + "DUUI_POS_AG_ANNOTATOR_VERSION", "0.1.0" +) +LOG_LEVEL = os.environ.get("DUUI_POS_AG_LOG_LEVEL", "DEBUG") +MODEL_PATH = os.environ.get( + "DUUI_POS_AG_MODEL_PATH", "qbnguyen/ancient-greek-pos-xlmr" +) + +COMPONENT_ROOT = Path(__file__).parent + +logging.basicConfig(level=getattr(logging, LOG_LEVEL)) +logger = logging.getLogger(__name__) + +# BORROWED. The Flair POS component and the spaCy sentencizer both have +# this. I assume it's for performance but honestly I just copied the +# pattern because it seemed like the right thing to do. +_TYPESYSTEM_XML = (COMPONENT_ROOT / "TypeSystemPOS.xml").read_text("utf-8") +_LUA_SCRIPT = ( + (COMPONENT_ROOT / "duui_pos_ancient_greek.lua").read_text("utf-8") +) + +# Punctuation pattern for the tokenizer. +# FRAGILE. I assembled this character class myself by looking at what +# shows up in my Ancient Greek test corpus. The middle dot (·) and the +# Greek question mark (;) are the ones that kept tripping me up. +# There are probably more punctuation marks in Unicode Greek ranges +# that I'm missing. If tokens start looking wrong, check here first. +_PUNCT = r"""[,.:;!?·;()\[\]«»\u201c\u201d\u2018\u2019]+""" + +# -- Schemas -- +# BORROWED. The request/response schema pattern comes from the DUUI +# components. The Flair tagger uses DkproSentence / DkproPos, the +# emotion component uses UimaSentence, etc. I renamed things to match +# what my component actually does but the shape is the same. +# +# I asked ChatGPT: "what is the difference between a Pydantic BaseModel +# and a regular dataclass" and the answer was helpful enough that I +# stopped worrying and just used BaseModel like everyone else. + + +class Sentence(BaseModel): + begin: int + end: int + text: str + + +class PosRequest(BaseModel): + doc_text: str + doc_len: int + lang: str = "grc" + model_name: Optional[str] = None + sentences: Optional[list[Sentence]] = None + + +class TokenPOS(BaseModel): + begin: int + end: int + pos_value: str + pos_coarse_value: str + + +class PosResponse(BaseModel): + tokens: list[TokenPOS] + model_name: str + model_version: str + model_source: str + model_lang: str + errors: list[str] + + +# -- Model loading -- + +# BORROWED. The lru_cache trick for model loading appears in every +# single DUUI component I looked at. The Flair tagger has a +# configurable cache size, the emotion component uses a lock + cache +# combo. I went with the simplest version: cache one model, no lock. +# +# REVISIT. The emotion and sentiment components use a threading Lock +# around model loading/inference. I'm not doing that because I only +# run one worker (see uvicorn config at the bottom), but if I ever +# scale this up I'll need to add locking. I only half-understand why +# concurrent access to a pytorch model is dangerous. + +@lru_cache(maxsize=1) +def load_model(model_path: str): + logger.info("Loading model from %s", model_path) + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForTokenClassification.from_pretrained(model_path) + model.eval() + logger.info("Model loaded successfully on CPU") + return model, tokenizer + + +# -- Tokenisation + +# SOLID (mostly). I wrote this function myself. It's the part I +# understand best because it's basically text processing, which is +# closer to my wheelhouse than the ML inference stuff. +# +# The idea: split on whitespace, then peel off leading and trailing +# punctuation as separate tokens. I need character offsets because +# DUUI maps annotations back onto the original document by position. +# +# I went through about four versions of this. First attempt used +# spaCy's tokenizer but it was overkill and slow for Ancient Greek. +# Second attempt was a naive whitespace split that broke on «εἶπεν» +# because the guillemets stayed glued to the word. Current version +# handles that. +# +# The _append helper inside the function is a pattern Copilot +# suggested when I kept repeating the dict construction. I would +# have just written it out each time, but this is cleaner. + +def tokenize_raw_text(text: str) -> list[dict]: + """Split *text* into word tokens with character offsets, + separating leading/trailing punctuation.""" + tokens: list[dict] = [] + + def _append(form: str, start: int): + tokens.append({"form": form, "begin": start, "end": start + len(form)}) + + for m in re.finditer(r"\S+", text): + word, ws = m.group(), m.start() + + # peel off leading punctuation — «, (, [, etc. + lead = re.match(f"^({_PUNCT})", word) + if lead: + _append(lead.group(1), ws) + ws += lead.end() + word = word[lead.end() :] + if not word: + continue + + # peel off trailing punctuation — same idea, from the right + trail = re.search(f"({_PUNCT})$", word) + trail_tok = None + if trail: + trail_tok = (trail.group(1), ws + trail.start()) + word = word[: trail.start()] + + if word: + _append(word, ws) + if trail_tok: + _append(*trail_tok) + + return tokens + + +# -- POS inference -- + +# COPILOT wrote the first draft of this function. My prompt was +# roughly: "given a list of pre-tokenized words, run them through +# a HuggingFace token classification model and map subword predictions +# back to the original words using word_ids()" +# +# I then rewrote parts of it after spending a long time reading: +# https://huggingface.co/docs/transformers/tasks/token_classification +# and this Stack Overflow answer about word_ids() alignment: +# https://stackoverflow.com/a/75903065 +# +# The key thing I learned (from ChatGPT, after staring at wrong output +# for two hours): when you pass is_split_into_words=True, the tokenizer +# may split a single word into multiple subword tokens. word_ids() +# tells you which original word each subword belongs to. We only want +# the prediction for the *first* subword of each word. That's what +# the `seen` set is for. I understand this now but I would not have +# figured it out without help. +# +# FRAGILE. The max_length=256 truncation means very long sentences +# will lose tokens at the end silently. My corpus doesn't have +# sentences that long, but if yours does, raise this. I don't know +# what the actual max is for XLM-RoBERTa. + +def predict_pos( + text: str, offset: int, model, tokenizer +) -> list[TokenPOS]: + if not text or not text.strip(): + return [] + + word_tokens = tokenize_raw_text(text) + if not word_tokens: + return [] + + words = [t["form"] for t in word_tokens] + + # Tokenize with the model's subword tokenizer. + # is_split_into_words=True tells it we already split on whitespace. + encoding = tokenizer( + words, + is_split_into_words=True, + truncation=True, + max_length=256, + return_tensors="pt", + ) + + # Run inference: no gradient computation needed, we're just predicting + with torch.no_grad(): + logits = model(**encoding).logits + + # argmax gives us the most likely label index for each subword token + preds = torch.argmax(logits, dim=-1)[0].tolist() + word_ids = encoding.word_ids() + id2label = model.config.id2label + + # Map subword predictions back to our original word tokens. + # We only take the first subword's prediction for each word. + # COPILOT. This loop structure is mostly Copilot's. I added the + # offset arithmetic to make the character positions absolute + # (relative to the full document, not just this sentence). + results: list[TokenPOS] = [] + seen: set[int] = set() + for sw_idx, wid in enumerate(word_ids): + if wid is None or wid in seen: + continue + seen.add(wid) + tok = word_tokens[wid] + label = id2label[preds[sw_idx]] + results.append( + TokenPOS( + begin=tok["begin"] + offset, + end=tok["end"] + offset, + pos_value=label, + # I'm setting coarse and fine to the same value because the + # model only outputs Universal POS tags. The DUUI type system + # expects both fields. The Flair POS component leaves + # coarse_value empty (""), but I figured identical values + # are more informative than blank. + pos_coarse_value=label, + ) + ) + return results + + +# -- Helpers -- + +# SOLID. Just bundles the response. Nothing clever happening here. +def _make_response( + tokens: list[TokenPOS], + model_path: str, + errors: list[str], +) -> PosResponse: + return PosResponse( + tokens=tokens, + model_name=model_path, + model_version=ANNOTATOR_VERSION, + model_source=model_path, + model_lang="grc", + errors=errors, + ) + + +# -- FastAPI -- +# BORROWED. The endpoint structure is required by the DUUI protocol. +# Every DUUI component follows this pattern. I copied the skeleton from +# the Flair POS tagger and the spaCy sentencizer, then filled in my +# own details. + +app = FastAPI( + title=ANNOTATOR_NAME, + version=ANNOTATOR_VERSION, + description="DUUI component for Ancient Greek POS tagging", +) + + +# Returns the UIMA type system XML. +# The Flair component returns this with media_type="application/xml", +# but I'm using PlainTextResponse like the simpler components do. +@app.get("/v1/typesystem", response_class=PlainTextResponse) +def get_typesystem(): + return _TYPESYSTEM_XML + + +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer(): + return _LUA_SCRIPT + + +# BORROWED. The documentation endpoint structure is adapted from the +# Flair POS component. The Flair version has a proper TextImagerDocumentation +# Pydantic model with a capabilities field. The spaCy version does too. +# I simplified mine to a plain dict because the emotion component by +# Bagci literally just returns the string "Test" for this endpoint and +# apparently that's fine? So I figured a real dict is already an +# improvement. +# +# REVISIT. Should probably add a TextImagerCapability model like the +# Flair and spaCy components do. Right now this is just a dict. +@app.get("/v1/documentation") +def get_documentation(): + return { + "annotator_name": ANNOTATOR_NAME, + "version": ANNOTATOR_VERSION, + "implementation_lang": "Python", + "meta": { + "description": ( + "Part-of-Speech tagger for Ancient Greek using a " + "fine-tuned XLM-RoBERTa model on UD Perseus treebank." + ), + "language": "grc", + "model": "xlm-roberta-base (fine-tuned)", + "training_data": "UD_Ancient_Greek-Perseus", + "tagset": "Universal POS (17 tags)", + }, + "parameters": { + "model_name": { + "type": "string", + "description": "Path or HF Hub ID for the model", + "default": MODEL_PATH, + } + }, + } + + +# BORROWED. From the Flair POS component. Maps DUUI input/output types +# so the Java pipeline knows what annotations this component reads and +# produces. +@app.get("/v1/details/input_output") +def get_input_output(): + return { + "inputs": [ + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + ], + "outputs": [ + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" + ], + } + + +# BORROWED. The overall structure (try to load model, iterate over +# sentences, collect results, catch exceptions into an error list) +# is modelled on the emotion and sentiment components. +# Those components process "selections" of sentences; mine is simpler +# because I only handle one selection type (sentences). +# +# The fallback path (when no sentences are provided) splits on newlines. +# I added this because during testing I kept sending raw text without +# pre-segmented sentences and getting empty results back. Took me +# embarrassingly long to realise the sentence list was just empty. +# +# FRAGILE. The newline fallback uses `cur += len(line) + 1` to track +# character offsets. The +1 is for the newline character itself. This +# will be wrong if the document uses \r\n line endings. I don't think +# Ancient Greek corpora have that problem but I've been wrong before. +@app.post("/v1/process", response_model=PosResponse) +def process(request: PosRequest): + model_path = request.model_name or MODEL_PATH + + try: + model, tokenizer = load_model(model_path) + except Exception as e: + logger.error("Failed to load model: %s", e) + return _make_response([], model_path, [f"Model load error: {e}"]) + + all_tokens: list[TokenPOS] = [] + errors: list[str] = [] + + try: + if request.sentences: + for sent in request.sentences: + all_tokens.extend( + predict_pos(sent.text, sent.begin, model, tokenizer) + ) + else: + # No pre-segmented sentences, fall back to line-by-line. + # Not ideal but better than returning nothing. + cur = 0 + for line in request.doc_text.split("\n"): + if line.strip(): + all_tokens.extend( + predict_pos(line, cur, model, tokenizer) + ) + cur += len(line) + 1 # +1 for the newline character + except Exception as e: + logger.error("Inference error: %s", e, exc_info=True) + errors.append(f"Inference error: {e}") + + return _make_response(all_tokens, model_path, errors) + + +# -- Entry point -- +# SOLID. Standard uvicorn startup. workers=1 because I don't want to +# deal with concurrent model access. Port 9714 was chosen arbitrarily. +# The otehr DUUI components each seem to pick their own port and I just +# made sure mine didn't collide with any of the ones I saw in their +# docker-compose files. + +if __name__ == "__main__": + uvicorn.run( + "duui_pos_ancient_greek:app", + host="0.0.0.0", + port=9714, + workers=1, + ) \ No newline at end of file diff --git a/duui-pos-ancient-greek/src/test/java/org/hucompute/textimager/uima/pos/AncientGreekPOSTest.java b/duui-pos-ancient-greek/src/test/java/org/hucompute/textimager/uima/pos/AncientGreekPOSTest.java new file mode 100644 index 00000000..92518b2e --- /dev/null +++ b/duui-pos-ancient-greek/src/test/java/org/hucompute/textimager/uima/pos/AncientGreekPOSTest.java @@ -0,0 +1,218 @@ +package org.hucompute.textimager.uima.pos; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.*; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; + +import java.util.Collection; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Integration tests for the Ancient Greek POS tagger DUUI component. + */ +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +class AncientGreekPOSTest { + + private static DUUIComposer composer; + private static final String ENDPOINT = "http://localhost:9714"; + + @BeforeAll + static void setUp() throws Exception { + // Initialize DUUI composer with a remote driver + DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary(); + + composer = new DUUIComposer() + .withLuaContext(ctx) + .withSkipVerification(true); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); + + // Add the remote POS component + composer.add(new DUUIRemoteDriver.Component(ENDPOINT)); + + System.out.println("DUUI Composer initialized, endpoint: " + ENDPOINT); + } + + @AfterAll + static void tearDown() throws Exception { + if (composer != null) { + composer.shutdown(); + } + } + + /** + * Test 1: Simple single-line Ancient Greek sentence. + * Verifies POS annotations are created and cover known words. + */ + @Test + @Order(1) + @DisplayName("Test simple Iliad opening line") + void testSimpleSentence() throws Exception { + String text = "Μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος"; + + JCas jCas = JCasFactory.createJCas(); + jCas.setDocumentText(text); + jCas.setDocumentLanguage("grc"); + + // Run the pipeline + composer.run(jCas); + + // Collect POS annotations + Collection posAnnotations = JCasUtil.select(jCas, POS.class); + + System.out.println("\n--- Test 1: Simple Sentence ---"); + System.out.println("Text: " + text); + System.out.println("POS annotations found: " + posAnnotations.size()); + + for (POS pos : posAnnotations) { + String word = pos.getCoveredText(); + System.out.printf(" [%d:%d] %-20s → %s%n", + pos.getBegin(), pos.getEnd(), word, pos.getPosValue()); + } + + // Assertions + assertFalse(posAnnotations.isEmpty(), + "Should have at least one POS annotation"); + assertTrue(posAnnotations.size() >= 5, + "Expected at least 5 tokens, got " + posAnnotations.size()); + + // Verify every annotation has a valid POS value + for (POS pos : posAnnotations) { + assertNotNull(pos.getPosValue(), + "POS value should not be null for: " + pos.getCoveredText()); + assertFalse(pos.getPosValue().isEmpty(), + "POS value should not be empty for: " + pos.getCoveredText()); + assertTrue(pos.getBegin() >= 0, "Begin offset should be >= 0"); + assertTrue(pos.getEnd() <= text.length(), + "End offset should be <= text length"); + assertTrue(pos.getBegin() < pos.getEnd(), + "Begin should be < End"); + } + } + + /** + * Test 2: Multi-line passage from the Iliad. + * Verifies annotations cover the entire document. + */ + @Test + @Order(2) + @DisplayName("Test multi-line Iliad passage") + void testMultiLineSentence() throws Exception { + String text = + "οὐλομένην, ἣ μυρί' Ἀχαιοῖς ἄλγε' ἔθηκε\n" + + "πολλὰς δ' ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν ,\n" + + "ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν"; + + JCas jCas = JCasFactory.createJCas(); + jCas.setDocumentText(text); + jCas.setDocumentLanguage("grc"); + + composer.run(jCas); + + Collection posAnnotations = JCasUtil.select(jCas, POS.class); + + System.out.println("\n--- Test 2: Multi-line Passage ---"); + System.out.println("Text length: " + text.length()); + System.out.println("POS annotations found: " + posAnnotations.size()); + + for (POS pos : posAnnotations) { + String word = pos.getCoveredText(); + System.out.printf(" [%d:%d] %-20s → %s%n", + pos.getBegin(), pos.getEnd(), word, pos.getPosValue()); + } + + // Should have tokens from all three lines + assertTrue(posAnnotations.size() >= 15, + "Expected at least 15 tokens across 3 lines, got " + + posAnnotations.size()); + + // Verify last annotation's end offset is within the text + POS lastPos = null; + for (POS pos : posAnnotations) { + lastPos = pos; + } + assertNotNull(lastPos); + assertTrue(lastPos.getEnd() <= text.length(), + "Last token's end should be within text bounds"); + + // Verify covered text matches the document + for (POS pos : posAnnotations) { + String covered = text.substring(pos.getBegin(), pos.getEnd()); + assertEquals(covered, pos.getCoveredText(), + "Covered text mismatch at offset " + pos.getBegin()); + } + } + + /** + * Test 3: Empty text input. + * Verifies the component handles it gracefully without crashing. + */ + @Test + @Order(3) + @DisplayName("Test empty text handling") + void testEmptyText() throws Exception { + JCas jCas = JCasFactory.createJCas(); + jCas.setDocumentText(""); + jCas.setDocumentLanguage("grc"); + + // Should not throw + assertDoesNotThrow(() -> composer.run(jCas)); + + Collection posAnnotations = JCasUtil.select(jCas, POS.class); + + System.out.println("\n--- Test 3: Empty Text ---"); + System.out.println("POS annotations found: " + posAnnotations.size()); + + assertEquals(0, posAnnotations.size(), + "Empty text should produce no POS annotations"); + } + + /** + * Test 4: Verify specific POS tag for a known word. + * "Μῆνιν" (wrath, accusative) should be tagged as NOUN. + */ + @Test + @Order(4) + @DisplayName("Test known word POS prediction") + void testKnownWordTag() throws Exception { + String text = "Μῆνιν ἄειδε θεά"; + + JCas jCas = JCasFactory.createJCas(); + jCas.setDocumentText(text); + jCas.setDocumentLanguage("grc"); + + composer.run(jCas); + + Collection posAnnotations = JCasUtil.select(jCas, POS.class); + + System.out.println("\n--- Test 4: Known Word POS ---"); + for (POS pos : posAnnotations) { + System.out.printf(" %-20s → %s%n", + pos.getCoveredText(), pos.getPosValue()); + } + + // Find the first token (should be Μῆνιν) + POS firstToken = posAnnotations.iterator().next(); + assertEquals("Μῆνιν", firstToken.getCoveredText()); + assertEquals("NOUN", firstToken.getPosValue(), + "Μῆνιν (wrath/acc) should be tagged as NOUN"); + + // Verify ἄειδε is VERB + boolean foundVerb = false; + for (POS pos : posAnnotations) { + if ("ἄειδε".equals(pos.getCoveredText())) { + assertEquals("VERB", pos.getPosValue(), + "ἄειδε (sing!) should be tagged as VERB"); + foundVerb = true; + } + } + assertTrue(foundVerb, "Should find ἄειδε in annotations"); + } +} \ No newline at end of file diff --git a/duui-transformers-Emotion/Readme.md b/duui-transformers-Emotion/Readme.md index 7655adee..c1de6021 100644 --- a/duui-transformers-Emotion/Readme.md +++ b/duui-transformers-Emotion/Readme.md @@ -53,6 +53,7 @@ DUUI implementation for selected Hugging-Face-based transformer [Emotion tools]( | universal-joy-pt-small | https://github.com/sotlampr/universal-joy | 6ab01e98c8106e610247e5e8f0712af08c007b67 | PT | | universal-joy-tl-small | https://github.com/sotlampr/universal-joy | 6ab01e98c8106e610247e5e8f0712af08c007b67 | TL | | universal-joy-zh-small | https://github.com/sotlampr/universal-joy | 6ab01e98c8106e610247e5e8f0712af08c007b67 | ZH | +| phobert-emotion | https://huggingface.co/visolex/phobert-emotion | 6099c5a6f91fc6c8175818e37f96fecad0c96b63 | VI | # How To Use diff --git a/duui-transformers-Emotion/docker_build.sh b/duui-transformers-Emotion/docker_build.sh old mode 100644 new mode 100755 index 89229e33..777aac30 --- a/duui-transformers-Emotion/docker_build.sh +++ b/duui-transformers-Emotion/docker_build.sh @@ -377,6 +377,13 @@ export MODEL_LANG="DE" #export MODEL_LANG="ZH" ####-------------------------------------------------------------------- +####--------------------------------------------------------------------- +export MODEL_NAME="visolex/phobert-emotion" +export MODEL_SPECNAME="phobert-emotion" +export MODEL_VERSION="90460fb946cf640ef9c56ae484cabb49d48ef14e" +export MODEL_SOURCE="https://huggingface.co/visolex/phobert-emotion" +export MODEL_LANG="VI" +####-------------------------------------------------------------------- docker build \ --build-arg ANNOTATOR_NAME \ diff --git a/duui-transformers-Emotion/pom.xml b/duui-transformers-Emotion/pom.xml index 4cfec8fc..a265f77a 100644 --- a/duui-transformers-Emotion/pom.xml +++ b/duui-transformers-Emotion/pom.xml @@ -99,9 +99,9 @@ ${ttlab.duui.version} --> - com.github.mevbagci + com.github.texttechnologylab DockerUnifiedUIMAInterface - 1.4.9 + 1.5.3 @@ -110,9 +110,9 @@ - com.github.mevbagci + com.github.texttechnologylab UIMATypeSystem - 3.0.13 + 02fb1a2f13 diff --git a/duui-transformers-Emotion/src/main/docker/Dockerfile b/duui-transformers-Emotion/src/main/docker/Dockerfile index aa9f53c9..77acfd92 100644 --- a/duui-transformers-Emotion/src/main/docker/Dockerfile +++ b/duui-transformers-Emotion/src/main/docker/Dockerfile @@ -44,7 +44,8 @@ RUN python -c "from transformers import pipeline; pipeline('text-classification' #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='alex-shvets/roberta-large-emopillars-contextual-emocontext')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='AdapterHub/bert-base-uncased-pf-emo')" #RUN python -c "from pytorch_transformers import (BertTokenizer, BertModel, BertConfig,); BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False); BertModel.from_pretrained('bert-base-multilingual-cased')" - +RUN python -c "from transformers import pipeline; pipeline('text-classification', \ + model='visolex/phobert-emotion')" # copy scripts COPY ./src/main/python/TypeSystemEmotion.xml ./TypeSystemEmotion.xml @@ -101,4 +102,4 @@ ENV TRANSFORMERS_OFFLINE=$DUUI_TRANSFORMERS_TRANSFORMERS_OFFLINE ENTRYPOINT ["uvicorn", "duui_transformers_emotion:app", "--host", "0.0.0.0", "--port" ,"9714"] -CMD ["--workers", "1"] \ No newline at end of file +CMD ["--workers", "1"] diff --git a/duui-transformers-Emotion/src/main/docker/Dockerfile-cuda b/duui-transformers-Emotion/src/main/docker/Dockerfile-cuda index 33bfef7e..9aceb468 100644 --- a/duui-transformers-Emotion/src/main/docker/Dockerfile-cuda +++ b/duui-transformers-Emotion/src/main/docker/Dockerfile-cuda @@ -56,6 +56,8 @@ RUN python -c "from transformers import pipeline; pipeline('text-classification' #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='alex-shvets/roberta-large-emopillars-contextual-emocontext')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='AdapterHub/bert-base-uncased-pf-emo')" #RUN python -c "from pytorch_transformers import (BertTokenizer, BertModel, BertConfig,); BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False); BertModel.from_pretrained('bert-base-multilingual-cased')" +RUN python -c "from transformers import pipeline; pipeline('text-classification', \ + model='visolex/phobert-emotion')" # copy scripts COPY ./src/main/python/TypeSystemEmotion.xml ./TypeSystemEmotion.xml @@ -112,4 +114,4 @@ ENV TRANSFORMERS_OFFLINE=$DUUI_TRANSFORMERS_TRANSFORMERS_OFFLINE ENTRYPOINT ["uvicorn", "duui_transformers_emotion:app", "--host", "0.0.0.0", "--port" ,"9714"] -CMD ["--workers", "1"] \ No newline at end of file +CMD ["--workers", "1"] diff --git a/duui-transformers-Emotion/src/main/python/EmotionDetection.py b/duui-transformers-Emotion/src/main/python/EmotionDetection.py index 7802f8f1..63c112c8 100644 --- a/duui-transformers-Emotion/src/main/python/EmotionDetection.py +++ b/duui-transformers-Emotion/src/main/python/EmotionDetection.py @@ -34,7 +34,8 @@ def sigmoid(x): "SamLowe": "SamLowe/roberta-base-go_emotions", "michellejieli": "michellejieli/emotion_text_classifier", "EmoAtlas": "EmoAtlas", - "MRM8488": "mrm8488/t5-base-finetuned-emotion" + "MRM8488": "mrm8488/t5-base-finetuned-emotion", + "PhoBERT": "visolex/phobert-emotion" } map_emotion = { "DReAMy-lib/xlm-roberta-large-DreamBank-emotion-presence": { @@ -393,6 +394,14 @@ def sigmoid(x): 1: "happy", 2: "sad", 3: "angry" + }, + "visolex/phobert-emotion": { + 0: "enjoyment", + 1: "sadness", + 2: "anger", + 3: "fear", + 4: "disgust", + 5: "surprise" } } @@ -492,7 +501,10 @@ def __init__(self, model_name: str, device='cuda:0'): self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device) self.class_mapping = self.model.config.id2label - self.labels = list(map_emotion[model_name].values()) + if model_name in map_emotion and len(map_emotion[model_name]) == len(self.class_mapping): + self.labels = list(map_emotion[model_name].values()) + else: + self.labels = [self.class_mapping[i] for i in sorted(self.class_mapping.keys())] def emotion_prediction(self, texts: List[str]): with torch.no_grad(): diff --git a/duui-transformers-Emotion/src/main/python/duui_transformers_emotion.py b/duui-transformers-Emotion/src/main/python/duui_transformers_emotion.py index 1b962f3a..d27b82b0 100644 --- a/duui-transformers-Emotion/src/main/python/duui_transformers_emotion.py +++ b/duui-transformers-Emotion/src/main/python/duui_transformers_emotion.py @@ -30,6 +30,7 @@ "mrm8488/t5-base-finetuned-emotion": "https://huggingface.co/mrm8488/t5-base-finetuned-emotion", "EmoAtlas": "https://github.com/alfonsosemeraro/emoatlas", "pysentimiento": "https://github.com/pysentimiento/pysentimiento/", + "visolex/phobert-emotion": "https://huggingface.co/visolex/phobert-emotion", } languages = { @@ -46,6 +47,7 @@ "mrm8488/t5-base-finetuned-emotion": "en", "SamLowe/roberta-base-go_emotions": "en", "ActivationAI/distilbert-base-uncased-finetuned-emotion": "en", + "visolex/phobert-emotion": "vi", } versions = { @@ -62,6 +64,7 @@ "mrm8488/t5-base-finetuned-emotion": "e44a316825f11230724b36412fbf1899c76e82de", "EmoAtlas": "adae44a80dd55c1d1c467c4e72bdb2d8cf63bf28", "pysentimiento": "60822acfd805ad5d95437c695daa33c18dbda060", + "visolex/phobert-emotion": "90460fb946cf640ef9c56ae484cabb49d48ef14e", } diff --git a/duui-transformers-Emotion/src/test/java/org/hucompute/textimager/uima/transformers/emotion/EmotionTest.java b/duui-transformers-Emotion/src/test/java/org/hucompute/textimager/uima/transformers/emotion/EmotionTest.java index d0547ea1..9ff6e22a 100644 --- a/duui-transformers-Emotion/src/test/java/org/hucompute/textimager/uima/transformers/emotion/EmotionTest.java +++ b/duui-transformers-Emotion/src/test/java/org/hucompute/textimager/uima/transformers/emotion/EmotionTest.java @@ -225,4 +225,48 @@ public void TurkishTest() throws Exception { Assertions.assertEquals(expected_emotions.get(expected.indexOf(emotion)), key); } } + + + @Test + public void VietnameseTest() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + ); + + List sentences = Arrays.asList( + "Tao ghét mày. Tao đang rất tức giận", + "Tôi rất vui khi được ở đây. Tôi yêu nơi này." + ); + + createCas("vi", sentences); + composer.run(cas); + + Collection all_emotions = JCasUtil.select(cas, Emotion.class); + ArrayList> expected = new ArrayList<>(); + + for (Emotion emotion : all_emotions) { + System.out.println("Text: " + emotion.getCoveredText()); + Map emotions = new HashMap<>(); + FSArray emotions_all = emotion.getEmotions(); + + for (AnnotationComment comment : emotions_all) { + emotions.put(comment.getKey(), Float.parseFloat(comment.getValue())); + System.out.println(" " + comment.getKey() + ": " + comment.getValue()); + } + expected.add(emotions); + } + + // Expected: Enjoyment, Sadness (CAPITALIZED to match model output) + ArrayList expected_emotions = new ArrayList<>(Arrays.asList("Anger", "Enjoyment")); + + for (int i = 0; i < expected.size(); i++) { + String top_emotion = Collections.max( + expected.get(i).entrySet(), + Map.Entry.comparingByValue() + ).getKey(); + Assertions.assertEquals(expected_emotions.get(i), top_emotion); + } + } + } \ No newline at end of file diff --git a/duui-transformers-sentiment-atomar/Readme.md b/duui-transformers-sentiment-atomar/Readme.md index aa00640f..ce23c9dd 100644 --- a/duui-transformers-sentiment-atomar/Readme.md +++ b/duui-transformers-sentiment-atomar/Readme.md @@ -18,6 +18,7 @@ DUUI implementation for selected Hugging-Face-based transformer [Sentiment tools | roberta-based-en | https://huggingface.co/j-hartmann/sentiment-roberta-large-english-3-classes | 81cdc0fe3eee1bc18d95ffdfb56b2151a39c9007 | EN | | finance-sentiment-de | https://huggingface.co/bardsai/finance-sentiment-de-base | 51b3d03f716eaa093dc42130f675839675a07b9a | DE | | german-sentiment-bert | https://huggingface.co/oliverguhr/german-sentiment-bert | b1177ff59e305c966836ba2825d3dc2efc53f125 | DE | +| phobert-base-vietnamese-sentiment | https://huggingface.co/wonrax/phobert-base-vietnamese-sentiment | 9076a5896971b5d551588fe8a51c722c89731d36 | VI | # How To Use diff --git a/duui-transformers-sentiment-atomar/docker_build.sh b/duui-transformers-sentiment-atomar/docker_build.sh index 29afb41f..df74e5b2 100644 --- a/duui-transformers-sentiment-atomar/docker_build.sh +++ b/duui-transformers-sentiment-atomar/docker_build.sh @@ -67,6 +67,14 @@ export MODEL_SOURCE="https://huggingface.co/oliverguhr/german-sentiment-bert" export MODEL_LANG="DE" ###-------------------------------------------------------------------- +###--------------------------------------------------------------------- +#export MODEL_NAME="wonrax/phobert-base-vietnamese-sentiment" +#export MODEL_SPECNAME="phobert-vietnamese" +#export MODEL_VERSION="b9f2ff6ba0e1cdaec8b0e4149ebeae7c46b78d4f" +#export MODEL_SOURCE="https://huggingface.co/wonrax/phobert-base-vietnamese-sentiment" +#export MODEL_LANG="VI" +###--------------------------------------------------------------------- + export DOCKER_REGISTRY="docker.texttechnologylab.org/" export DUUI_CUDA= diff --git a/duui-transformers-sentiment-atomar/pom.xml b/duui-transformers-sentiment-atomar/pom.xml index 17185f9e..b4328b05 100644 --- a/duui-transformers-sentiment-atomar/pom.xml +++ b/duui-transformers-sentiment-atomar/pom.xml @@ -101,7 +101,7 @@ com.github.texttechnologylab DockerUnifiedUIMAInterface - 7cef2433b5 + 1.5.3 diff --git a/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile b/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile index ae52cb29..6427d1aa 100644 --- a/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile +++ b/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile @@ -15,6 +15,7 @@ RUN pip install -r requirements.txt #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='cardiffnlp/twitter-roberta-base-sentiment-latest')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='j-hartmann/sentiment-roberta-large-english-3-classes')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='bardsai/finance-sentiment-de-base')" +RUN python -c "from transformers import pipeline; pipeline('text-classification', model='wonrax/phobert-base-vietnamese-sentiment')" RUN python -c "from germansentiment import SentimentModel; model = SentimentModel()" # copy scripts diff --git a/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile-cuda b/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile-cuda index 12d392f0..cf51c1c2 100644 --- a/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile-cuda +++ b/duui-transformers-sentiment-atomar/src/main/docker/Dockerfile-cuda @@ -38,6 +38,7 @@ RUN pip install -r requirements.txt #RUN python -c "from transformers import AutoModelForSequenceClassification, AutoTokenizer; AutoModelForSequenceClassification.from_pretrained('poltextlab/xlm-roberta-large-party-cap-v3', trust_remote_code=True); AutoTokenizer.from_pretrained('xlm-roberta-large')" #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='cardiffnlp/tweet-topic-latest-single')" RUN python -c "from transformers import pipeline; pipeline('text-classification', model='cardiffnlp/tweet-topic-large-multilingual')" +#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='wonrax/phobert-base-vietnamese-sentiment')" # log level ARG LOG_LEVEL="DEBUG" diff --git a/duui-transformers-sentiment-atomar/src/main/python/SentimentSpeech.py b/duui-transformers-sentiment-atomar/src/main/python/SentimentSpeech.py index a6e3bfab..907833a4 100644 --- a/duui-transformers-sentiment-atomar/src/main/python/SentimentSpeech.py +++ b/duui-transformers-sentiment-atomar/src/main/python/SentimentSpeech.py @@ -48,6 +48,11 @@ def sigmoid(x): 0: "positive", 1: "neutral", 2: "negative" + }, + "wonrax/phobert-base-vietnamese-sentiment": { + 0: "negative", + 1: "positive", + 2: "neutral" } } diff --git a/duui-transformers-sentiment-atomar/src/test/java/org/hucompute/textimager/uima/transformers/sentiment/SentimentTest.java b/duui-transformers-sentiment-atomar/src/test/java/org/hucompute/textimager/uima/transformers/sentiment/SentimentTest.java index d254d29e..43179668 100644 --- a/duui-transformers-sentiment-atomar/src/test/java/org/hucompute/textimager/uima/transformers/sentiment/SentimentTest.java +++ b/duui-transformers-sentiment-atomar/src/test/java/org/hucompute/textimager/uima/transformers/sentiment/SentimentTest.java @@ -229,4 +229,31 @@ public void EnTest() throws Exception { System.out.println("Positive: " + positive); } } + @Test + public void VietnamesePhoBertTest() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + ); + + List sentences = Arrays.asList( + "Tôi rất yêu thích bộ phim này. Nó tuyệt vời!", + "Sản phẩm này không tốt. Tôi rất thất vọng.", + "Món hàng này cũng bình thường, không tốt không xấu." + ); + + createCas("vi", sentences); + composer.run(cas); + + Collection all_sentiment = JCasUtil.select(cas, SentimentModel.class); + for (SentimentModel sentiment_i : all_sentiment) { + System.out.println(sentiment_i.getCoveredText()); + Double negative = sentiment_i.getProbabilityNegative(); + Double neutral = sentiment_i.getProbabilityNeutral(); + Double positive = sentiment_i.getProbabilityPositive(); + System.out.println("Negative: " + negative); + System.out.println("Neutral: " + neutral); + System.out.println("Positive: " + positive); + } + } } diff --git a/duui-transformers-summary/Readme.md b/duui-transformers-summary/Readme.md index 75e46bda..368b33ec 100644 --- a/duui-transformers-summary/Readme.md +++ b/duui-transformers-summary/Readme.md @@ -13,6 +13,7 @@ DUUI implementation for selected Hugging-Face-based transformer summary tools mo | MT5 | https://huggingface.co/csebuetnlp/mT5_multilingual_XLSum | 2437a524effdbadc327ced84595508f1e32025b3 | Multilingual | | Google T5 | https://huggingface.co/google/flan-t5-base | 7bcac572ce56db69c1ea7c8af255c5d7c9672fc2 | Multilingual | | MDML | https://github.com/airKlizz/mdmls | 60f9eadb55d20eae889332035daa884205971566 | Multilingual | +| Pegasus Finance | https://huggingface.co/human-centered-summarization/financial-summarization-pegasus | 734fe2da8db6e4d7272ad553cb3343ed59a566d7 | English | # How To Use diff --git a/duui-transformers-summary/docker_build.sh b/duui-transformers-summary/docker_build.sh old mode 100644 new mode 100755 diff --git a/duui-transformers-summary/pom.xml b/duui-transformers-summary/pom.xml index 8bb12fe4..499016c6 100644 --- a/duui-transformers-summary/pom.xml +++ b/duui-transformers-summary/pom.xml @@ -101,7 +101,7 @@ com.github.texttechnologylab DockerUnifiedUIMAInterface - 7cef2433b5 + 1.5.3 @@ -109,18 +109,18 @@ - - - - - - - - org.texttechnologylab.annotation - typesystem - 3.0.1 + + com.github.texttechnologylab + UIMATypeSystem + 02fb1a2f13 + + + + + + diff --git a/duui-transformers-summary/src/main/docker/Dockerfile b/duui-transformers-summary/src/main/docker/Dockerfile index 0091c928..6f2885dc 100644 --- a/duui-transformers-summary/src/main/docker/Dockerfile +++ b/duui-transformers-summary/src/main/docker/Dockerfile @@ -9,9 +9,10 @@ RUN pip install setuptools wheel COPY ./requirements.txt ./requirements.txt RUN pip install -r requirements.txt -RUN python -c "from transformers import pipeline; pipeline('text2text-generation', model='csebuetnlp/mT5_multilingual_XLSum')" -RUN python -c "from transformers import pipeline; pipeline('text2text-generation', model='google/flan-t5-base')" -RUN python -c "from mdmls import Summarizer; Summarizer(device=-1)" +#RUN python -c "from transformers import pipeline; pipeline('text2text-generation', model='csebuetnlp/mT5_multilingual_XLSum')" +#RUN python -c "from transformers import pipeline; pipeline('text2text-generation', model='google/flan-t5-base')" +#RUN python -c "from mdmls import Summarizer; Summarizer(device=-1)" +RUN python -c "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM; AutoTokenizer.from_pretrained('human-centered-summarization/financial-summarization-pegasus'); AutoModelForSeq2SeqLM.from_pretrained('human-centered-summarization/financial-summarization-pegasus')" # copy scripts COPY ./src/main/python/TypeSystemSummary.xml ./TypeSystemSummary.xml diff --git a/duui-transformers-summary/src/main/docker/Dockerfile_cuda b/duui-transformers-summary/src/main/docker/Dockerfile_cuda index 55942a37..f87edc31 100644 --- a/duui-transformers-summary/src/main/docker/Dockerfile_cuda +++ b/duui-transformers-summary/src/main/docker/Dockerfile_cuda @@ -20,6 +20,7 @@ RUN pip install -r requirements.txt RUN python -c "from transformers import pipeline; pipeline('text2text-generation', model='csebuetnlp/mT5_multilingual_XLSum')" RUN python -c "from transformers import pipeline; pipeline('text2text-generation', model='google/flan-t5-base')" RUN python -c "from mdmls import Summarizer; Summarizer(device=-1)" +RUN python -c "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM; AutoTokenizer.from_pretrained('human-centered-summarization/financial-summarization-pegasus'); AutoModelForSeq2SeqLM.from_pretrained('human-centered-summarization/financial-summarization-pegasus')" # copy scripts COPY ./src/main/python/TypeSystemSummary.xml ./TypeSystemSummary.xml diff --git a/duui-transformers-summary/src/main/python/duui_transformers_summary.py b/duui-transformers-summary/src/main/python/duui_transformers_summary.py index 458bcad6..31e4a81d 100644 --- a/duui-transformers-summary/src/main/python/duui_transformers_summary.py +++ b/duui-transformers-summary/src/main/python/duui_transformers_summary.py @@ -8,23 +8,26 @@ import torch from threading import Lock from functools import lru_cache -from summarization import Summarization, MDMLSummarization, MT5Summarization +from summarization import Summarization, MDMLSummarization, MT5Summarization, PegasusSummarization import numpy as np sources = { "MT5": "https://huggingface.co/csebuetnlp/mT5_multilingual_XLSum", "MDML": "https://github.com/airKlizz/mdmls", "Google T5": "https://huggingface.co/google/flan-t5-base", + "Pegasus Financial": "https://huggingface.co/human-centered-summarization/financial-summarization-pegasus", } languages = { "MT5": "Multi", "MDML": "Multi", "Google T5": "Multi", + "Pegasus Financial": "English", } versions = { "MT5": "2437a524effdbadc327ced84595508f1e32025b3", "MDML": "60f9eadb55d20eae889332035daa884205971566", "Google T5": "7bcac572ce56db69c1ea7c8af255c5d7c9672fc2", + "Pegasus Financial": "734fe2da8db6e4d7272ad553cb3343ed59a566d7", } # Settings # These are automatically loaded from env variables @@ -182,6 +185,8 @@ def load_model(model_name): model_i = MT5Summarization("csebuetnlp/mT5_multilingual_XLSum", device) case "Google T5": model_i = Summarization("google/flan-t5-base", device) + case "Pegasus Financial": + model_i = PegasusSummarization("human-centered-summarization/financial-summarization-pegasus", device) return model_i diff --git a/duui-transformers-summary/src/main/python/summarization.py b/duui-transformers-summary/src/main/python/summarization.py index ec453109..2fe2e279 100644 --- a/duui-transformers-summary/src/main/python/summarization.py +++ b/duui-transformers-summary/src/main/python/summarization.py @@ -68,6 +68,40 @@ def summarize(self, text, sum_len=84): return summary +class PegasusSummarization: + def __init__(self, model_name, device='cuda:0'): + self.device = device + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) + + def summarize(self, text, sum_len=128): + with torch.no_grad(): + inputs = self.tokenizer( + text, + max_length=512, + truncation=True, + padding="max_length", + return_tensors='pt' + ).to(self.device) + + preds = self.model.generate( + input_ids=inputs['input_ids'], + attention_mask=inputs['attention_mask'], + max_length=min(sum_len, 256), + min_length=int(sum_len * 0.3), + num_beams=4, + early_stopping=True, + length_penalty=2.0, + no_repeat_ngram_size=3 + ) + + decoded_predictions = self.tokenizer.batch_decode( + preds, + skip_special_tokens=True + ) + return decoded_predictions[0] + + if __name__ == '__main__': text = """The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972.""" model_i = "csebuetnlp/mT5_multilingual_XLSum" diff --git a/duui-transformers-summary/src/test/java/org/hucompute/textimager/uima/transformers/summary/FinancialSummaryTest.java b/duui-transformers-summary/src/test/java/org/hucompute/textimager/uima/transformers/summary/FinancialSummaryTest.java new file mode 100644 index 00000000..5444995c --- /dev/null +++ b/duui-transformers-summary/src/test/java/org/hucompute/textimager/uima/transformers/summary/FinancialSummaryTest.java @@ -0,0 +1,67 @@ +package org.hucompute.textimager.uima.transformers.summary; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.texttechnologylab.annotation.Summary; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.util.Collection; + +public class FinancialSummaryTest { + static DUUIComposer composer; + static String summary_url = "http://127.0.0.1:1000"; + static String model = "Pegasus Financial"; + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @Test + public void sentencesTest() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(summary_url) + .withParameter("model_name", model) + .withParameter("summary_length", "60") + ); + + String Text = "Everyone is gloomy about America’s jobs market. Investors talk of a “K-shaped” economy, in which growth is buoyed by an exuberant stockmarket and artificial-intelligence investment, while ordinary Americans languish. Job creation and overall economic growth, which usually move in tandem, have diverged. The Federal Reserve has cut interest rates at its two most recent meetings. Jerome Powell, the central bank’s chair, calls the loosening “risk management”, or insurance against a deeper downturn. Christopher Waller, a contender to replace Mr Powell, is pushing for further and faster cuts, beginning at the next meeting on December 10th, to support a weakening labour market."; + JCas cas = JCasFactory.createText(Text, "en"); + + Annotation sentence1 = new Sentence(cas, 0, 250); + sentence1.addToIndexes(); + Annotation sentence2 = new Sentence(cas, 251, Text.length()); + sentence2.addToIndexes(); + + composer.run(cas); + + Collection summaries = JCasUtil.select(cas, Summary.class); + + for (Summary summary : summaries) { + System.out.println(summary.getSummary()); + } + assert summaries.size() > 0; + } +} \ No newline at end of file