diff --git a/duui-Hate/Readme.md b/duui-Hate/Readme.md
index b259c12d..7ab15c2a 100644
--- a/duui-Hate/Readme.md
+++ b/duui-Hate/Readme.md
@@ -35,6 +35,7 @@ DUUI implementation for selected hate classification tools: [Hate](https://huggi
| mehate-bert | https://huggingface.co/l3cube-pune/me-hate-bert | 407f19357c3b2166db6cbc2107807fc07a17b8f5 | MULTI |
| hatemoji | https://huggingface.co/HannahRoseKirk/Hatemoji | f2f98581ab15fb3ccf8b8a5465d7ca70c2958902 | EN |
| codemix-hate | https://huggingface.co/debajyotimaz/codemix_hate | b07d73f1a05dd04c0adbb941b5446064b14feb10 | EN, HI |
+| phobert-hsd | https://huggingface.co/visolex/phobert-hsd | 844b4cda62a864907038a33edb346cf8b612054f | VI |
# How To Use
diff --git a/duui-Hate/docker_build.sh b/duui-Hate/docker_build.sh
old mode 100644
new mode 100755
index c0aa55d1..666537d4
--- a/duui-Hate/docker_build.sh
+++ b/duui-Hate/docker_build.sh
@@ -1,7 +1,7 @@
export ANNOTATOR_NAME=duui-hate
export ANNOTATOR_VERSION=0.3.0
export LOG_LEVEL=INFO
-eport MODEL_CACHE_SIZE=3
+export MODEL_CACHE_SIZE=3
#---------------------------------------------------------------------
#export MODEL_NAME="Andrazp/multilingual-hate-speech-robacofi"
@@ -211,7 +211,13 @@ export MODEL_SOURCE="https://huggingface.co/debajyotimaz/codemix_hate"
export MODEL_LANG="EN, HI"
##--------------------------------------------------------------------
-
+##---------------------------------------------------------------------
+export MODEL_NAME="visolex/phobert-hsd"
+export MODEL_SPECNAME="phobert-hsd"
+export MODEL_VERSION="844b4cda62a864907038a33edb346cf8b612054f"
+export MODEL_SOURCE="https://huggingface.co/visolex/phobert-hsd"
+export MODEL_LANG="VI"
+##--------------------------------------------------------------------
export DOCKER_REGISTRY="docker.texttechnologylab.org/"
export DUUI_CUDA=
diff --git a/duui-Hate/pom.xml b/duui-Hate/pom.xml
index 19bf3634..97dfe74f 100644
--- a/duui-Hate/pom.xml
+++ b/duui-Hate/pom.xml
@@ -101,7 +101,7 @@
com.github.texttechnologylabDockerUnifiedUIMAInterface
- 7cef2433b5
+ 1.5.3
@@ -112,7 +112,7 @@
com.github.texttechnologylabUIMATypeSystem
- fedfa0ace
+ 02fb1a2f13
diff --git a/duui-Hate/src/main/docker/Dockerfile b/duui-Hate/src/main/docker/Dockerfile
index 519512c2..cc2a3b09 100644
--- a/duui-Hate/src/main/docker/Dockerfile
+++ b/duui-Hate/src/main/docker/Dockerfile
@@ -57,8 +57,9 @@ RUN pip install -r reqiurements.txt
#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='HannahRoseKirk/Hatemoji')"
-RUN python -c "from transformers import pipeline; pipeline('text-classification', model='debajyotimaz/codemix_hate')"
+#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='debajyotimaz/codemix_hate')"
+RUN python -c "from transformers import pipeline; pipeline('text-classification', model='visolex/phobert-hsd')"
# service script
COPY ./src/main/python/TypeSystemHate.xml ./TypeSystemHate.xml
diff --git a/duui-Hate/src/main/docker/Dockerfile-cuda b/duui-Hate/src/main/docker/Dockerfile-cuda
index f61fa681..7603b56f 100644
--- a/duui-Hate/src/main/docker/Dockerfile-cuda
+++ b/duui-Hate/src/main/docker/Dockerfile-cuda
@@ -46,9 +46,11 @@ RUN pip install -r reqiurements.txt
#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-french')"
-RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-indonesian')"
+#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-indonesian')"
#RUN python -c "from nubia_score import Nubia; nubia = Nubia()"
+#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='visolex/phobert-hsd')"
+
# service script
COPY ./src/main/python/TypeSystemHate.xml ./TypeSystemHate.xml
COPY ./src/main/python/evaluator.py ./evaluator.py
diff --git a/duui-Hate/src/main/python/hatechecker.py b/duui-Hate/src/main/python/hatechecker.py
index afadbd07..d4aee7b5 100644
--- a/duui-Hate/src/main/python/hatechecker.py
+++ b/duui-Hate/src/main/python/hatechecker.py
@@ -96,25 +96,10 @@ def sigmoid(x):
0: "NOT HATE",
1: "HATE"
},
- "l3cube-pune/me-hate-bert": {
+ "visolex/phobert-hsd": {
0: "NOT HATE",
- 1: "HATE"
- },
- "HannahRoseKirk/Hatemoji": {
- 0: "NOT HATE",
- 1: "HATE",
- },
- "debajyotimaz/codemix_hate": {
- 0: "NOT HATE",
- 1: "HATE"
- },
- "MilaNLProc/hate-ita": {
- 0: "NOT HATE",
- 1: "HATE"
- },
- "MilaNLProc/hate-ita-xlm-r-base": {
- 0: "NOT HATE",
- 1: "HATE"
+ 1: "OFFENSIVE",
+ 2: "HATE"
}
}
diff --git a/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java b/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java
index 06c08e97..c0a30960 100644
--- a/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java
+++ b/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java
@@ -156,4 +156,41 @@ public void DeTest() throws Exception {
Assertions.assertEquals(expected_i, out_i);
}
}
+
+ @Test
+ public void VietnameseTest() throws Exception {
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ );
+ List sentences = Arrays.asList(
+ "Tôi ghét cay ghét đắng điều đó. Sao bạn có thể làm điều tồi tệ đó với tôi! TẠI SAO!",
+ "Tôi rất vui khi được ở đây. Tôi yêu nơi này."
+ );
+
+ createCas("vi", sentences);
+ composer.run(cas);
+
+ // Update to match actual offsets and predictions
+ HashMap expected = new HashMap<>();
+ expected.put("0_43", "NonHate"); // Model predicts NonHate
+ expected.put("44_82", "NonHate");
+
+ Collection all_hate = JCasUtil.select(cas, Hate.class);
+ for (Hate hate : all_hate) {
+ int begin = hate.getBegin();
+ int end = hate.getEnd();
+ double hate_i = hate.getHate();
+ double non_hate = hate.getNonHate();
+ String out_i = "HATE";
+ if (hate_i < non_hate){
+ out_i = "NonHate";
+ }
+ String expected_i = expected.get(begin+"_"+end);
+ if (expected_i != null) {
+ Assertions.assertEquals(expected_i, out_i);
+ }
+ }
+ }
+
}
diff --git a/duui-ocr/Readme.md b/duui-ocr/Readme.md
new file mode 100644
index 00000000..3bce964d
--- /dev/null
+++ b/duui-ocr/Readme.md
@@ -0,0 +1,99 @@
+# DUUI OCR
+
+DUUI implementation for vision-language OCR models.
+
+## Supported Models
+
+| Name | Params | Languages | Supported Tasks |
+| ---- | ------ | --------- | --------------- |
+| [PaddlePaddle/PaddleOCR-VL-1.5](https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5) | 0.9B | multilingual | ocr, table, formula, chart, spotting, seal |
+| [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR) | 0.9B | multilingual | ocr, table, formula |
+
+## Supported Tasks
+
+| Task | PaddleOCR-VL Prompt | GLM-OCR Prompt | Description |
+| ---- | ------------------- | -------------- | ----------- |
+| `ocr` | `OCR:` | `Text Recognition:` | General text recognition |
+| `table` | `Table Recognition:` | `Table Recognition:` | Table structure recognition |
+| `formula` | `Formula Recognition:` | `Formula Recognition:` | LaTeX formula recognition |
+| `chart` | `Chart Recognition:` | — | Chart content recognition |
+| `spotting` | `Spotting:` | — | Text spotting with location |
+| `seal` | `Seal Recognition:` | — | Seal text recognition |
+
+## How To Use
+
+Requires
+[Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface).
+
+### Run within DUUI
+
+```java
+// PaddleOCR-VL
+composer.add(
+ new DUUIDockerDriver.Component(
+ "docker.texttechnologylab.org/duui-ocr:latest"
+ )
+ .withParameter("model_name",
+ "PaddlePaddle/PaddleOCR-VL-1.5")
+ .withParameter("task", "ocr")
+);
+
+// GLM-OCR
+composer.add(
+ new DUUIDockerDriver.Component(
+ "docker.texttechnologylab.org/duui-ocr:latest"
+ )
+ .withParameter("model_name", "zai-org/GLM-OCR")
+ .withParameter("task", "ocr")
+);
+```
+
+### Parameters
+
+| Name | Description | Default |
+| ---- | ----------- | ------- |
+| `model_name` | Model to use (see table above) | — |
+| `task` | OCR task type | `ocr` |
+| `max_new_tokens` | Maximum tokens to generate | `1024` |
+
+### Input / Output
+
+- **Input**: `org.texttechnologylab.annotation.type.Image`
+ annotations in CAS (src can be base64 or file path)
+- **Output**: `org.texttechnologylab.annotation.AnnotationComment`
+ with key = task name, value = recognized text
+
+## Cite
+
+```bibtex
+@inproceedings{Leonhardt:et:al:2023,
+ title = {Unlocking the Heterogeneous Landscape of Big Data
+ {NLP} with {DUUI}},
+ author = {Leonhardt, Alexander and Abrami, Giuseppe
+ and Baumartz, Daniel and Mehler, Alexander},
+ booktitle = {Findings of the Association for Computational
+ Linguistics: EMNLP 2023},
+ year = {2023},
+ publisher = {Association for Computational Linguistics},
+ url = {https://aclanthology.org/2023.findings-emnlp.29},
+ pages = {385--399},
+}
+
+@misc{cui2026paddleocrvl15multitask09bvlm,
+ title = {PaddleOCR-VL-1.5: Towards a Multi-Task 0.9B VLM
+ for Robust In-the-Wild Document Parsing},
+ author = {Cheng Cui and Ting Sun and Suyin Liang and others},
+ year = {2026},
+ eprint = {2601.21957},
+ archivePrefix = {arXiv},
+ primaryClass = {cs.CV},
+}
+
+@misc{glmocr2026,
+ title = {GLM-OCR: A Multimodal OCR Model for Complex
+ Document Understanding},
+ author = {Z.ai Team},
+ year = {2026},
+ url = {https://huggingface.co/zai-org/GLM-OCR},
+}
+```
\ No newline at end of file
diff --git a/duui-ocr/docker_build.sh b/duui-ocr/docker_build.sh
new file mode 100755
index 00000000..2093907e
--- /dev/null
+++ b/duui-ocr/docker_build.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+export DUUI_OCR_CUDA=
+#export DUUI_OCR_CUDA="-cuda"
+
+export DUUI_OCR_ANNOTATOR_NAME=duui-ocr
+export DUUI_OCR_ANNOTATOR_VERSION=0.2.0
+export DUUI_OCR_LOG_LEVEL=DEBUG
+export DUUI_OCR_MODEL_CACHE_SIZE=1
+export DOCKER_REGISTRY="docker.texttechnologylab.org/"
+
+docker build \
+ --build-arg DUUI_OCR_ANNOTATOR_NAME \
+ --build-arg DUUI_OCR_ANNOTATOR_VERSION \
+ --build-arg DUUI_OCR_LOG_LEVEL \
+ --build-arg DUUI_OCR_MODEL_CACHE_SIZE \
+ -t ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:${DUUI_OCR_ANNOTATOR_VERSION}${DUUI_OCR_CUDA} \
+ -f src/main/docker/Dockerfile${DUUI_OCR_CUDA} \
+ .
+
+docker tag \
+ ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:${DUUI_OCR_ANNOTATOR_VERSION}${DUUI_OCR_CUDA} \
+ ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:latest${DUUI_OCR_CUDA}
\ No newline at end of file
diff --git a/duui-ocr/pom.xml b/duui-ocr/pom.xml
new file mode 100644
index 00000000..d8406d68
--- /dev/null
+++ b/duui-ocr/pom.xml
@@ -0,0 +1,133 @@
+
+
+ 4.0.0
+
+ org.texttechnologylab.duui
+ duui_ocr
+ 0.2.0
+
+
+
+ AGPL-3.0-or-later
+
+ https://www.gnu.org/licenses/agpl.txt
+
+ repo
+
+ GNU Affero General Public License v3.0
+ or later
+
+
+
+
+
+ Texttechnology Lab
+ https://www.texttechnologylab.org
+
+
+
+
+
+
+ org.apache.maven.plugins
+
+
+ maven-surefire-plugin
+
+ 2.22.0
+
+
+ --illegal-access=permit
+ --add-opens
+ java.base/java.util=ALL-UNNAMED
+
+
+
+
+
+
+
+ 17
+ 17
+
+ 2.4.0
+
+
+
+
+
+ jitpack.io
+ https://jitpack.io
+
+
+
+
+
+
+ org.dkpro.core
+
+ dkpro-core-asl
+
+
+ ${dkpro.core.version}
+
+ pom
+ import
+
+
+
+
+
+
+
+ com.github.texttechnologylab
+
+
+ DockerUnifiedUIMAInterface
+
+ 1.5.3
+
+
+
+
+ com.github.texttechnologylab
+
+ UIMATypeSystem
+ 02fb1a2f13
+
+
+
+ org.junit.jupiter
+ junit-jupiter
+ 5.9.0
+ test
+
+
+
+ org.dkpro.core
+
+ dkpro-core-api-segmentation-asl
+
+ test
+
+
+
+ org.dkpro.core
+
+ dkpro-core-io-xmi-asl
+
+ test
+
+
+
+ org.dkpro.core
+
+ dkpro-core-api-resources-asl
+
+ test
+
+
+
\ No newline at end of file
diff --git a/duui-ocr/requirements.txt b/duui-ocr/requirements.txt
new file mode 100644
index 00000000..4aa290b8
--- /dev/null
+++ b/duui-ocr/requirements.txt
@@ -0,0 +1,12 @@
+transformers>=5.0.0
+torch==2.6.0
+torchvision==0.21.0
+Pillow>=10.0.0
+fastapi==0.110.0
+uvicorn[standard]==0.27.1
+pydantic-settings==2.0.2
+dkpro-cassis==0.9.1
+numpy>=1.26.0
+sentencepiece>=0.2.0
+protobuf>=5.0.0
+accelerate>=0.30.0
\ No newline at end of file
diff --git a/duui-ocr/src/main/docker/Dockerfile b/duui-ocr/src/main/docker/Dockerfile
new file mode 100644
index 00000000..6d87c793
--- /dev/null
+++ b/duui-ocr/src/main/docker/Dockerfile
@@ -0,0 +1,107 @@
+# Builds the container image for the multi-model OCR annotator.
+# This is v3 of the Dockerfile.
+#
+# v1: Only downloaded PaddleOCR-VL-1.5. Straightforward, worked fine.
+#
+# v2: Added microsoft/trocr-base-printed to the pre-download step.
+# This actually *built* successfully, unlike the Python code which
+# never ran properly with TrOCR. I also had to add
+# VisionEncoderDecoderModel and TrOCRProcessor to the import
+# line, which was the moment I started realizing TrOCR was a
+# different "thing". The image was ~2GB larger for a model we never
+# ended up using. Removed it.
+#
+# v3: Replaced TrOCR with GLM-OCR. Downloads both PaddleOCR-VL and
+# GLM-OCR at build time. Current version.
+#
+# BORROWED. The overall structure (WORKDIR, EXPOSE, pip install pattern,
+# ARG/ENV pairs for config) is copied from other DUUI annotator
+# Dockerfiles in the TTLab repo:
+# https://github.com/texttechnologylab/DockerUnifiedUIMAInterface
+#
+#
+# Last meaningful edit: Feb 2026
+
+
+# SOLID. Python 3.10 because that's what the TTLab DUUI components
+# standardize on. Didn't investigate further, just went with 3.10.
+FROM python:3.10
+
+WORKDIR /usr/src/app
+
+EXPOSE 9714
+
+# -- Dependencies --
+# SOLID. pip install in order: upgrade pip itself, install build tools
+# (setuptools/wheel, needed for some packages that compile C extensions),
+# then install from requirements.txt.
+
+RUN pip install --upgrade pip
+RUN pip install setuptools wheel
+COPY ./requirements.txt ./requirements.txt
+RUN pip install -r requirements.txt
+
+# -- Model pre-download --
+# SOLID (the idea) + COPILOT (the syntax)
+#
+# This downloads the model weights during the Docker *build*, not at
+# runtime.
+#
+# v2 ABANDONED: this block used to also download TrOCR:
+#
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \
+# VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed'); \
+# TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed'); \
+#
+# Removed it when I abandoned TrOCR. No point baking a ~900MB model
+# into the image if we're never going to call it.
+#
+# Note: both working models (PaddleOCR-VL and GLM-OCR) load through
+# the same Auto* classes AutoModelForImageTextToText and
+# AutoProcessor. This is the same compatibility that makes them work
+# in the shared Python backend. TrOCR needed VisionEncoderDecoderModel
+# and TrOCRProcessor, which was another hint that it didn't belong here.
+RUN python -c "\
+from transformers import AutoProcessor, AutoModelForImageTextToText; \
+AutoModelForImageTextToText.from_pretrained('PaddlePaddle/PaddleOCR-VL-1.5'); \
+AutoProcessor.from_pretrained('PaddlePaddle/PaddleOCR-VL-1.5'); \
+AutoModelForImageTextToText.from_pretrained('zai-org/GLM-OCR'); \
+AutoProcessor.from_pretrained('zai-org/GLM-OCR')"
+
+# -- Source files --
+# SOLID. Copy the actual application code.
+COPY ./src/main/python/TypeSystemOCR.xml ./TypeSystemOCR.xml
+COPY ./src/main/python/duui_ocr.py ./duui_ocr.py
+COPY ./src/main/python/duui_ocr.lua ./duui_ocr.lua
+
+# -- Configuration --
+# BORROWED. The ARG/ENV pattern is from TTLab's other Dockerfiles.
+
+ARG DUUI_OCR_LOG_LEVEL="DEBUG"
+ENV DUUI_OCR_LOG_LEVEL=$DUUI_OCR_LOG_LEVEL
+
+# How many models to keep loaded in memory simultaneously.
+# Default 1 means loading GLM-OCR evicts PaddleOCR and vice versa.
+# Set to 2 if you have enough VRAM for both (~20GB+).
+ARG DUUI_OCR_MODEL_CACHE_SIZE=1
+ENV DUUI_OCR_MODEL_CACHE_SIZE=$DUUI_OCR_MODEL_CACHE_SIZE
+
+# -- Metadata --
+# These get reported through the /v1/documentation endpoint.
+# The version is "unset" by default and gets overridden by the CI/CD
+# pipeline (or manually with --build-arg).
+ARG DUUI_OCR_ANNOTATOR_NAME="duui-ocr"
+ENV DUUI_OCR_ANNOTATOR_NAME=$DUUI_OCR_ANNOTATOR_NAME
+ARG DUUI_OCR_ANNOTATOR_VERSION="unset"
+ENV DUUI_OCR_ANNOTATOR_VERSION=$DUUI_OCR_ANNOTATOR_VERSION
+
+# -- Startup --
+# BORROWED. uvicorn is the ASGI server that runs the FastAPI app.
+#
+# REVISIT. I've read that uvicorn with --workers > 1 uses separate
+# processes, each with its own memory space. So two workers = two
+# copies of the model in VRAM? That's definitely not what we want.
+# But I wonder if there's a way to share the model across workers.
+# Haven't looked into it. For now, 1 worker is fine for our throughput.
+ENTRYPOINT ["uvicorn", "duui_ocr:app", "--host", "0.0.0.0", "--port", "9714"]
+CMD ["--workers", "1"]
\ No newline at end of file
diff --git a/duui-ocr/src/main/docker/Dockerfile_cuda b/duui-ocr/src/main/docker/Dockerfile_cuda
new file mode 100644
index 00000000..8d9ed2a9
--- /dev/null
+++ b/duui-ocr/src/main/docker/Dockerfile_cuda
@@ -0,0 +1,88 @@
+# BORROWED. Almost entirely lifted from the base Dockerfile, with
+# the only real difference being the base image (NVIDIA CUDA runtime
+# instead of plain Python) and a couple of CUDA-specific env vars.
+#
+# See the base Dockerfile for detailed commentary on each section.
+# I'm not repeating all of that here.
+#
+# Last meaningful edit: Feb 2026
+
+# COPILOT. I asked "what's the right NVIDIA base image for running
+# HuggingFace models with CUDA" and Copilot suggested this one.
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+# BORROWED. Stops Python from buffering stdout/stderr, so logs
+# show up immediately in docker logs. Copied from the other
+# Dockerfile for another DUUI GPU component.
+ENV PYTHONUNBUFFERED=1
+
+WORKDIR /usr/src/app
+
+EXPOSE 9714
+
+# -- Python installation --
+# The NVIDIA base image doesn't come with Python, unlike the python:3.10
+# image we use in the base Dockerfile. So we install it manually.
+# COPILOT helped with the apt-get lines. The DEBIAN_FRONTEND=noninteractive
+# suppresses interactive prompts during install that would hang the build.
+RUN apt-get update && \
+ DEBIAN_FRONTEND=noninteractive apt-get install -y \
+ python3.10 \
+ python3-pip \
+ python3.10-venv \
+ && rm -rf /var/lib/apt/lists/*
+
+# -- Dependencies --
+# Same as base Dockerfile.
+RUN pip install --upgrade pip
+RUN pip install setuptools wheel
+COPY ./requirements.txt ./requirements.txt
+RUN pip install -r requirements.txt
+
+# -- Model pre-download --
+# Same as base Dockerfile.
+#
+# v2 ABANDONED: TrOCR download
+# RUN python3 -c "\
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \
+# VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed'); \
+# TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')"
+
+RUN python3 -c "\
+from transformers import AutoProcessor, AutoModelForImageTextToText; \
+AutoModelForImageTextToText.from_pretrained('PaddlePaddle/PaddleOCR-VL-1.5'); \
+AutoProcessor.from_pretrained('PaddlePaddle/PaddleOCR-VL-1.5'); \
+AutoModelForImageTextToText.from_pretrained('zai-org/GLM-OCR'); \
+AutoProcessor.from_pretrained('zai-org/GLM-OCR')"
+
+# -- Source files --
+COPY ./src/main/python/TypeSystemOCR.xml ./TypeSystemOCR.xml
+COPY ./src/main/python/duui_ocr.py ./duui_ocr.py
+COPY ./src/main/python/duui_ocr.lua ./duui_ocr.lua
+
+# -- Configuration --
+# Same ARG/ENV pairs as base Dockerfile.
+ARG DUUI_OCR_LOG_LEVEL="DEBUG"
+ENV DUUI_OCR_LOG_LEVEL=$DUUI_OCR_LOG_LEVEL
+
+ARG DUUI_OCR_MODEL_CACHE_SIZE=1
+ENV DUUI_OCR_MODEL_CACHE_SIZE=$DUUI_OCR_MODEL_CACHE_SIZE
+
+ARG DUUI_OCR_ANNOTATOR_NAME="duui-ocr"
+ENV DUUI_OCR_ANNOTATOR_NAME=$DUUI_OCR_ANNOTATOR_NAME
+ARG DUUI_OCR_ANNOTATOR_VERSION="unset"
+ENV DUUI_OCR_ANNOTATOR_VERSION=$DUUI_OCR_ANNOTATOR_VERSION
+
+# -- CUDA-specific --
+# COPILOT. Asked "what NVIDIA env vars does a container need to
+# use the host GPU." These tell the NVIDIA container runtime which
+# GPU capabilities to expose. "compute,utility" covers inference
+# (compute) and nvidia-smi (utility). There's also "graphics" and
+# "video" but we don't need those.
+# Source: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+# -- Startup --
+ENTRYPOINT ["python3", "-m", "uvicorn", "duui_ocr:app", "--host", "0.0.0.0", "--port", "9714"]
+CMD ["--workers", "1"]
\ No newline at end of file
diff --git a/duui-ocr/src/main/python/TypeSystemOCR.xml b/duui-ocr/src/main/python/TypeSystemOCR.xml
new file mode 100644
index 00000000..2164ae04
--- /dev/null
+++ b/duui-ocr/src/main/python/TypeSystemOCR.xml
@@ -0,0 +1,89 @@
+
+
+
+
+ org.texttechnologylab.annotation.type.Image
+ Image annotation with source data
+ uima.tcas.Annotation
+
+
+ src
+
+ Base64 encoded image data or file path
+
+ uima.cas.String
+
+
+ width
+
+ uima.cas.Integer
+
+
+ height
+
+ uima.cas.Integer
+
+
+ mimetype
+
+ uima.cas.String
+
+
+
+
+
+
+ org.texttechnologylab.annotation.AnnotationComment
+
+
+ uima.cas.AnnotationBase
+
+
+ reference
+
+ uima.cas.TOP
+
+
+ value
+
+ uima.cas.String
+
+
+ key
+
+ uima.cas.String
+
+
+
+
+
+
+ org.texttechnologylab.annotation.model.MetaData
+
+
+ uima.tcas.Annotation
+
+
+ modelName
+
+ uima.cas.String
+
+
+ modelVersion
+
+ uima.cas.String
+
+
+ source
+
+ uima.cas.String
+
+
+ lang
+
+ uima.cas.String
+
+
+
+
+
\ No newline at end of file
diff --git a/duui-ocr/src/main/python/duui_ocr.lua b/duui-ocr/src/main/python/duui_ocr.lua
new file mode 100644
index 00000000..c40d8ae2
--- /dev/null
+++ b/duui-ocr/src/main/python/duui_ocr.lua
@@ -0,0 +1,248 @@
+--[[
+I do not really know Lua :(. I've never written Lua before this Praktikum.
+Copilot is near-useless here because it doesn't understand the luajava
+bridge or the DUUI-specific patterns, and keeps hallucinating methods
+that don't exist on the Java objects :((. So for this file I leaned heavily
+on ChatGPT and on reading existing Lua scripts from other DUUI
+annotators in the TTLab repo.
+
+Sources I borrowed from (all from the same GitHub org):
+ https://github.com/texttechnologylab/duui-uima
+ - duui-transformers-summary/src/.../duui_summary.lua
+ (the serialize/deserialize skeleton, the JCasUtil iteration pattern,
+ the MetaData and DocumentModification annotation creation)
+ - duui-transformers-sentiment/src/.../duui_sentiment.lua
+ (the selection-based iteration with Class:forName, the pattern for
+ writing results back as typed annotations with begin/end offsets)
+ - duui-image-generation/src/.../duui_image_generation.lua
+ (the Image annotation type usage, error handling with
+ AnnotationComment, writing config key-value pairs back as
+ annotation comments. I basically lifted that pattern wholesale)
+
+The structure is always the same across all DUUI Lua scripts.
+Once I understood that pattern from reading the existing scripts,
+writing this one was mostly a matter of swapping in the right
+annotation types and field names for OCR.
+
+ChatGPT wrote the first draft of both functions. I edited field names
+and annotation types to match our Python server's request/response
+schemas.
+
+Last meaningful edit: Feb 2026
+--]]
+
+
+-- -- Java class bindings --
+-- BORROWED. I copied this block from duui-transformers-sentiment and
+-- duui-image-generation, then added/removed classes as needed.
+-- DUUILuaUtils is a TTLab helper that wraps some common operations
+-- like getting document text length (which is apparently not trivial
+-- in UIMA because of how surrogate pairs work? I didn't dig into it).
+--
+-- The string concatenation with ".." is Lua's version of "+" for
+-- strings. ChatGPT taught me that. I split long class names across
+-- lines because some of these fully-qualified Java names are... something.
+
+StandardCharsets = luajava.bindClass(
+ "java.nio.charset.StandardCharsets"
+)
+Class = luajava.bindClass("java.lang.Class")
+JCasUtil = luajava.bindClass(
+ "org.apache.uima.fit.util.JCasUtil"
+)
+DUUILuaUtils = luajava.bindClass(
+ "org.texttechnologylab.DockerUnifiedUIMAInterface"
+ .. ".lua.DUUILuaUtils"
+)
+
+
+-- -- serialize --
+-- BORROWED + CHATGPT. The overall skeleton is from duui-transformers-sentiment
+-- and duui-image-generation. The Image annotation iteration is adapted
+-- from duui-image-generation's deserialize function, but run in
+-- reverse. There they *write* Image annotations, here I *read* them.
+--
+-- ChatGPT wrote the first working version after I described what I
+-- needed: "read all Image annotations from the CAS, extract their
+-- src/begin/end fields, and send them as a JSON array along with
+-- model config parameters."
+--
+-- I understand the flow: get params, iterate over typed annotations,
+-- build a table, encode to JSON. The luajava method-call syntax with
+-- the colons (obj:method()) vs dots (obj.field) still trips me up.
+-- In Lua, colon means "call this method on the object" and dot means
+-- "access this field." I think. ChatGPT explained it three times.
+
+function serialize(inputCas, outputStream, parameters)
+ local doc_lang = inputCas:getDocumentLanguage()
+ local doc_text = inputCas:getDocumentText()
+ local doc_len =
+ DUUILuaUtils:getDocumentTextLength(inputCas)
+
+ local model_name = parameters["model_name"]
+
+ -- Default task to "ocr" if not specified. Most of the time
+ -- that's what we want anyway.
+ local task = parameters["task"]
+ if task == nil then
+ task = "ocr"
+ end
+
+ -- Cap on how much text the model can generate per image.
+ -- 1024 is generous for OCR, a full page of text is usually
+ -- well under that in tokens. But better too high than truncated.
+ local max_new_tokens = parameters["max_new_tokens"]
+ if max_new_tokens == nil then
+ max_new_tokens = 1024
+ end
+
+ -- -- Collect Image annotations from the CAS --
+ -- BORROWED. This pattern is straight from duui-image-generation.
+ -- I was going to comment more but then remembered the line from
+ -- Game of Throne "You know nothing, Jon Snow." Jon Snow is me,
+ -- I am the Jon Snow of Lua.
+
+ local images = {}
+ local images_count = 1
+ local ImageClass = Class:forName(
+ "org.texttechnologylab.annotation.type.Image"
+ )
+ local images_it =
+ JCasUtil:select(inputCas, ImageClass):iterator()
+
+ while images_it:hasNext() do
+ local img = images_it:next()
+ local image_data = {
+ src = img:getSrc(),
+ begin = img:getBegin(),
+ ["end"] = img:getEnd(),
+ }
+ images[images_count] = image_data
+ images_count = images_count + 1
+ end
+
+ outputStream:write(json.encode({
+ images = images,
+ lang = doc_lang,
+ doc_len = doc_len,
+ model_name = model_name,
+ task = task,
+ max_new_tokens = max_new_tokens,
+ }))
+end
+
+
+-- -- deserialize --
+-- BORROWED + CHATGPT. The overall structure is a patchwork of patterns
+-- from the three existing Lua scripts I studied:
+-- - Error handling with AnnotationComment: from duui-image-generation
+-- - MetaData annotation creation: from duui-transformers-sentiment
+-- and duui-transformers-summary (they're basically identical)
+-- - Writing results as AnnotationComment key-value pairs: from
+-- duui-image-generation's config loop
+--
+-- ChatGPT helped me stitch these patterns together and adapt them
+-- to match the OCRResponse schema from our Python server.
+--
+-- FRAGILE. This function assumes the Python server's response JSON
+-- has exactly the field names we check for. If someone changes the
+-- Pydantic model on the Python side without updating this Lua script,
+-- results will silently not appear in the CAS. I don't know how to
+-- make this more robust in Lua. There's no schema validation.
+
+function deserialize(inputCas, inputStream)
+ -- I would never have figured out this incantation on my own.
+ -- Copied verbatim from duui-transformers-summary.
+ local inputString = luajava.newInstance(
+ "java.lang.String",
+ inputStream:readAllBytes(),
+ StandardCharsets.UTF_8
+ )
+ local results = json.decode(inputString)
+
+ -- -- Error handling --
+ -- BORROWED from duui-image-generation. Jon Snow speaking here.
+ if results["errors"] ~= nil then
+ for _, error in ipairs(results["errors"]) do
+ local err_annotation = luajava.newInstance(
+ "org.texttechnologylab.annotation"
+ .. ".AnnotationComment",
+ inputCas
+ )
+ err_annotation:setKey("error")
+ err_annotation:setValue(error)
+ err_annotation:addToIndexes()
+ end
+ end
+
+ -- -- Model metadata --
+ -- BORROWED from duui-transformers-sentiment and
+ -- duui-transformers-summary.
+ if results["model_name"] ~= nil then
+ local model_meta = luajava.newInstance(
+ "org.texttechnologylab.annotation"
+ .. ".model.MetaData",
+ inputCas
+ )
+ model_meta:setModelName(results["model_name"])
+ model_meta:setModelVersion(
+ results["model_version"]
+ )
+ model_meta:setSource(results["model_source"])
+ model_meta:setLang(results["model_lang"])
+ model_meta:addToIndexes()
+ end
+
+ -- -- OCR results --
+ -- CHATGPT. I asked ChatGPT to write this block.
+ -- Prompt was roughly: "iterate over ocr_results from the JSON,
+ -- create an AnnotationComment for each, set the key to the task
+ -- name and the value to the recognized text."
+ --
+ -- I'm using AnnotationComment as the output type because our
+ -- TypeSystemOCR.xml doesn't define a dedicated OCR annotation
+ -- type (yet). AnnotationComment is a generic key-value pair
+ -- that's available in the TTLab type system. The key stores
+ -- which task produced this result ("ocr", "table", "formula",
+ -- etc.) and the value stores the actual text.
+ --
+ -- REVISIT. This loses the begin/end offset information from
+ -- the OCR results. The AnnotationComment gets added at position
+ -- 0,0 in the document rather than at the original image's
+ -- location. I should probably set the begin/end on the
+ -- annotation to match result["begin"] and result["end"], but
+ -- I wasn't sure if AnnotationComment supports positional offsets
+ -- the way other annotation types do. Need to check the type
+ -- system definition. For now the offset data is just... lost
+ -- between Python and here. Not great.
+ if results["ocr_results"] ~= nil then
+ for _, result in ipairs(results["ocr_results"]) do
+ local ocr_annotation = luajava.newInstance(
+ "org.texttechnologylab.annotation"
+ .. ".AnnotationComment",
+ inputCas
+ )
+ ocr_annotation:setKey(result["task"])
+ ocr_annotation:setValue(result["text"])
+ ocr_annotation:addToIndexes()
+ end
+ end
+
+ -- -- Config as annotation comments --
+ -- BORROWED + CHATGPT. This pattern is directly from duui-image-generation.
+ -- Lua is weakly typed but Java is not, and the luajava bridge
+ -- doesn't do implicit conversion. Found that out when it threw
+ -- an error on a numeric value. ChatGPT suggested tostring() as the fix.
+ if results["config"] ~= nil then
+ for key, value in pairs(results["config"]) do
+ local config_annotation = luajava.newInstance(
+ "org.texttechnologylab.annotation"
+ .. ".AnnotationComment",
+ inputCas
+ )
+ config_annotation:setKey("config_" .. key)
+ config_annotation:setValue(tostring(value))
+ config_annotation:addToIndexes()
+ end
+ end
+end
\ No newline at end of file
diff --git a/duui-ocr/src/main/python/duui_ocr.py b/duui-ocr/src/main/python/duui_ocr.py
new file mode 100644
index 00000000..4b977cc7
--- /dev/null
+++ b/duui-ocr/src/main/python/duui_ocr.py
@@ -0,0 +1,883 @@
+"""
+duui_ocr_server.py
+
+FastAPI server that wraps vision-language models and exposes them as a
+DUUI-compatible annotator component. You send it images (base64 or file
+paths), it sends back OCR text.
+
+ITERATION HISTORY:
+
+ v1: PaddleOCR-VL-1.5 only. One model, everything in one huge function,
+ worked but was impossible to extend. I knew from the start we'd
+ need to support more models, so I had to improve on this version
+ even though it technically ran fine. The problem was architectural,
+ not functional.
+
+ v2: Tried to add microsoft/trocr-base-printed as a second model.
+ Spent two days on this before realizing TrOCR is a fundamentally
+ different kind of model. It uses VisionEncoderDecoderModel instead
+ of AutoModelForImageTextToText, needs its own TrOCRProcessor
+ instead of AutoProcessor, has no concept of chat templates or
+ text prompts, and this is the real killer: it only works on
+ single text-line images :(. You literally have to pre-crop every
+ line of text before feeding it in. It doesn't do full-page OCR.
+ My whole infrastructure assumes you hand the model a page and get
+ text back. TrOCR assumes someone else already found the text
+ lines for you. I couldn't reconcile these two approaches without
+ rewriting everything into two completely separate pipelines, and
+ at that point what's the shared infrastructure even for?
+ The abandoned TrOCR backend code is still in this file, commented
+ out, as proof of "concept".
+
+ v3: Added zai-org/GLM-OCR instead. This worked almost immediately
+ because GLM-OCR is architecturally the same *kind* of model as
+ PaddleOCR-VL: it's a vision-language model that uses
+ AutoModelForImageTextToText, supports AutoProcessor with chat
+ templates, accepts text prompts alongside images, and does
+ full-page OCR. The backend pattern I'd already built for PaddleOCR
+ fit GLM-OCR with only minor adjustments. Sometimes the answer
+ isn't "write more code," it's "pick a compatible model."
+
+Heavy lifting on the model loading, batching, and the generate() call
+was done with GitHub Copilot. I understand the flow but some of the
+torch-specific idioms (inference_mode, bfloat16, cache eviction) are
+things I looked up rather than knew from experience.
+
+The DUUI integration layer (typesystem, lua script, endpoints) is
+mostly lifted from existing DUUI annotator examples in the TTLab repo:
+https://github.com/texttechnologylab/DockerUnifiedUIMAInterface
+
+Last meaningful edit: Feb 2026
+"""
+
+from __future__ import annotations
+
+import base64
+import gc
+import logging
+import os
+from abc import ABC, abstractmethod
+from functools import lru_cache
+from io import BytesIO
+from threading import Lock
+from typing import Dict, List, Optional, Union
+
+import torch
+from cassis import load_typesystem
+from fastapi import FastAPI
+from fastapi.encoders import jsonable_encoder
+from PIL import Image as PILImage
+from pydantic import BaseModel
+from pydantic_settings import BaseSettings
+from starlette.responses import JSONResponse, PlainTextResponse, Response
+from transformers import AutoModelForImageTextToText, AutoProcessor
+
+# -- v2: TrOCR imports --
+# ABANDONED.
+# TrOCR needs its own model class and processor class. It can't use
+# the Auto* classes that PaddleOCR-VL and GLM-OCR share.
+#
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+
+# -- Registry --
+# SOLID. This is just a dictionary.
+#
+# Each model we support gets an entry here with its metadata.
+# "task_prompts" maps a task name to the literal string the model
+# expects as its instruction. I got these prompt strings from the
+# respective model cards on HuggingFace:
+# - https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5
+# - https://huggingface.co/zai-org/GLM-OCR
+#
+# If you add a new model, you add it here and write a backend class
+# for it below. "backend" is just a string key that maps to a class
+# in BACKEND_MAP at the bottom of the backends section.
+
+MODEL_REGISTRY = {
+ "PaddlePaddle/PaddleOCR-VL-1.5": {
+ "source": "https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5",
+ "lang": "multi",
+ "version": "2026-01-28",
+ "tasks": ["ocr", "table", "formula", "chart", "spotting", "seal"],
+ "task_prompts": {
+ "ocr": "OCR:",
+ "table": "Table Recognition:",
+ "formula": "Formula Recognition:",
+ "chart": "Chart Recognition:",
+ "spotting": "Spotting:",
+ "seal": "Seal Recognition:",
+ },
+ "backend": "paddleocr",
+ },
+ # -- v2: TrOCR registry entry --
+ # ABANDONED. I had this in the registry for about 6 hours before
+ # I realized it was never going to work with the shared backend.
+ #
+ # "microsoft/trocr-base-printed": {
+ # "source": "https://huggingface.co/microsoft/trocr-base-printed",
+ # "lang": "en", # TrOCR is English-only, unlike the others
+ # "version": "2021-09-21",
+ # "tasks": ["ocr"], # only OCR, no table/formula/chart support
+ # "task_prompts": {
+ # # TrOCR doesn't actually use text prompts at all.
+ # # It just takes pixel_values and generates text directly.
+ # # I put this here to fit the registry schema but it's
+ # # meaningless, the TrOCR backend ignores it.
+ # "ocr": "",
+ # },
+ # "backend": "trocr",
+ # },
+ "zai-org/GLM-OCR": {
+ "source": "https://huggingface.co/zai-org/GLM-OCR",
+ "lang": "multi",
+ "version": "2026-02-09",
+ "tasks": ["ocr", "table", "formula"],
+ "task_prompts": {
+ "ocr": "Text Recognition:",
+ "table": "Table Recognition:",
+ "formula": "Formula Recognition:",
+ },
+ "backend": "glmocr",
+ },
+}
+
+# Just collects every unique task string across all models.
+# The sorted() is cosmetic, I like alphabetical order in API docs.
+ALL_SUPPORTED_TASKS = sorted(
+ {t for m in MODEL_REGISTRY.values() for t in m["tasks"]}
+)
+
+# -- Settings & globals --
+# BORROWED. pydantic-settings pattern from TTLab's other DUUI components.
+# Source: https://github.com/texttechnologylab/DockerUnifiedUIMAInterface
+#
+# The idea is that all config comes from environment variables so the
+# Docker container can be parameterized at runtime. BaseSettings does
+# the env-var-to-field mapping automatically, which I didn't know before.
+
+
+class Settings(BaseSettings):
+ duui_ocr_annotator_name: str
+ duui_ocr_annotator_version: str
+ duui_ocr_log_level: str
+ duui_ocr_model_cache_size: int = 1 # how many models to keep loaded
+
+
+settings = Settings()
+logging.basicConfig(level=settings.duui_ocr_log_level)
+logger = logging.getLogger(__name__)
+
+# COPILOT. I asked Copilot "how to pick GPU vs CPU and set dtype for
+# transformers inference" and this is essentially what it gave me.
+# bfloat16 is a half-precision float that saves VRAM. I *think* it's
+# fine for inference but not for training? Either way it works here.
+# On CPU we fall back to float32 because bfloat16 support on CPU is
+# patchy depending on the hardware.
+DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if DEVICE != "cpu" else torch.float32
+logger.info("Using device: %s", DEVICE)
+
+# Only one thread can use the model at a time. Without this lock,
+# concurrent requests can corrupt the GPU state and you get cryptic
+# CUDA errors. Learned that the hard way during testing.
+model_lock = Lock()
+
+# BORROWED, DUUI boilerplate. Every DUUI annotator needs a UIMA type
+# system (XML) and a Lua communication script. These are loaded once
+# at startup. The format is dictated by the DUUI framework.
+with open("TypeSystemOCR.xml", "rb") as f:
+ typesystem = load_typesystem(f)
+with open("duui_ocr.lua", "rb") as f:
+ lua_communication_script = f.read().decode("utf-8")
+
+# -- Schemas --
+# SOLID. These are just data shapes for the API. Pydantic validates
+# incoming JSON against these classes automatically, which is genuinely
+# one of the nicest things about FastAPI.
+#
+# "begin" and "end" are character offsets in the original UIMA document.
+# They travel with the image so we can attach the OCR result back to
+# the right spot in the document.
+
+
+class ImageInput(BaseModel):
+ src: str # base64-encoded image data or a file path
+ begin: int
+ end: int
+
+
+class OCRResult(BaseModel):
+ text: str # the recognized text
+ task: str # which task produced this ("ocr", "table", etc.)
+ begin: int
+ end: int
+
+
+class OCRRequest(BaseModel):
+ images: List[ImageInput]
+ lang: str
+ doc_len: int
+ model_name: str
+ task: str = "ocr"
+ max_new_tokens: int = 1024 # upper bound on model output length
+
+
+class OCRResponse(BaseModel):
+ ocr_results: List[OCRResult]
+ model_name: str
+ model_version: str
+ model_source: str
+ model_lang: str
+ errors: List[str] # we collect errors instead of crashing
+ config: Dict[str, Union[str, int, bool]]
+
+
+# BORROWED, DUUI documentation schema. Every annotator must describe
+# itself through this endpoint. Copied from existing annotators.
+class TextImagerDocumentation(BaseModel):
+ annotator_name: str
+ version: str
+ implementation_lang: Optional[str] = None
+ meta: Optional[dict] = None
+ parameters: Optional[dict] = None
+
+
+# -- Helpers --
+
+
+def decode_image(src: str) -> PILImage.Image:
+ """
+ SOLID. Takes either a file path or a base64 string and gives
+ back a PIL image. The .convert("RGB") is important because some
+ PNGs come in as RGBA or palette mode and the models choke on that.
+ I found that out after a very confusing afternoon of "why does
+ this work on JPEGs but not PNGs?"
+ """
+ if os.path.isfile(src):
+ return PILImage.open(src).convert("RGB")
+ return PILImage.open(BytesIO(base64.b64decode(src))).convert("RGB")
+
+
+def to_device(mapping: dict) -> dict:
+ """
+ COPILOT. Moves all tensors in a dict to the target device (GPU/CPU).
+ Copilot generated this as a one-liner dict comprehension. I expanded
+ it for readability. The isinstance check is there because the
+ processor output dict also contains non-tensor values (like
+ attention masks as lists sometimes?) and you can't call .to() on those.
+ """
+ return {
+ k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
+ for k, v in mapping.items()
+ }
+
+
+def generate(model, inputs: dict, max_new_tokens: int):
+ """
+ COPILOT. wraps model.generate() with the settings we want.
+ Prompt was roughly "generate from a transformers model with no
+ sampling deterministic output."
+
+ - inference_mode: faster than no_grad, Copilot's suggestion.
+ I *think* it disables autograd more aggressively.
+ - do_sample=False: deterministic output, same image = same text.
+ - use_cache=True: something about reusing intermediate computations
+ during token generation. Makes it faster. I don't fully understand
+ the KV-cache mechanism but every example I've seen sets this to True.
+ """
+ with torch.inference_mode():
+ return model.generate(
+ **inputs,
+ max_new_tokens=max_new_tokens,
+ do_sample=False,
+ use_cache=True,
+ )
+
+
+# -- Backends --
+# This is the part I'm least confident about architecturally.
+# The idea: each model family has slightly different ways of building
+# the input prompt and decoding the output. So each one gets its own
+# "backend" class that knows how to talk to that specific model.
+#
+# The abstract base class defines the interface. Subclasses fill in
+# the details. I learned this pattern from the original code in the
+# DUUI repo.
+#
+# v1 had just PaddleOCR, no abstraction needed.
+# v2 is where I introduced the ABC because I thought TrOCR would be
+# a second subclass. It wasn't :((. TrOCR's interface was too different.
+# v3 kept the ABC because GLM-OCR actually fits it perfectly.
+# So the abstraction turned out to be useful, just not for the model
+# I originally designed it for.
+
+
+class OCRBackend(ABC):
+ def __init__(self, model_name: str, model, processor):
+ self.model_name = model_name
+ self.model = model
+ self.processor = processor
+ self.meta = MODEL_REGISTRY[model_name]
+
+ def get_prompt(self, task: str) -> str:
+ """
+ SOLID. Looks up the prompt string for a given task.
+ Falls back to the "ocr" prompt if the task isn't found,
+ which is a safety net that probably shouldn't be needed
+ since we validate tasks earlier. But just in case.
+ """
+ prompts = self.meta["task_prompts"]
+ return prompts.get(task, prompts.get("ocr", "OCR:"))
+
+ @abstractmethod
+ def run_single(
+ self, image: PILImage.Image, task: str, max_new_tokens: int
+ ) -> str: ...
+
+ def run_batch(
+ self,
+ images: List[PILImage.Image],
+ task: str,
+ max_new_tokens: int,
+ ) -> List[str]:
+ """
+ WORKS. Tries batch processing first, and if that fails for
+ any reason (OOM, padding issues, whatever) falls back to
+ processing images one at a time. This saved me during testing
+ when batch processing would randomly fail on certain image
+ size combinations. The sequential fallback is slower but
+ at least it doesn't crash the whole request.
+ """
+ try:
+ return self._run_batch_impl(images, task, max_new_tokens)
+ except Exception as e:
+ logger.warning("Batch failed, falling back to sequential: %s", e)
+ return [
+ self.run_single(img, task, max_new_tokens) for img in images
+ ]
+
+ def _run_batch_impl(
+ self,
+ images: List[PILImage.Image],
+ task: str,
+ max_new_tokens: int,
+ ) -> List[str]:
+ """Default: just loops. Subclasses override with real batching."""
+ return [
+ self.run_single(img, task, max_new_tokens) for img in images
+ ]
+
+
+class PaddleOCRBackend(OCRBackend):
+ """
+ BORROWED + COPILOT. Backend for PaddlePaddle/PaddleOCR-VL-1.5.
+ This was the first model I got working (v1). The chat template
+ pattern (apply_chat_template) comes from the HuggingFace model
+ card example:
+ https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5
+
+ Copilot helped me adapt it for batch processing. The _decode
+ method with the .split(chat_text)[-1] trick is from the model
+ card too. Yhe model repeats the prompt in its output so you
+ have to strip it. Took me a while to figure out why I was
+ getting the prompt text echoed back in my results.
+ """
+
+ def _chat_text(self, task: str) -> str:
+ # Builds the chat-formatted prompt string the model expects.
+ # The structure with "role" / "content" / list of dicts is
+ # the HuggingFace chat template convention.
+ # {"type": "image"} is a placeholder — the actual pixel data
+ # gets passed separately to the processor.
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": self.get_prompt(task)},
+ ],
+ }
+ ]
+ return self.processor.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+
+ def _decode(self, generated, chat_text: str) -> List[str]:
+ # FRAGILE. The split-on-prompt-text approach assumes the
+ # model always echoes the prompt. If a future model version
+ # changes this behavior, results will break silently.
+ decoded = self.processor.batch_decode(
+ generated, skip_special_tokens=True
+ )
+ return [r.split(chat_text)[-1].strip() for r in decoded]
+
+ def run_single(self, image, task, max_new_tokens):
+ text = self._chat_text(task)
+ inputs = to_device(
+ self.processor(text=[text], images=[image], return_tensors="pt")
+ )
+ out = generate(self.model, inputs, max_new_tokens)
+ return self._decode(out, text)[0]
+
+ def _run_batch_impl(self, images, task, max_new_tokens):
+ # Same as run_single but we pass all images at once with
+ # padding=True so the processor pads shorter sequences to
+ # match the longest one. Faster on GPU because it processes
+ # in parallel (I think).
+ text = self._chat_text(task)
+ inputs = to_device(
+ self.processor(
+ text=[text] * len(images),
+ images=images,
+ return_tensors="pt",
+ padding=True,
+ )
+ )
+ out = generate(self.model, inputs, max_new_tokens)
+ return self._decode(out, text)
+
+
+
+# v2 ABANDONED: TrOCR Backend
+#
+# I spent a full weekend trying to make this work. Leaving it here
+# commented out as documentation of why it failed, in case anyone
+# else gets the same idea.
+#
+# The core problem: TrOCR (microsoft/trocr-base-printed) is a
+# VisionEncoderDecoderModel, not an AutoModelForImageTextToText.
+#
+# Our whole pipeline sends full page images. TrOCR expects someone
+# to have already detected and cropped individual text lines. I'd
+# need to add a whole text detection step before TrOCR, basically
+# building a separate pipeline.
+#
+# I also couldn't get it to load through AutoModelForImageTextToText
+# without it throwing architecture mismatch errors. Copilot kept
+# suggesting workarounds that compiled but produced garbage output.
+#
+# Source that finally made me understand the difference:
+# https://huggingface.co/docs/transformers/en/model_doc/trocr
+# https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/vision-encoder-decoder
+# Also this HF discussion where someone asks the same question I had:
+# https://huggingface.co/microsoft/trocr-base-printed/discussions/3
+#
+# class TrOCRBackend(OCRBackend):
+# """
+# ABANDONED, backend for microsoft/trocr-base-printed.
+#
+# This doesn't actually inherit from OCRBackend cleanly because
+# the interface is too different. I tried to force it to fit by
+# ignoring the task parameter and skipping the prompt, but the
+# real issue is deeper: TrOCR only does single-line OCR.
+#
+# Model card: https://huggingface.co/microsoft/trocr-base-printed
+# Paper: https://arxiv.org/abs/2109.10282
+# """
+#
+# def __init__(self, model_name: str, model, processor):
+# # Can't call super().__init__() cleanly because the parent
+# # expects self.processor to have apply_chat_template(), which
+# # TrOCRProcessor doesn't have. Already a bad sign.
+# self.model_name = model_name
+# self.model = model
+# self.processor = processor
+# self.meta = MODEL_REGISTRY[model_name]
+#
+# def run_single(self, image, task, max_new_tokens):
+# # TrOCR ignores the task parameter entirely. It only does OCR.
+# # No table recognition, no formula recognition, nothing.
+# #
+# # The processor here is TrOCRProcessor, which only takes images.
+# # No text= argument. No chat template. Just pixel_values.
+# pixel_values = self.processor(
+# images=image, return_tensors="pt"
+# ).pixel_values.to(DEVICE)
+#
+# with torch.inference_mode():
+# generated_ids = self.model.generate(
+# pixel_values,
+# max_new_tokens=max_new_tokens,
+# )
+#
+# return self.processor.batch_decode(
+# generated_ids, skip_special_tokens=True
+# )[0]
+#
+# def _run_batch_impl(self, images, task, max_new_tokens):
+# # FRAGILE: TrOCR batching. I got this working but the results
+# # were garbage on full-page images. The model would output
+# # random fragments or repeat the same word over and over.
+# #
+# # In hindsight this is obvious: the model was trained on
+# # cropped single-line images at 384x384 resolution.
+# pixel_values = self.processor(
+# images=images, return_tensors="pt", padding=True
+# ).pixel_values.to(DEVICE)
+#
+# with torch.inference_mode():
+# generated_ids = self.model.generate(
+# pixel_values,
+# max_new_tokens=max_new_tokens,
+# )
+#
+# return self.processor.batch_decode(
+# generated_ids, skip_special_tokens=True
+# )
+#
+# End of abandoned TrOCR code.
+
+
+class GlmOCRBackend(OCRBackend):
+ """
+ BORROWED + COPILOT. Backend for zai-org/GLM-OCR (v3 addition).
+
+ After the TrOCR failure I was nervous about adding another model,
+ but GLM-OCR turned out to be almost suspiciously easy to integrate.
+ The reason: it's the same *kind* of model as PaddleOCR-VL.
+
+ Both are vision-language models built on the
+ AutoModelForImageTextToText architecture. Both use AutoProcessor
+ with chat templates. Both accept full-page images with text prompts.
+ The only real differences are in how the messages dict is structured
+ and how you decode the output.
+
+ Specifically, why GLM-OCR works where TrOCR didn't:
+ 1. GLM-OCR loads with AutoModelForImageTextToText: same class
+ as PaddleOCR-VL. No special imports needed.
+ 2. GLM-OCR's processor supports apply_chat_template() so the
+ prompt-building pattern from OCRBackend.get_prompt() just works.
+ 3. GLM-OCR handles full document pages natively. It was designed
+ for "complex document understanding" (their words). No need
+ to pre-crop text lines.
+ 4. GLM-OCR supports multiple tasks (ocr, table, formula): same
+ multi-task pattern as PaddleOCR-VL.
+
+ If I'd found GLM-OCR first, I wouldn't have wasted time on TrOCR.
+ Lesson learned: check the model architecture *class* before you
+ check the model's benchmarks.
+
+ Model card: https://huggingface.co/zai-org/GLM-OCR
+ GitHub/SDK: https://github.com/zai-org/GLM-OCR
+
+ The big difference from PaddleOCR in terms of code: here you pass
+ the actual PIL image object inside the messages dict
+ ({"type": "image", "image": img}), whereas PaddleOCR wants a
+ placeholder token and the images separately.
+
+ I followed the model card example for the message format.
+ Copilot wrote _generate_and_decode. The apply_chat_template call
+ here does tokenization directly (tokenize=True, return_dict=True)
+ unlike PaddleOCR where we tokenize in a separate step. I don't
+ love that the two backends work so differently internally but
+ that's what the models expect.
+ """
+
+ def _build_messages(self, images: List[PILImage.Image], task: str):
+ # One message-list per image. Each is a separate "conversation"
+ # because the model processes them independently even in a batch.
+ prompt = self.get_prompt(task)
+ return [
+ [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image", "image": img},
+ {"type": "text", "text": prompt},
+ ],
+ }
+ ]
+ for img in images
+ ]
+
+ def _generate_and_decode(self, images, task, max_new_tokens):
+ inputs = to_device(
+ self.processor.apply_chat_template(
+ self._build_messages(images, task),
+ tokenize=True,
+ add_generation_prompt=True,
+ return_tensors="pt",
+ padding=True,
+ return_dict=True,
+ )
+ )
+ out = generate(self.model, inputs, max_new_tokens)
+
+ # COPILOT. This slice strips the input prompt tokens from
+ # the output. "shape[-1]" is the length of the input sequence.
+ # Everything after that is what the model actually generated.
+ generated = out[:, inputs["input_ids"].shape[-1] :]
+
+ return [
+ t.strip()
+ for t in self.processor.batch_decode(
+ generated, skip_special_tokens=True
+ )
+ ]
+
+ def run_single(self, image, task, max_new_tokens):
+ return self._generate_and_decode([image], task, max_new_tokens)[0]
+
+ def _run_batch_impl(self, images, task, max_new_tokens):
+ return self._generate_and_decode(images, task, max_new_tokens)
+
+
+# Maps the "backend" string from MODEL_REGISTRY to the actual class.
+# TrOCR was going to be "trocr": TrOCRBackend here. Now it's just
+# the two that actually work.
+BACKEND_MAP = {"paddleocr": PaddleOCRBackend, "glmocr": GlmOCRBackend}
+
+# -- Model loading --
+
+# -- v2 ABANDONED: TrOCR loader --
+# ABANDONED. TrOCR needs its own loading function because it uses
+# different classes. This was a telling sign it wasn't going to fit.
+#
+# def load_trocr(model_name: str):
+# """
+# Loads TrOCR with VisionEncoderDecoderModel instead of
+# AutoModelForImageTextToText. I tried using Auto* classes first
+# and got:
+# ValueError: Unrecognized configuration class
+#
+# for this kind of AutoModel: AutoModelForImageTextToText.
+#
+# Source for the correct loading pattern:
+# https://huggingface.co/microsoft/trocr-base-printed
+# """
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+# processor = TrOCRProcessor.from_pretrained(model_name)
+# model = VisionEncoderDecoderModel.from_pretrained(
+# model_name, torch_dtype=DTYPE
+# )
+# model.to(DEVICE).eval()
+# return TrOCRBackend(model_name, model, processor)
+
+
+@lru_cache(maxsize=settings.duui_ocr_model_cache_size)
+def load_backend(model_name: str) -> OCRBackend:
+ """
+ COPILOT + BORROWED. Loads a model and its processor from HuggingFace,
+ wraps them in the appropriate backend class, and caches the result.
+
+ The lru_cache decorator means we only download/load each model once.
+ With cache_size=1 (default), loading a second model evicts the first.
+ This is important because these models are huge and you probably can't
+ fit two on one GPU.
+
+ REVISIT. lru_cache doesn't actually free the GPU memory when it
+ evicts an entry. The old model just becomes unreferenced and *eventually*
+ gets garbage collected, maybe. I've seen CUDA OOM errors when switching
+ models. Might need a custom cache that explicitly calls del + gc.collect()
+ + torch.cuda.empty_cache() on eviction. Haven't figured out a clean
+ way to do that yet.
+
+ The AutoProcessor / AutoModelForImageTextToText pattern is from the
+ HuggingFace transformers docs:
+ https://huggingface.co/docs/transformers/model_doc/auto
+ Copilot filled in the dtype and device placement.
+
+ Note: this only works for models that support AutoModelForImageTextToText.
+ TrOCR doesn't. That was a big part of why v2 failed. Both PaddleOCR-VL
+ and GLM-OCR declare "auto_model": "AutoModelForImageTextToText" in their
+ HuggingFace config, which is how the Auto* classes know what to load.
+ TrOCR's config says VisionEncoderDecoderModel, which is a different
+ class hierarchy entirely.
+ """
+ if model_name not in MODEL_REGISTRY:
+ raise ValueError(
+ f"Unknown model: {model_name}. "
+ f"Choose from: {list(MODEL_REGISTRY.keys())}"
+ )
+ meta = MODEL_REGISTRY[model_name]
+
+ # v2 remnant: I had a special case here for TrOCR.
+ # if meta["backend"] == "trocr":
+ # return load_trocr(model_name)
+
+ logger.info("Loading model: %s", model_name)
+ processor = AutoProcessor.from_pretrained(model_name)
+ model = AutoModelForImageTextToText.from_pretrained(
+ model_name, torch_dtype=DTYPE
+ )
+ # .eval() puts the model in inference mode (disables dropout etc.)
+ # .to(DEVICE) moves all parameters to GPU. These two calls are in
+ # every single HuggingFace example I've ever seen.
+ model.to(DEVICE).eval()
+ logger.info("Model loaded on %s", DEVICE)
+ return BACKEND_MAP[meta["backend"]](model_name, model, processor)
+
+
+# -- FastAPI --
+# BORROWED. The app setup and DUUI endpoint structure is standard
+# across all DUUI annotators.
+
+app = FastAPI(
+ openapi_url="/openapi.json",
+ docs_url="/api",
+ redoc_url=None,
+ title=settings.duui_ocr_annotator_name,
+ description="Multi-model OCR Component for DUUI",
+ version=settings.duui_ocr_annotator_version,
+ terms_of_service="https://www.texttechnologylab.org/legal_notice/",
+ contact={"name": "TTLab Team", "url": "https://texttechnologylab.org"},
+ license_info={
+ "name": "AGPL",
+ "url": "http://www.gnu.org/licenses/agpl-3.0.en.html",
+ },
+)
+
+
+# The next four endpoints are pure DUUI boilerplate. They just serve
+# static content that the DUUI framework needs to discover and
+# configure this annotator. Nothing interesting happens here.
+
+
+@app.get("/v1/typesystem")
+def get_typesystem() -> Response:
+ return Response(
+ content=typesystem.to_xml().encode("utf-8"),
+ media_type="application/xml",
+ )
+
+
+@app.get("/v1/communication_layer", response_class=PlainTextResponse)
+def get_communication_layer() -> str:
+ return lua_communication_script
+
+
+@app.get("/v1/documentation")
+def get_documentation():
+ return TextImagerDocumentation(
+ annotator_name=settings.duui_ocr_annotator_name,
+ version=settings.duui_ocr_annotator_version,
+ implementation_lang="Python",
+ meta={
+ "models": {
+ name: {k: m[k] for k in ("source", "lang", "version", "tasks")}
+ for name, m in MODEL_REGISTRY.items()
+ },
+ "supported_tasks": ALL_SUPPORTED_TASKS,
+ },
+ parameters={
+ "model_name": "Model to use: " + ", ".join(MODEL_REGISTRY),
+ "task": "OCR task: " + ", ".join(ALL_SUPPORTED_TASKS),
+ "max_new_tokens": "Maximum tokens to generate",
+ },
+ )
+
+
+@app.get("/v1/details/input_output")
+def get_input_output() -> JSONResponse:
+ return JSONResponse(
+ content=jsonable_encoder(
+ {
+ "inputs": ["org.texttechnologylab.annotation.type.Image"],
+ "outputs": [
+ "org.texttechnologylab.annotation.AnnotationComment"
+ ],
+ }
+ )
+ )
+
+
+@app.post("/v1/process")
+def post_process(request: OCRRequest):
+ """
+ SOLID (mostly). This is where the actual OCR happens.
+
+ The flow:
+ 1. Check the requested task is valid for the chosen model
+ 2. Decode all images from base64/filepath to PIL
+ 3. Acquire the model lock (one request at a time on the GPU)
+ 4. Run the OCR backend on the batch
+ 5. Pair each result back with its original document offsets
+ 6. Clean up GPU memory
+
+ I collect errors in a list instead of raising exceptions because
+ DUUI expects a response even if some images failed. A partial
+ result (3 out of 5 images worked) is more useful than a crash.
+
+ FRAGILE. The finally block with cuda.empty_cache() and gc.collect()
+ is my attempt at preventing memory leaks between requests. I'm not
+ 100% sure it's sufficient. During long runs the VRAM usage seems
+ to creep up slowly. Might be a leak in the processor or in PIL.
+ Haven't had time to profile it properly.
+ """
+ meta = MODEL_REGISTRY.get(request.model_name, {})
+ ocr_results: List[OCRResult] = []
+ errors: List[str] = []
+
+ try:
+ # Validate task before we do any heavy work
+ supported = meta.get("tasks", [])
+ if request.task not in supported:
+ errors.append(
+ f"Task '{request.task}' not supported by "
+ f"{request.model_name}. Choose from: {supported}"
+ )
+ else:
+ # Decode images: keep track of which ones succeeded so we
+ # can match results back to the right request indices later.
+ # Bad images (corrupt base64, missing files) get logged as
+ # errors but don't kill the whole batch.
+ pil_images, valid_indices = [], []
+ for i, img_in in enumerate(request.images):
+ try:
+ pil_images.append(decode_image(img_in.src))
+ valid_indices.append(i)
+ except Exception as e:
+ logger.error("Failed to decode image %d: %s", i, e)
+ errors.append(f"Image {i}: {e}")
+
+ if pil_images:
+ with model_lock:
+ backend = load_backend(request.model_name)
+ texts = backend.run_batch(
+ pil_images, request.task, request.max_new_tokens
+ )
+
+ # Pair each OCR result with the original image's
+ # document offsets (begin/end). The zip with
+ # valid_indices is how we skip over failed images.
+ for idx, text in zip(valid_indices, texts):
+ img_in = request.images[idx]
+ ocr_results.append(
+ OCRResult(
+ text=text,
+ task=request.task,
+ begin=img_in.begin,
+ end=img_in.end,
+ )
+ )
+
+ # Close PIL images to free memory. I kept forgetting
+ # this and wondering why RAM usage kept growing.
+ for img in pil_images:
+ img.close()
+ except Exception as ex:
+ logger.exception(ex)
+ errors.append(str(ex))
+ finally:
+ # COPILOT. Asked "how to free GPU memory after inference in
+ # pytorch" and got this. empty_cache releases unused cached
+ # memory back to CUDA, gc.collect nudges Python's garbage
+ # collector. Belt and suspenders.
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ gc.collect()
+
+ return OCRResponse(
+ ocr_results=ocr_results,
+ model_name=request.model_name,
+ model_version=meta.get("version", "Unknown"),
+ model_source=meta.get("source", "Unknown"),
+ model_lang=meta.get("lang", "Unknown"),
+ errors=errors,
+ config={
+ "task": request.task,
+ "max_new_tokens": request.max_new_tokens,
+ },
+ )
\ No newline at end of file
diff --git a/duui-ocr/src/test/java/org/hucompute/textimager/uima/ocr/ocr/DUUIOCRTest.java b/duui-ocr/src/test/java/org/hucompute/textimager/uima/ocr/ocr/DUUIOCRTest.java
new file mode 100644
index 00000000..af443156
--- /dev/null
+++ b/duui-ocr/src/test/java/org/hucompute/textimager/uima/ocr/ocr/DUUIOCRTest.java
@@ -0,0 +1,654 @@
+/*
+ * I do not really know Java. I know Java ish but not enough for this.
+ * I'm a Python person who got handed a Java test and tried to
+ * make it work. Most of my understanding of JUnit, JCas, and the
+ * DUUI test patterns comes from three places:
+ *
+ * 1. ChatGPT: I asked it to explain basically everything.
+ * "What does @BeforeAll do?" "Why does JCas need a factory?"
+ * "What's the difference between @AfterEach and @AfterAll?"
+ * I used it as a tutor more than a code generator here.
+ *
+ * 2. GitHub Copilot: wrote most of the repetitive test methods
+ * after I got the first one working. Once Copilot saw the
+ * pattern for testPaddleOCRWithBase64, it basically generated
+ * the GLM variants with minimal prompting.
+ *
+ * 3. Existing DUUI test files: this is where the real skeleton
+ * came from. I studied and borrowed heavily from:
+ * - GermanSummaryTest.java (the createCas + Sentence pattern)
+ * - SentimentTest.java (the @BeforeAll/@AfterEach lifecycle,
+ * the composer setup, the general structure of everything)
+ * - TextToImageTest.java (the base64 image encoding/decoding,
+ * which was exactly what I needed for OCR image input)
+ * These three files are the real authors of the test scaffolding.
+ * I adapted their patterns for OCR.
+ *
+ * ITERATION HISTORY (mirrors the Python server's evolution):
+ *
+ * v1: Tests for PaddleOCR-VL-1.5 only. This was the first model
+ * I got working end-to-end (Python server → DUUI → Java test).
+ * Took longer than it should have because I was learning JUnit
+ * and UIMA at the same time. But PaddleOCR-VL cooperated.
+ * The test structure comes from this iteration.
+ *
+ * v2: Tried to add tests for microsoft/trocr-base-printed. Wrote
+ * the test methods, ran them, got results that were garbage.
+ * Spent time debugging on the Java side before realizing the
+ * problem wasn't here at all. Commented out below with notes
+ * on what went wrong. See the Python server file for the full
+ * post-mortem. Or don't, it's kinda embarrasing.
+ *
+ * v3: Added tests for zai-org/GLM-OCR. Worked almost immediately.
+ * GLM-OCR is architecturally the same *kind* of model as
+ * PaddleOCR-VL (AutoModelForImageTextToText, chat templates,
+ * full-page OCR), so the test pattern from v1 transferred
+ * directly. Copilot generated most of these by pattern-matching
+ * on the Paddle tests. This is the current working state.
+ *
+ * Source for DUUI test conventions:
+ * https://github.com/texttechnologylab/DockerUnifiedUIMAInterface
+ *
+ * Last meaningful edit: Feb 2026
+ */
+
+package org.hucompute.textimager.uima.ocr;
+
+import org.apache.uima.fit.factory.JCasFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.util.XmlCasSerializer;
+import org.junit.jupiter.api.*;
+import org.texttechnologylab.DockerUnifiedUIMAInterface
+ .DUUIComposer;
+import org.texttechnologylab.DockerUnifiedUIMAInterface
+ .driver.DUUIRemoteDriver;
+import org.texttechnologylab.DockerUnifiedUIMAInterface
+ .lua.DUUILuaContext;
+import org.texttechnologylab.annotation.AnnotationComment;
+import org.texttechnologylab.annotation.type.Image;
+import org.xml.sax.SAXException;
+
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
+import java.io.*;
+import java.net.UnknownHostException;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+public class DUUIOCRTest {
+
+ // ── Static fields / shared state ────────────────────────────────
+ // BORROWED. This entire static block pattern is from SentimentTest.
+ //
+ // I asked ChatGPT why these are static and it explained that
+ // @BeforeAll methods must be static in JUnit 5 (unless you change
+ // the lifecycle mode), so anything they initialize also has to be
+ // static. Coming from Python's pytest fixtures this felt weird,
+ // but I get it now. Mostly.
+
+ static DUUIComposer composer;
+ static JCas cas;
+
+ // Port 9714 is what I set in my docker-compose for the OCR server.
+ // If you're running the server somewhere else, change this.
+ static String url = "http://127.0.0.1:9714";
+ static String paddleModel =
+ "PaddlePaddle/PaddleOCR-VL-1.5";
+
+ // ── v2: TrOCR model string ──────────────────────────────────
+ // ABANDONED. This model exists and loads fine. The problem is
+ // what it *does* with full-page images (spoiler: nothing useful).
+ // Kept here as documentation of the attempt.
+ //
+ // Source: https://huggingface.co/microsoft/trocr-base-printed
+ //
+ // static String trOcrModel =
+ // "microsoft/trocr-base-printed";
+
+ // v3: GLM-OCR, the model that actually worked as a second option.
+ // Source: https://huggingface.co/zai-org/GLM-OCR
+ static String glmModel = "zai-org/GLM-OCR";
+
+ // I keep test images in a local directory. They're not committed
+ // to the repo because they're 5-10MB each. You need to put your
+ // own test images here before running these tests.
+ static String testImageDir = "src/test_images";
+
+ // ── Lifecycle methods ───────────────────────────────────────────
+
+ @BeforeAll
+ static void beforeAll() throws Exception {
+ // BORROWED. Copied almost verbatim from SentimentTest.beforeAll().
+ // withSkipVerification(true) skips some DUUI handshake checks
+ // that were failing locally. ChatGPT told me this is fine for
+ // testing but you probably shouldn't do it in production.
+ // withJsonLibrary() adds JSON support to the Lua context,
+ // which the communication script needs. Concerning...
+ composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withLuaContext(
+ new DUUILuaContext().withJsonLibrary()
+ );
+
+ DUUIRemoteDriver remoteDriver =
+ new DUUIRemoteDriver();
+ composer.addDriver(remoteDriver);
+
+ // ChatGPT explained: JCasFactory.createJCas() needs a type
+ // system on the classpath. If this line throws a
+ // ResourceInitializationException, it usually means the
+ // type system XML isn't where UIMA expects it. I spent an
+ // hour on that before realizing my pom.xml was missing a
+ // dependency. Not my proudest debugging session.
+ cas = JCasFactory.createJCas();
+ }
+
+ @AfterAll
+ static void afterAll() throws UnknownHostException {
+ composer.shutdown();
+ }
+
+ @AfterEach
+ public void afterEach()
+ throws IOException, SAXException {
+ // BORROWED from SentimentTest. Nothing else to say.
+ composer.resetPipeline();
+
+ ByteArrayOutputStream stream =
+ new ByteArrayOutputStream();
+ XmlCasSerializer.serialize(
+ cas.getCas(), null, stream
+ );
+ System.out.println(
+ stream.toString(StandardCharsets.UTF_8)
+ );
+
+ cas.reset();
+ }
+
+ // ── Helper methods ──────────────────────────────────────────────
+
+ private static String imageToBase64(String imagePath) {
+ // BORROWED. This is essentially the same as
+ // convertImageToBase64() from TextToImageTest.java, just
+ // renamed to match my naming convention. The pattern is:
+ // read file → BufferedImage → write to ByteArrayOutputStream
+ // as PNG → base64-encode the bytes.
+ //
+ // I asked ChatGPT why we can't just read the raw file bytes
+ // and encode those directly. Answer: we *could*, but going
+ // through BufferedImage + ImageIO normalizes the format.
+ // A JPEG file's raw bytes would be JPEG-encoded, and this
+ // way we always send PNG regardless of the source format.
+ // Honestly not sure if the server cares, but TextToImageTest
+ // did it this way and it works.
+ try {
+ File file = new File(imagePath);
+ BufferedImage bufferedImage =
+ ImageIO.read(file);
+ ByteArrayOutputStream baos =
+ new ByteArrayOutputStream();
+ ImageIO.write(bufferedImage, "png", baos);
+ return Base64.getEncoder()
+ .encodeToString(baos.toByteArray());
+ } catch (IOException e) {
+ e.printStackTrace();
+ return null;
+ }
+ }
+
+ private List collectTestImagePaths() {
+ // COPILOT + CHATGPT. I told Copilot "find all image files
+ // in a directory" and it generated most of this. ChatGPT
+ // helped me understand the Objects.requireNonNull() part.
+ //
+ // FRAGILE. This only checks file extensions, not actual
+ // file content. A corrupted PNG with a .png extension will
+ // pass this filter and fail later in imageToBase64(). I was
+ // hungry so this is good enough for testing.
+ List paths = new ArrayList<>();
+ File dir = new File(testImageDir);
+ if (dir.exists() && dir.isDirectory()) {
+ for (File file :
+ Objects.requireNonNull(
+ dir.listFiles()
+ )) {
+ String name = file.getName().toLowerCase();
+ if (name.endsWith(".png")
+ || name.endsWith(".jpg")
+ || name.endsWith(".jpeg")) {
+ paths.add(file.getAbsolutePath());
+ }
+ }
+ }
+ return paths;
+ }
+
+ private void createCasWithBase64Images(
+ String language, List imagePaths) {
+ // BORROWED + COPILOT. Adapted from the createCas() pattern in
+ // GermanSummaryTest and SentimentTest, but instead of adding
+ // Sentence annotations, I'm adding Image annotations with
+ // base64-encoded src data.
+ //
+ // The "OCR document" placeholder text is there because UIMA
+ // requires a document text. You can't have a CAS with no text.
+ // ChatGPT explained that begin/end offsets on annotations must
+ // fall within the document text range, so I set all images to
+ // span the full "OCR document" string (0 to length).
+ cas.setDocumentLanguage(language);
+ cas.setDocumentText("OCR document");
+ for (String path : imagePaths) {
+ String b64 = imageToBase64(path);
+ if (b64 != null) {
+ Image img = new Image(
+ cas, 0, cas.getDocumentText().length()
+ );
+ img.setSrc(b64);
+ img.addToIndexes();
+ }
+ }
+ }
+
+ private void createCasWithFilePaths(
+ String language, List imagePaths) {
+ // Same as above but passes file paths instead of base64 data.
+ cas.setDocumentLanguage(language);
+ cas.setDocumentText("OCR document");
+ for (String path : imagePaths) {
+ Image img = new Image(
+ cas, 0, cas.getDocumentText().length()
+ );
+ img.setSrc(path);
+ img.addToIndexes();
+ }
+ }
+
+ private void printResults() {
+ // BORROWED. The pattern of selecting annotations by type and
+ // printing them is everywhere in the DUUI test files.
+ // SentimentTest does this with SentimentModel, TextToImageTest
+ // with Image, and I do it with AnnotationComment.
+ Collection results =
+ JCasUtil.select(cas, AnnotationComment.class);
+ for (AnnotationComment c : results) {
+ System.out.println(
+ "Key: " + c.getKey()
+ + " | Value: " + c.getValue()
+ );
+ }
+ }
+
+ // ── PaddleOCR-VL tests (v1) ────────────────────────────────────
+ // These were the first tests I wrote. The whole test structure I
+ // use everywhere else in this file crystallized during this
+ // iteration: compose a pipeline with a model name and task, load
+ // images into the CAS, run the pipeline, check that
+ // AnnotationComments came back.
+ //
+ // Everything after this section is a variation on this pattern.
+
+ @Test
+ public void testPaddleOCRWithBase64() throws Exception {
+ // COPILOT + CHATGPT. This is the first test I got running.
+ // It took an embarrassingly long time. The composer.add()
+ // pattern with .withParameter() is from SentimentTest. ChatGPT
+ // walked me through how DUUIRemoteDriver.Component works:
+ // you give it a URL and parameters, and those parameters get
+ // passed to the Python server as part of the DUUI protocol.
+ //
+ // The assertions are minimal - I just check that:
+ // 1. I actually have test images (otherwise what are we testing?)
+ // 2. At least one AnnotationComment came back
+ //
+ // I don't check the *content* of the OCR results because that
+ // depends on what test images you have. SentimentTest checks
+ // exact probability values, which is possible because text
+ // input is deterministic. OCR results vary with the image, so
+ // I just verify something came back and eyeball the printResults()
+ // output. Not ideal, I know.
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("model_name", paddleModel)
+ .withParameter("task", "ocr")
+ );
+ List paths = collectTestImagePaths();
+ assertFalse(paths.isEmpty(),
+ "No test images in " + testImageDir);
+ createCasWithBase64Images("en", paths);
+ composer.run(cas);
+ assertFalse(
+ JCasUtil.select(
+ cas, AnnotationComment.class
+ ).isEmpty(),
+ "No OCR results returned"
+ );
+ printResults();
+ }
+
+ @Test
+ public void testPaddleOCRWithFilePaths()
+ throws Exception {
+ // Same as testPaddleOCRWithBase64 but sends file paths instead
+ // of base64. Copilot generated this after seeing the base64
+ // version. It just swapped createCasWithBase64Images for
+ // createCasWithFilePaths. Amazeballs.
+ //
+ // This test will fail if the server is containerized and
+ // can't see your local filesystem.
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("model_name", paddleModel)
+ .withParameter("task", "ocr")
+ );
+ List paths = collectTestImagePaths();
+ assertFalse(paths.isEmpty());
+ createCasWithFilePaths("en", paths);
+ composer.run(cas);
+ assertFalse(
+ JCasUtil.select(
+ cas, AnnotationComment.class
+ ).isEmpty()
+ );
+ printResults();
+ }
+
+ @Test
+ public void testPaddleTableRecognition()
+ throws Exception {
+ // COPILOT generated this and the formula test below almost
+ // entirely on its own after seeing the OCR tests above.
+ // The only difference is .withParameter("task", "table").
+ //
+ // I only send a single image here (paths.get(0)) because table
+ // recognition is slower and I don't need to batch-test it.
+ // ChatGPT told me Collections.singletonList() is the Java way
+ // to make a one-element list. In Python I'd just write [paths[0]].
+ // Java is... verbose.
+ //
+ // REVISIT. I should really use an image that actually contains
+ // a table for this test. Right now I'm just sending whatever
+ // the first image in the directory is, which is probably a
+ // regular text page. The model still returns *something*, but
+ // it's not a meaningful test of table recognition quality.
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("model_name", paddleModel)
+ .withParameter("task", "table")
+ );
+ List paths = collectTestImagePaths();
+ assertFalse(paths.isEmpty());
+ createCasWithBase64Images(
+ "en",
+ Collections.singletonList(paths.get(0))
+ );
+ composer.run(cas);
+ printResults();
+ }
+
+ @Test
+ public void testPaddleFormulaRecognition()
+ throws Exception {
+ // COPILOT. Same pattern as testPaddleTableRecognition, just
+ // with task="formula". Copilot's autocomplete at this point
+ // was finishing entire test methods before I could type the
+ // method name. Saved me a lot of time, all hail the AI-Overlord.
+ //
+ // Same REVISIT as above: should use an image with actual
+ // mathematical formulas.
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("model_name", paddleModel)
+ .withParameter("task", "formula")
+ );
+ List paths = collectTestImagePaths();
+ assertFalse(paths.isEmpty());
+ createCasWithBase64Images(
+ "en",
+ Collections.singletonList(paths.get(0))
+ );
+ composer.run(cas);
+ printResults();
+ }
+
+ // ── TrOCR tests (v2) - ABANDONED :( ──────────────────────────────
+ //
+ // This is why I should do more reading and research before doing.
+
+ // What happened:
+ // I wanted a second model to count towards my duui-module count
+ // microsoft/trocr-base-printed seemed perfect: well-known,
+ // well-documented, specifically designed for OCR. I wrote the
+ // tests below by duplicating the PaddleOCR tests and swapping
+ // in the TrOCR model name.
+ //
+ // The tests *ran* but failed. I assumed I had a bug in how I
+ // was building the CAS, or in the base64 encoding, or in the Lua
+ // communication script. I spent hours debugging before giving up.
+ // I commented everything out. Moved on to GLM-OCR (v3),
+ // which turned out to be the right call.
+ //
+ // Leaving these tests here as documentation. If anyone in the
+ // future wants to integrate TrOCR, they need to know it requires
+ // a fundamentally different approach: pre-crop text lines first,
+ // then feed each line to TrOCR individually.
+ //
+ // BORROWED. Test structure is identical to the Paddle tests.
+ // The problem was never in the test code. It was in the assumption
+ // that TrOCR could handle the same input as PaddleOCR.
+
+ // @Test
+ // public void testTrOCRWithBase64() throws Exception {
+ // // ABANDONED. Too sad to comment further.
+ // composer.add(
+ // new DUUIRemoteDriver.Component(url)
+ // .withParameter("model_name", trOcrModel)
+ // .withParameter("task", "ocr")
+ // );
+ // List paths = collectTestImagePaths();
+ // assertFalse(paths.isEmpty(),
+ // "No test images in " + testImageDir);
+ // createCasWithBase64Images("en", paths);
+ // composer.run(cas);
+ // // This assertion *passes* — results do come back.
+ // // They're just meaningless.
+ // assertFalse(
+ // JCasUtil.select(
+ // cas, AnnotationComment.class
+ // ).isEmpty(),
+ // "No OCR results returned"
+ // );
+ // printResults();
+ // }
+
+ // @Test
+ // public void testTrOCRWithFilePaths() throws Exception {
+ // // ABANDONED. Same story as testTrOCRWithBase64.
+ // composer.add(
+ // new DUUIRemoteDriver.Component(url)
+ // .withParameter("model_name", trOcrModel)
+ // .withParameter("task", "ocr")
+ // );
+ // List paths = collectTestImagePaths();
+ // assertFalse(paths.isEmpty());
+ // createCasWithFilePaths("en", paths);
+ // composer.run(cas);
+ // assertFalse(
+ // JCasUtil.select(
+ // cas, AnnotationComment.class
+ // ).isEmpty()
+ // );
+ // printResults();
+ // }
+
+ // @Test
+ // public void testTrOCRTableRecognition() throws Exception {
+ // // ABANDONED. I wrote this knowing it probably wouldn't work,
+ // // because TrOCR only supports OCR, so no table recognition,
+ // // no formula recognition, nothing. It's a single-task model.
+ // // On the Python side, the TrOCR backend ignores the task
+ // // parameter entirely. But I wrote the test anyway to see
+ // // what would happen.
+ // //
+ // // What happened: the server accepted the request (it falls
+ // // back to basic OCR when the task isn't supported), and
+ // // returned the same garbage the OCR test. No table structure,
+ // // obviously.
+ // //
+ // // This was the moment I started suspecting the problem was
+ // // deeper than a bug. Three different test configurations,
+ // // all returning the same kind of fragmented output. That's
+ // // not a bug, that's a model doing what it was designed to do
+ // // on input it was never designed to handle.
+ // composer.add(
+ // new DUUIRemoteDriver.Component(url)
+ // .withParameter("model_name", trOcrModel)
+ // .withParameter("task", "table")
+ // );
+ // List paths = collectTestImagePaths();
+ // assertFalse(paths.isEmpty());
+ // createCasWithBase64Images(
+ // "en",
+ // Collections.singletonList(paths.get(0))
+ // );
+ // composer.run(cas);
+ // printResults();
+ // }
+
+ // @Test
+ // public void testTrOCRFormulaRecognition() throws Exception {
+ // // ABANDONED. Same as table, TrOCR doesn't do formulas.
+ // // Included for completeness.
+ // composer.add(
+ // new DUUIRemoteDriver.Component(url)
+ // .withParameter("model_name", trOcrModel)
+ // .withParameter("task", "formula")
+ // );
+ // List paths = collectTestImagePaths();
+ // assertFalse(paths.isEmpty());
+ // createCasWithBase64Images(
+ // "en",
+ // Collections.singletonList(paths.get(0))
+ // );
+ // composer.run(cas);
+ // printResults();
+ // }
+ //
+ // End of abandoned TrOCR tests.
+
+ // ── GLM-OCR tests (v3) ─────────────────────────────────────────
+ // After the TrOCR failure I was genuinely nervous about trying a
+ // third model. But GLM-OCR (zai-org/GLM-OCR) turned out to be
+ // almost suspiciously easy. These tests worked on the first run.
+ //
+ // Source: https://huggingface.co/zai-org/GLM-OCR
+ //
+ // BORROWED. The strategy of mirroring every test across models
+ // is something I saw in SentimentTest, which has separate tests
+ // for different languages and model variants (DeTest, EnTest,
+ // EnCadriffNLPTest, VietnamesePhoBertTest, etc.). Each one is
+ // basically the same flow with different parameters. Repetitive
+ // but easy to read and debug.
+ //
+ // Copilot generated all four of these by pattern-matching on the
+ // Paddle tests above. I only had to change the model string and
+ // the method names.
+
+ @Test
+ public void testGlmOCRWithBase64() throws Exception {
+ // COPILOT. Generated by duplicating testPaddleOCRWithBase64
+ // and changing paddleModel to glmModel.
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("model_name", glmModel)
+ .withParameter("task", "ocr")
+ );
+ List paths = collectTestImagePaths();
+ assertFalse(paths.isEmpty(),
+ "No test images in " + testImageDir);
+ createCasWithBase64Images("en", paths);
+ composer.run(cas);
+ assertFalse(
+ JCasUtil.select(
+ cas, AnnotationComment.class
+ ).isEmpty(),
+ "No OCR results returned"
+ );
+ printResults();
+ }
+
+ @Test
+ public void testGlmOCRWithFilePaths()
+ throws Exception {
+ // COPILOT generated. Same file path caveat as the Paddle
+ // file path test (Docker path resolution).
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("model_name", glmModel)
+ .withParameter("task", "ocr")
+ );
+ List paths = collectTestImagePaths();
+ assertFalse(paths.isEmpty());
+ createCasWithFilePaths("en", paths);
+ composer.run(cas);
+ assertFalse(
+ JCasUtil.select(
+ cas, AnnotationComment.class
+ ).isEmpty()
+ );
+ printResults();
+ }
+
+ @Test
+ public void testGlmTableRecognition()
+ throws Exception {
+ // COPILOT generated. Same REVISIT about using a real table
+ // image applies here.
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("model_name", glmModel)
+ .withParameter("task", "table")
+ );
+ List paths = collectTestImagePaths();
+ assertFalse(paths.isEmpty());
+ createCasWithBase64Images(
+ "en",
+ Collections.singletonList(paths.get(0))
+ );
+ composer.run(cas);
+ printResults();
+ }
+
+ @Test
+ public void testGlmFormulaRecognition()
+ throws Exception {
+ // COPILOT generated. Last of the mirrored tests. At this
+ // point Copilot was basically writing the whole method from
+ // the method name alone.
+ //
+ // REVISIT. I have eight active test methods (four Paddle,
+ // four GLM) and they're all structurally identical. ChatGPT
+ // suggested using @ParameterizedTest with a @MethodSource to
+ // collapse these into one or two parameterized methods
+ // (model × task × input mode). That would be cleaner but I
+ // haven't learned JUnit parameterized tests yet. The
+ // SentimentTest and TextToImageTest files both use separate
+ // methods per scenario too, so at least I'm in good company.
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("model_name", glmModel)
+ .withParameter("task", "formula")
+ );
+ List paths = collectTestImagePaths();
+ assertFalse(paths.isEmpty());
+ createCasWithBase64Images(
+ "en",
+ Collections.singletonList(paths.get(0))
+ );
+ composer.run(cas);
+ printResults();
+ }
+}
\ No newline at end of file
diff --git a/duui-ocr/src/test_images/test_1.png b/duui-ocr/src/test_images/test_1.png
new file mode 100644
index 00000000..343b92b6
Binary files /dev/null and b/duui-ocr/src/test_images/test_1.png differ
diff --git a/duui-ocr/src/test_images/test_2.png b/duui-ocr/src/test_images/test_2.png
new file mode 100644
index 00000000..87d343b2
Binary files /dev/null and b/duui-ocr/src/test_images/test_2.png differ
diff --git a/duui-pos-ancient-greek/.gitignore b/duui-pos-ancient-greek/.gitignore
new file mode 100644
index 00000000..2a679d94
--- /dev/null
+++ b/duui-pos-ancient-greek/.gitignore
@@ -0,0 +1,47 @@
+# Python
+.venv/
+venv/
+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+*.egg
+dist/
+build/
+
+# Model artifacts (large files)
+model/
+*.bin
+*.safetensors
+checkpoint*/
+
+# IDE
+.idea/
+*.iml
+.DS_Store
+.vscode/
+*.swp
+*.swo
+
+# Java build
+target/
+*.class
+
+# Data (cloned separately)
+data/
+UD_Ancient_Greek-*/
+
+# Docker
+*.tar
+*.tar.gz
+
+# Test outputs
+evaluation_report.txt
+test_output.json
+
+# Cache
+.cache/
+.pytest_cache/
+
+# Logs
+*.log
\ No newline at end of file
diff --git a/duui-pos-ancient-greek/README.md b/duui-pos-ancient-greek/README.md
new file mode 100644
index 00000000..ea1f8a4f
--- /dev/null
+++ b/duui-pos-ancient-greek/README.md
@@ -0,0 +1,97 @@
+[]()
+[]()
+[]()
+
+# Ancient Greek Part-of-Speech Tagger
+
+DUUI implementation for Ancient Greek Part-of-Speech (POS) tagging. This component utilizes a fine-tuned `xlm-roberta-base` model trained on the Universal Dependencies [Ancient Greek Perseus treebank](https://github.com/UniversalDependencies/UD_Ancient_Greek-Perseus), achieving a 91.38% test accuracy for 17 Universal POS tags.
+
+## 1. Annotations
+
+The following is a list of Annotations that are needed as Input for the Docker-Image and are returned as Output by the Docker-Image:
+- ### Input (Optional):
+ - `de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence` (If sentences are provided, tagging is performed per sentence. Otherwise, the whole document text is processed).
+- ### Output:
+ - `de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS`
+
+## 2. Included Models
+
+| Name | Source | Revision | Languages |
+|---------------|--------------------------------------------------------------------------------|------------------------------------------|-----------|
+| ancient-greek-pos-xlmr | https://huggingface.co/qbnguyen/ancient-greek-pos-xlmr | a297f1e9bffaa7831ce6f2f58d8f6f3a22948952 | Ancient Greek |
+
+
+# How To Use
+
+For using duui-pos-ancient-greek as a DUUI image it is necessary to use the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface).
+
+## Start Docker container
+
+```bash
+docker run --rm -p 9714:9714 duui-pos-ancient-greek:latest
+```
+
+*(Note: If deployed to the TTLab registry, replace `duui-pos-ancient-greek:latest` with `docker.texttechnologylab.org/duui-pos-ancient-greek:latest`)*
+
+## Run within DUUI
+
+```java
+composer.add(
+ new DUUIDockerDriver.Component("duui-pos-ancient-greek:latest")
+ .withScale(iWorkers)
+ .withImageFetching()
+ // Optional: specify a different HF model ID or local path
+ // .withParameter("model_name", "qbnguyen/ancient-greek-pos-xlmr")
+);
+```
+
+### Parameters
+
+| Name | Description |
+|--------------|------------------------------------|
+| `model_name` | Model to use. Default is `qbnguyen/ancient-greek-pos-xlmr` |
+
+
+# Cite
+
+If you want to use the DUUI image please quote this as follows:
+
+Alexander Leonhardt, Giuseppe Abrami, Daniel Baumartz and Alexander Mehler. (2023). "Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI." Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. [[LINK](https://aclanthology.org/2023.findings-emnlp.29)] [[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)]
+
+## BibTeX
+
+```bibtex
+@inproceedings{Leonhardt:et:al:2023,
+ title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}},
+ author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander},
+ editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika},
+ booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023},
+ year = {2023},
+ address = {Singapore},
+ publisher = {Association for Computational Linguistics},
+ url = {https://aclanthology.org/2023.findings-emnlp.29},
+ pages = {385--399},
+ pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf},
+ abstract = {Automatic analysis of large corpora is a complex task, especially
+ in terms of time efficiency. This complexity is increased by the
+ fact that flexible, extensible text analysis requires the continuous
+ integration of ever new tools. Since there are no adequate frameworks
+ for these purposes in the field of NLP, and especially in the
+ context of UIMA, that are not outdated or unusable for security
+ reasons, we present a new approach to address the latter task:
+ Docker Unified UIMA Interface (DUUI), a scalable, flexible, lightweight,
+ and feature-rich framework for automatic distributed analysis
+ of text corpora that leverages Big Data experience and virtualization
+ with Docker. We evaluate DUUI{'}s communication approach against
+ a state-of-the-art approach and demonstrate its outstanding behavior
+ in terms of time efficiency, enabling the analysis of big text
+ data.}
+}
+
+@misc{Nguyen:2026,
+ author = {Nguyen, Quoc-Bao},
+ title = {Ancient Greek POS Tagger as {DUUI} component},
+ year = {2026},
+ howpublished = {https://github.com/texttechnologylab/duui-uima/tree/main/duui-pos-ancient-greek}
+}
+```
\ No newline at end of file
diff --git a/duui-pos-ancient-greek/docker_build.sh b/duui-pos-ancient-greek/docker_build.sh
new file mode 100755
index 00000000..d1772975
--- /dev/null
+++ b/duui-pos-ancient-greek/docker_build.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -e
+
+DUUI_POS_AG_ANNOTATOR_NAME="duui-pos-ancient-greek"
+DUUI_POS_AG_ANNOTATOR_VERSION="0.1.0"
+
+IMAGE_NAME="${DUUI_POS_AG_ANNOTATOR_NAME}"
+IMAGE_TAG="${DUUI_POS_AG_ANNOTATOR_VERSION}"
+
+echo "============================================="
+echo "Building: ${IMAGE_NAME}:${IMAGE_TAG}"
+echo "============================================="
+
+# Build from project root, using the Dockerfile in src/main/docker/
+docker build \
+ -t "${IMAGE_NAME}:${IMAGE_TAG}" \
+ -t "${IMAGE_NAME}:latest" \
+ -f src/main/docker/Dockerfile \
+ .
+
+echo ""
+echo "============================================="
+echo " Build complete"
+echo " Image: ${IMAGE_NAME}:${IMAGE_TAG}"
+echo ""
+echo "Run with:"
+echo " docker run -p 9714:9714 ${IMAGE_NAME}:${IMAGE_TAG}"
+echo "============================================="
\ No newline at end of file
diff --git a/duui-pos-ancient-greek/pom.xml b/duui-pos-ancient-greek/pom.xml
new file mode 100644
index 00000000..bec5766a
--- /dev/null
+++ b/duui-pos-ancient-greek/pom.xml
@@ -0,0 +1,105 @@
+
+
+ 4.0.0
+
+ org.hucompute.textimager.uima
+ duui_pos_ancient_greek
+ 0.1.0
+ jar
+
+ DUUI POS Ancient Greek
+
+ DUUI component for Ancient Greek POS tagging
+ using a fine-tuned XLM-RoBERTa model.
+
+
+
+ 17
+ 17
+ UTF-8
+ 5.10.2
+
+
+
+
+ jitpack.io
+ https://jitpack.io
+
+
+
+
+
+ com.github.texttechnologylab
+ DockerUnifiedUIMAInterface
+ 1.5.3
+
+
+
+ com.github.texttechnologylab
+ UIMATypeSystem
+ 02fb1a2f13
+
+
+
+ org.apache.uima
+ uimaj-core
+ 3.5.0
+
+
+
+ org.apache.uima
+ uimafit-core
+ 3.5.0
+
+
+
+ org.dkpro.core
+ dkpro-core-api-lexmorph-asl
+ 2.4.0
+
+
+
+ org.dkpro.core
+ dkpro-core-api-segmentation-asl
+ 2.4.0
+
+
+
+ org.junit.jupiter
+ junit-jupiter-api
+ ${junit.version}
+ test
+
+
+ org.junit.jupiter
+ junit-jupiter-engine
+ ${junit.version}
+ test
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.2.5
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.12.1
+
+ 17
+ 17
+
+
+
+
+
+
\ No newline at end of file
diff --git a/duui-pos-ancient-greek/requirements.txt b/duui-pos-ancient-greek/requirements.txt
new file mode 100644
index 00000000..7174c16d
--- /dev/null
+++ b/duui-pos-ancient-greek/requirements.txt
@@ -0,0 +1,9 @@
+transformers>=4.40.0
+torch==2.6.0
+fastapi==0.115.0
+uvicorn[standard]==0.27.1
+pydantic-settings==2.0.2
+numpy>=1.26.0
+sentencepiece>=0.2.0
+protobuf>=5.0.0
+accelerate>=0.30.0
\ No newline at end of file
diff --git a/duui-pos-ancient-greek/src/main/docker/Dockerfile b/duui-pos-ancient-greek/src/main/docker/Dockerfile
new file mode 100644
index 00000000..c0afc32e
--- /dev/null
+++ b/duui-pos-ancient-greek/src/main/docker/Dockerfile
@@ -0,0 +1,115 @@
+# Builds the container image for the Ancient Greek POS tagger.
+# Uses a fine-tuned XLM-RoBERTa model (qbnguyen/ancient-greek-pos-xlmr)
+# to tag parts of speech in Ancient Greek text, served as a DUUI
+# annotator component.
+
+
+# BORROWED. The overall skeleton (WORKDIR, EXPOSE, pip pattern,
+# ARG/ENV pairs, uvicorn entrypoint) is lifted from existing
+# DUUI annotator Dockerfiles. Specifically I had these open while
+# writing this:
+# - duui-sentencizer-spacy (the spaCy one with all the model downloads)
+# - duui-flair-pos (the Flair POS tagger, for the ARG/ENV pattern)
+# - my own duui-ocr Dockerfile (for the HF model pre-download trick)
+# All from: https://github.com/texttechnologylab/DockerUnifiedUIMAInterface
+#
+# Last meaningful edit: Feb 2026
+
+
+# BORROWED. python:3.10-slim instead of the full 3.10 that other DUUI
+# normally uses. ChatGPT suggested this when I complained about image
+# size. The "-slim" variant strips out a lot of system tooling we
+# don't need (man pages, extra locales, etc). I stuck with 3.10
+# specifically because that's what DUUI often have on.
+FROM python:3.10-slim
+
+WORKDIR /usr/src/app
+
+EXPOSE 9714
+
+# -- System dependencies --
+# FRAGILE. This block exists because python:3.10-slim doesn't include
+# a C compiler, and at least one of our pip dependencies (I think it's
+# a tokenizers thing? or maybe sentencepiece?) needs to compile from
+# source.
+#
+# I only found this out because `pip install` was failing with a
+# cryptic "error: command 'gcc' not found" buried in the output.
+# Asked ChatGPT what was going on and it explained that slim images
+# don't ship build tools.
+#
+# The --no-install-recommends flag and the rm -rf cleanup at the end
+# are from ChatGPT's suggestion to keep the image small. The idea is:
+# install gcc and friends, let pip use them to compile whatever it
+# needs, and the compiled .so files stay even though we could
+# theoretically remove build-essential afterward. I haven't bothered
+# with that cleanup because multi-stage builds seem like a rabbit hole
+# I don't need right now.
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends build-essential && \
+ rm -rf /var/lib/apt/lists/*
+
+# -- Python dependencies --
+# SOLID. Same pattern as every other DUUI Dockerfile.
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# -- Model pre-download --
+# BORROWED (the pattern) from my OCR Dockerfile, which I originally
+# got from Copilot. AI is eating itself ouroboros style I guess.
+#
+# Downloads the fine-tuned XLM-RoBERTa model for Ancient Greek POS
+# tagging at *build time* so the container doesn't need internet
+# access at runtime. This is the same trick the OCR Dockerfile uses
+# for PaddleOCR-VL and GLM-OCR.
+#
+# The model itself is from me on Hugging Face. It's XLM-R
+# fine-tuned on Ancient Greek POS data. I self-trained it and
+# upload to Hugging Face instead of using locally, fitting DUUI
+# framework so far.
+#
+# Note: unlike the OCR models which use AutoModelForImageTextToText,
+# this one uses AutoModelForTokenClassification because POS tagging
+# is a token classification task.
+RUN python -c "\
+from transformers import AutoTokenizer, AutoModelForTokenClassification; \
+AutoModelForTokenClassification.from_pretrained('qbnguyen/ancient-greek-pos-xlmr'); \
+AutoTokenizer.from_pretrained('qbnguyen/ancient-greek-pos-xlmr')"
+
+# -- Source files --
+# Just listing the source files, nothing more to comment.
+COPY src/main/python/duui_pos_ancient_greek.py .
+COPY src/main/python/duui_pos_ancient_greek.lua .
+COPY src/main/python/TypeSystemPOS.xml .
+
+# -- Configuration --
+# BORROWED. The ENV pattern is from every other DUUI Dockerfile.
+# The other Dockerfiles use the ARG/ENV pair pattern (ARG with a
+# default, then ENV=$ARG) so you can override at build time with
+# --build-arg. I simplified to just ENV here because I don't
+# actually need build-time overrides yet. I'm the only one building
+# this. If this ever goes into TTLab's pipeline I should
+# probably switch to the ARG/ENV pair pattern to match the others.
+#
+# REVISIT: should DUUI_POS_AG_MODEL_PATH be configurable? Right now
+# it's hardcoded to the one model. But if someone fine-tunes a better
+# Ancient Greek POS model later, being able to swap it via env var
+# without rebuilding the image would be nice. Leaving it as ENV for
+# that reason even though currently there's only one option.
+ENV DUUI_POS_AG_ANNOTATOR_NAME="duui-pos-ancient-greek"
+ENV DUUI_POS_AG_ANNOTATOR_VERSION="0.1.0"
+ENV DUUI_POS_AG_LOG_LEVEL="DEBUG"
+ENV DUUI_POS_AG_MODEL_PATH="qbnguyen/ancient-greek-pos-xlmr"
+
+# -- Startup --
+# BORROWED. Identical pattern to every other DUUI annotator.
+# uvicorn runs the FastAPI app on port 9714.
+#
+# Not setting --workers here (the other Dockerfiles put it in CMD).
+# I should probably add CMD ["--workers", "1"] to match the
+# convention, but it defaults to 1 anyway so it's not breaking
+# anything. In the OCR Dockerfile I made a note about multiple workers
+# each loading their own copy of the model into VRAM. Same concern
+# applies here, so 1 worker is correct for now.
+ENTRYPOINT ["uvicorn", "duui_pos_ancient_greek:app", \
+ "--host", "0.0.0.0", "--port", "9714"]
\ No newline at end of file
diff --git a/duui-pos-ancient-greek/src/main/python/TypeSystemPOS.xml b/duui-pos-ancient-greek/src/main/python/TypeSystemPOS.xml
new file mode 100644
index 00000000..2f064b08
--- /dev/null
+++ b/duui-pos-ancient-greek/src/main/python/TypeSystemPOS.xml
@@ -0,0 +1,41 @@
+
+
+ TypeSystemPOS
+
+ Type system for the Ancient Greek POS tagger DUUI component.
+ Defines POS annotation type. DocumentModification and AnnotationComment
+ are provided by the UIMATypeSystem dependency.
+
+ 0.1.0
+ TTLab / Goethe University Frankfurt
+
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+ Part-of-speech tag annotation
+ uima.tcas.Annotation
+
+
+ PosValue
+ Fine-grained POS tag value
+ uima.cas.String
+
+
+ coarseValue
+ Coarse-grained POS tag value (UPOS)
+ uima.cas.String
+
+
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence
+ Sentence annotation
+ uima.tcas.Annotation
+
+
+
+
\ No newline at end of file
diff --git a/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.lua b/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.lua
new file mode 100644
index 00000000..a2f378a8
--- /dev/null
+++ b/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.lua
@@ -0,0 +1,163 @@
+--[[
+I same struggled with this Lua file just like with other Lua files. I don't
+really know Lua, and the `luajava` bridge feels like dark magic. GitHub Copilot
+was actively harmful here. It kept hallucinating Lua/Java syntax that doesn't
+actually work. I ended up relying entirely on ChatGPT for debugging and pieced
+this together by studying the existing DUUI components (especially the Flair
+POS tagger and the Emotion annotator).
+
+Last meaningful edit: Feb 2026
+]]
+
+-- Java class bindings --
+-- BORROWED. Standard boilerplate from literally every DUUI script.
+StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets")
+
+-- FRAGILE. I wasted an hour trying to `require("json")` because ChatGPT
+-- told me to, which broke the whole pipeline. Turns out DUUI injects `json`
+-- as a global variable at runtime.
+
+-- SERIALIZE: CAS → JSON request --
+function serialize(inputCas, outputStream, parameters)
+ -- 1. Extract document text and language
+ local doc_text = inputCas:getDocumentText()
+ local doc_lang = inputCas:getDocumentLanguage()
+
+ -- I force default to "grc" (Greek) if unspecified, because
+ -- sometimes the upstream reader drops the language tag before
+ -- the text reaches this component.
+ if doc_lang == nil or doc_lang == "x-unspecified" then
+ doc_lang = "grc"
+ end
+ local doc_len = #doc_text
+
+ -- 2. Extract model_name from parameters
+ local model_name = nil
+ if parameters ~= nil and parameters["model_name"] ~= nil then
+ model_name = parameters["model_name"]
+ end
+
+ -- 3. Extract existing Sentence annotations
+ --
+ -- SOLID / CHATGPT. This chunk took three iterations. Originally, I copied
+ -- the `JCasUtil:select(inputCas, Sentence):iterator()` pattern from the
+ -- Flair POS script. But it threw a massive Java 17 "InaccessibleObjectException"
+ -- about ArrayList iterators.
+ --
+ -- ChatGPT explained that Java 17 blocks reflection on certain native Java
+ -- classes, and suggested using UIMA's native index instead of JCasUtil
+ -- to bypass the security block. I don't fully grasp UIMA's index internals,
+ -- but this approach doesn't crash.
+ local sentences = {}
+ local sent_counter = 1
+ local has_sentences = false
+ local sentence_type = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"
+
+ local uimaType = inputCas:getTypeSystem():getType(sentence_type)
+ if uimaType ~= nil then
+ local sent_index = inputCas:getAnnotationIndex(uimaType)
+ if sent_index ~= nil then
+ local it = sent_index:iterator()
+ while it:hasNext() do
+ local sent = it:next()
+ sentences[sent_counter] = {
+ begin = sent:getBegin(),
+ ["end"] = sent:getEnd(),
+ text = sent:getCoveredText()
+ }
+ sent_counter = sent_counter + 1
+ has_sentences = true
+ end
+ end
+ end
+
+ -- 4. Build JSON request
+ local request = {
+ doc_text = doc_text,
+ doc_len = doc_len,
+ lang = doc_lang,
+ model_name = model_name
+ }
+
+ if has_sentences then
+ request.sentences = sentences
+ end
+
+ -- 5. Write to output stream
+ outputStream:write(json.encode(request))
+end
+
+-- DESERIALIZE: JSON response → CAS annotations --
+function deserialize(inputCas, inputStream)
+ -- 1. Read and parse the JSON response
+ local javaString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8)
+
+ -- CHATGPT (CRITICAL FIX). The java.lang.String returned above looks
+ -- like a string to Lua, but it's actually a Java object reference.
+ -- `json.decode` was failing silently and returning nil. ChatGPT caught
+ -- this typing mismatch. You *must* cast it to a native Lua string.
+ local inputString = tostring(javaString)
+ local response = json.decode(inputString)
+
+ if response == nil then
+ print("LUA ERROR: json.decode returned nil. Cannot parse response.")
+ return
+ end
+
+ -- DEBUG PRINT. I added this because I kept getting silent failures
+ -- when the Python inference server crashed. This forces Python errors
+ -- into the TextImager Java logs.
+ if response["errors"] ~= nil and #response["errors"] > 0 then
+ for _, err in ipairs(response["errors"]) do
+ print("PYTHON API ERROR: " .. tostring(err))
+ end
+ end
+
+ -- 2. Get type references
+ local pos_type = "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"
+
+ -- 3. Create POS annotations for each token
+ -- BORROWED. The instantiation and `.addToIndexes()` pattern is lifted
+ -- almost verbatim from the DUUI Flair POS script.
+ if response["tokens"] ~= nil then
+ for _, token in ipairs(response["tokens"]) do
+ local pos = luajava.newInstance(pos_type, inputCas)
+ pos:setBegin(token["begin"])
+ pos:setEnd(token["end"])
+ pos:setPosValue(token["pos_value"])
+
+ -- Unlike Flair, I'm setting the coarse value too since my Python
+ -- script returns it.
+ pos:setCoarseValue(token["pos_coarse_value"])
+ pos:addToIndexes()
+ end
+ else
+ print("LUA WARNING: 'tokens' array is nil or missing in the response.")
+ end
+
+ -- 4. Create MetaData annotation
+ -- BORROWED. I took this `DocumentModification` block from the Emotion
+ -- and spaCy sentencizer scripts. It leaves an audit trail in the CAS
+ -- so that my tags show up properly with a timestamp and model version
+ -- in the TextImager UI.
+ local meta_type = "org.texttechnologylab.annotation.DocumentModification"
+ local meta = luajava.newInstance(meta_type, inputCas)
+ meta:setUser(response["model_name"] or "duui-pos-ancient-greek")
+ meta:setTimestamp(os.time())
+ meta:setComment(
+ "POS tagging by " .. (response["model_name"] or "unknown")
+ .. " v" .. (response["model_version"] or "0.1.0")
+ )
+ meta:addToIndexes()
+
+ -- 5. Create AnnotationComment for any errors
+ if response["errors"] ~= nil and #response["errors"] > 0 then
+ local comment_type = "org.texttechnologylab.annotation.AnnotationComment"
+ for _, err in ipairs(response["errors"]) do
+ local comment = luajava.newInstance(comment_type, inputCas)
+ comment:setKey("error")
+ comment:setValue(err)
+ comment:addToIndexes()
+ end
+ end
+end
\ No newline at end of file
diff --git a/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.py b/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.py
new file mode 100644
index 00000000..776bef3f
--- /dev/null
+++ b/duui-pos-ancient-greek/src/main/python/duui_pos_ancient_greek.py
@@ -0,0 +1,437 @@
+"""
+POS tagger for Ancient Greek, built as a DUUI component for the
+TextImager pipeline. Uses a fine-tuned XLM-RoBERTa model (trained
+on the UD Perseus treebank) to tag tokens with Universal POS labels.
+
+I did not write this from nothing. The DUUI boilerplate: the endpoint
+structure, the Lua communication layer, the typesystem handshake,
+is borrowed heavily from the existing TTLab components, especially
+the Flair POS tagger and the spaCy sentencizer . I studied those to
+understand how DUUI components are supposed to be wired up, then
+adapted the skeleton for my own model.
+
+The actual inference logic (tokenisation, subword-to-word alignment)
+was written with a lot of help from GitHub Copilot and several rounds
+of asking ChatGPT "why is word_ids() returning None for special tokens."
+
+Last meaningful edit: Feb 2026
+"""
+
+import logging
+import os
+import re
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+
+import torch
+import uvicorn
+from fastapi import FastAPI
+from fastapi.responses import PlainTextResponse
+from pydantic import BaseModel
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+
+# -- Config --
+# BORROWED. This env-var-based config pattern is straight from the
+# DUUI emotion and sentiment components.
+# I liked it better than the pydantic BaseSettings approach used in
+# the spaCy sentencizer, mostly because I don't fully understand how
+# pydantic settings auto-loads from env vars and I didn't want to
+# debug that on top of everything else.
+
+ANNOTATOR_NAME = os.environ.get(
+ "DUUI_POS_AG_ANNOTATOR_NAME", "duui-pos-ancient-greek"
+)
+ANNOTATOR_VERSION = os.environ.get(
+ "DUUI_POS_AG_ANNOTATOR_VERSION", "0.1.0"
+)
+LOG_LEVEL = os.environ.get("DUUI_POS_AG_LOG_LEVEL", "DEBUG")
+MODEL_PATH = os.environ.get(
+ "DUUI_POS_AG_MODEL_PATH", "qbnguyen/ancient-greek-pos-xlmr"
+)
+
+COMPONENT_ROOT = Path(__file__).parent
+
+logging.basicConfig(level=getattr(logging, LOG_LEVEL))
+logger = logging.getLogger(__name__)
+
+# BORROWED. The Flair POS component and the spaCy sentencizer both have
+# this. I assume it's for performance but honestly I just copied the
+# pattern because it seemed like the right thing to do.
+_TYPESYSTEM_XML = (COMPONENT_ROOT / "TypeSystemPOS.xml").read_text("utf-8")
+_LUA_SCRIPT = (
+ (COMPONENT_ROOT / "duui_pos_ancient_greek.lua").read_text("utf-8")
+)
+
+# Punctuation pattern for the tokenizer.
+# FRAGILE. I assembled this character class myself by looking at what
+# shows up in my Ancient Greek test corpus. The middle dot (·) and the
+# Greek question mark (;) are the ones that kept tripping me up.
+# There are probably more punctuation marks in Unicode Greek ranges
+# that I'm missing. If tokens start looking wrong, check here first.
+_PUNCT = r"""[,.:;!?·;()\[\]«»\u201c\u201d\u2018\u2019]+"""
+
+# -- Schemas --
+# BORROWED. The request/response schema pattern comes from the DUUI
+# components. The Flair tagger uses DkproSentence / DkproPos, the
+# emotion component uses UimaSentence, etc. I renamed things to match
+# what my component actually does but the shape is the same.
+#
+# I asked ChatGPT: "what is the difference between a Pydantic BaseModel
+# and a regular dataclass" and the answer was helpful enough that I
+# stopped worrying and just used BaseModel like everyone else.
+
+
+class Sentence(BaseModel):
+ begin: int
+ end: int
+ text: str
+
+
+class PosRequest(BaseModel):
+ doc_text: str
+ doc_len: int
+ lang: str = "grc"
+ model_name: Optional[str] = None
+ sentences: Optional[list[Sentence]] = None
+
+
+class TokenPOS(BaseModel):
+ begin: int
+ end: int
+ pos_value: str
+ pos_coarse_value: str
+
+
+class PosResponse(BaseModel):
+ tokens: list[TokenPOS]
+ model_name: str
+ model_version: str
+ model_source: str
+ model_lang: str
+ errors: list[str]
+
+
+# -- Model loading --
+
+# BORROWED. The lru_cache trick for model loading appears in every
+# single DUUI component I looked at. The Flair tagger has a
+# configurable cache size, the emotion component uses a lock + cache
+# combo. I went with the simplest version: cache one model, no lock.
+#
+# REVISIT. The emotion and sentiment components use a threading Lock
+# around model loading/inference. I'm not doing that because I only
+# run one worker (see uvicorn config at the bottom), but if I ever
+# scale this up I'll need to add locking. I only half-understand why
+# concurrent access to a pytorch model is dangerous.
+
+@lru_cache(maxsize=1)
+def load_model(model_path: str):
+ logger.info("Loading model from %s", model_path)
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
+ model = AutoModelForTokenClassification.from_pretrained(model_path)
+ model.eval()
+ logger.info("Model loaded successfully on CPU")
+ return model, tokenizer
+
+
+# -- Tokenisation
+
+# SOLID (mostly). I wrote this function myself. It's the part I
+# understand best because it's basically text processing, which is
+# closer to my wheelhouse than the ML inference stuff.
+#
+# The idea: split on whitespace, then peel off leading and trailing
+# punctuation as separate tokens. I need character offsets because
+# DUUI maps annotations back onto the original document by position.
+#
+# I went through about four versions of this. First attempt used
+# spaCy's tokenizer but it was overkill and slow for Ancient Greek.
+# Second attempt was a naive whitespace split that broke on «εἶπεν»
+# because the guillemets stayed glued to the word. Current version
+# handles that.
+#
+# The _append helper inside the function is a pattern Copilot
+# suggested when I kept repeating the dict construction. I would
+# have just written it out each time, but this is cleaner.
+
+def tokenize_raw_text(text: str) -> list[dict]:
+ """Split *text* into word tokens with character offsets,
+ separating leading/trailing punctuation."""
+ tokens: list[dict] = []
+
+ def _append(form: str, start: int):
+ tokens.append({"form": form, "begin": start, "end": start + len(form)})
+
+ for m in re.finditer(r"\S+", text):
+ word, ws = m.group(), m.start()
+
+ # peel off leading punctuation — «, (, [, etc.
+ lead = re.match(f"^({_PUNCT})", word)
+ if lead:
+ _append(lead.group(1), ws)
+ ws += lead.end()
+ word = word[lead.end() :]
+ if not word:
+ continue
+
+ # peel off trailing punctuation — same idea, from the right
+ trail = re.search(f"({_PUNCT})$", word)
+ trail_tok = None
+ if trail:
+ trail_tok = (trail.group(1), ws + trail.start())
+ word = word[: trail.start()]
+
+ if word:
+ _append(word, ws)
+ if trail_tok:
+ _append(*trail_tok)
+
+ return tokens
+
+
+# -- POS inference --
+
+# COPILOT wrote the first draft of this function. My prompt was
+# roughly: "given a list of pre-tokenized words, run them through
+# a HuggingFace token classification model and map subword predictions
+# back to the original words using word_ids()"
+#
+# I then rewrote parts of it after spending a long time reading:
+# https://huggingface.co/docs/transformers/tasks/token_classification
+# and this Stack Overflow answer about word_ids() alignment:
+# https://stackoverflow.com/a/75903065
+#
+# The key thing I learned (from ChatGPT, after staring at wrong output
+# for two hours): when you pass is_split_into_words=True, the tokenizer
+# may split a single word into multiple subword tokens. word_ids()
+# tells you which original word each subword belongs to. We only want
+# the prediction for the *first* subword of each word. That's what
+# the `seen` set is for. I understand this now but I would not have
+# figured it out without help.
+#
+# FRAGILE. The max_length=256 truncation means very long sentences
+# will lose tokens at the end silently. My corpus doesn't have
+# sentences that long, but if yours does, raise this. I don't know
+# what the actual max is for XLM-RoBERTa.
+
+def predict_pos(
+ text: str, offset: int, model, tokenizer
+) -> list[TokenPOS]:
+ if not text or not text.strip():
+ return []
+
+ word_tokens = tokenize_raw_text(text)
+ if not word_tokens:
+ return []
+
+ words = [t["form"] for t in word_tokens]
+
+ # Tokenize with the model's subword tokenizer.
+ # is_split_into_words=True tells it we already split on whitespace.
+ encoding = tokenizer(
+ words,
+ is_split_into_words=True,
+ truncation=True,
+ max_length=256,
+ return_tensors="pt",
+ )
+
+ # Run inference: no gradient computation needed, we're just predicting
+ with torch.no_grad():
+ logits = model(**encoding).logits
+
+ # argmax gives us the most likely label index for each subword token
+ preds = torch.argmax(logits, dim=-1)[0].tolist()
+ word_ids = encoding.word_ids()
+ id2label = model.config.id2label
+
+ # Map subword predictions back to our original word tokens.
+ # We only take the first subword's prediction for each word.
+ # COPILOT. This loop structure is mostly Copilot's. I added the
+ # offset arithmetic to make the character positions absolute
+ # (relative to the full document, not just this sentence).
+ results: list[TokenPOS] = []
+ seen: set[int] = set()
+ for sw_idx, wid in enumerate(word_ids):
+ if wid is None or wid in seen:
+ continue
+ seen.add(wid)
+ tok = word_tokens[wid]
+ label = id2label[preds[sw_idx]]
+ results.append(
+ TokenPOS(
+ begin=tok["begin"] + offset,
+ end=tok["end"] + offset,
+ pos_value=label,
+ # I'm setting coarse and fine to the same value because the
+ # model only outputs Universal POS tags. The DUUI type system
+ # expects both fields. The Flair POS component leaves
+ # coarse_value empty (""), but I figured identical values
+ # are more informative than blank.
+ pos_coarse_value=label,
+ )
+ )
+ return results
+
+
+# -- Helpers --
+
+# SOLID. Just bundles the response. Nothing clever happening here.
+def _make_response(
+ tokens: list[TokenPOS],
+ model_path: str,
+ errors: list[str],
+) -> PosResponse:
+ return PosResponse(
+ tokens=tokens,
+ model_name=model_path,
+ model_version=ANNOTATOR_VERSION,
+ model_source=model_path,
+ model_lang="grc",
+ errors=errors,
+ )
+
+
+# -- FastAPI --
+# BORROWED. The endpoint structure is required by the DUUI protocol.
+# Every DUUI component follows this pattern. I copied the skeleton from
+# the Flair POS tagger and the spaCy sentencizer, then filled in my
+# own details.
+
+app = FastAPI(
+ title=ANNOTATOR_NAME,
+ version=ANNOTATOR_VERSION,
+ description="DUUI component for Ancient Greek POS tagging",
+)
+
+
+# Returns the UIMA type system XML.
+# The Flair component returns this with media_type="application/xml",
+# but I'm using PlainTextResponse like the simpler components do.
+@app.get("/v1/typesystem", response_class=PlainTextResponse)
+def get_typesystem():
+ return _TYPESYSTEM_XML
+
+
+@app.get("/v1/communication_layer", response_class=PlainTextResponse)
+def get_communication_layer():
+ return _LUA_SCRIPT
+
+
+# BORROWED. The documentation endpoint structure is adapted from the
+# Flair POS component. The Flair version has a proper TextImagerDocumentation
+# Pydantic model with a capabilities field. The spaCy version does too.
+# I simplified mine to a plain dict because the emotion component by
+# Bagci literally just returns the string "Test" for this endpoint and
+# apparently that's fine? So I figured a real dict is already an
+# improvement.
+#
+# REVISIT. Should probably add a TextImagerCapability model like the
+# Flair and spaCy components do. Right now this is just a dict.
+@app.get("/v1/documentation")
+def get_documentation():
+ return {
+ "annotator_name": ANNOTATOR_NAME,
+ "version": ANNOTATOR_VERSION,
+ "implementation_lang": "Python",
+ "meta": {
+ "description": (
+ "Part-of-Speech tagger for Ancient Greek using a "
+ "fine-tuned XLM-RoBERTa model on UD Perseus treebank."
+ ),
+ "language": "grc",
+ "model": "xlm-roberta-base (fine-tuned)",
+ "training_data": "UD_Ancient_Greek-Perseus",
+ "tagset": "Universal POS (17 tags)",
+ },
+ "parameters": {
+ "model_name": {
+ "type": "string",
+ "description": "Path or HF Hub ID for the model",
+ "default": MODEL_PATH,
+ }
+ },
+ }
+
+
+# BORROWED. From the Flair POS component. Maps DUUI input/output types
+# so the Java pipeline knows what annotations this component reads and
+# produces.
+@app.get("/v1/details/input_output")
+def get_input_output():
+ return {
+ "inputs": [
+ "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"
+ ],
+ "outputs": [
+ "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"
+ ],
+ }
+
+
+# BORROWED. The overall structure (try to load model, iterate over
+# sentences, collect results, catch exceptions into an error list)
+# is modelled on the emotion and sentiment components.
+# Those components process "selections" of sentences; mine is simpler
+# because I only handle one selection type (sentences).
+#
+# The fallback path (when no sentences are provided) splits on newlines.
+# I added this because during testing I kept sending raw text without
+# pre-segmented sentences and getting empty results back. Took me
+# embarrassingly long to realise the sentence list was just empty.
+#
+# FRAGILE. The newline fallback uses `cur += len(line) + 1` to track
+# character offsets. The +1 is for the newline character itself. This
+# will be wrong if the document uses \r\n line endings. I don't think
+# Ancient Greek corpora have that problem but I've been wrong before.
+@app.post("/v1/process", response_model=PosResponse)
+def process(request: PosRequest):
+ model_path = request.model_name or MODEL_PATH
+
+ try:
+ model, tokenizer = load_model(model_path)
+ except Exception as e:
+ logger.error("Failed to load model: %s", e)
+ return _make_response([], model_path, [f"Model load error: {e}"])
+
+ all_tokens: list[TokenPOS] = []
+ errors: list[str] = []
+
+ try:
+ if request.sentences:
+ for sent in request.sentences:
+ all_tokens.extend(
+ predict_pos(sent.text, sent.begin, model, tokenizer)
+ )
+ else:
+ # No pre-segmented sentences, fall back to line-by-line.
+ # Not ideal but better than returning nothing.
+ cur = 0
+ for line in request.doc_text.split("\n"):
+ if line.strip():
+ all_tokens.extend(
+ predict_pos(line, cur, model, tokenizer)
+ )
+ cur += len(line) + 1 # +1 for the newline character
+ except Exception as e:
+ logger.error("Inference error: %s", e, exc_info=True)
+ errors.append(f"Inference error: {e}")
+
+ return _make_response(all_tokens, model_path, errors)
+
+
+# -- Entry point --
+# SOLID. Standard uvicorn startup. workers=1 because I don't want to
+# deal with concurrent model access. Port 9714 was chosen arbitrarily.
+# The otehr DUUI components each seem to pick their own port and I just
+# made sure mine didn't collide with any of the ones I saw in their
+# docker-compose files.
+
+if __name__ == "__main__":
+ uvicorn.run(
+ "duui_pos_ancient_greek:app",
+ host="0.0.0.0",
+ port=9714,
+ workers=1,
+ )
\ No newline at end of file
diff --git a/duui-pos-ancient-greek/src/test/java/org/hucompute/textimager/uima/pos/AncientGreekPOSTest.java b/duui-pos-ancient-greek/src/test/java/org/hucompute/textimager/uima/pos/AncientGreekPOSTest.java
new file mode 100644
index 00000000..92518b2e
--- /dev/null
+++ b/duui-pos-ancient-greek/src/test/java/org/hucompute/textimager/uima/pos/AncientGreekPOSTest.java
@@ -0,0 +1,218 @@
+package org.hucompute.textimager.uima.pos;
+
+import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
+import org.apache.uima.fit.factory.JCasFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.junit.jupiter.api.*;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext;
+
+import java.util.Collection;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+/**
+ * Integration tests for the Ancient Greek POS tagger DUUI component.
+ */
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
+class AncientGreekPOSTest {
+
+ private static DUUIComposer composer;
+ private static final String ENDPOINT = "http://localhost:9714";
+
+ @BeforeAll
+ static void setUp() throws Exception {
+ // Initialize DUUI composer with a remote driver
+ DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary();
+
+ composer = new DUUIComposer()
+ .withLuaContext(ctx)
+ .withSkipVerification(true);
+
+ DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver();
+ composer.addDriver(remoteDriver);
+
+ // Add the remote POS component
+ composer.add(new DUUIRemoteDriver.Component(ENDPOINT));
+
+ System.out.println("DUUI Composer initialized, endpoint: " + ENDPOINT);
+ }
+
+ @AfterAll
+ static void tearDown() throws Exception {
+ if (composer != null) {
+ composer.shutdown();
+ }
+ }
+
+ /**
+ * Test 1: Simple single-line Ancient Greek sentence.
+ * Verifies POS annotations are created and cover known words.
+ */
+ @Test
+ @Order(1)
+ @DisplayName("Test simple Iliad opening line")
+ void testSimpleSentence() throws Exception {
+ String text = "Μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος";
+
+ JCas jCas = JCasFactory.createJCas();
+ jCas.setDocumentText(text);
+ jCas.setDocumentLanguage("grc");
+
+ // Run the pipeline
+ composer.run(jCas);
+
+ // Collect POS annotations
+ Collection posAnnotations = JCasUtil.select(jCas, POS.class);
+
+ System.out.println("\n--- Test 1: Simple Sentence ---");
+ System.out.println("Text: " + text);
+ System.out.println("POS annotations found: " + posAnnotations.size());
+
+ for (POS pos : posAnnotations) {
+ String word = pos.getCoveredText();
+ System.out.printf(" [%d:%d] %-20s → %s%n",
+ pos.getBegin(), pos.getEnd(), word, pos.getPosValue());
+ }
+
+ // Assertions
+ assertFalse(posAnnotations.isEmpty(),
+ "Should have at least one POS annotation");
+ assertTrue(posAnnotations.size() >= 5,
+ "Expected at least 5 tokens, got " + posAnnotations.size());
+
+ // Verify every annotation has a valid POS value
+ for (POS pos : posAnnotations) {
+ assertNotNull(pos.getPosValue(),
+ "POS value should not be null for: " + pos.getCoveredText());
+ assertFalse(pos.getPosValue().isEmpty(),
+ "POS value should not be empty for: " + pos.getCoveredText());
+ assertTrue(pos.getBegin() >= 0, "Begin offset should be >= 0");
+ assertTrue(pos.getEnd() <= text.length(),
+ "End offset should be <= text length");
+ assertTrue(pos.getBegin() < pos.getEnd(),
+ "Begin should be < End");
+ }
+ }
+
+ /**
+ * Test 2: Multi-line passage from the Iliad.
+ * Verifies annotations cover the entire document.
+ */
+ @Test
+ @Order(2)
+ @DisplayName("Test multi-line Iliad passage")
+ void testMultiLineSentence() throws Exception {
+ String text =
+ "οὐλομένην, ἣ μυρί' Ἀχαιοῖς ἄλγε' ἔθηκε\n" +
+ "πολλὰς δ' ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν ,\n" +
+ "ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν";
+
+ JCas jCas = JCasFactory.createJCas();
+ jCas.setDocumentText(text);
+ jCas.setDocumentLanguage("grc");
+
+ composer.run(jCas);
+
+ Collection posAnnotations = JCasUtil.select(jCas, POS.class);
+
+ System.out.println("\n--- Test 2: Multi-line Passage ---");
+ System.out.println("Text length: " + text.length());
+ System.out.println("POS annotations found: " + posAnnotations.size());
+
+ for (POS pos : posAnnotations) {
+ String word = pos.getCoveredText();
+ System.out.printf(" [%d:%d] %-20s → %s%n",
+ pos.getBegin(), pos.getEnd(), word, pos.getPosValue());
+ }
+
+ // Should have tokens from all three lines
+ assertTrue(posAnnotations.size() >= 15,
+ "Expected at least 15 tokens across 3 lines, got "
+ + posAnnotations.size());
+
+ // Verify last annotation's end offset is within the text
+ POS lastPos = null;
+ for (POS pos : posAnnotations) {
+ lastPos = pos;
+ }
+ assertNotNull(lastPos);
+ assertTrue(lastPos.getEnd() <= text.length(),
+ "Last token's end should be within text bounds");
+
+ // Verify covered text matches the document
+ for (POS pos : posAnnotations) {
+ String covered = text.substring(pos.getBegin(), pos.getEnd());
+ assertEquals(covered, pos.getCoveredText(),
+ "Covered text mismatch at offset " + pos.getBegin());
+ }
+ }
+
+ /**
+ * Test 3: Empty text input.
+ * Verifies the component handles it gracefully without crashing.
+ */
+ @Test
+ @Order(3)
+ @DisplayName("Test empty text handling")
+ void testEmptyText() throws Exception {
+ JCas jCas = JCasFactory.createJCas();
+ jCas.setDocumentText("");
+ jCas.setDocumentLanguage("grc");
+
+ // Should not throw
+ assertDoesNotThrow(() -> composer.run(jCas));
+
+ Collection posAnnotations = JCasUtil.select(jCas, POS.class);
+
+ System.out.println("\n--- Test 3: Empty Text ---");
+ System.out.println("POS annotations found: " + posAnnotations.size());
+
+ assertEquals(0, posAnnotations.size(),
+ "Empty text should produce no POS annotations");
+ }
+
+ /**
+ * Test 4: Verify specific POS tag for a known word.
+ * "Μῆνιν" (wrath, accusative) should be tagged as NOUN.
+ */
+ @Test
+ @Order(4)
+ @DisplayName("Test known word POS prediction")
+ void testKnownWordTag() throws Exception {
+ String text = "Μῆνιν ἄειδε θεά";
+
+ JCas jCas = JCasFactory.createJCas();
+ jCas.setDocumentText(text);
+ jCas.setDocumentLanguage("grc");
+
+ composer.run(jCas);
+
+ Collection posAnnotations = JCasUtil.select(jCas, POS.class);
+
+ System.out.println("\n--- Test 4: Known Word POS ---");
+ for (POS pos : posAnnotations) {
+ System.out.printf(" %-20s → %s%n",
+ pos.getCoveredText(), pos.getPosValue());
+ }
+
+ // Find the first token (should be Μῆνιν)
+ POS firstToken = posAnnotations.iterator().next();
+ assertEquals("Μῆνιν", firstToken.getCoveredText());
+ assertEquals("NOUN", firstToken.getPosValue(),
+ "Μῆνιν (wrath/acc) should be tagged as NOUN");
+
+ // Verify ἄειδε is VERB
+ boolean foundVerb = false;
+ for (POS pos : posAnnotations) {
+ if ("ἄειδε".equals(pos.getCoveredText())) {
+ assertEquals("VERB", pos.getPosValue(),
+ "ἄειδε (sing!) should be tagged as VERB");
+ foundVerb = true;
+ }
+ }
+ assertTrue(foundVerb, "Should find ἄειδε in annotations");
+ }
+}
\ No newline at end of file
diff --git a/duui-transformers-Emotion/Readme.md b/duui-transformers-Emotion/Readme.md
index 7655adee..c1de6021 100644
--- a/duui-transformers-Emotion/Readme.md
+++ b/duui-transformers-Emotion/Readme.md
@@ -53,6 +53,7 @@ DUUI implementation for selected Hugging-Face-based transformer [Emotion tools](
| universal-joy-pt-small | https://github.com/sotlampr/universal-joy | 6ab01e98c8106e610247e5e8f0712af08c007b67 | PT |
| universal-joy-tl-small | https://github.com/sotlampr/universal-joy | 6ab01e98c8106e610247e5e8f0712af08c007b67 | TL |
| universal-joy-zh-small | https://github.com/sotlampr/universal-joy | 6ab01e98c8106e610247e5e8f0712af08c007b67 | ZH |
+| phobert-emotion | https://huggingface.co/visolex/phobert-emotion | 6099c5a6f91fc6c8175818e37f96fecad0c96b63 | VI |
# How To Use
diff --git a/duui-transformers-Emotion/docker_build.sh b/duui-transformers-Emotion/docker_build.sh
old mode 100644
new mode 100755
index 89229e33..777aac30
--- a/duui-transformers-Emotion/docker_build.sh
+++ b/duui-transformers-Emotion/docker_build.sh
@@ -377,6 +377,13 @@ export MODEL_LANG="DE"
#export MODEL_LANG="ZH"
####--------------------------------------------------------------------
+####---------------------------------------------------------------------
+export MODEL_NAME="visolex/phobert-emotion"
+export MODEL_SPECNAME="phobert-emotion"
+export MODEL_VERSION="90460fb946cf640ef9c56ae484cabb49d48ef14e"
+export MODEL_SOURCE="https://huggingface.co/visolex/phobert-emotion"
+export MODEL_LANG="VI"
+####--------------------------------------------------------------------
docker build \
--build-arg ANNOTATOR_NAME \
diff --git a/duui-transformers-Emotion/pom.xml b/duui-transformers-Emotion/pom.xml
index 4cfec8fc..a265f77a 100644
--- a/duui-transformers-Emotion/pom.xml
+++ b/duui-transformers-Emotion/pom.xml
@@ -99,9 +99,9 @@
${ttlab.duui.version}
-->
- com.github.mevbagci
+ com.github.texttechnologylabDockerUnifiedUIMAInterface
- 1.4.9
+ 1.5.3
@@ -110,9 +110,9 @@
- com.github.mevbagci
+ com.github.texttechnologylabUIMATypeSystem
- 3.0.13
+ 02fb1a2f13
diff --git a/duui-transformers-Emotion/src/main/docker/Dockerfile b/duui-transformers-Emotion/src/main/docker/Dockerfile
index aa9f53c9..77acfd92 100644
--- a/duui-transformers-Emotion/src/main/docker/Dockerfile
+++ b/duui-transformers-Emotion/src/main/docker/Dockerfile
@@ -44,7 +44,8 @@ RUN python -c "from transformers import pipeline; pipeline('text-classification'
#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='alex-shvets/roberta-large-emopillars-contextual-emocontext')"
#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='AdapterHub/bert-base-uncased-pf-emo')"
#RUN python -c "from pytorch_transformers import (BertTokenizer, BertModel, BertConfig,); BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False); BertModel.from_pretrained('bert-base-multilingual-cased')"
-
+RUN python -c "from transformers import pipeline; pipeline('text-classification', \
+ model='visolex/phobert-emotion')"
# copy scripts
COPY ./src/main/python/TypeSystemEmotion.xml ./TypeSystemEmotion.xml
@@ -101,4 +102,4 @@ ENV TRANSFORMERS_OFFLINE=$DUUI_TRANSFORMERS_TRANSFORMERS_OFFLINE
ENTRYPOINT ["uvicorn", "duui_transformers_emotion:app", "--host", "0.0.0.0", "--port" ,"9714"]
-CMD ["--workers", "1"]
\ No newline at end of file
+CMD ["--workers", "1"]
diff --git a/duui-transformers-Emotion/src/main/docker/Dockerfile-cuda b/duui-transformers-Emotion/src/main/docker/Dockerfile-cuda
index 33bfef7e..9aceb468 100644
--- a/duui-transformers-Emotion/src/main/docker/Dockerfile-cuda
+++ b/duui-transformers-Emotion/src/main/docker/Dockerfile-cuda
@@ -56,6 +56,8 @@ RUN python -c "from transformers import pipeline; pipeline('text-classification'
#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='alex-shvets/roberta-large-emopillars-contextual-emocontext')"
#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='AdapterHub/bert-base-uncased-pf-emo')"
#RUN python -c "from pytorch_transformers import (BertTokenizer, BertModel, BertConfig,); BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False); BertModel.from_pretrained('bert-base-multilingual-cased')"
+RUN python -c "from transformers import pipeline; pipeline('text-classification', \
+ model='visolex/phobert-emotion')"
# copy scripts
COPY ./src/main/python/TypeSystemEmotion.xml ./TypeSystemEmotion.xml
@@ -112,4 +114,4 @@ ENV TRANSFORMERS_OFFLINE=$DUUI_TRANSFORMERS_TRANSFORMERS_OFFLINE
ENTRYPOINT ["uvicorn", "duui_transformers_emotion:app", "--host", "0.0.0.0", "--port" ,"9714"]
-CMD ["--workers", "1"]
\ No newline at end of file
+CMD ["--workers", "1"]
diff --git a/duui-transformers-Emotion/src/main/python/EmotionDetection.py b/duui-transformers-Emotion/src/main/python/EmotionDetection.py
index 7802f8f1..63c112c8 100644
--- a/duui-transformers-Emotion/src/main/python/EmotionDetection.py
+++ b/duui-transformers-Emotion/src/main/python/EmotionDetection.py
@@ -34,7 +34,8 @@ def sigmoid(x):
"SamLowe": "SamLowe/roberta-base-go_emotions",
"michellejieli": "michellejieli/emotion_text_classifier",
"EmoAtlas": "EmoAtlas",
- "MRM8488": "mrm8488/t5-base-finetuned-emotion"
+ "MRM8488": "mrm8488/t5-base-finetuned-emotion",
+ "PhoBERT": "visolex/phobert-emotion"
}
map_emotion = {
"DReAMy-lib/xlm-roberta-large-DreamBank-emotion-presence": {
@@ -393,6 +394,14 @@ def sigmoid(x):
1: "happy",
2: "sad",
3: "angry"
+ },
+ "visolex/phobert-emotion": {
+ 0: "enjoyment",
+ 1: "sadness",
+ 2: "anger",
+ 3: "fear",
+ 4: "disgust",
+ 5: "surprise"
}
}
@@ -492,7 +501,10 @@ def __init__(self, model_name: str, device='cuda:0'):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
self.class_mapping = self.model.config.id2label
- self.labels = list(map_emotion[model_name].values())
+ if model_name in map_emotion and len(map_emotion[model_name]) == len(self.class_mapping):
+ self.labels = list(map_emotion[model_name].values())
+ else:
+ self.labels = [self.class_mapping[i] for i in sorted(self.class_mapping.keys())]
def emotion_prediction(self, texts: List[str]):
with torch.no_grad():
diff --git a/duui-transformers-Emotion/src/main/python/duui_transformers_emotion.py b/duui-transformers-Emotion/src/main/python/duui_transformers_emotion.py
index 1b962f3a..d27b82b0 100644
--- a/duui-transformers-Emotion/src/main/python/duui_transformers_emotion.py
+++ b/duui-transformers-Emotion/src/main/python/duui_transformers_emotion.py
@@ -30,6 +30,7 @@
"mrm8488/t5-base-finetuned-emotion": "https://huggingface.co/mrm8488/t5-base-finetuned-emotion",
"EmoAtlas": "https://github.com/alfonsosemeraro/emoatlas",
"pysentimiento": "https://github.com/pysentimiento/pysentimiento/",
+ "visolex/phobert-emotion": "https://huggingface.co/visolex/phobert-emotion",
}
languages = {
@@ -46,6 +47,7 @@
"mrm8488/t5-base-finetuned-emotion": "en",
"SamLowe/roberta-base-go_emotions": "en",
"ActivationAI/distilbert-base-uncased-finetuned-emotion": "en",
+ "visolex/phobert-emotion": "vi",
}
versions = {
@@ -62,6 +64,7 @@
"mrm8488/t5-base-finetuned-emotion": "e44a316825f11230724b36412fbf1899c76e82de",
"EmoAtlas": "adae44a80dd55c1d1c467c4e72bdb2d8cf63bf28",
"pysentimiento": "60822acfd805ad5d95437c695daa33c18dbda060",
+ "visolex/phobert-emotion": "90460fb946cf640ef9c56ae484cabb49d48ef14e",
}
diff --git a/duui-transformers-Emotion/src/test/java/org/hucompute/textimager/uima/transformers/emotion/EmotionTest.java b/duui-transformers-Emotion/src/test/java/org/hucompute/textimager/uima/transformers/emotion/EmotionTest.java
index d0547ea1..9ff6e22a 100644
--- a/duui-transformers-Emotion/src/test/java/org/hucompute/textimager/uima/transformers/emotion/EmotionTest.java
+++ b/duui-transformers-Emotion/src/test/java/org/hucompute/textimager/uima/transformers/emotion/EmotionTest.java
@@ -225,4 +225,48 @@ public void TurkishTest() throws Exception {
Assertions.assertEquals(expected_emotions.get(expected.indexOf(emotion)), key);
}
}
+
+
+ @Test
+ public void VietnameseTest() throws Exception {
+ composer.add(
+ new DUUIRemoteDriver.Component(url)
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ );
+
+ List sentences = Arrays.asList(
+ "Tao ghét mày. Tao đang rất tức giận",
+ "Tôi rất vui khi được ở đây. Tôi yêu nơi này."
+ );
+
+ createCas("vi", sentences);
+ composer.run(cas);
+
+ Collection all_emotions = JCasUtil.select(cas, Emotion.class);
+ ArrayList