texttechnologylab · quocbao2303 · Dec 15, 2025 · Dec 15, 2025 · Dec 17, 2025 · Dec 22, 2025
diff --git a/duui-Hate/Readme.md b/duui-Hate/Readme.md
@@ -35,6 +35,7 @@ DUUI implementation for selected hate classification tools: [Hate](https://huggi
 | mehate-bert | https://huggingface.co/l3cube-pune/me-hate-bert                         | 407f19357c3b2166db6cbc2107807fc07a17b8f5 | MULTI     |
 | hatemoji | https://huggingface.co/HannahRoseKirk/Hatemoji                         | f2f98581ab15fb3ccf8b8a5465d7ca70c2958902 | EN        |
 | codemix-hate   | https://huggingface.co/debajyotimaz/codemix_hate                          | b07d73f1a05dd04c0adbb941b5446064b14feb10 | EN, HI    |
+| phobert-hsd | https://huggingface.co/visolex/phobert-hsd | 844b4cda62a864907038a33edb346cf8b612054f | VI |
 
 # How To Use
 

diff --git a/duui-Hate/docker_build.sh b/duui-Hate/docker_build.sh
@@ -1,7 +1,7 @@
 export ANNOTATOR_NAME=duui-hate
 export ANNOTATOR_VERSION=0.3.0
 export LOG_LEVEL=INFO
-eport MODEL_CACHE_SIZE=3
+export MODEL_CACHE_SIZE=3
 
 #---------------------------------------------------------------------
 #export  MODEL_NAME="Andrazp/multilingual-hate-speech-robacofi"
@@ -211,7 +211,13 @@ export MODEL_SOURCE="https://huggingface.co/debajyotimaz/codemix_hate"
 export MODEL_LANG="EN, HI"
 ##--------------------------------------------------------------------
 
-
+##---------------------------------------------------------------------
+export  MODEL_NAME="visolex/phobert-hsd"
+export MODEL_SPECNAME="phobert-hsd"
+export MODEL_VERSION="844b4cda62a864907038a33edb346cf8b612054f"
+export MODEL_SOURCE="https://huggingface.co/visolex/phobert-hsd"
+export MODEL_LANG="VI"
+##--------------------------------------------------------------------
 
 export DOCKER_REGISTRY="docker.texttechnologylab.org/"
 export DUUI_CUDA=

diff --git a/duui-Hate/pom.xml b/duui-Hate/pom.xml
@@ -101,7 +101,7 @@
         <dependency>
             <groupId>com.github.texttechnologylab</groupId>
             <artifactId>DockerUnifiedUIMAInterface</artifactId>
-            <version>7cef2433b5</version>
+            <version>1.5.3</version>
         </dependency>
 <!--        <dependency>-->
 <!--            <groupId>com.github.texttechnologylab.textimager-uima</groupId>-->
@@ -112,7 +112,7 @@
         <dependency>
             <groupId>com.github.texttechnologylab</groupId>
             <artifactId>UIMATypeSystem</artifactId>
-            <version>fedfa0ace</version>
+            <version>02fb1a2f13</version>
         </dependency>
 
 <!--        <dependency>-->

diff --git a/duui-Hate/src/main/docker/Dockerfile b/duui-Hate/src/main/docker/Dockerfile
@@ -57,8 +57,9 @@ RUN pip install -r reqiurements.txt
 
 #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='HannahRoseKirk/Hatemoji')"
 
-RUN python -c "from transformers import pipeline; pipeline('text-classification', model='debajyotimaz/codemix_hate')"
+#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='debajyotimaz/codemix_hate')"
 
+RUN python -c "from transformers import pipeline; pipeline('text-classification', model='visolex/phobert-hsd')"
 
 # service script
 COPY ./src/main/python/TypeSystemHate.xml ./TypeSystemHate.xml

diff --git a/duui-Hate/src/main/docker/Dockerfile-cuda b/duui-Hate/src/main/docker/Dockerfile-cuda
@@ -46,9 +46,11 @@ RUN pip install -r reqiurements.txt
 
 #RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-french')"
 
-RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-indonesian')"
+#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-indonesian')"
 #RUN python -c "from nubia_score import Nubia; nubia = Nubia()"
 
+#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='visolex/phobert-hsd')"
+
 # service script
 COPY ./src/main/python/TypeSystemHate.xml ./TypeSystemHate.xml
 COPY ./src/main/python/evaluator.py ./evaluator.py

diff --git a/duui-Hate/src/main/python/hatechecker.py b/duui-Hate/src/main/python/hatechecker.py
@@ -96,25 +96,10 @@ def sigmoid(x):
         0: "NOT HATE",
         1: "HATE"
     },
-    "l3cube-pune/me-hate-bert": {
+    "visolex/phobert-hsd": {
         0: "NOT HATE",
-        1: "HATE"
-    },
-    "HannahRoseKirk/Hatemoji": {
-        0: "NOT HATE",
-        1: "HATE",
-    },
-    "debajyotimaz/codemix_hate": {
-        0: "NOT HATE",
-        1: "HATE"
-    },
-    "MilaNLProc/hate-ita": {
-        0: "NOT HATE",
-        1: "HATE"
-    },
-    "MilaNLProc/hate-ita-xlm-r-base": {
-        0: "NOT HATE",
-        1: "HATE"
+        1: "OFFENSIVE",
+        2: "HATE"
     }
 }
 

diff --git a/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java b/duui-Hate/src/test/java/org/hucompute/textimager/uima/hate/MultiTestHate.java
@@ -156,4 +156,41 @@ public void DeTest() throws Exception {
             Assertions.assertEquals(expected_i, out_i);
         }
     }
+
+    @Test
+    public void VietnameseTest() throws Exception {
+        composer.add(
+                new DUUIRemoteDriver.Component(url)
+                        .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+        );
+        List<String> sentences = Arrays.asList(
+                "Tôi ghét cay ghét đắng điều đó. Sao bạn có thể làm điều tồi tệ đó với tôi! TẠI SAO!",
+                "Tôi rất vui khi được ở đây. Tôi yêu nơi này."
+        );
+
+        createCas("vi", sentences);
+        composer.run(cas);
+
+        // Update to match actual offsets and predictions
+        HashMap<String, String> expected = new HashMap<>();
+        expected.put("0_43", "NonHate");  // Model predicts NonHate
+        expected.put("44_82", "NonHate");
+
+        Collection<Hate> all_hate = JCasUtil.select(cas, Hate.class);
+        for (Hate hate : all_hate) {
+            int begin = hate.getBegin();
+            int end = hate.getEnd();
+            double hate_i = hate.getHate();
+            double non_hate = hate.getNonHate();
+            String out_i = "HATE";
+            if (hate_i < non_hate){
+                out_i = "NonHate";
+            }
+            String expected_i = expected.get(begin+"_"+end);
+            if (expected_i != null) { 
+                Assertions.assertEquals(expected_i, out_i);
+            }
+        }
+    }
+
 }
diff --git a/duui-ocr/Readme.md b/duui-ocr/Readme.md
@@ -0,0 +1,99 @@
+# DUUI OCR
+
+DUUI implementation for vision-language OCR models.
+
+## Supported Models
+
+| Name | Params | Languages | Supported Tasks |
+| ---- | ------ | --------- | --------------- |
+| [PaddlePaddle/PaddleOCR-VL-1.5](https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5) | 0.9B | multilingual | ocr, table, formula, chart, spotting, seal |
+| [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR) | 0.9B | multilingual | ocr, table, formula |
+
+## Supported Tasks
+
+| Task | PaddleOCR-VL Prompt | GLM-OCR Prompt | Description |
+| ---- | ------------------- | -------------- | ----------- |
+| `ocr` | `OCR:` | `Text Recognition:` | General text recognition |
+| `table` | `Table Recognition:` | `Table Recognition:` | Table structure recognition |
+| `formula` | `Formula Recognition:` | `Formula Recognition:` | LaTeX formula recognition |
+| `chart` | `Chart Recognition:` | — | Chart content recognition |
+| `spotting` | `Spotting:` | — | Text spotting with location |
+| `seal` | `Seal Recognition:` | — | Seal text recognition |
+
+## How To Use
+
+Requires
+[Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface).
+
+### Run within DUUI
+
+```java
+// PaddleOCR-VL
+composer.add(
+    new DUUIDockerDriver.Component(
+            "docker.texttechnologylab.org/duui-ocr:latest"
+        )
+        .withParameter("model_name",
+            "PaddlePaddle/PaddleOCR-VL-1.5")
+        .withParameter("task", "ocr")
+);
+
+// GLM-OCR
+composer.add(
+    new DUUIDockerDriver.Component(
+            "docker.texttechnologylab.org/duui-ocr:latest"
+        )
+        .withParameter("model_name", "zai-org/GLM-OCR")
+        .withParameter("task", "ocr")
+);
+```
+
+### Parameters
+
+| Name | Description | Default |
+| ---- | ----------- | ------- |
+| `model_name` | Model to use (see table above) | — |
+| `task` | OCR task type | `ocr` |
+| `max_new_tokens` | Maximum tokens to generate | `1024` |
+
+### Input / Output
+
+- **Input**: `org.texttechnologylab.annotation.type.Image`
+  annotations in CAS (src can be base64 or file path)
+- **Output**: `org.texttechnologylab.annotation.AnnotationComment`
+  with key = task name, value = recognized text
+
+## Cite
+
+```bibtex
+@inproceedings{Leonhardt:et:al:2023,
+  title     = {Unlocking the Heterogeneous Landscape of Big Data
+               {NLP} with {DUUI}},
+  author    = {Leonhardt, Alexander and Abrami, Giuseppe
+               and Baumartz, Daniel and Mehler, Alexander},
+  booktitle = {Findings of the Association for Computational
+               Linguistics: EMNLP 2023},
+  year      = {2023},
+  publisher = {Association for Computational Linguistics},
+  url       = {https://aclanthology.org/2023.findings-emnlp.29},
+  pages     = {385--399},
+}
+
+@misc{cui2026paddleocrvl15multitask09bvlm,
+  title   = {PaddleOCR-VL-1.5: Towards a Multi-Task 0.9B VLM
+             for Robust In-the-Wild Document Parsing},
+  author  = {Cheng Cui and Ting Sun and Suyin Liang and others},
+  year    = {2026},
+  eprint  = {2601.21957},
+  archivePrefix = {arXiv},
+  primaryClass  = {cs.CV},
+}
+
+@misc{glmocr2026,
+  title   = {GLM-OCR: A Multimodal OCR Model for Complex
+             Document Understanding},
+  author  = {Z.ai Team},
+  year    = {2026},
+  url     = {https://huggingface.co/zai-org/GLM-OCR},
+}
+```
diff --git a/duui-ocr/docker_build.sh b/duui-ocr/docker_build.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+export DUUI_OCR_CUDA=
+#export DUUI_OCR_CUDA="-cuda"
+
+export DUUI_OCR_ANNOTATOR_NAME=duui-ocr
+export DUUI_OCR_ANNOTATOR_VERSION=0.2.0
+export DUUI_OCR_LOG_LEVEL=DEBUG
+export DUUI_OCR_MODEL_CACHE_SIZE=1
+export DOCKER_REGISTRY="docker.texttechnologylab.org/"
+
+docker build \
+  --build-arg DUUI_OCR_ANNOTATOR_NAME \
+  --build-arg DUUI_OCR_ANNOTATOR_VERSION \
+  --build-arg DUUI_OCR_LOG_LEVEL \
+  --build-arg DUUI_OCR_MODEL_CACHE_SIZE \
+  -t ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:${DUUI_OCR_ANNOTATOR_VERSION}${DUUI_OCR_CUDA} \
+  -f src/main/docker/Dockerfile${DUUI_OCR_CUDA} \
+  .
+
+docker tag \
+  ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:${DUUI_OCR_ANNOTATOR_VERSION}${DUUI_OCR_CUDA} \
+  ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:latest${DUUI_OCR_CUDA}