Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions duui-Hate/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ DUUI implementation for selected hate classification tools: [Hate](https://huggi
| mehate-bert | https://huggingface.co/l3cube-pune/me-hate-bert | 407f19357c3b2166db6cbc2107807fc07a17b8f5 | MULTI |
| hatemoji | https://huggingface.co/HannahRoseKirk/Hatemoji | f2f98581ab15fb3ccf8b8a5465d7ca70c2958902 | EN |
| codemix-hate | https://huggingface.co/debajyotimaz/codemix_hate | b07d73f1a05dd04c0adbb941b5446064b14feb10 | EN, HI |
| phobert-hsd | https://huggingface.co/visolex/phobert-hsd | 844b4cda62a864907038a33edb346cf8b612054f | VI |

# How To Use

Expand Down
10 changes: 8 additions & 2 deletions duui-Hate/docker_build.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
export ANNOTATOR_NAME=duui-hate
export ANNOTATOR_VERSION=0.3.0
export LOG_LEVEL=INFO
eport MODEL_CACHE_SIZE=3
export MODEL_CACHE_SIZE=3

#---------------------------------------------------------------------
#export MODEL_NAME="Andrazp/multilingual-hate-speech-robacofi"
Expand Down Expand Up @@ -211,7 +211,13 @@ export MODEL_SOURCE="https://huggingface.co/debajyotimaz/codemix_hate"
export MODEL_LANG="EN, HI"
##--------------------------------------------------------------------


##---------------------------------------------------------------------
export MODEL_NAME="visolex/phobert-hsd"
export MODEL_SPECNAME="phobert-hsd"
export MODEL_VERSION="844b4cda62a864907038a33edb346cf8b612054f"
export MODEL_SOURCE="https://huggingface.co/visolex/phobert-hsd"
export MODEL_LANG="VI"
##--------------------------------------------------------------------

export DOCKER_REGISTRY="docker.texttechnologylab.org/"
export DUUI_CUDA=
Expand Down
4 changes: 2 additions & 2 deletions duui-Hate/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
<dependency>
<groupId>com.github.texttechnologylab</groupId>
<artifactId>DockerUnifiedUIMAInterface</artifactId>
<version>7cef2433b5</version>
<version>1.5.3</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>com.github.texttechnologylab.textimager-uima</groupId>-->
Expand All @@ -112,7 +112,7 @@
<dependency>
<groupId>com.github.texttechnologylab</groupId>
<artifactId>UIMATypeSystem</artifactId>
<version>fedfa0ace</version>
<version>02fb1a2f13</version>
</dependency>

<!-- <dependency>-->
Expand Down
3 changes: 2 additions & 1 deletion duui-Hate/src/main/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ RUN pip install -r reqiurements.txt

#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='HannahRoseKirk/Hatemoji')"

RUN python -c "from transformers import pipeline; pipeline('text-classification', model='debajyotimaz/codemix_hate')"
#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='debajyotimaz/codemix_hate')"

RUN python -c "from transformers import pipeline; pipeline('text-classification', model='visolex/phobert-hsd')"

# service script
COPY ./src/main/python/TypeSystemHate.xml ./TypeSystemHate.xml
Expand Down
4 changes: 3 additions & 1 deletion duui-Hate/src/main/docker/Dockerfile-cuda
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,11 @@ RUN pip install -r reqiurements.txt

#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-french')"

RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-indonesian')"
#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='Hate-speech-CNERG/dehatebert-mono-indonesian')"
#RUN python -c "from nubia_score import Nubia; nubia = Nubia()"

#RUN python -c "from transformers import pipeline; pipeline('text-classification', model='visolex/phobert-hsd')"

# service script
COPY ./src/main/python/TypeSystemHate.xml ./TypeSystemHate.xml
COPY ./src/main/python/evaluator.py ./evaluator.py
Expand Down
21 changes: 3 additions & 18 deletions duui-Hate/src/main/python/hatechecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,25 +96,10 @@ def sigmoid(x):
0: "NOT HATE",
1: "HATE"
},
"l3cube-pune/me-hate-bert": {
"visolex/phobert-hsd": {
0: "NOT HATE",
1: "HATE"
},
"HannahRoseKirk/Hatemoji": {
0: "NOT HATE",
1: "HATE",
},
"debajyotimaz/codemix_hate": {
0: "NOT HATE",
1: "HATE"
},
"MilaNLProc/hate-ita": {
0: "NOT HATE",
1: "HATE"
},
"MilaNLProc/hate-ita-xlm-r-base": {
0: "NOT HATE",
1: "HATE"
1: "OFFENSIVE",
2: "HATE"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,41 @@ public void DeTest() throws Exception {
Assertions.assertEquals(expected_i, out_i);
}
}

@Test
public void VietnameseTest() throws Exception {
composer.add(
new DUUIRemoteDriver.Component(url)
.withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
);
List<String> sentences = Arrays.asList(
"Tôi ghét cay ghét đắng điều đó. Sao bạn có thể làm điều tồi tệ đó với tôi! TẠI SAO!",
"Tôi rất vui khi được ở đây. Tôi yêu nơi này."
);

createCas("vi", sentences);
composer.run(cas);

// Update to match actual offsets and predictions
HashMap<String, String> expected = new HashMap<>();
expected.put("0_43", "NonHate"); // Model predicts NonHate
expected.put("44_82", "NonHate");

Collection<Hate> all_hate = JCasUtil.select(cas, Hate.class);
for (Hate hate : all_hate) {
int begin = hate.getBegin();
int end = hate.getEnd();
double hate_i = hate.getHate();
double non_hate = hate.getNonHate();
String out_i = "HATE";
if (hate_i < non_hate){
out_i = "NonHate";
}
String expected_i = expected.get(begin+"_"+end);
if (expected_i != null) {
Assertions.assertEquals(expected_i, out_i);
}
}
}

}
99 changes: 99 additions & 0 deletions duui-ocr/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# DUUI OCR

DUUI implementation for vision-language OCR models.

## Supported Models

| Name | Params | Languages | Supported Tasks |
| ---- | ------ | --------- | --------------- |
| [PaddlePaddle/PaddleOCR-VL-1.5](https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5) | 0.9B | multilingual | ocr, table, formula, chart, spotting, seal |
| [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR) | 0.9B | multilingual | ocr, table, formula |

## Supported Tasks

| Task | PaddleOCR-VL Prompt | GLM-OCR Prompt | Description |
| ---- | ------------------- | -------------- | ----------- |
| `ocr` | `OCR:` | `Text Recognition:` | General text recognition |
| `table` | `Table Recognition:` | `Table Recognition:` | Table structure recognition |
| `formula` | `Formula Recognition:` | `Formula Recognition:` | LaTeX formula recognition |
| `chart` | `Chart Recognition:` | — | Chart content recognition |
| `spotting` | `Spotting:` | — | Text spotting with location |
| `seal` | `Seal Recognition:` | — | Seal text recognition |

## How To Use

Requires
[Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface).

### Run within DUUI

```java
// PaddleOCR-VL
composer.add(
new DUUIDockerDriver.Component(
"docker.texttechnologylab.org/duui-ocr:latest"
)
.withParameter("model_name",
"PaddlePaddle/PaddleOCR-VL-1.5")
.withParameter("task", "ocr")
);

// GLM-OCR
composer.add(
new DUUIDockerDriver.Component(
"docker.texttechnologylab.org/duui-ocr:latest"
)
.withParameter("model_name", "zai-org/GLM-OCR")
.withParameter("task", "ocr")
);
```

### Parameters

| Name | Description | Default |
| ---- | ----------- | ------- |
| `model_name` | Model to use (see table above) | — |
| `task` | OCR task type | `ocr` |
| `max_new_tokens` | Maximum tokens to generate | `1024` |

### Input / Output

- **Input**: `org.texttechnologylab.annotation.type.Image`
annotations in CAS (src can be base64 or file path)
- **Output**: `org.texttechnologylab.annotation.AnnotationComment`
with key = task name, value = recognized text

## Cite

```bibtex
@inproceedings{Leonhardt:et:al:2023,
title = {Unlocking the Heterogeneous Landscape of Big Data
{NLP} with {DUUI}},
author = {Leonhardt, Alexander and Abrami, Giuseppe
and Baumartz, Daniel and Mehler, Alexander},
booktitle = {Findings of the Association for Computational
Linguistics: EMNLP 2023},
year = {2023},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2023.findings-emnlp.29},
pages = {385--399},
}

@misc{cui2026paddleocrvl15multitask09bvlm,
title = {PaddleOCR-VL-1.5: Towards a Multi-Task 0.9B VLM
for Robust In-the-Wild Document Parsing},
author = {Cheng Cui and Ting Sun and Suyin Liang and others},
year = {2026},
eprint = {2601.21957},
archivePrefix = {arXiv},
primaryClass = {cs.CV},
}

@misc{glmocr2026,
title = {GLM-OCR: A Multimodal OCR Model for Complex
Document Understanding},
author = {Z.ai Team},
year = {2026},
url = {https://huggingface.co/zai-org/GLM-OCR},
}
```
24 changes: 24 additions & 0 deletions duui-ocr/docker_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash
set -euo pipefail

export DUUI_OCR_CUDA=
#export DUUI_OCR_CUDA="-cuda"

export DUUI_OCR_ANNOTATOR_NAME=duui-ocr
export DUUI_OCR_ANNOTATOR_VERSION=0.2.0
export DUUI_OCR_LOG_LEVEL=DEBUG
export DUUI_OCR_MODEL_CACHE_SIZE=1
export DOCKER_REGISTRY="docker.texttechnologylab.org/"

docker build \
--build-arg DUUI_OCR_ANNOTATOR_NAME \
--build-arg DUUI_OCR_ANNOTATOR_VERSION \
--build-arg DUUI_OCR_LOG_LEVEL \
--build-arg DUUI_OCR_MODEL_CACHE_SIZE \
-t ${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:${DUUI_OCR_ANNOTATOR_VERSION}${DUUI_OCR_CUDA} \
-f src/main/docker/Dockerfile${DUUI_OCR_CUDA} \
.

docker tag \
${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:${DUUI_OCR_ANNOTATOR_VERSION}${DUUI_OCR_CUDA} \
${DOCKER_REGISTRY}${DUUI_OCR_ANNOTATOR_NAME}:latest${DUUI_OCR_CUDA}
Loading