From f0c47e06e73c78dcaac2ad20a7e55b1a4f5190a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Fri, 2 Aug 2024 20:12:08 +0200
Subject: [PATCH 01/51] fix typo

---
 EvalView/templates/EvalView/_instructions-esa.html | 2 +-
 EvalView/templates/EvalView/_instructions-mqm.html | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/EvalView/templates/EvalView/_instructions-esa.html b/EvalView/templates/EvalView/_instructions-esa.html
index 62c2d143..04ec0f4d 100644
--- a/EvalView/templates/EvalView/_instructions-esa.html
+++ b/EvalView/templates/EvalView/_instructions-esa.html
@@ -1,7 +1,7 @@
 <div class="row">
   <div class="col-md-12">
     <ul class="list-unstyled">
-      <li><strong>Higlighting errors:</strong>
+      <li><strong>Highlighting errors:</strong>
         <ul>
           <li>
             Highlight the text fragment where you have identified a translation error (drag or click start & end).
diff --git a/EvalView/templates/EvalView/_instructions-mqm.html b/EvalView/templates/EvalView/_instructions-mqm.html
index 2f36b694..284beed1 100644
--- a/EvalView/templates/EvalView/_instructions-mqm.html
+++ b/EvalView/templates/EvalView/_instructions-mqm.html
@@ -1,7 +1,7 @@
 <div class="row">
   <div class="col-md-12">
     <ul class="list-unstyled">
-      <li><strong>Higlighting errors:</strong>
+      <li><strong>Highlighting errors:</strong>
         <ul>
           <li>
             Highlight the text fragment where you have identified a translation error (drag or click start & end).

From d64659b142ad16d88262cdec5d4fac9b9708e687 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Mon, 5 Aug 2024 10:29:24 +0200
Subject: [PATCH 02/51] clarify annotator tokens

---
 INSTALL.md | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index fffcf124..33809a75 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -1,4 +1,4 @@
-## Setup
+# Setup
 
 1. Basic setup:
 
@@ -39,6 +39,9 @@ python3 manage.py StartNewCampaign Examples/MQM+ESA/manifest.json \
 python3 manage.py CreateInviteTokens test_group 20 --create-group test_group
 ```
 
+Add `--task-confirmation-tokens` if you with to show annotators tokens at the end.
+See [quality control](#Quality control) for more details.
+
 5. Optionally clean up everything
 
 ```
@@ -122,4 +125,13 @@ For task:
 - `batchNo`: task number
 - `randomSeed`: number used in batch generation
 - `requiredAnnotations`: how many annotations does a task need, in most cases use 1
-- `source/targetLanguage`: source and target language
\ No newline at end of file
+- `source/targetLanguage`: source and target language
+
+## Quality control
+
+With `--task-confirmation-tokens`, the annotators will be shown a random one if they fail the quality control and a correct one (matching the one in the CSV output) if they succeed.
+The quality control checks if the perturbed samples (`itemType=BAD`) have statistically lower scores than the original ones (`itemType=TGT`).
+Even without the switch, the campaign status page will show a p-value (last column for staff account) that corresponds to the outcome of this test.
+If it's close to 1, then the annotator is annotating randomly and is of poor quality.
+For values close to 0, the annotations are good.
+The threshold to generate the true token for annotators is currently p<=10%.

From 2666cb22ef5376f9da33e7829a7d63b3ef5082fe Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
Date: Mon, 19 Aug 2024 10:38:18 +0100
Subject: [PATCH 03/51] Update INSTALL.md

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index 33809a75..5c2c32a7 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -39,7 +39,7 @@ python3 manage.py StartNewCampaign Examples/MQM+ESA/manifest.json \
 python3 manage.py CreateInviteTokens test_group 20 --create-group test_group
 ```
 
-Add `--task-confirmation-tokens` if you with to show annotators tokens at the end.
+Add `--task-confirmation-tokens` if you want to generate annotator confirmation tokens.
 See [quality control](#Quality control) for more details.
 
 5. Optionally clean up everything

From 82e9eab7e19dfa35f12af3aaf17cd6db47172d3b Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
Date: Mon, 19 Aug 2024 10:38:24 +0100
Subject: [PATCH 04/51] Update INSTALL.md

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index 5c2c32a7..6b11109e 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -129,7 +129,7 @@ For task:
 
 ## Quality control
 
-With `--task-confirmation-tokens`, the annotators will be shown a random one if they fail the quality control and a correct one (matching the one in the CSV output) if they succeed.
+With `--task-confirmation-tokens`, the annotators will be shown a random key/token if they fail the quality control and a correct one (matching the one in the CSV output with credentials) if they succeed.
 The quality control checks if the perturbed samples (`itemType=BAD`) have statistically lower scores than the original ones (`itemType=TGT`).
 Even without the switch, the campaign status page will show a p-value (last column for staff account) that corresponds to the outcome of this test.
 If it's close to 1, then the annotator is annotating randomly and is of poor quality.

From 715ade94b35c9fe0364d07e55ed79586b6deca54 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
Date: Mon, 19 Aug 2024 10:38:30 +0100
Subject: [PATCH 05/51] Update INSTALL.md

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index 6b11109e..a85f6d10 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -134,4 +134,4 @@ The quality control checks if the perturbed samples (`itemType=BAD`) have statis
 Even without the switch, the campaign status page will show a p-value (last column for staff account) that corresponds to the outcome of this test.
 If it's close to 1, then the annotator is annotating randomly and is of poor quality.
 For values close to 0, the annotations are good.
-The threshold to generate the true token for annotators is currently p<=10%.
+The threshold to generate the valid token for annotators is currently p<=10%.

From d07015d9ec24f048b327a9845324628258b196ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Sat, 4 Jan 2025 11:41:45 +0100
Subject: [PATCH 06/51] remove constraints from loader

---
 EvalData/models/data_assessment.py              | 8 --------
 EvalData/models/direct_assessment.py            | 4 ----
 EvalData/models/direct_assessment_context.py    | 7 -------
 EvalData/models/direct_assessment_document.py   | 7 -------
 EvalData/models/multi_modal_assessment.py       | 7 -------
 EvalData/models/pairwise_assessment.py          | 5 -----
 EvalData/models/pairwise_assessment_document.py | 7 -------
 INSTALL.md                                      | 2 +-
 8 files changed, 1 insertion(+), 46 deletions(-)

diff --git a/EvalData/models/data_assessment.py b/EvalData/models/data_assessment.py
index 1fcd72bb..9eb9ea00 100644
--- a/EvalData/models/data_assessment.py
+++ b/EvalData/models/data_assessment.py
@@ -429,14 +429,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 )
                 new_items.append(new_item)
 
-            if not len(new_items) == 100:
-                _msg = 'Expected 100 items for task but found {0}'.format(
-                    len(new_items)
-                )
-                LOGGER.warn(_msg)
-                print(_msg)
-                continue
-
             current_count += 1
 
             # for new_item in new_items:
diff --git a/EvalData/models/direct_assessment.py b/EvalData/models/direct_assessment.py
index 18d32e71..2f2ada09 100644
--- a/EvalData/models/direct_assessment.py
+++ b/EvalData/models/direct_assessment.py
@@ -311,10 +311,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 )
                 new_items.append(new_item)
 
-            if len(new_items) != 100:
-                LOGGER.error(f'Expected 100 items for task but found {len(new_items)}')
-                continue
-
             current_count += 1
             batch_meta.textpair_set.add(*new_items, bulk=False)
             batch_meta.save()
diff --git a/EvalData/models/direct_assessment_context.py b/EvalData/models/direct_assessment_context.py
index 9c231b71..988c59c6 100644
--- a/EvalData/models/direct_assessment_context.py
+++ b/EvalData/models/direct_assessment_context.py
@@ -367,13 +367,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 if item['isCompleteDocument']:
                     doc_items += 1
 
-            if (len(new_items) - doc_items) != 100:
-                _msg = 'Expected 100 items for task but found {0}'.format(
-                    len(new_items) - doc_items
-                )
-                LOGGER.warn(_msg)
-                continue
-
             current_count += 1
 
             for new_item in new_items:
diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index 861f0755..ab88b01e 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -462,13 +462,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 if item['isCompleteDocument']:
                     doc_items += 1
 
-            if (len(new_items) - doc_items) != 100:
-                _msg = 'Expected 100 items for task but found {0}'.format(
-                    len(new_items) - doc_items
-                )
-                LOGGER.warn(_msg)
-                continue
-
             current_count += 1
 
             for new_item in new_items:
diff --git a/EvalData/models/multi_modal_assessment.py b/EvalData/models/multi_modal_assessment.py
index 17778e1f..65bebc1b 100644
--- a/EvalData/models/multi_modal_assessment.py
+++ b/EvalData/models/multi_modal_assessment.py
@@ -348,13 +348,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 )
                 new_items.append(new_item)
 
-            if not len(new_items) == 100:
-                _msg = 'Expected 100 items for task but found {0}'.format(
-                    len(new_items)
-                )
-                LOGGER.warn(_msg)
-                continue
-
             current_count += 1
 
             # for new_item in new_items:
diff --git a/EvalData/models/pairwise_assessment.py b/EvalData/models/pairwise_assessment.py
index 7158e001..11934524 100644
--- a/EvalData/models/pairwise_assessment.py
+++ b/EvalData/models/pairwise_assessment.py
@@ -346,11 +346,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 )
                 new_items.append(new_item)
 
-            if not len(new_items) == 100:
-                _msg = 'Expected 100 items for task but found {0}'.format(count_items)
-                LOGGER.warn(_msg)
-                continue
-
             current_count += 1
 
             # for new_item in new_items:
diff --git a/EvalData/models/pairwise_assessment_document.py b/EvalData/models/pairwise_assessment_document.py
index f834e815..26097a2e 100644
--- a/EvalData/models/pairwise_assessment_document.py
+++ b/EvalData/models/pairwise_assessment_document.py
@@ -471,13 +471,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 if item['isCompleteDocument']:
                     doc_items += 1
 
-            if (len(new_items) - doc_items) != 100:
-                _msg = 'Expected 100 items for task but found {0}'.format(
-                    len(new_items) - doc_items
-                )
-                LOGGER.warn(_msg)
-                continue
-
             current_count += 1
 
             for new_item in new_items:
diff --git a/INSTALL.md b/INSTALL.md
index a85f6d10..42304238 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -78,7 +78,7 @@ The manifest looks like this:
 - In the associated data we had only one En-De task. The combination of redundancy of 2 and the first 2 in the task distribution simply creates two accounts with the same single task (redundant). If there were e.g. 5 tasks and we wanted no redundancy, the line would be `["eng", "deu", "uniform",  5, 5]`. 
 Alternatively to manual manifests, a Django command can be created instead of the manifest file, see `Campaign/management/commands/InitCampaigh*.py`.
 
-The batches file is a list of tasks with items and task descriptions. As a rule, there are exactly 100 segments in a task. An example for ESA/MQM:
+The batches file is a list of tasks with items and task descriptions. There are usually at least 100 segments in a task. An example for ESA/MQM:
 ```
 [
     {

From d6ff83c6698c2590eb78628188a0b5b19add99e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Sat, 4 Jan 2025 11:53:25 +0100
Subject: [PATCH 07/51] rename hit to set

---
 Dashboard/templates/Dashboard/dashboard.html | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/Dashboard/templates/Dashboard/dashboard.html b/Dashboard/templates/Dashboard/dashboard.html
index 2c9f893e..d1eb5bea 100644
--- a/Dashboard/templates/Dashboard/dashboard.html
+++ b/Dashboard/templates/Dashboard/dashboard.html
@@ -3,11 +3,6 @@
 {% block content %}
       <!-- Main jumbotron for a primary marketing message or call to action -->
       <div class="jumbotron">
-          <!--
-        <div class="alert alert-info" role="alert">
-          Accounts and tasks are live now &mdash; check <a href="https://github.com/AppraiseDev/WMT21SrcDA/">github.com/AppraiseDev/WMT21SrcDA/</a> for latest updates.
-        </div>
-          -->
 
         <h1>Dashboard</h1>
         <h4>Evaluation campaign for shared tasks hosted at <a href="https://statmt.org/wmt24">the 9th Conference on Machine Translation</a> (WMT24)</h4>
@@ -18,12 +13,12 @@ <h3 class="panel-title"><strong>This is Appraise</strong></h3>
           </div>
           <div class="panel-body">
 {% if current_task %}
-            <h3>Current HIT</h3>
+            <h3>Current set</h3>
             <p>Continue annotation for <a href="{% url current_url %}">{{current_task.campaign}}:{{current_task.marketTargetLanguage}}</a>.</p>
 {% elif all_languages %}
     {% for _, languages in all_languages.items %}
     {% if languages %}
-            <h3>Next HIT</h3>
+            <h3>Next set</h3>
             <p>Start annotation for:
         {% for code, language, campaign, task_url in languages %}
             <a href="{% url task_url code campaign %}">{{campaign}}:{{language}}</a>{% if not forloop.last %} &middot; {% endif %}
@@ -46,11 +41,11 @@ <h3>Work completed</h3>
               </div>
             </div>
 {% else %}
-            <h3>Next HIT</h3>
+            <h3>Next set</h3>
             <p>We are currently finalising the registration process for annotator accounts. Once this has been completed, direct assessment tasks will be become available from this page. Please check back in a little while.</p>
 {% endif %}
             <h3>User status</h3>
-            <p>{{annotations}} annotation{{annotations|pluralize}}, {{hits}} HIT{{hits|pluralize}} completed. Total annotation duration {% if days %}{{days|stringformat:"02d"}}d{% endif %}{{hours|stringformat:"02d"}}h{{minutes|stringformat:"02d"}}m{{seconds|stringformat:"02d"}}s.</p>
+            <p>{{annotations}} annotation{{annotations|pluralize}}, {{hits}} set{{hits|pluralize}} completed. Total annotation duration {% if days %}{{days|stringformat:"02d"}}d{% endif %}{{hours|stringformat:"02d"}}h{{minutes|stringformat:"02d"}}m{{seconds|stringformat:"02d"}}s.</p>
           </div>
         </div>
       </div>

From 1954d7b90a861d285a5b27afea6ab9e5d9b2630c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Sat, 4 Jan 2025 13:09:34 +0100
Subject: [PATCH 08/51] implement character-level alignment

---
 .../direct-assessment-document-mqm-esa.css    |  1 -
 .../js/direct-assessment-document-mqm-esa.js  | 53 +++++++++++++++----
 .../direct-assessment-document-mqm-esa.html   |  3 +-
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
index 89d25f78..d17ab3e7 100644
--- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
+++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
@@ -136,7 +136,6 @@
     min-width: 130px;
 }
 
-
 .mqm_char:hover:not([selected]):not([in_mqm]) {
     outline: 2px solid #ccc;
 }
diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
index 7c9afc93..05c22eff 100644
--- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
+++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
@@ -287,7 +287,8 @@ class MQMItemHandler {
         this.initialize()
     }
 
-    initialize() {
+    async initialize() {
+        this.el_source = this.el.find(".source-text")
         this.el_target = this.el.find(".target-text")
         this.el_slider = this.el.find('.slider')
         // for Appraise reasons it's a JSON string encoding JSON
@@ -306,6 +307,9 @@ class MQMItemHandler {
         }
         this.mqm_submitted = structuredClone(this.mqm)
         this.mqm_orig = JSON.parse(JSON.parse(this.el.children('#mqm-payload-orig').html()))
+        this.text_source_orig = decodeEntitiesPreservingTags(JSON.parse(this.el.children('#text-source-payload').html()).trim())
+        this.source_video = JSON.parse(this.el.children('#text-source-payload').html()).trim().startsWith("<video")
+        // NOTE: we don't decode entities for the target text, which might cause false positive annotated errors
         this.text_target_orig = JSON.parse(this.el.children('#text-target-payload').html()).trim()
         this.SELECTION_STATE = []
         this.HOVER_UNDECIDED_SPANS = new Set()
@@ -326,15 +330,13 @@ class MQMItemHandler {
         })
         let score = parseFloat(this.el.children('#score-payload').html())
 
-        // setup_span_structure
-        let split_text = this.text_target_orig.split("")
+    
 
-        // word-level, not used anymore
-        // split_text = [...TXT_CANDIDATE_ORIGINAL.matchAll(/([\p{L}\-0-9]+|[^\p{L}\-0-9]+)/gu)].map((v) => v[0])
-        let html_candidate = split_text.map((v, i) => {
-            return `<span class="mqm_char" id="candidate_char_${i}" char_id="${i}">${v}</span>`
-        }).join("") + " <span class='mqm_char span_missing' id='candidate_char_missing' char_id='missing'>[MISSING]</span>"
-        this.el_target.html(html_candidate)
+        // setup_span_structure
+        let html_target = this.text_target_orig.split("").map((v, i) => {
+            return `<span class="mqm_char" id="target_char_${i}" char_id="${i}">${v}</span>`
+        }).join("") + " <span class='mqm_char span_missing' id='target_char_missing' char_id='missing'>[MISSING]</span>"
+        this.el_target.html(html_target)
 
         this.redraw_mqm()
 
@@ -350,6 +352,39 @@ class MQMItemHandler {
             this.el_slider.slider('value', score);
         }
 
+        // handle character alignment estimation
+        if (!this.source_video) {
+            let html_source = this.text_source_orig.split("").map((v, i) => {
+                return `<span class="mqm_char_src" id="source_char_${i}" char_id="${i}">${v}</span>`
+            }).join("")
+            this.el_source.html(html_source)
+
+            await waitout_js_loop()
+
+            let len_src = this.text_source_orig.split("").length
+            let len_tgt = this.text_target_orig.split("").length
+            this.el_target.children(".mqm_char").each((i, el) => {
+                // on hover
+                $(el).on("mouseenter", () => {
+                    // get char position from attribute
+                    let tgt_char_i = Number.parseInt($(el).attr("char_id"))
+                    // approximate position
+                    let src_char_i = Math.floor(tgt_char_i * len_src / len_tgt)
+                    // remove underline from all mqm
+                    this.el_source.children(".mqm_char_src").css("text-decoration", "")
+                    // set underline to the corresponding character and its neighbours
+                    this.el_source.children(`#source_char_${src_char_i}`).css("text-decoration", "underline 10%")
+                    this.el_source.children(`#source_char_${src_char_i-1}`).css("text-decoration", "underline 10%")
+                    this.el_source.children(`#source_char_${src_char_i+1}`).css("text-decoration", "underline 10%")
+                })
+                // on leave remove all decorations
+                $(el).on("mouseleave", () => {
+                    this.el_source.children(".mqm_char_src").css("text-decoration", "")
+                })
+            })
+        }
+
+
         // slider bubble handling
         this.el_slider.find(".ui-slider-handle").append("<div class='slider-bubble'>100</div>")
         let refresh_bubble = () => {
diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
index 3b380283..753cff37 100644
--- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
+++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
@@ -60,6 +60,7 @@
 
     {{ scores.mqm|json_script:"mqm-payload" }}
     {{ scores.mqm_orig|json_script:"mqm-payload-orig" }}
+    {{ item.sourceText|json_script:"text-source-payload" }}
     {{ item.targetText|json_script:"text-target-payload" }}
     {{ scores.score|json_script:"score-payload" }}
 
@@ -79,7 +80,7 @@
         <div class="source-box">
             <div class="tutorial-text"></div>
             <div class="source-text">
-                <!-- TODO: this means that HTML can be injected, incorrect! -->
+                <!-- NOTE: "safe" means that HTML can be injected, which is needed for video! -->
                 {{ item.sourceText|safe }}
             </div>
 

From d8d89682b3494eca9b81fad4d814511036d59d71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Tue, 7 Jan 2025 11:03:35 +0100
Subject: [PATCH 09/51] make src-tgt character highlights more prominent

---
 .../js/direct-assessment-document-mqm-esa.js  | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
index 05c22eff..857fc2c3 100644
--- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
+++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
@@ -261,7 +261,11 @@ async function submit_finish_document(override_tutorial_check=false) {
         await new Promise(resolve => setTimeout(resolve, 5_000))
         $("#button-next-doc").prop('disabled', false);
     }
-
+}
+function decodeEntities(html) {
+    var txt = document.createElement("textarea");
+    txt.innerHTML = html;
+    return txt.value;
 }
 
 function _show_error_box(text, timeout = 2000) {
@@ -307,7 +311,7 @@ class MQMItemHandler {
         }
         this.mqm_submitted = structuredClone(this.mqm)
         this.mqm_orig = JSON.parse(JSON.parse(this.el.children('#mqm-payload-orig').html()))
-        this.text_source_orig = decodeEntitiesPreservingTags(JSON.parse(this.el.children('#text-source-payload').html()).trim())
+        this.text_source_orig = decodeEntities(JSON.parse(this.el.children('#text-source-payload').html()).trim())
         this.source_video = JSON.parse(this.el.children('#text-source-payload').html()).trim().startsWith("<video")
         // NOTE: we don't decode entities for the target text, which might cause false positive annotated errors
         this.text_target_orig = JSON.parse(this.el.children('#text-target-payload').html()).trim()
@@ -372,10 +376,15 @@ class MQMItemHandler {
                     let src_char_i = Math.floor(tgt_char_i * len_src / len_tgt)
                     // remove underline from all mqm
                     this.el_source.children(".mqm_char_src").css("text-decoration", "")
+
                     // set underline to the corresponding character and its neighbours
-                    this.el_source.children(`#source_char_${src_char_i}`).css("text-decoration", "underline 10%")
-                    this.el_source.children(`#source_char_${src_char_i-1}`).css("text-decoration", "underline 10%")
-                    this.el_source.children(`#source_char_${src_char_i+1}`).css("text-decoration", "underline 10%")
+                    for (let range = 5; range > 0; range--) {
+                        // extrapolate range between #111 and #ddd
+                        let color = (Math.floor(range/5 * (0xd - 0x1))+0x1).toString(16)
+                        for (let i = Math.max(0, src_char_i - range); i < Math.min(len_src, src_char_i + range); i++) {
+                            this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 25% #${color}${color}${color} solid`)
+                        }
+                    }
                 })
                 // on leave remove all decorations
                 $(el).on("mouseleave", () => {

From a9f42215a867791ed36c6277414b69cf5f8274d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Wed, 8 Jan 2025 09:35:46 +0100
Subject: [PATCH 10/51] add prompt dialog for style of src-tgt highlight
 (testing only)

---
 .../js/direct-assessment-document-mqm-esa.js  | 52 ++++++++++---------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
index 857fc2c3..f744dfd8 100644
--- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
+++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
@@ -127,7 +127,19 @@ async function get_error_type() {
     return error_stack
 }
 
+var TMP_HIGHLIGHT_MODE = null
+var TMP_HIGHLIGHT_WIDTH = null
+
 $(document).ready(() => {
+    // TODO: only temporary, remove once decided
+    // native dialog box to select highlight mode
+    while(!["thin", "normal", "bold", "wavy", "dotted"].includes(TMP_HIGHLIGHT_MODE)) {
+        TMP_HIGHLIGHT_MODE = prompt('Please select highlight mode: "thin", "normal" (default), "bold", "wavy", "dotted"', "normal")
+    }
+    while(isNaN(parseInt(TMP_HIGHLIGHT_WIDTH)) || TMP_HIGHLIGHT_WIDTH < 1) {
+        TMP_HIGHLIGHT_WIDTH = parseInt(prompt('Please select how many characters to highlight. Default is 8.', 8))
+    }
+    
     MQM_TYPE = JSON.parse($('#mqm-type-payload').html())
 
     // sliders are present only for ESA
@@ -377,12 +389,23 @@ class MQMItemHandler {
                     // remove underline from all mqm
                     this.el_source.children(".mqm_char_src").css("text-decoration", "")
 
+                    let highlight_width = Math.floor(TMP_HIGHLIGHT_WIDTH / 2)
                     // set underline to the corresponding character and its neighbours
-                    for (let range = 5; range > 0; range--) {
+                    for (let range = highlight_width; range > 0; range--) {
                         // extrapolate range between #111 and #ddd
-                        let color = (Math.floor(range/5 * (0xd - 0x1))+0x1).toString(16)
-                        for (let i = Math.max(0, src_char_i - range); i < Math.min(len_src, src_char_i + range); i++) {
-                            this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 25% #${color}${color}${color} solid`)
+                        let color = (Math.floor((range-1)/highlight_width * (0xd - 0x1))+0x1).toString(16)
+                        for (let i = Math.max(0, src_char_i - range); i <= Math.min(len_src, src_char_i + range); i++) {
+                            if (TMP_HIGHLIGHT_MODE == "bold") {
+                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 25% #${color}${color}${color} solid`)
+                            } else if (TMP_HIGHLIGHT_MODE == "wavy") {
+                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} wavy`)
+                            } else if (TMP_HIGHLIGHT_MODE == "dotted") {
+                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} dotted`)
+                            } else if (TMP_HIGHLIGHT_MODE == "normal") {
+                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} solid`)
+                            } else if (TMP_HIGHLIGHT_MODE == "thin") {
+                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 5% #${color}${color}${color} solid`)
+                            }
                         }
                     }
                 })
@@ -443,10 +466,6 @@ class MQMItemHandler {
         // store currently displayed version
         this.el.find('input[name="mqm"]').val(JSON.stringify(this.mqm));
 
-        // NOTE: do not automatically recompute
-        // should be in range [0, 100]
-        // this.el_slider.slider('value', this.current_mqm_score(true))
-
         // redraw
         this.el_target.children(".mqm_char").each((i, el) => {
             el = $(el)
@@ -558,23 +577,6 @@ class MQMItemHandler {
             alert(`Please follow the tutorial instructions.\n(${this.text_target_orig.substring(0, 60)}...)`);
             return false
         }
-        // skip other messages in the tutorial
-        // if (this.tutorial) {
-        //     return true
-        // }
-
-        // if (this.mqm.some((x) => x["severity"] == "undecided")) {
-        //     alert('There are some segments without severity (in blue). Click on them to change their severities.');
-        //     return false
-        // }
-
-        // remove dialogs
-        // if (this.mqm.length == 0 && !confirm("There are no annotated text fragments. Are you sure you want to submit?")) {
-        //     return false
-        // }
-        // if (MQM_TYPE == "ESA" && this.current_mqm_score(true) == Number.parseFloat(this.el.find("input[name='score']").val()) && !confirm("You did not change the original translation score. Are you sure you want to submit?")) {
-        //     return false
-        // }
         return true;
     }
 

From 8268137bf6c2756d9d4742f393b0273763d4c58d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Mon, 17 Feb 2025 18:13:02 +0100
Subject: [PATCH 11/51] add tests for more or less than 100 items; add info
 about the number of loaded items

---
 .github/workflows/tests.yml                   |   2 +-
 EvalData/models/data_assessment.py            |   1 +
 EvalData/models/direct_assessment.py          |   1 +
 EvalData/models/direct_assessment_context.py  |   1 +
 EvalData/models/direct_assessment_document.py |   3 +-
 EvalData/models/multi_modal_assessment.py     |   1 +
 EvalData/models/pairwise_assessment.py        |   3 +-
 .../models/pairwise_assessment_document.py    |   3 +-
 .../special/example_gt100.scores.csv.expected | 110 ++++++++++++++++++
 .../special/example_lt100.scores.csv.expected |  10 ++
 .../tests/special/manifest_gt100.json         |  14 +++
 .../tests/special/manifest_lt100.json         |  14 +++
 .../tests/special/test_examples_gt100.sh      |  37 ++++++
 .../tests/special/test_examples_lt100.sh      |  48 ++++++++
 14 files changed, 244 insertions(+), 4 deletions(-)
 create mode 100644 RegressionTests/tests/special/example_gt100.scores.csv.expected
 create mode 100644 RegressionTests/tests/special/example_lt100.scores.csv.expected
 create mode 100644 RegressionTests/tests/special/manifest_gt100.json
 create mode 100644 RegressionTests/tests/special/manifest_lt100.json
 create mode 100644 RegressionTests/tests/special/test_examples_gt100.sh
 create mode 100644 RegressionTests/tests/special/test_examples_lt100.sh

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1fff83db..ee50a6fe 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -31,7 +31,7 @@ jobs:
           cat listing.txt
           7z a -tzip regression-tests-appraise.zip @listing.txt
       - name: Publish outputs
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: regression-tests-appraise
           path: regression-tests-appraise.zip
diff --git a/EvalData/models/data_assessment.py b/EvalData/models/data_assessment.py
index 9eb9ea00..08b0397f 100644
--- a/EvalData/models/data_assessment.py
+++ b/EvalData/models/data_assessment.py
@@ -429,6 +429,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 )
                 new_items.append(new_item)
 
+            LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
 
             # for new_item in new_items:
diff --git a/EvalData/models/direct_assessment.py b/EvalData/models/direct_assessment.py
index 2f2ada09..a605b6d2 100644
--- a/EvalData/models/direct_assessment.py
+++ b/EvalData/models/direct_assessment.py
@@ -311,6 +311,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 )
                 new_items.append(new_item)
 
+            LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
             batch_meta.textpair_set.add(*new_items, bulk=False)
             batch_meta.save()
diff --git a/EvalData/models/direct_assessment_context.py b/EvalData/models/direct_assessment_context.py
index 988c59c6..cb0a581d 100644
--- a/EvalData/models/direct_assessment_context.py
+++ b/EvalData/models/direct_assessment_context.py
@@ -367,6 +367,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 if item['isCompleteDocument']:
                     doc_items += 1
 
+            LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
 
             for new_item in new_items:
diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index ab88b01e..38ccfb8a 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -461,7 +461,8 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 new_items.append(new_item)
                 if item['isCompleteDocument']:
                     doc_items += 1
-
+            
+            LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
 
             for new_item in new_items:
diff --git a/EvalData/models/multi_modal_assessment.py b/EvalData/models/multi_modal_assessment.py
index 65bebc1b..42b609d8 100644
--- a/EvalData/models/multi_modal_assessment.py
+++ b/EvalData/models/multi_modal_assessment.py
@@ -348,6 +348,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 )
                 new_items.append(new_item)
 
+            LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
 
             # for new_item in new_items:
diff --git a/EvalData/models/pairwise_assessment.py b/EvalData/models/pairwise_assessment.py
index 11934524..8ad8c987 100644
--- a/EvalData/models/pairwise_assessment.py
+++ b/EvalData/models/pairwise_assessment.py
@@ -345,7 +345,8 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                     contextRight=context_right,
                 )
                 new_items.append(new_item)
-
+            
+            LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
 
             # for new_item in new_items:
diff --git a/EvalData/models/pairwise_assessment_document.py b/EvalData/models/pairwise_assessment_document.py
index 26097a2e..69c71088 100644
--- a/EvalData/models/pairwise_assessment_document.py
+++ b/EvalData/models/pairwise_assessment_document.py
@@ -470,7 +470,8 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 new_items.append(new_item)
                 if item['isCompleteDocument']:
                     doc_items += 1
-
+            
+            LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
 
             for new_item in new_items:
diff --git a/RegressionTests/tests/special/example_gt100.scores.csv.expected b/RegressionTests/tests/special/example_gt100.scores.csv.expected
new file mode 100644
index 00000000..b325c99d
--- /dev/null
+++ b/RegressionTests/tests/special/example_gt100.scores.csv.expected
@@ -0,0 +1,110 @@
+engdeu9604,ende-tutorial1,1000000,TGT,eng,deu,1,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial1,1000001,TGT,eng,deu,2,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial1,1000002,TGT,eng,deu,3,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial2,1000003,TGT,eng,deu,4,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial2,1000004,TGT,eng,deu,5,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial2,1000005,TGT,eng,deu,6,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,706,TGT,eng,deu,7,test-en-speech_392RoIzR2Fs_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ONLINE-B,778,TGT,eng,deu,8,test-en-speech_TBPP-za78BQ_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,5,TGT,eng,deu,9,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,6,TGT,eng,deu,10,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,7,TGT,eng,deu,11,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,8,TGT,eng,deu,12,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,9,TGT,eng,deu,13,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,451,BAD,eng,deu,14,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,452,BAD,eng,deu,15,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,453,BAD,eng,deu,16,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,454,BAD,eng,deu,17,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,455,BAD,eng,deu,18,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,456,BAD,eng,deu,19,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,457,BAD,eng,deu,20,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,458,BAD,eng,deu,21,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,459,BAD,eng,deu,22,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,460,BAD,eng,deu,23,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,899,TGT,eng,deu,24,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,900,TGT,eng,deu,25,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,901,TGT,eng,deu,26,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,902,TGT,eng,deu,27,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,903,TGT,eng,deu,28,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,904,TGT,eng,deu,29,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,905,TGT,eng,deu,30,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,906,TGT,eng,deu,31,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,907,TGT,eng,deu,32,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,908,TGT,eng,deu,33,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,724,TGT,eng,deu,34,test-en-speech_6JeSS_CODZ0_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,689,TGT,eng,deu,35,test-en-speech_07FOJFFqOYc_002,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,373,BAD,eng,deu,36,test-en-social_112111346044907536#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,374,BAD,eng,deu,37,test-en-social_112111346044907536#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,789,TGT,eng,deu,38,test-en-speech_XwIQLLbD7SI_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,969,TGT,eng,deu,39,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,970,TGT,eng,deu,40,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,971,TGT,eng,deu,41,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,972,TGT,eng,deu,42,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,973,TGT,eng,deu,43,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,974,TGT,eng,deu,44,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,975,TGT,eng,deu,45,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,976,TGT,eng,deu,46,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,977,TGT,eng,deu,47,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,978,TGT,eng,deu,48,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,373,TGT,eng,deu,49,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,374,TGT,eng,deu,50,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,375,TGT,eng,deu,51,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,376,TGT,eng,deu,52,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,377,TGT,eng,deu,53,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,378,TGT,eng,deu,54,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,379,TGT,eng,deu,55,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,380,TGT,eng,deu,56,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,381,TGT,eng,deu,57,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,382,TGT,eng,deu,58,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,19,TGT,eng,deu,59,test-en-news_economist.14223#incomplete,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,20,TGT,eng,deu,60,test-en-news_economist.14223#incomplete,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,42,TGT,eng,deu,61,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,43,TGT,eng,deu,62,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,44,TGT,eng,deu,63,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,45,TGT,eng,deu,64,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,46,TGT,eng,deu,65,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,47,TGT,eng,deu,66,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,48,TGT,eng,deu,67,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,49,TGT,eng,deu,68,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,50,TGT,eng,deu,69,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,51,TGT,eng,deu,70,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,10,TGT,eng,deu,71,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,11,TGT,eng,deu,72,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,12,TGT,eng,deu,73,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,13,TGT,eng,deu,74,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,14,TGT,eng,deu,75,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,15,TGT,eng,deu,76,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,16,TGT,eng,deu,77,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,17,TGT,eng,deu,78,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,18,TGT,eng,deu,79,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,424,TGT,eng,deu,80,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,425,TGT,eng,deu,81,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,426,TGT,eng,deu,82,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,427,TGT,eng,deu,83,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,428,TGT,eng,deu,84,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,429,TGT,eng,deu,85,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,430,TGT,eng,deu,86,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,431,TGT,eng,deu,87,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,432,TGT,eng,deu,88,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,433,TGT,eng,deu,89,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,451,TGT,eng,deu,90,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,452,TGT,eng,deu,91,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,453,TGT,eng,deu,92,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,454,TGT,eng,deu,93,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,455,TGT,eng,deu,94,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,456,TGT,eng,deu,95,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,457,TGT,eng,deu,96,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,458,TGT,eng,deu,97,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,459,TGT,eng,deu,98,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,460,TGT,eng,deu,99,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,767,TGT,eng,deu,100,test-en-speech_QaueRRYecxo_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,453,TGT,eng,deu,101,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,454,TGT,eng,deu,102,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,455,TGT,eng,deu,103,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,456,TGT,eng,deu,104,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,457,TGT,eng,deu,105,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,458,TGT,eng,deu,106,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,459,TGT,eng,deu,107,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,460,TGT,eng,deu,108,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,461,TGT,eng,deu,109,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,768,TGT,eng,deu,110,test-en-speech_QaueRRYecxo_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
diff --git a/RegressionTests/tests/special/example_lt100.scores.csv.expected b/RegressionTests/tests/special/example_lt100.scores.csv.expected
new file mode 100644
index 00000000..cc6b9db4
--- /dev/null
+++ b/RegressionTests/tests/special/example_lt100.scores.csv.expected
@@ -0,0 +1,10 @@
+engdeu9704,ende-tutorial1,1000000,TGT,eng,deu,1,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial1,1000001,TGT,eng,deu,2,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial1,1000002,TGT,eng,deu,3,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial2,1000003,TGT,eng,deu,4,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial2,1000004,TGT,eng,deu,5,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial2,1000005,TGT,eng,deu,6,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,Claude-3.5,706,TGT,eng,deu,7,test-en-speech_392RoIzR2Fs_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ONLINE-B,778,TGT,eng,deu,8,test-en-speech_TBPP-za78BQ_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,Llama3-70B,5,TGT,eng,deu,9,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,Llama3-70B,6,TGT,eng,deu,10,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
diff --git a/RegressionTests/tests/special/manifest_gt100.json b/RegressionTests/tests/special/manifest_gt100.json
new file mode 100644
index 00000000..41f301b8
--- /dev/null
+++ b/RegressionTests/tests/special/manifest_gt100.json
@@ -0,0 +1,14 @@
+{
+    "CAMPAIGN_URL": "http://127.0.0.1:8000/dashboard/sso/",
+    "CAMPAIGN_NAME": "example15esaGT100",
+    "CAMPAIGN_KEY": "example15esaGT100",
+    "CAMPAIGN_NO": 150,
+    "REDUNDANCY": 2,
+
+    "TASKS_TO_ANNOTATORS": [
+        ["eng", "deu", "uniform",  4, 2]
+    ],
+
+    "TASK_TYPE": "Document",
+    "TASK_OPTIONS": "ESA"
+}
diff --git a/RegressionTests/tests/special/manifest_lt100.json b/RegressionTests/tests/special/manifest_lt100.json
new file mode 100644
index 00000000..3ae9a6ba
--- /dev/null
+++ b/RegressionTests/tests/special/manifest_lt100.json
@@ -0,0 +1,14 @@
+{
+    "CAMPAIGN_URL": "http://127.0.0.1:8000/dashboard/sso/",
+    "CAMPAIGN_NAME": "example15esaLT100",
+    "CAMPAIGN_KEY": "example15esaLT100",
+    "CAMPAIGN_NO": 151,
+    "REDUNDANCY": 2,
+
+    "TASKS_TO_ANNOTATORS": [
+        ["eng", "deu", "uniform",  4, 2]
+    ],
+
+    "TASK_TYPE": "Document",
+    "TASK_OPTIONS": "ESA"
+}
diff --git a/RegressionTests/tests/special/test_examples_gt100.sh b/RegressionTests/tests/special/test_examples_gt100.sh
new file mode 100644
index 00000000..fdfd618d
--- /dev/null
+++ b/RegressionTests/tests/special/test_examples_gt100.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash -x
+
+# Exit on error
+set -eo pipefail
+
+prefix=example_gt100
+
+mkdir -p Batches
+
+# duplicate the last 10 examples in the list but increase "itemID" by one
+python3 <<EOF
+import json
+with open("$APPRAISE_EXAMPLES/MQM+ESA/batches_esa.json") as f:
+    data = json.load(f)
+for task in data:
+    task["items"] += [{**item, "itemID": item["itemID"] + 1} for item in task["items"][-10:]]
+with open("Batches/${prefix}_batches_esa.json", "w") as f:
+    json.dump(data, f, indent=2)
+EOF
+
+# Create campaign from Examples/DirectMQM
+$APPRAISE_MANAGE StartNewCampaign manifest_gt100.json \
+    --batches-json Batches/${prefix}_batches_esa.json \
+    --csv-output ${prefix}.users.csv
+
+# Make 120 annotations
+for score in $( seq 1 110 ); do
+    $APPRAISE_MANAGE MakeAnnotation engdeu9604:e0829752 Document $score --mqm '[{"start_i": 0, "end_i": 50, "severity": "major"}]'
+done
+
+# Export scores without timestamps and compare with the expected output, it should be only 110 annotations
+# Escape quotes in MQM fields
+$APPRAISE_MANAGE ExportSystemScoresToCSV example15esaGT100 | sed "s/, /| /g" | cut -f-10 -d, | sed "s/| /, /g" > ${prefix}.scores.csv
+diff --strip-trailing-cr ${prefix}.scores.csv ${prefix}.scores.csv.expected
+
+# Exit with success code
+exit $EXIT_CODE_SUCCESS
diff --git a/RegressionTests/tests/special/test_examples_lt100.sh b/RegressionTests/tests/special/test_examples_lt100.sh
new file mode 100644
index 00000000..c13e7426
--- /dev/null
+++ b/RegressionTests/tests/special/test_examples_lt100.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash -x
+
+# Exit on error
+set -eo pipefail
+
+prefix=example_lt100
+
+mkdir -p Batches
+
+# $APPRAISE_EXAMPLES/MQM+ESA/batches_esa.json is a list of dictionaries, each containing among other fields a field "items", which is a list
+# read $APPRAISE_EXAMPLES/MQM+ESA/batches_esa.json and keep only the first 10 examples in each "items" list
+python3 <<EOF
+import json
+with open("$APPRAISE_EXAMPLES/MQM+ESA/batches_esa.json") as f:
+    data = json.load(f)
+for item in data:
+    item["items"] = item["items"][:10]
+with open("Batches/${prefix}_batches_esa.json", "w") as f:
+    json.dump(data, f, indent=2)
+EOF
+
+# Create campaign from Examples/DirectMQM
+$APPRAISE_MANAGE StartNewCampaign manifest_lt100.json \
+    --batches-json Batches/${prefix}_batches_esa.json \
+    --csv-output ${prefix}.users.csv
+
+# Make 10 annotations
+for score in $( seq 1 10 ); do
+    $APPRAISE_MANAGE MakeAnnotation engdeu9704:17d9e109 Document $score --mqm '[{"start_i": 0, "end_i": 50, "severity": "major"}]'
+done
+
+# Export scores without timestamps and compare with the expected output
+# Escape quotes in MQM fields
+$APPRAISE_MANAGE ExportSystemScoresToCSV example15esaLT100 | sed "s/, /| /g" | cut -f-10 -d, | sed "s/| /, /g" > ${prefix}.scores.csv
+diff --strip-trailing-cr ${prefix}.scores.csv ${prefix}.scores.csv.expected
+
+# Make two more annotations, should not create any new entries in the scores file
+for score in $( seq 1 3 ); do
+    $APPRAISE_MANAGE MakeAnnotation engdeu9704:17d9e109 Document $score --mqm '[{"start_i": 0, "end_i": 50, "severity": "major"}]'
+done
+
+# the output should remain the same
+$APPRAISE_MANAGE ExportSystemScoresToCSV example15esaLT100 | sed "s/, /| /g" | cut -f-10 -d, | sed "s/| /, /g" > ${prefix}.scores2.csv
+diff --strip-trailing-cr ${prefix}.scores2.csv ${prefix}.scores.csv.expected
+
+
+# Exit with success code
+exit $EXIT_CODE_SUCCESS

From d97c1b2d8d1297552071665998cae49f8d2a72f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Mon, 17 Feb 2025 18:31:31 +0100
Subject: [PATCH 12/51] set src-tgt char alignment as discussed

---
 .../js/direct-assessment-document-mqm-esa.js  | 26 ++-----------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
index f744dfd8..3a7e9e77 100644
--- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
+++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
@@ -127,19 +127,7 @@ async function get_error_type() {
     return error_stack
 }
 
-var TMP_HIGHLIGHT_MODE = null
-var TMP_HIGHLIGHT_WIDTH = null
-
 $(document).ready(() => {
-    // TODO: only temporary, remove once decided
-    // native dialog box to select highlight mode
-    while(!["thin", "normal", "bold", "wavy", "dotted"].includes(TMP_HIGHLIGHT_MODE)) {
-        TMP_HIGHLIGHT_MODE = prompt('Please select highlight mode: "thin", "normal" (default), "bold", "wavy", "dotted"', "normal")
-    }
-    while(isNaN(parseInt(TMP_HIGHLIGHT_WIDTH)) || TMP_HIGHLIGHT_WIDTH < 1) {
-        TMP_HIGHLIGHT_WIDTH = parseInt(prompt('Please select how many characters to highlight. Default is 8.', 8))
-    }
-    
     MQM_TYPE = JSON.parse($('#mqm-type-payload').html())
 
     // sliders are present only for ESA
@@ -389,23 +377,13 @@ class MQMItemHandler {
                     // remove underline from all mqm
                     this.el_source.children(".mqm_char_src").css("text-decoration", "")
 
-                    let highlight_width = Math.floor(TMP_HIGHLIGHT_WIDTH / 2)
+                    let highlight_width = Math.floor(16 / 2)
                     // set underline to the corresponding character and its neighbours
                     for (let range = highlight_width; range > 0; range--) {
                         // extrapolate range between #111 and #ddd
                         let color = (Math.floor((range-1)/highlight_width * (0xd - 0x1))+0x1).toString(16)
                         for (let i = Math.max(0, src_char_i - range); i <= Math.min(len_src, src_char_i + range); i++) {
-                            if (TMP_HIGHLIGHT_MODE == "bold") {
-                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 25% #${color}${color}${color} solid`)
-                            } else if (TMP_HIGHLIGHT_MODE == "wavy") {
-                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} wavy`)
-                            } else if (TMP_HIGHLIGHT_MODE == "dotted") {
-                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} dotted`)
-                            } else if (TMP_HIGHLIGHT_MODE == "normal") {
-                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} solid`)
-                            } else if (TMP_HIGHLIGHT_MODE == "thin") {
-                                this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 5% #${color}${color}${color} solid`)
-                            }
+                            this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} solid`)
                         }
                     }
                 })

From bdd3840ec2386bffa0fecc3ea448535e5c54b1ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Mon, 17 Feb 2025 18:33:42 +0100
Subject: [PATCH 13/51] remove unused/duplicate files

---
 scripts/create_iwslt22_tasks.py |  764 -----------------------
 scripts/create_wmt19_tasks.py   |  546 -----------------
 scripts/create_wmt21_tasks.py   |  709 ---------------------
 scripts/create_wmt22_tasks.py   | 1023 -------------------------------
 4 files changed, 3042 deletions(-)
 delete mode 100644 scripts/create_iwslt22_tasks.py
 delete mode 100644 scripts/create_wmt19_tasks.py
 delete mode 100644 scripts/create_wmt21_tasks.py
 delete mode 100644 scripts/create_wmt22_tasks.py

diff --git a/scripts/create_iwslt22_tasks.py b/scripts/create_iwslt22_tasks.py
deleted file mode 100644
index fb26f639..00000000
--- a/scripts/create_iwslt22_tasks.py
+++ /dev/null
@@ -1,764 +0,0 @@
-# pylint: disable=C0103,C0111,C0330,E1101
-import sys
-from collections import OrderedDict
-from copy import deepcopy
-from glob import iglob
-from json import dumps as json_dumps
-from os.path import basename
-from os.path import join
-from random import choice
-from random import randint
-from random import seed
-from random import shuffle
-from typing import Any
-from typing import Dict
-from typing import List
-from typing import Text
-from typing import Tuple
-
-from lxml import etree
-
-
-MAX_TASK_SIZE = 100  # No support for tasks over 100 items
-MAX_DOC_LENGTH = 70  # We do not support documents longer than 70 segments
-
-MISSING_TRANSLATION_MESSAGE = ("NO TRANSLATION AVAILABLE",)
-DEFAULT_TRANSLATOR = "DEFAULT"
-# If False, documents with control items will be very last ones in each batch
-SHUFFLE_DOCS_WITH_CONTROL_ITEMS = True
-# If True, add references as additional system outputs
-INCLUDE_REFERENCES_AS_SYSTEMS = True
-# If True, documents may be oversampled to form the last batch
-USE_ALL_DOCUMENTS_AND_ALL_SYSTEMS = True
-REFERENCE_AS_SYSTEM_PREFIX = 'translator-'
-
-
-def unwrap_xml(
-    xml_file,
-    missing_message=MISSING_TRANSLATION_MESSAGE,
-    encoding='utf-8',
-):
-    """
-    Unwraps an xml file in WMT format, producing source and (if present) reference files
-
-    :param xml_file: The xml file (or fd)
-    :param missing_message: The message to insert when no reference
-
-    :returns: src_lang, src_lines, ref_lang, ref_lines, hyp_lang, hyp_lines
-
-    ref_lines maps translator to document to tuples of segment id and line text
-    hyp_lines maps system to document to tuples of segment id and line text
-
-    ref_lang and hyp_lang may be None, and then their lines are empty
-    note: a single language is assumed for each of sources, refs and hyps
-
-    This function has been extracted from
-    https://github.com/wmt-conference/wmt-format-tools/wmtformat/unwrap.py with
-    some modifications
-    """
-    tree = etree.parse(xml_file)
-
-    # Find and check  the documents (src, ref, hyp)
-    src_langs, ref_langs, hyp_langs, translators, systems = (
-        set(),
-        set(),
-        set(),
-        set(),
-        set(),
-    )
-
-    for src_doc in tree.getroot().findall(".//src"):
-        src_langs.add(src_doc.get("lang"))
-
-    for ref_doc in tree.getroot().findall(".//ref"):
-        ref_langs.add(ref_doc.get("lang"))
-        translator = ref_doc.get("translator")
-        if translator:
-            translators.add(translator)
-
-    for hyp_doc in tree.getroot().findall(".//hyp"):
-        hyp_langs.add(hyp_doc.get("lang"))
-        systems.add(hyp_doc.get("system"))
-
-    if len(src_langs) > 1:
-        raise RuntimeError("Multiple source languages found")
-
-    if len(src_langs) == 0:
-        raise RuntimeError("No source languages found")
-
-    src_lang = src_langs.pop()
-    src_docs = OrderedDict()
-
-    if len(ref_langs) > 1:
-        raise RuntimeError("Multiple reference languages found")
-
-    translators = list(translators)
-    if len(ref_langs) > 0:
-        if len(translators) == 0:
-            print("No translator identifiers found")
-            translators.append(DEFAULT_TRANSLATOR)
-        ref_lang = ref_langs.pop()
-        ref_docs = OrderedDict(
-            (translator, OrderedDict()) for translator in translators
-        )
-    else:
-        print("No references found")
-        ref_lang = None
-        ref_docs = OrderedDict()
-
-    if len(hyp_langs) > 1:
-        raise RuntimeError("Multiple hypothesis languages found")
-
-    systems = list(systems)
-    if len(hyp_langs) > 0:
-        hyp_docs = OrderedDict((system, OrderedDict()) for system in systems)
-        hyp_lang = hyp_langs.pop()
-    else:
-        hyp_docs = OrderedDict()
-        hyp_lang = None
-
-    # Extract text
-    src_sent_count, doc_count = 0, 0
-    for doc in tree.getroot().findall(".//doc"):
-        doc_id = doc.get("id")
-        src = []
-        if "testsuite" in doc.attrib:
-            continue
-        doc_count += 1
-        src_sents = {int(seg.get("id")): seg.text for seg in doc.findall(".//src//seg")}
-
-        def get_sents(doc):
-            return {
-                int(seg.get("id")): seg.text if seg.text else ""
-                for seg in doc.findall(f".//seg")
-            }
-
-        if ref_lang:
-            _ref_docs = doc.findall(".//ref")
-            trans_to_ref = {}
-
-            # If no translator identifiers, we just read one reference (if any)
-            # If there are translator identifiers, we add a reference for each translator
-            if len(translators) == 1 and DEFAULT_TRANSLATOR in translators:
-                if len(_ref_docs):
-                    trans_to_ref[DEFAULT_TRANSLATOR] = get_ref_sents(_ref_docs[0])
-                else:
-                    trans_to_ref[DEFAULT_TRANSLATOR] = {}
-            else:
-                trans_to_ref = {
-                    ref_doc.get("translator"): get_sents(ref_doc)
-                    for ref_doc in _ref_docs
-                }
-
-        if hyp_lang:
-            _hyp_docs = doc.findall(".//hyp")
-            system_to_ref = {
-                hyp_doc.get("system"): get_sents(hyp_doc) for hyp_doc in _hyp_docs
-            }
-
-        for seg_id in sorted(src_sents.keys()):
-            src.append([seg_id, src_sents[seg_id]])
-            src_sent_count += 1
-            if ref_lang:
-                for translator in translators:
-                    if doc_id not in ref_docs[translator]:
-                        ref_docs[translator][doc_id] = []
-
-                    # _ref_text = trans_to_ref.get(translator, {translator: {}}).get(
-                    _ref_text = trans_to_ref[translator].get(seg_id, missing_message)
-                    ref_docs[translator][doc_id].append((seg_id, _ref_text))
-
-                    if _ref_text == MISSING_TRANSLATION_MESSAGE:
-                        print(
-                            f'Warning: missing reference for translator {translator}, '
-                            f'document {doc_id}, segment {seg_id}'
-                        )
-            if hyp_lang:
-                for system in systems:
-                    if doc_id not in hyp_docs[system]:
-                        hyp_docs[system][doc_id] = []
-
-                    # _hyp_text = system_to_ref.get(system, {system: {}}).get(
-                    _hyp_text = system_to_ref[system].get(seg_id, missing_message)
-                    hyp_docs[system][doc_id].append((seg_id, _hyp_text))
-
-                    if _hyp_text == MISSING_TRANSLATION_MESSAGE:
-                        print(
-                            f'Warning: missing translation from {system}, '
-                            f'document {doc_id}, segment {seg_id}'
-                        )
-
-        src_docs[doc_id] = src
-
-    print(
-        f"Extracted {doc_count} document(s) containing {src_sent_count} sentences in {src_lang}"
-    )
-
-    return src_lang, src_docs, ref_lang, ref_docs, hyp_lang, hyp_docs
-
-
-def _create_bad_ref(seg_text: str, ref_text: str, character_based: bool = False) -> str:
-    """
-    Creates bad reference for given text.
-
-    Segment length (a, b] to phrase length (excluding a, including b)
-    mapping defined as follows:
-        ( 0,   1] : 1
-        ( 1,   5] : 2
-        ( 5,   8] : 3
-        ( 8,  15] : 4
-        (15,  20] : 5
-        (20, max] : 6
-
-    For character-based languages, which do not support tokenisation
-    by whitespace, the resulting phrase length will be doubled, and
-    is interpreted as a character length.
-    """
-    seg_data = seg_text.split(' ')
-    ref_data = ref_text.split(' ')[1:]  # Don't use the first word
-
-    if character_based:
-        seg_data = [x for x in seg_text]
-        ref_data = [x for x in ref_text]
-
-    seg_len = len(seg_data)
-    ref_len = len(ref_data)
-
-    # Determine length of bad phrase, relative to segment length.
-    _seg_to_bad_mapping = {
-        (None, 1): 2,
-        (1, 5): 2,
-        (5, 8): 3,
-        (8, 15): 4,
-        (15, 20): 5,
-        (20, None): 6,
-    }
-
-    bad_len = 0
-    for seg_pair in _seg_to_bad_mapping:
-        left, right = seg_pair
-
-        # seg_len == right; left edge case
-        if not left:
-            if seg_len == right:
-                bad_len = _seg_to_bad_mapping[seg_pair]
-                break
-
-        # left < seg_len; right edge case
-        elif not right:
-            if left < seg_len:
-                bad_len = _seg_to_bad_mapping[seg_pair]
-                break
-
-        # left < seg_len <= right; middle cases
-        elif left < seg_len <= right:
-            bad_len = _seg_to_bad_mapping[seg_pair]
-            break
-
-    # Double length of bad phrase for character-based languages.
-    if character_based:
-        bad_len = 2 * bad_len
-
-    # Determine random replacement position. For segments longer than
-    # (bad_len + 1), we enforce that this cannot be sentence initial
-    # or final, so positions 0 and (seg_len - bad_len -1) are invalid
-    # and we use an embedded bad_pos in [1, (seg_len - bad_len - 1)].
-    # This happens for all seg_len > 3.
-    bad_pos = 1
-    _xs = max(1, seg_len - bad_len - 1)
-    bad_pos = choice([x + 1 for x in range(_xs)])
-
-    ref_pos = 1
-    if ref_len - bad_len > 0:
-        _xs = max(1, ref_len - bad_len - 1)
-        ref_pos = choice(range(_xs))
-
-    bad_data = (
-        seg_data[:bad_pos]
-        + ref_data[ref_pos : ref_pos + bad_len]
-        + seg_data[bad_pos + bad_len :]
-    )
-    bad_text = ' '.join(bad_data)
-    if character_based:
-        bad_text = ''.join(bad_data)
-
-    # print(seg_text)
-    # print(bad_text)
-    # print('------------')
-    return bad_text
-
-
-def create_bad_refs(
-    docs: Dict[str, List[Tuple[str, str]]],
-    refs: Dict[str, List[Tuple[str, str]]],
-    character_based: bool = False,
-) -> Dict[str, List[Tuple[str, str]]]:
-    """
-    Creates bad references for given documents.
-
-    For each segment in the given documents, this creates a so-called
-    ``bad reference'' which is constructed by replacing an embedded
-    phrase p with a randomly placed phrase p' of the same length,
-    taken from a different segment contained in refs. The length of
-    the phrase is relative to the full segment length.
-
-    See _create_bad_ref() definition for length mapping details.
-    """
-    # Create mapping from f'{doc_id}_{seg_id}' to reference text.
-    all_refs = {}
-    for curr_doc_id, curr_doc in refs.items():
-        for curr_seg_id, curr_ref_text in curr_doc:
-            all_refs[f'{curr_doc_id}_{curr_seg_id}'] = curr_ref_text
-
-    # Create list of f'{doc_id}_{seg_id}' ids, to be used for random
-    # choice later when we want to identify a reference to work with.
-    all_keys = list(all_refs.keys())
-
-    # Iterate through documents and create bad references.
-    bad_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict()
-    for curr_doc_id, curr_doc in docs.items():
-        if not curr_doc_id in bad_docs:
-            bad_docs[curr_doc_id] = []
-
-        print(f'doc_id: {curr_doc_id},\tdoc_len: {len(curr_doc)}')
-        for curr_seg in curr_doc:
-            curr_seg_id, curr_seg_text = curr_seg
-
-            # Bad reference id may not be identical to current id.
-            bad_id = choice(all_keys)
-            while bad_id == f'{curr_doc_id}_{curr_seg_id}':
-                bad_id = choice(all_keys)
-
-            curr_bad_text = _create_bad_ref(
-                curr_seg_text,
-                all_refs[bad_id],
-                character_based=character_based,
-            )
-
-            # Ensure that keys can be reused.
-            all_keys.append(bad_id)
-
-            bad_docs[curr_doc_id].append((curr_seg_id, curr_bad_text))
-
-    return bad_docs
-
-
-if __name__ == "__main__":
-    if len(sys.argv) < 8:
-        print('Example usage:')
-        print(
-            f'  {sys.argv[0]} newstest2021.en-de.all.xml batches.en-de enu deu 50 True False'
-        )
-        exit()
-
-    XML_FILE = sys.argv[1]  # Path to .xml file with sources, references and outputs
-    OUT_NAME = sys.argv[2]  # Prefix for .csv and .json output files
-    SRC_LANG = sys.argv[3]  # Code for source language, e.g. eng
-    TGT_LANG = sys.argv[4]  # Code for target language, e.g. deu
-    TASK_MAX = int(sys.argv[5])  # Maximum number of tasks
-    CONTROLS = sys.argv[6].lower() not in ['', '0', 'false', 'off']  # Generate QC items
-    CHARLANG = sys.argv[7].lower() in ['1', 'true', 'on']  # Character-based
-    print(f'Character based={CHARLANG}')
-
-    ENC = 'utf-8'
-
-    RND_SEED = 1234567
-    # RND_SEED = 11111
-    seed(RND_SEED)
-
-    print(f'Quality control={CONTROLS}')
-    if CONTROLS:
-        REQUIRED_SEGS = 92
-    else:
-        REQUIRED_SEGS = 100
-    print(f'Setting REQUIRED_SEGS={REQUIRED_SEGS}')
-
-    SYS_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict()
-    BAD_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict()
-    print(f'Loading docs from {XML_FILE}')
-    src_lang, SRC_DOCS, ref_lang, REF_DOCS, hyp_lang, SYS_DOCS = unwrap_xml(
-        XML_FILE, encoding=ENC
-    )
-
-    # This reference will be used for generating BAD items
-    REF_ID = sorted(list(REF_DOCS.keys()))[0]
-    print(f'Using reference "{REF_ID}"')
-
-    # Add references as additional system outputs
-    if INCLUDE_REFERENCES_AS_SYSTEMS:
-        for ref_id in sorted(list(REF_DOCS.keys())):
-            sys_id = REFERENCE_AS_SYSTEM_PREFIX + ref_id
-            print(f'Adding reference "{ref_id}" as system output "{sys_id}"')
-            SYS_DOCS[sys_id] = REF_DOCS[ref_id]
-
-    # List of system names that can be iterated deterministically
-    SYS_IDS = sorted(list(SYS_DOCS.keys()))
-    print("SYS IDS size:", len(SYS_IDS))
-
-    for sys_id in SYS_IDS:
-        print(f'Generating bad references for {sys_id}')
-        BAD_DOCS[sys_id] = create_bad_refs(
-            SYS_DOCS[sys_id], REF_DOCS[REF_ID], character_based=CHARLANG
-        )
-
-    # pylint: disable-msg=invalid-name
-    some_sys_id = choice(SYS_IDS)
-    some_doc_id = choice(sorted(list(SYS_DOCS[some_sys_id].keys())))
-    some_sys_text = SYS_DOCS[some_sys_id][some_doc_id]
-    some_bad_text = BAD_DOCS[some_sys_id][some_doc_id]
-    print("Example:", some_sys_id, some_doc_id)
-
-    for _s, _b in zip(some_sys_text, some_bad_text):
-        print(_s)
-        print(_b)
-        print('---')
-
-    DOC_STATS: Dict[int, List[Tuple[int, str, str]]] = OrderedDict()
-    for sys_id in SYS_IDS:
-        for doc_id in SYS_DOCS[sys_id].keys():
-            doc_len = len(SYS_DOCS[sys_id][doc_id])
-
-            # We do not support documents longer than 70 segments.
-            if doc_len > MAX_DOC_LENGTH:
-                print("!!! DOCUMENT TOO LONG:", doc_id)
-                continue
-
-            if not doc_len in DOC_STATS.keys():
-                DOC_STATS[doc_len] = []
-            DOC_STATS[doc_len].append((doc_len, doc_id, sys_id))
-
-    # Randomise system order
-    for doc_len in DOC_STATS:
-        shuffle(DOC_STATS[doc_len])
-
-    print("Doc. stats (doc.len/count):", DOC_STATS.keys())
-    total_docs = 0
-    total_sys = set()
-    for doc_len in DOC_STATS.keys():
-        print(f'  {doc_len}:\t{len(DOC_STATS[doc_len])}')
-        total_docs += len(DOC_STATS[doc_len])
-        for x in DOC_STATS[doc_len]:
-            total_sys.add(x[2])
-    print("total docs:", total_docs)
-    print("total sys:", total_sys)
-
-    all_systems = list(total_sys)
-    sampled_tasks: List[Tuple[Tuple[int, str, str], ...]] = []
-    CURR_LEN = 0
-    CURR_SYS = 0
-    curr_task: List[Tuple[int, str, str]] = []
-    DOC_STATS_COPY = deepcopy(DOC_STATS)
-    last_task = False
-    while DOC_STATS.keys():
-        ALL_KEYS = sorted(list(DOC_STATS.keys()))
-        # Maximum allowed length of a document to not exceed 100 segments in this task
-        max_delta = REQUIRED_SEGS - CURR_LEN
-        valid_keys = [x for x in ALL_KEYS if x <= max_delta]
-
-        if not valid_keys:
-            print("  #segments in current task:", CURR_LEN)
-            for _doc in curr_task:
-                print("   ", _doc)
-            print('------')
-            sampled_tasks.append(tuple(curr_task))
-            CURR_LEN = 0
-            curr_task = []
-            if last_task:  # Stop if this was the last task with
-                break
-            continue
-
-        # Take the document that fill in the allowed size perfectly, or random
-        if max_delta in valid_keys:
-            curr_key = max_delta
-        else:
-            curr_key = choice(valid_keys)
-
-        CURR_LEN += curr_key
-        curr_val = DOC_STATS[curr_key].pop(0)  # This takes a random system.
-        # print('  ... selected ', curr_val)
-        # print('   .. left systems', sum( len(DOC_STATS[k]) for k in DOC_STATS ))
-
-        # Below code would pick systems one after the other
-        # curr_val = None
-        # for iter_val in DOC_STATS[curr_key]:
-        # if iter_val[2] == all_systems[CURR_SYS]:
-        # curr_val = iter_val
-        # DOC_STATS[curr_key].remove(iter_val)
-        # break
-
-        # if not curr_val:
-        # curr_val = DOC_STATS[curr_key].pop(0)
-        # CURR_SYS = all_systems.index(curr_val[2])
-        # CURR_SYS = (CURR_SYS + 1) % len(all_systems)
-
-        curr_task.append(curr_val)
-        if not DOC_STATS[curr_key]:
-            DOC_STATS.pop(curr_key)
-
-        # If there are some documents left that cannot form a full task with
-        # 100 segments, take random documents to create the last task.
-        # This ensures that all documents have been used at least once.
-        if (
-            USE_ALL_DOCUMENTS_AND_ALL_SYSTEMS
-            and len(DOC_STATS) == 0
-            and len(curr_task) > 0
-        ):
-            DOC_STATS = DOC_STATS_COPY
-            last_task = True
-            print('Creating last batch with padded documents')
-
-    # print("------------")
-    # print("Left docs:")
-    # print(DOC_STATS)
-    # print("------------")
-
-    # Print documents per system
-    _all_tasks = []
-    for _tup in sampled_tasks:
-        _all_tasks += list(_tup)
-    _docs_by_sys: Dict[str, Any] = {}
-    for (_, docid, sysid) in _all_tasks:
-        if sysid not in _docs_by_sys:
-            _docs_by_sys[sysid] = []
-        _docs_by_sys[sysid].append(docid)
-    for i, sysid in enumerate(_docs_by_sys):
-        print(i, sysid)
-        for j, docid in enumerate(sorted(_docs_by_sys[sysid])):
-            print("  ", j, docid)
-
-    # Shuffle order of tasks
-    shuffle(sampled_tasks)
-    print("Total number of tasks:", len(sampled_tasks))
-
-    padded_tasks: List[Tuple[Tuple[int, str, str], ...]] = []
-    for tid, task in enumerate(sampled_tasks):
-        task_docs = len(task)
-        task_len = sum([x[0] for x in task])
-        print(f'task_len: {task_len}')
-        if task_len > MAX_TASK_SIZE:
-            raise NotImplementedError(
-                'No support for tasks >{0} items!'.format(MAX_TASK_SIZE)
-            )
-
-        elif task_len < MAX_TASK_SIZE:
-            pad_size = MAX_TASK_SIZE - task_len
-            pad_data: List[Tuple[int, str, str]] = list(task)
-            pad_pos = 0
-            while pad_size > 0:
-                print(f'pad_size: {pad_size}')
-                print(f'pad_pos: {pad_pos}')
-                pad_data.append(tuple(list(pad_data[pad_pos]) + [True]))  # type: ignore
-                print(pad_data[-1])
-                pad_size -= pad_data[-1][0]
-                pad_pos = (pad_pos + 1) % task_docs
-            if pad_size < 0:
-                print(f'pad_size: {pad_size}')
-                print(f'pad_pos: {pad_pos}')
-
-                last_doc: Tuple[int, str, str] = pad_data[-1]
-                print(last_doc[0], '-->', last_doc[0] + pad_size)
-                fixed_doc = (last_doc[0] + pad_size, *last_doc[1:])
-                pad_data[-1] = fixed_doc
-                print(pad_data[-1][0])
-            padded_tasks.append(tuple(pad_data))
-            print("Padded tasks:")
-            for _pad in padded_tasks[-1]:
-                print("  ", _pad)
-
-        else:
-            print(f'WARNING: no control items in task no. {tid}')
-            # raise NotImplementedError('Needs isControl=True update!')
-            padded_tasks.append(tuple(task))  # TODO: does this ever occur?
-
-    csv_data = []
-    task_id = 0
-    for task in padded_tasks:
-        task_id += 1
-        task_len = sum([x[0] for x in task])
-        print(f'>>> task_len: {task_len}')
-
-        for _doc in task:
-            _data = [str(task_id)]
-            for x in _doc:  # type: ignore
-                _data.append(str(x))
-
-            if _data[-1] != 'True':
-                _data.append('False')  # isControl=False
-            print('>>> ', ' '.join(_data))
-            csv_data.append(','.join(_data))
-
-    with open(f'{OUT_NAME}.csv', mode='w') as _file:
-        for csv_line in csv_data:
-            _file.write(csv_line)
-            _file.write('\n')
-
-    json_data = []
-    batch_id = 0
-    for task in padded_tasks[:TASK_MAX]:
-        # Remember, batch numbers are one-based
-        task_data = OrderedDict(
-            {
-                'batchNo': batch_id + 1,
-                'batchSize': 100,
-                'sourceLanguage': SRC_LANG,
-                'targetLanguage': TGT_LANG,
-                'requiredAnnotations': 1,
-                'randomSeed': RND_SEED,
-            }
-        )
-
-        source_id = basename(XML_FILE)
-
-        items_data: List[List[Dict[str, Any]]] = []  # Keeps items grouped into document
-        _item = 0
-        doc_counter = 0
-        for doc_data in task:
-            items_data.append([])  # Add a new bucket for items from this documents
-            has_control_item = False
-
-            doc_len, doc_id, sys_id, *rest = doc_data  # type: ignore
-
-            isControl = rest is not None and rest
-
-            target_id = sys_id
-
-            _src = {}
-            _ref = {}
-            _bad = {}
-            _tgt = {}
-
-            for item_id, item_src in SRC_DOCS[doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _src[seg_id] = item_src
-
-            for item_id, item_ref in REF_DOCS[REF_ID][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _ref[seg_id] = item_ref
-
-            for item_id, item_bad in BAD_DOCS[sys_id][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _bad[seg_id] = item_bad
-
-            for item_id, item_tgt in SYS_DOCS[sys_id][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _tgt[seg_id] = item_tgt
-
-            seg_counter = 0
-            context_src: List[Text] = []
-            context_ref: List[Text] = []
-            context_bad: List[Text] = []
-            context_tgt: List[Text] = []
-            for seg_id in _src:
-                if seg_counter >= doc_len:  # Padding tasks are shorter!
-                    break
-                item_src = _src[seg_id]
-                item_ref = _ref[seg_id]
-                item_bad = _bad[seg_id]
-                item_tgt = _tgt[seg_id]
-
-                target_text = item_tgt
-                target_type = 'TGT'
-
-                # Do not generate any BAD items if QC is disabled
-                if CONTROLS and isControl:
-                    randomCoinFlip = choice(
-                        [
-                            False,
-                            False,
-                            False,
-                            True,
-                            True,
-                            True,
-                            True,
-                            True,
-                            True,
-                            True,
-                        ]  # 7:3 chance
-                    )
-                    if randomCoinFlip:
-                        target_text = item_bad
-                        target_type = 'BAD'
-                        has_control_item = True
-
-                obj: Dict[str, Any] = OrderedDict()
-                obj['_item'] = _item
-                obj['_block'] = -1
-                obj['sourceID'] = source_id
-                obj['sourceContextLeft'] = ' '.join(context_src)
-                obj['sourceText'] = item_src
-                obj['targetID'] = target_id
-                obj['targetContextLeft'] = ' '.join(context_tgt)
-                obj['targetText'] = target_text
-                obj['itemID'] = seg_counter
-                obj['itemType'] = target_type
-                obj['documentID'] = doc_id
-                obj['isCompleteDocument'] = False
-
-                # print(seg_id)
-                # print(' '.join(context_src))
-                # print(item_src)
-                # print('...')
-                # print(' '.join(context_tgt))
-                # print(item_tgt.encode('utf-8'))
-                # print('---')
-
-                context_src.append(item_src)
-                context_ref.append(item_ref)
-                context_bad.append(item_bad)
-                context_tgt.append(target_text)
-
-                items_data[-1].append(obj)
-                _item += 1
-                seg_counter += 1
-
-            obj = OrderedDict()
-            obj['_item'] = _item
-            obj['_block'] = -1
-            obj['sourceID'] = source_id
-            obj['sourceText'] = ' '.join(context_src)  # full document
-            obj['targetID'] = target_id
-            obj['targetText'] = ' '.join(context_tgt)  # full document
-            obj['itemID'] = item_id
-            obj['itemType'] = 'TGT'
-            obj['documentID'] = doc_id
-            obj['isCompleteDocument'] = True
-            items_data[-1].append(obj)
-
-            if has_control_item and SHUFFLE_DOCS_WITH_CONTROL_ITEMS:
-                # Move the document with control items to a random position so
-                # that they are not accumulated as very last documents
-                _bad_doc = items_data.pop()
-                _pos = randint(0, len(items_data) - 1)
-                print(f'  Moving the last QC document to position {_pos}')
-                items_data.insert(_pos, _bad_doc)
-
-        # Extract items from documents
-        _items_data = [item for doc_items in items_data for item in doc_items]
-        # Re-assign _item numbers
-        if SHUFFLE_DOCS_WITH_CONTROL_ITEMS:
-            _item = 0
-            for i in range(len(_items_data)):
-                _items_data[i]['_item'] = _item
-                if _items_data[i]['isCompleteDocument'] == False:
-                    _item += 1
-
-        output_data = OrderedDict({'task': task_data, 'items': _items_data})
-
-        json_data.append(output_data)
-
-        # write out JSON
-        json_text = json_dumps(json_data, indent=2, sort_keys=True)
-
-        json_file_name = f'{OUT_NAME}.json'
-        with open(json_file_name, mode='w', encoding='utf8') as out_file:
-            sys.stdout.write(
-                'Creating {0}, batch no. {1} ... '.format(json_file_name, batch_id + 1),
-            )
-            out_file.write(str(json_text))
-            sys.stdout.write('OK\n')
-
-        batch_id += 1
-
-    print(f'Total tasks: {len(sampled_tasks)}')
-    print(f'Total docs:  {total_docs}')
-    print(f'Total sys:   {len(total_sys)} {sorted(list(total_sys))}')
diff --git a/scripts/create_wmt19_tasks.py b/scripts/create_wmt19_tasks.py
deleted file mode 100644
index 77b56cde..00000000
--- a/scripts/create_wmt19_tasks.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# pylint: disable=C0103,C0111,C0330,E1101
-import sys
-from collections import defaultdict
-from collections import OrderedDict
-from glob import iglob
-from json import dumps as json_dumps
-from os.path import basename
-from os.path import join
-from random import choice
-from random import seed
-from random import shuffle
-from typing import Any
-from typing import Dict
-from typing import List
-from typing import Text
-from typing import Tuple
-
-from bs4 import BeautifulSoup  # type: ignore
-
-
-MAX_TASK_SIZE = 100  # No support for tasks over 100 items
-MAX_DOC_LENGTH = 70  # We do not support documents longer than 70 segments
-
-
-def load_docs_from_sgml(
-    file_path: str, encoding='utf-8'
-) -> Dict[str, List[Tuple[str, str]]]:
-    """
-    Loads documents from given SGML file.
-
-    Returns dict mapping document ids to list of segments [segments].
-    Each segment is a tuple (segment id, segment text).
-    """
-    soup = None
-
-    with open(file_path, encoding=encoding) as _file:
-        soup = BeautifulSoup(_file, features='lxml')
-
-    all_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict()
-    for curr_doc in soup.find_all('doc'):
-        curr_doc_id = curr_doc.attrs['docid']
-        if not curr_doc_id in all_docs:
-            all_docs[curr_doc_id] = []
-
-        for curr_seg in curr_doc.find_all('seg'):
-            curr_seg_id = curr_seg.attrs['id']
-            curr_seg_text = curr_seg.get_text()
-            all_docs[curr_doc_id].append((curr_seg_id, curr_seg_text))
-
-    return all_docs
-
-
-def _create_bad_ref(seg_text: str, ref_text: str, character_based: bool = False) -> str:
-    """
-    Creates bad reference for given text.
-
-    Segment length (a, b] to phrase length (excluding a, including b)
-    mapping defined as follows:
-        ( 0,   1] : 1
-        ( 1,   5] : 2
-        ( 5,   8] : 3
-        ( 8,  15] : 4
-        (15,  20] : 5
-        (20, max] : 6
-
-    For character-based languages, which do not support tokenisation
-    by whitespace, the resulting phrase length will be doubled, and
-    is interpreted as a character length.
-    """
-    seg_data = seg_text.split(' ')
-    ref_data = ref_text.split(' ')
-
-    if character_based:
-        seg_data = [x for x in seg_text]
-        ref_data = [x for x in ref_text]
-
-    seg_len = len(seg_data)
-    ref_len = len(ref_data)
-
-    # Determine length of bad phrase, relative to segment length.
-    _seg_to_bad_mapping = {
-        (None, 1): 1,
-        (1, 5): 2,
-        (5, 8): 3,
-        (8, 15): 4,
-        (15, 20): 5,
-        (20, None): 6,
-    }
-
-    bad_len = 0
-    for seg_pair in _seg_to_bad_mapping:
-        left, right = seg_pair
-
-        # seg_len == right; left edge case
-        if not left:
-            if seg_len == right:
-                bad_len = _seg_to_bad_mapping[seg_pair]
-                break
-
-        # left < seg_len; right edge case
-        elif not right:
-            if left < seg_len:
-                bad_len = _seg_to_bad_mapping[seg_pair]
-                break
-
-        # left < seg_len <= right; middle cases
-        elif left < seg_len <= right:
-            bad_len = _seg_to_bad_mapping[seg_pair]
-            break
-
-    # Double length of bad phrase for character-based languages.
-    if character_based:
-        bad_len = 2 * bad_len
-
-    # Determine random replacement position. For segments longer than
-    # (bad_len + 1), we enforce that this cannot be sentence initial
-    # or final, so positions 0 and (seg_len - bad_len -1) are invalid
-    # and we use an embedded bad_pos in [1, (seg_len - bad_len - 1)].
-    # This happens for all seg_len > 3.
-    bad_pos = 0
-    if seg_len - bad_len > 0:
-        bad_pos = choice(range(seg_len - bad_len))
-
-    elif seg_len > 3:
-        bad_pos = choice([x + 1 for x in range(seg_len - bad_len - 1)])
-
-    ref_pos = 0
-    if ref_len - bad_len > 0:
-        ref_pos = choice(range(ref_len - bad_len))
-
-    bad_data = (
-        seg_data[:bad_pos]
-        + ref_data[ref_pos : ref_pos + bad_len]
-        + seg_data[bad_pos + bad_len :]
-    )
-    bad_text = ' '.join(bad_data)
-    if character_based:
-        bad_text = ''.join(bad_data)
-
-    return bad_text
-
-
-def create_bad_refs(
-    docs: Dict[str, List[Tuple[str, str]]],
-    refs: Dict[str, List[Tuple[str, str]]],
-    character_based: bool = False,
-) -> Dict[str, List[Tuple[str, str]]]:
-    """
-    Creates bad references for given documents.
-
-    For each segment in the given documents, this creates a so-called
-    ``bad reference'' which is constructed by replacing an embedded
-    phrase p with a randomly placed phrase p' of the same length,
-    taken from a different segment contained in refs. The length of
-    the phrase is relative to the full segment length.
-
-    See _create_bad_ref() definition for length mapping details.
-    """
-    # Create mapping from f'{doc_id}_{seg_id}' to reference text.
-    all_refs = {}
-    for curr_doc_id, curr_doc in refs.items():
-        for curr_seg_id, curr_ref_text in curr_doc:
-            all_refs[f'{curr_doc_id}_{curr_seg_id}'] = curr_ref_text
-
-    # Create list of f'{doc_id}_{seg_id}' ids, to be used for random
-    # choice later when we want to identify a reference to work with.
-    all_keys = list(all_refs.keys())
-
-    # Iterate through documents and create bad references.
-    bad_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict()
-    for curr_doc_id, curr_doc in docs.items():
-        if not curr_doc_id in bad_docs:
-            bad_docs[curr_doc_id] = []
-
-        print(f'doc_id: {curr_doc_id},\tdoc_len: {len(curr_doc)}')
-        for curr_seg in curr_doc:
-            curr_seg_id, curr_seg_text = curr_seg
-
-            # Bad reference id may not be identical to current id.
-            bad_id = choice(all_keys)
-            while bad_id == f'{curr_doc_id}_{curr_seg_id}':
-                bad_id = choice(all_keys)
-
-            curr_bad_text = _create_bad_ref(
-                curr_seg_text,
-                all_refs[bad_id],
-                character_based=character_based,
-            )
-
-            # Ensure that keys can be reused.
-            all_keys.append(bad_id)
-
-            bad_docs[curr_doc_id].append((curr_seg_id, curr_bad_text))
-
-    return bad_docs
-
-
-def process_sgml(file_path: str) -> Dict[int, List[str]]:
-    """
-    Extracts document stats from given SGML file.
-
-    Returns dict mapping number of segments to list of document [ids].
-    Each referenced document has the respective number of segments.
-    """
-    soup = None
-
-    with open(file_path) as _file:
-        soup = BeautifulSoup(_file, features='lxml')
-
-    all_docs = []
-    stats: Dict[int, List[str]] = defaultdict(list)
-    for curr_doc in soup.find_all('doc'):
-        curr_doc_id = curr_doc.attrs['docid']
-        seg_count = len(curr_doc.find_all('seg'))
-        stats[seg_count].append(curr_doc_id)
-        all_docs.append(seg_count)
-
-    curr_len = 0
-    for doc in all_docs:
-        if curr_len + doc > REQUIRED_SEGS:
-            print(curr_len)
-            curr_len = 0
-        curr_len += doc
-    print(curr_len)
-
-    return stats
-
-
-if __name__ == "__main__":
-    SRC_SGML = sys.argv[1]  # Path to source .sgm file
-    REF_SGML = sys.argv[2]  # Path to reference .sgm file
-    SYS_PATH = sys.argv[3]  # Path to the directory with system outputs
-    SYS_GLOB = sys.argv[4]  # Pattern for .sgm files, e.g '*.sgm'
-    OUT_NAME = sys.argv[5]  # Prefix for .csv and .json output files
-    SRC_LANG = sys.argv[6]  # Code for source language, e.g. eng
-    TGT_LANG = sys.argv[7]  # Code for target language, e.g. deu
-    TASK_MAX = int(sys.argv[8])  # Maximum number of tasks
-    CONTROLS = sys.argv[9].lower() not in ['', '0', 'false', 'off']
-    ENC = 'utf-8'
-
-    RND_SEED = 123456
-    seed(RND_SEED)
-
-    print(f'Quality control={CONTROLS}')
-    if CONTROLS:
-        REQUIRED_SEGS = 80
-    else:
-        REQUIRED_SEGS = 100
-    print(f'Setting REQUIRED_SEGS={REQUIRED_SEGS}')
-
-    print(f'Loading source docs from {SRC_SGML}')
-    SRC_DOCS = load_docs_from_sgml(SRC_SGML, encoding=ENC)
-    print(f'Loading reference docs from {SRC_SGML}')
-    REF_DOCS = load_docs_from_sgml(REF_SGML, encoding=ENC)
-
-    SYS_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = {}
-    BAD_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = {}
-    for SYS_SGML in iglob(join(SYS_PATH, SYS_GLOB)):
-        SYS_ID = basename(SYS_SGML)
-        print(f'Loading outputs of {SYS_ID}')
-
-        SYS_DOCS[SYS_ID] = load_docs_from_sgml(SYS_SGML, encoding=ENC)
-        BAD_DOCS[SYS_ID] = create_bad_refs(SYS_DOCS[SYS_ID], REF_DOCS)
-
-    # pylint: disable-msg=invalid-name
-    some_sys_id = choice(list(SYS_DOCS.keys()))
-    some_doc_id = choice(list(SYS_DOCS[some_sys_id].keys()))
-    some_sys_text = SYS_DOCS[some_sys_id][some_doc_id]
-    some_bad_text = BAD_DOCS[some_sys_id][some_doc_id]
-    print(some_sys_id, some_doc_id)
-
-    for _s, _b in zip(some_sys_text, some_bad_text):
-        print(_s)
-        print(_b)
-        print('---')
-
-    DOC_STATS: Dict[int, List[Tuple[int, str, str]]] = {}
-    for sys_id in SYS_DOCS:
-        for doc_id in SYS_DOCS[sys_id]:
-            doc_len = len(SYS_DOCS[sys_id][doc_id])
-
-            # We do not support documents longer than 70 segments.
-            if doc_len > MAX_DOC_LENGTH:
-                continue
-
-            if not doc_len in DOC_STATS.keys():
-                DOC_STATS[doc_len] = []
-
-            DOC_STATS[doc_len].append((doc_len, doc_id, sys_id))
-
-    # Randomise system order
-    for doc_len in DOC_STATS:
-        shuffle(DOC_STATS[doc_len])
-
-    print(sorted(DOC_STATS.keys()))
-    total_docs = 0
-    total_sys = set()
-    for doc_len in sorted(DOC_STATS.keys()):
-        print(f'{doc_len}:\t{len(DOC_STATS[doc_len])}')
-        total_docs += len(DOC_STATS[doc_len])
-        for x in DOC_STATS[doc_len]:
-            total_sys.add(x[2])
-
-    all_systems = list(total_sys)
-    sampled_tasks: List[Tuple[Tuple[int, str, str], ...]] = []
-    CURR_LEN = 0
-    CURR_SYS = 0
-    curr_task: List[Tuple[int, str, str]] = []
-    while DOC_STATS.keys():
-        ALL_KEYS = list(DOC_STATS.keys())
-        max_delta = REQUIRED_SEGS - CURR_LEN
-        valid_keys = [x for x in ALL_KEYS if x <= max_delta]
-
-        if not valid_keys:
-            print(CURR_LEN)
-            print(curr_task)
-            print('------')
-            sampled_tasks.append(tuple(curr_task))
-            CURR_LEN = 0
-            curr_task = []
-            continue
-
-        if max_delta in valid_keys:
-            curr_key = max_delta
-        else:
-            curr_key = choice(valid_keys)
-
-        CURR_LEN += curr_key
-
-        curr_val = DOC_STATS[curr_key].pop(0)  # This takes a random system.
-
-        # Below code would pick systems one after the other
-        #
-        # curr_val = None
-        # for iter_val in DOC_STATS[curr_key]:
-        #    if iter_val[2] == all_systems[CURR_SYS]:
-        #        curr_val = iter_val
-        #        DOC_STATS[curr_key].remove(iter_val)
-        #        break
-        #
-        # if not curr_val:
-        #    curr_val = DOC_STATS[curr_key].pop(0)
-        #    CURR_SYS = all_systems.index(curr_val[2])
-        # CURR_SYS = (CURR_SYS + 1) % len(all_systems)
-
-        curr_task.append(curr_val)
-        if not DOC_STATS[curr_key]:
-            DOC_STATS.pop(curr_key)
-
-    # Shuffle order of tasks
-    shuffle(sampled_tasks)
-
-    padded_tasks: List[Tuple[Tuple[int, str, str], ...]] = []
-    for tid, task in enumerate(sampled_tasks):
-        task_docs = len(task)
-        task_len = sum([x[0] for x in task])
-        print(f'task_len: {task_len}')
-        if task_len > MAX_TASK_SIZE:
-            raise NotImplementedError(
-                'No support for tasks >{0} items!'.format(MAX_TASK_SIZE)
-            )
-
-        elif task_len < MAX_TASK_SIZE:
-            pad_size = MAX_TASK_SIZE - task_len
-            pad_data: List[Tuple[int, str, str]] = list(task)
-            pad_pos = 0
-            while pad_size > 0:
-                print(f'pad_size: {pad_size}')
-                print(f'pad_pos: {pad_pos}')
-                pad_data.append(tuple(list(pad_data[pad_pos]) + [True]))  # type: ignore
-                print(pad_data[-1])
-                pad_size -= pad_data[-1][0]
-                pad_pos = (pad_pos + 1) % task_docs
-            if pad_size < 0:
-                print(f'pad_size: {pad_size}')
-                print(f'pad_pos: {pad_pos}')
-
-                last_doc: Tuple[int, str, str] = pad_data[-1]
-                print(last_doc[0], '-->', last_doc[0] + pad_size)
-                fixed_doc = (last_doc[0] + pad_size, *last_doc[1:])
-                pad_data[-1] = fixed_doc
-                print(pad_data[-1][0])
-            padded_tasks.append(tuple(pad_data))
-            print(padded_tasks[-1])
-
-        else:
-            print(f'WARNING: no control items in task no. {tid}')
-            # raise NotImplementedError('Needs isControl=True update!')
-            padded_tasks.append(tuple(task))  # TODO: does this ever occur?
-
-    csv_data = []
-    task_id = 0
-    for task in padded_tasks:
-        task_id += 1
-        task_len = sum([x[0] for x in task])
-        print(f'task_len: {task_len}')
-
-        for _doc in task:
-            _data = [str(task_id)]
-            for x in _doc:  # type: ignore
-                _data.append(str(x))
-
-            if _data[-1] != 'True':
-                _data.append('False')  # isControl=False
-            print(_data)
-            csv_data.append(','.join(_data))
-
-    with open(f'{OUT_NAME}.csv', mode='w') as _file:
-        for csv_line in csv_data:
-            _file.write(csv_line)
-            _file.write('\n')
-
-    json_data = []
-    batch_id = 0
-    for task in padded_tasks[:TASK_MAX]:
-        # Remember, batch numbers are one-based
-        task_data = OrderedDict(
-            {
-                'batchNo': batch_id + 1,
-                'batchSize': 100,
-                'sourceLanguage': SRC_LANG,
-                'targetLanguage': TGT_LANG,
-                'requiredAnnotations': 1,
-                'randomSeed': RND_SEED,
-            }
-        )
-
-        source_id = basename(SRC_SGML)
-
-        items_data = []
-        _item = 0
-        for doc_data in task:
-            doc_len, doc_id, sys_id, *rest = doc_data  # type: ignore
-
-            isControl = rest is not None and rest
-
-            target_id = sys_id
-
-            _src = {}
-            _ref = {}
-            _bad = {}
-            _tgt = {}
-
-            for item_id, item_src in SRC_DOCS[doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _src[seg_id] = item_src
-
-            for item_id, item_ref in REF_DOCS[doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _ref[seg_id] = item_ref
-
-            for item_id, item_bad in BAD_DOCS[sys_id][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _bad[seg_id] = item_bad
-
-            for item_id, item_tgt in SYS_DOCS[sys_id][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _tgt[seg_id] = item_tgt
-
-            seg_counter = 0
-            context_src: List[Text] = []
-            context_ref: List[Text] = []
-            context_bad: List[Text] = []
-            context_tgt: List[Text] = []
-            for seg_id in _src:
-                if seg_counter >= doc_len:  # Padding tasks are shorter!
-                    break
-                item_src = _src[seg_id]
-                item_ref = _ref[seg_id]
-                item_bad = _bad[seg_id]
-                item_tgt = _tgt[seg_id]
-
-                target_text = item_tgt
-                target_type = 'TGT'
-                if (
-                    CONTROLS and isControl
-                ):  # Do not generate any BAD items if QC is disabled
-                    randomCoinFlip = choice(
-                        [False, False, True, True, True]
-                    )  # 60:40 chance
-                    if randomCoinFlip:
-                        target_text = item_bad
-                        target_type = 'BAD'
-
-                obj: Dict[str, Any] = OrderedDict()
-                obj['_item'] = _item
-                obj['_block'] = -1
-                obj['sourceID'] = source_id
-                obj['sourceContextLeft'] = ' '.join(context_src)
-                obj['sourceText'] = item_src
-                obj['targetID'] = target_id
-                obj['targetContextLeft'] = ' '.join(context_tgt)
-                obj['targetText'] = target_text
-                obj['itemID'] = seg_counter
-                obj['itemType'] = target_type
-                obj['documentID'] = doc_id
-                obj['isCompleteDocument'] = False
-
-                print(seg_id)
-                print(' '.join(context_src))
-                print(item_src)
-                print('...')
-                print(' '.join(context_tgt))
-                print(item_tgt.encode('utf-8'))
-                print('---')
-
-                context_src.append(item_src)
-                context_ref.append(item_ref)
-                context_bad.append(item_bad)
-                context_tgt.append(target_text)
-
-                items_data.append(obj)
-                _item += 1
-                seg_counter += 1
-
-            obj = OrderedDict()
-            obj['_item'] = _item
-            obj['_block'] = -1
-            obj['sourceID'] = source_id
-            obj['sourceText'] = ' '.join(context_src)  # full document
-            obj['targetID'] = target_id
-            obj['targetText'] = ' '.join(context_tgt)  # full document
-            obj['itemID'] = item_id
-            obj['itemType'] = 'TGT'
-            obj['documentID'] = doc_id
-            obj['isCompleteDocument'] = True
-            items_data.append(obj)
-
-        output_data = OrderedDict({'task': task_data, 'items': items_data})
-
-        json_data.append(output_data)
-
-        # write out JSON
-        json_text = json_dumps(json_data, indent=2, sort_keys=True)
-
-        json_file_name = f'{OUT_NAME}.json'
-        with open(json_file_name, mode='w', encoding='utf8') as out_file:
-            sys.stdout.write('Creating {0} ... '.format(json_file_name, ending=''))  # type: ignore
-            out_file.write(str(json_text))
-            sys.stdout.write('OK\n')
-
-        batch_id += 1
-
-    print(f'Total tasks: {len(sampled_tasks)}')
-    print(f'Total docs:  {total_docs}')
-    print(f'Total sys:   {len(total_sys)} {total_sys}')
diff --git a/scripts/create_wmt21_tasks.py b/scripts/create_wmt21_tasks.py
deleted file mode 100644
index ea9fd14a..00000000
--- a/scripts/create_wmt21_tasks.py
+++ /dev/null
@@ -1,709 +0,0 @@
-# pylint: disable=C0103,C0111,C0330,E1101
-import sys
-from collections import defaultdict
-from collections import OrderedDict
-from glob import iglob
-from json import dumps as json_dumps
-from os.path import basename
-from os.path import join
-from random import choice
-from random import randint
-from random import seed
-from random import shuffle
-from typing import Any
-from typing import Dict
-from typing import List
-from typing import Text
-from typing import Tuple
-
-from lxml import etree
-
-
-MAX_TASK_SIZE = 100  # No support for tasks over 100 items
-MAX_DOC_LENGTH = 70  # We do not support documents longer than 70 segments
-
-MISSING_TRANSLATION_MESSAGE = ("NO TRANSLATION AVAILABLE",)
-DEFAULT_TRANSLATOR = "DEFAULT"
-# If False, documents with control items will be very last ones in each batch
-SHUFFLE_DOCS_WITH_CONTROL_ITEMS = True
-# If True, add references as additional system outputs
-INCLUDE_REFERENCES_AS_SYSTEMS = True
-REFERENCE_AS_SYSTEM_PREFIX = 'translator-'
-
-
-def unwrap_xml(
-    xml_file,
-    missing_message=MISSING_TRANSLATION_MESSAGE,
-    encoding='utf-8',
-):
-    """
-    Unwraps an xml file in WMT format, producing source and (if present) reference files
-
-    :param xml_file: The xml file (or fd)
-    :param missing_message: The message to insert when no reference
-
-    :returns: src_lang, src_lines, ref_lang, ref_lines, hyp_lang, hyp_lines
-
-    ref_lines maps translator to document to tuples of segment id and line text
-    hyp_lines maps system to document to tuples of segment id and line text
-
-    ref_lang and hyp_lang may be None, and then their lines are empty
-    note: a single language is assumed for each of sources, refs and hyps
-
-    This function has been extracted from
-    https://github.com/wmt-conference/wmt-format-tools/wmtformat/unwrap.py with
-    some modifications
-    """
-    tree = etree.parse(xml_file)
-
-    # Find and check  the documents (src, ref, hyp)
-    src_langs, ref_langs, hyp_langs, translators, systems = (
-        set(),
-        set(),
-        set(),
-        set(),
-        set(),
-    )
-
-    for src_doc in tree.getroot().findall(".//src"):
-        src_langs.add(src_doc.get("lang"))
-
-    for ref_doc in tree.getroot().findall(".//ref"):
-        ref_langs.add(ref_doc.get("lang"))
-        translator = ref_doc.get("translator")
-        if translator:
-            translators.add(translator)
-
-    for hyp_doc in tree.getroot().findall(".//hyp"):
-        hyp_langs.add(hyp_doc.get("lang"))
-        systems.add(hyp_doc.get("system"))
-
-    if len(src_langs) > 1:
-        raise RuntimeError("Multiple source languages found")
-
-    if len(src_langs) == 0:
-        raise RuntimeError("No source languages found")
-
-    src_lang = src_langs.pop()
-    src_docs = OrderedDict()
-
-    if len(ref_langs) > 1:
-        raise RuntimeError("Multiple reference languages found")
-
-    translators = list(translators)
-    if len(ref_langs) > 0:
-        if len(translators) == 0:
-            print("No translator identifiers found")
-            translators.append(DEFAULT_TRANSLATOR)
-        ref_lang = ref_langs.pop()
-        ref_docs = OrderedDict(
-            (translator, OrderedDict()) for translator in translators
-        )
-    else:
-        print("No references found")
-        ref_lang = None
-        ref_docs = OrderedDict()
-
-    if len(hyp_langs) > 1:
-        raise RuntimeError("Multiple hypothesis languages found")
-
-    systems = list(systems)
-    if len(hyp_langs) > 0:
-        hyp_docs = OrderedDict((system, OrderedDict()) for system in systems)
-        hyp_lang = hyp_langs.pop()
-    else:
-        hyp_docs = OrderedDict()
-        hyp_lang = None
-
-    # Extract text
-    src_sent_count, doc_count = 0, 0
-    for doc in tree.getroot().findall(".//doc"):
-        doc_id = doc.get("id")
-        src = []
-        if "testsuite" in doc.attrib:
-            continue
-        doc_count += 1
-        src_sents = {int(seg.get("id")): seg.text for seg in doc.findall(".//src//seg")}
-
-        def get_sents(doc):
-            return {
-                int(seg.get("id")): seg.text if seg.text else ""
-                for seg in doc.findall(f".//seg")
-            }
-
-        if ref_lang:
-            _ref_docs = doc.findall(".//ref")
-            trans_to_ref = {}
-
-            # If no translator identifiers, we just read one reference (if any)
-            # If there are translator identifiers, we add a reference for each translator
-            if len(translators) == 1 and DEFAULT_TRANSLATOR in translators:
-                if len(_ref_docs):
-                    trans_to_ref[DEFAULT_TRANSLATOR] = get_ref_sents(_ref_docs[0])
-                else:
-                    trans_to_ref[DEFAULT_TRANSLATOR] = {}
-            else:
-                trans_to_ref = {
-                    ref_doc.get("translator"): get_sents(ref_doc)
-                    for ref_doc in _ref_docs
-                }
-
-        if hyp_lang:
-            _hyp_docs = doc.findall(".//hyp")
-            system_to_ref = {
-                hyp_doc.get("system"): get_sents(hyp_doc) for hyp_doc in _hyp_docs
-            }
-
-        for seg_id in sorted(src_sents.keys()):
-            src.append([seg_id, src_sents[seg_id]])
-            src_sent_count += 1
-            if ref_lang:
-                for translator in translators:
-                    if doc_id not in ref_docs[translator]:
-                        ref_docs[translator][doc_id] = []
-
-                    # _ref_text = trans_to_ref.get(translator, {translator: {}}).get(
-                    _ref_text = trans_to_ref[translator].get(seg_id, missing_message)
-                    ref_docs[translator][doc_id].append((seg_id, _ref_text))
-
-                    if _ref_text == MISSING_TRANSLATION_MESSAGE:
-                        print(
-                            f'Warning: missing reference for translator {translator}, '
-                            f'document {doc_id}, segment {seg_id}'
-                        )
-            if hyp_lang:
-                for system in systems:
-                    if doc_id not in hyp_docs[system]:
-                        hyp_docs[system][doc_id] = []
-
-                    # _hyp_text = system_to_ref.get(system, {system: {}}).get(
-                    _hyp_text = system_to_ref[system].get(seg_id, missing_message)
-                    hyp_docs[system][doc_id].append((seg_id, _hyp_text))
-
-                    if _hyp_text == MISSING_TRANSLATION_MESSAGE:
-                        print(
-                            f'Warning: missing translation from {system}, '
-                            f'document {doc_id}, segment {seg_id}'
-                        )
-
-        src_docs[doc_id] = src
-
-    print(
-        f"Extracted {doc_count} document(s) containing {src_sent_count} sentences in {src_lang}"
-    )
-
-    return src_lang, src_docs, ref_lang, ref_docs, hyp_lang, hyp_docs
-
-
-def _create_bad_ref(seg_text: str, ref_text: str, character_based: bool = False) -> str:
-    """
-    Creates bad reference for given text.
-
-    Segment length (a, b] to phrase length (excluding a, including b)
-    mapping defined as follows:
-        ( 0,   1] : 1
-        ( 1,   5] : 2
-        ( 5,   8] : 3
-        ( 8,  15] : 4
-        (15,  20] : 5
-        (20, max] : 6
-
-    For character-based languages, which do not support tokenisation
-    by whitespace, the resulting phrase length will be doubled, and
-    is interpreted as a character length.
-    """
-    seg_data = seg_text.split(' ')
-    ref_data = ref_text.split(' ')
-
-    if character_based:
-        seg_data = [x for x in seg_text]
-        ref_data = [x for x in ref_text]
-
-    seg_len = len(seg_data)
-    ref_len = len(ref_data)
-
-    # Determine length of bad phrase, relative to segment length.
-    _seg_to_bad_mapping = {
-        (None, 1): 1,
-        (1, 5): 2,
-        (5, 8): 3,
-        (8, 15): 4,
-        (15, 20): 5,
-        (20, None): 6,
-    }
-
-    bad_len = 0
-    for seg_pair in _seg_to_bad_mapping:
-        left, right = seg_pair
-
-        # seg_len == right; left edge case
-        if not left:
-            if seg_len == right:
-                bad_len = _seg_to_bad_mapping[seg_pair]
-                break
-
-        # left < seg_len; right edge case
-        elif not right:
-            if left < seg_len:
-                bad_len = _seg_to_bad_mapping[seg_pair]
-                break
-
-        # left < seg_len <= right; middle cases
-        elif left < seg_len <= right:
-            bad_len = _seg_to_bad_mapping[seg_pair]
-            break
-
-    # Double length of bad phrase for character-based languages.
-    if character_based:
-        bad_len = 2 * bad_len
-
-    # Determine random replacement position. For segments longer than
-    # (bad_len + 1), we enforce that this cannot be sentence initial
-    # or final, so positions 0 and (seg_len - bad_len -1) are invalid
-    # and we use an embedded bad_pos in [1, (seg_len - bad_len - 1)].
-    # This happens for all seg_len > 3.
-    bad_pos = 0
-    if seg_len - bad_len > 0:
-        bad_pos = choice(range(seg_len - bad_len))
-
-    elif seg_len > 3:
-        _xs = max(1, seg_len - bad_len - 1)
-        bad_pos = choice([x + 1 for x in range(_xs)])
-
-    ref_pos = 0
-    if ref_len - bad_len > 0:
-        ref_pos = choice(range(ref_len - bad_len))
-
-    bad_data = (
-        seg_data[:bad_pos]
-        + ref_data[ref_pos : ref_pos + bad_len]
-        + seg_data[bad_pos + bad_len :]
-    )
-    bad_text = ' '.join(bad_data)
-    if character_based:
-        bad_text = ''.join(bad_data)
-
-    # print(seg_text)
-    # print(bad_text)
-    # print('------------')
-    return bad_text
-
-
-def create_bad_refs(
-    docs: Dict[str, List[Tuple[str, str]]],
-    refs: Dict[str, List[Tuple[str, str]]],
-    character_based: bool = False,
-) -> Dict[str, List[Tuple[str, str]]]:
-    """
-    Creates bad references for given documents.
-
-    For each segment in the given documents, this creates a so-called
-    ``bad reference'' which is constructed by replacing an embedded
-    phrase p with a randomly placed phrase p' of the same length,
-    taken from a different segment contained in refs. The length of
-    the phrase is relative to the full segment length.
-
-    See _create_bad_ref() definition for length mapping details.
-    """
-    # Create mapping from f'{doc_id}_{seg_id}' to reference text.
-    all_refs = {}
-    for curr_doc_id, curr_doc in refs.items():
-        for curr_seg_id, curr_ref_text in curr_doc:
-            all_refs[f'{curr_doc_id}_{curr_seg_id}'] = curr_ref_text
-
-    # Create list of f'{doc_id}_{seg_id}' ids, to be used for random
-    # choice later when we want to identify a reference to work with.
-    all_keys = list(all_refs.keys())
-
-    # Iterate through documents and create bad references.
-    bad_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict()
-    for curr_doc_id, curr_doc in docs.items():
-        if not curr_doc_id in bad_docs:
-            bad_docs[curr_doc_id] = []
-
-        print(f'doc_id: {curr_doc_id},\tdoc_len: {len(curr_doc)}')
-        for curr_seg in curr_doc:
-            curr_seg_id, curr_seg_text = curr_seg
-
-            # Bad reference id may not be identical to current id.
-            bad_id = choice(all_keys)
-            while bad_id == f'{curr_doc_id}_{curr_seg_id}':
-                bad_id = choice(all_keys)
-
-            curr_bad_text = _create_bad_ref(
-                curr_seg_text,
-                all_refs[bad_id],
-                character_based=character_based,
-            )
-
-            # Ensure that keys can be reused.
-            all_keys.append(bad_id)
-
-            bad_docs[curr_doc_id].append((curr_seg_id, curr_bad_text))
-
-    return bad_docs
-
-
-if __name__ == "__main__":
-    if len(sys.argv) < 8:
-        print('Example usage:')
-        print(
-            f'  {sys.argv[0]} newstest2021.en-de.all.xml batches.en-de enu deu 50 True False'
-        )
-        exit()
-
-    XML_FILE = sys.argv[1]  # Path to .xml file with sources, references and outputs
-    OUT_NAME = sys.argv[2]  # Prefix for .csv and .json output files
-    SRC_LANG = sys.argv[3]  # Code for source language, e.g. eng
-    TGT_LANG = sys.argv[4]  # Code for target language, e.g. deu
-    TASK_MAX = int(sys.argv[5])  # Maximum number of tasks
-    CONTROLS = sys.argv[6].lower() not in ['', '0', 'false', 'off']  # Generate QC items
-    CHARLANG = sys.argv[7].lower() in ['1', 'true', 'on']  # Character-based
-    print(f'Character based={CHARLANG}')
-
-    ENC = 'utf-8'
-
-    RND_SEED = 123456
-    seed(RND_SEED)
-
-    print(f'Quality control={CONTROLS}')
-    if CONTROLS:
-        REQUIRED_SEGS = 80
-    else:
-        REQUIRED_SEGS = 100
-    print(f'Setting REQUIRED_SEGS={REQUIRED_SEGS}')
-
-    SYS_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict()
-    BAD_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict()
-    print(f'Loading docs from {XML_FILE}')
-    src_lang, SRC_DOCS, ref_lang, REF_DOCS, hyp_lang, SYS_DOCS = unwrap_xml(
-        XML_FILE, encoding=ENC
-    )
-
-    # This reference will be used for generating BAD items
-    REF_ID = sorted(list(REF_DOCS.keys()))[0]
-    print(f'Using reference "{REF_ID}"')
-
-    # Add references as additional system outputs
-    if INCLUDE_REFERENCES_AS_SYSTEMS:
-        for ref_id in sorted(list(REF_DOCS.keys())):
-            sys_id = REFERENCE_AS_SYSTEM_PREFIX + ref_id
-            print(f'Adding reference "{ref_id}" as system output "{sys_id}"')
-            SYS_DOCS[sys_id] = REF_DOCS[ref_id]
-
-    # List of system names that can be iterated deterministically
-    SYS_IDS = sorted(list(SYS_DOCS.keys()))
-
-    for sys_id in SYS_IDS:
-        print(f'Generating bad references for {sys_id}')
-        BAD_DOCS[sys_id] = create_bad_refs(
-            SYS_DOCS[sys_id], REF_DOCS[REF_ID], character_based=CHARLANG
-        )
-
-    # pylint: disable-msg=invalid-name
-    some_sys_id = choice(SYS_IDS)
-    some_doc_id = choice(sorted(list(SYS_DOCS[some_sys_id].keys())))
-    some_sys_text = SYS_DOCS[some_sys_id][some_doc_id]
-    some_bad_text = BAD_DOCS[some_sys_id][some_doc_id]
-    print(some_sys_id, some_doc_id)
-
-    for _s, _b in zip(some_sys_text, some_bad_text):
-        print(_s)
-        print(_b)
-        print('---')
-
-    DOC_STATS: Dict[int, List[Tuple[int, str, str]]] = OrderedDict()
-    for sys_id in SYS_IDS:
-        for doc_id in SYS_DOCS[sys_id].keys():
-            doc_len = len(SYS_DOCS[sys_id][doc_id])
-
-            # We do not support documents longer than 70 segments.
-            if doc_len > MAX_DOC_LENGTH:
-                continue
-
-            if not doc_len in DOC_STATS.keys():
-                DOC_STATS[doc_len] = []
-
-            DOC_STATS[doc_len].append((doc_len, doc_id, sys_id))
-
-    # Randomise system order
-    for doc_len in DOC_STATS:
-        shuffle(DOC_STATS[doc_len])
-
-    print(DOC_STATS.keys())
-    total_docs = 0
-    total_sys = set()
-    for doc_len in DOC_STATS.keys():
-        print(f'{doc_len}:\t{len(DOC_STATS[doc_len])}')
-        total_docs += len(DOC_STATS[doc_len])
-        for x in DOC_STATS[doc_len]:
-            total_sys.add(x[2])
-
-    all_systems = list(total_sys)
-    sampled_tasks: List[Tuple[Tuple[int, str, str], ...]] = []
-    CURR_LEN = 0
-    CURR_SYS = 0
-    curr_task: List[Tuple[int, str, str]] = []
-    while DOC_STATS.keys():
-        ALL_KEYS = sorted(list(DOC_STATS.keys()))
-        max_delta = REQUIRED_SEGS - CURR_LEN
-        valid_keys = [x for x in ALL_KEYS if x <= max_delta]
-
-        if not valid_keys:
-            print(CURR_LEN)
-            print(curr_task)
-            print('------')
-            sampled_tasks.append(tuple(curr_task))
-            CURR_LEN = 0
-            curr_task = []
-            continue
-
-        if max_delta in valid_keys:
-            curr_key = max_delta
-        else:
-            curr_key = choice(valid_keys)
-
-        CURR_LEN += curr_key
-
-        curr_val = DOC_STATS[curr_key].pop(0)  # This takes a random system.
-
-        # Below code would pick systems one after the other
-        #
-        # curr_val = None
-        # for iter_val in DOC_STATS[curr_key]:
-        #    if iter_val[2] == all_systems[CURR_SYS]:
-        #        curr_val = iter_val
-        #        DOC_STATS[curr_key].remove(iter_val)
-        #        break
-        #
-        # if not curr_val:
-        #    curr_val = DOC_STATS[curr_key].pop(0)
-        #    CURR_SYS = all_systems.index(curr_val[2])
-        # CURR_SYS = (CURR_SYS + 1) % len(all_systems)
-
-        curr_task.append(curr_val)
-        if not DOC_STATS[curr_key]:
-            DOC_STATS.pop(curr_key)
-
-    # Shuffle order of tasks
-    shuffle(sampled_tasks)
-
-    padded_tasks: List[Tuple[Tuple[int, str, str], ...]] = []
-    for tid, task in enumerate(sampled_tasks):
-        task_docs = len(task)
-        task_len = sum([x[0] for x in task])
-        print(f'task_len: {task_len}')
-        if task_len > MAX_TASK_SIZE:
-            raise NotImplementedError(
-                'No support for tasks >{0} items!'.format(MAX_TASK_SIZE)
-            )
-
-        elif task_len < MAX_TASK_SIZE:
-            pad_size = MAX_TASK_SIZE - task_len
-            pad_data: List[Tuple[int, str, str]] = list(task)
-            pad_pos = 0
-            while pad_size > 0:
-                print(f'pad_size: {pad_size}')
-                print(f'pad_pos: {pad_pos}')
-                pad_data.append(tuple(list(pad_data[pad_pos]) + [True]))  # type: ignore
-                print(pad_data[-1])
-                pad_size -= pad_data[-1][0]
-                pad_pos = (pad_pos + 1) % task_docs
-            if pad_size < 0:
-                print(f'pad_size: {pad_size}')
-                print(f'pad_pos: {pad_pos}')
-
-                last_doc: Tuple[int, str, str] = pad_data[-1]
-                print(last_doc[0], '-->', last_doc[0] + pad_size)
-                fixed_doc = (last_doc[0] + pad_size, *last_doc[1:])
-                pad_data[-1] = fixed_doc
-                print(pad_data[-1][0])
-            padded_tasks.append(tuple(pad_data))
-            print(padded_tasks[-1])
-
-        else:
-            print(f'WARNING: no control items in task no. {tid}')
-            # raise NotImplementedError('Needs isControl=True update!')
-            padded_tasks.append(tuple(task))  # TODO: does this ever occur?
-
-    csv_data = []
-    task_id = 0
-    for task in padded_tasks:
-        task_id += 1
-        task_len = sum([x[0] for x in task])
-        print(f'task_len: {task_len}')
-
-        for _doc in task:
-            _data = [str(task_id)]
-            for x in _doc:  # type: ignore
-                _data.append(str(x))
-
-            if _data[-1] != 'True':
-                _data.append('False')  # isControl=False
-            print(_data)
-            csv_data.append(','.join(_data))
-
-    with open(f'{OUT_NAME}.csv', mode='w') as _file:
-        for csv_line in csv_data:
-            _file.write(csv_line)
-            _file.write('\n')
-
-    json_data = []
-    batch_id = 0
-    for task in padded_tasks[:TASK_MAX]:
-        # Remember, batch numbers are one-based
-        task_data = OrderedDict(
-            {
-                'batchNo': batch_id + 1,
-                'batchSize': 100,
-                'sourceLanguage': SRC_LANG,
-                'targetLanguage': TGT_LANG,
-                'requiredAnnotations': 1,
-                'randomSeed': RND_SEED,
-            }
-        )
-
-        source_id = basename(XML_FILE)
-
-        items_data: List[List[Dict[str, Any]]] = []  # Keeps items grouped into document
-        _item = 0
-        doc_counter = 0
-        for doc_data in task:
-            items_data.append([])  # Add a new bucket for items from this documents
-            has_control_item = False
-
-            doc_len, doc_id, sys_id, *rest = doc_data  # type: ignore
-
-            isControl = rest is not None and rest
-
-            target_id = sys_id
-
-            _src = {}
-            _ref = {}
-            _bad = {}
-            _tgt = {}
-
-            for item_id, item_src in SRC_DOCS[doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _src[seg_id] = item_src
-
-            for item_id, item_ref in REF_DOCS[REF_ID][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _ref[seg_id] = item_ref
-
-            for item_id, item_bad in BAD_DOCS[sys_id][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _bad[seg_id] = item_bad
-
-            for item_id, item_tgt in SYS_DOCS[sys_id][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _tgt[seg_id] = item_tgt
-
-            seg_counter = 0
-            context_src: List[Text] = []
-            context_ref: List[Text] = []
-            context_bad: List[Text] = []
-            context_tgt: List[Text] = []
-            for seg_id in _src:
-                if seg_counter >= doc_len:  # Padding tasks are shorter!
-                    break
-                item_src = _src[seg_id]
-                item_ref = _ref[seg_id]
-                item_bad = _bad[seg_id]
-                item_tgt = _tgt[seg_id]
-
-                target_text = item_tgt
-                target_type = 'TGT'
-
-                # Do not generate any BAD items if QC is disabled
-                if CONTROLS and isControl:
-                    randomCoinFlip = choice(
-                        [False, False, True, True, True]  # 60:40 chance
-                    )
-                    if randomCoinFlip:
-                        target_text = item_bad
-                        target_type = 'BAD'
-                        has_control_item = True
-
-                obj: Dict[str, Any] = OrderedDict()
-                obj['_item'] = _item
-                obj['_block'] = -1
-                obj['sourceID'] = source_id
-                obj['sourceContextLeft'] = ' '.join(context_src)
-                obj['sourceText'] = item_src
-                obj['targetID'] = target_id
-                obj['targetContextLeft'] = ' '.join(context_tgt)
-                obj['targetText'] = target_text
-                obj['itemID'] = seg_counter
-                obj['itemType'] = target_type
-                obj['documentID'] = doc_id
-                obj['isCompleteDocument'] = False
-
-                # print(seg_id)
-                # print(' '.join(context_src))
-                # print(item_src)
-                # print('...')
-                # print(' '.join(context_tgt))
-                # print(item_tgt.encode('utf-8'))
-                # print('---')
-
-                context_src.append(item_src)
-                context_ref.append(item_ref)
-                context_bad.append(item_bad)
-                context_tgt.append(target_text)
-
-                items_data[-1].append(obj)
-                _item += 1
-                seg_counter += 1
-
-            obj = OrderedDict()
-            obj['_item'] = _item
-            obj['_block'] = -1
-            obj['sourceID'] = source_id
-            obj['sourceText'] = ' '.join(context_src)  # full document
-            obj['targetID'] = target_id
-            obj['targetText'] = ' '.join(context_tgt)  # full document
-            obj['itemID'] = item_id
-            obj['itemType'] = 'TGT'
-            obj['documentID'] = doc_id
-            obj['isCompleteDocument'] = True
-            items_data[-1].append(obj)
-
-            if has_control_item and SHUFFLE_DOCS_WITH_CONTROL_ITEMS:
-                # Move the document with control items to a random position so
-                # that they are not accumulated as very last documents
-                _bad_doc = items_data.pop()
-                _pos = randint(0, len(items_data) - 1)
-                print(f'  Moving the last QC document to position {_pos}')
-                items_data.insert(_pos, _bad_doc)
-
-        # Extract items from documents
-        _items_data = [item for doc_items in items_data for item in doc_items]
-        # Re-assign _item numbers
-        if SHUFFLE_DOCS_WITH_CONTROL_ITEMS:
-            _item = 0
-            for i in range(len(_items_data)):
-                _items_data[i]['_item'] = _item
-                if _items_data[i]['isCompleteDocument'] == False:
-                    _item += 1
-
-        output_data = OrderedDict({'task': task_data, 'items': _items_data})
-
-        json_data.append(output_data)
-
-        # write out JSON
-        json_text = json_dumps(json_data, indent=2, sort_keys=True)
-
-        json_file_name = f'{OUT_NAME}.json'
-        with open(json_file_name, mode='w', encoding='utf8') as out_file:
-            sys.stdout.write(
-                'Creating {0}, batch no. {1} ... '.format(json_file_name, batch_id + 1),
-            )
-            out_file.write(str(json_text))
-            sys.stdout.write('OK\n')
-
-        batch_id += 1
-
-    print(f'Total tasks: {len(sampled_tasks)}')
-    print(f'Total docs:  {total_docs}')
-    print(f'Total sys:   {len(total_sys)} {sorted(list(total_sys))}')
diff --git a/scripts/create_wmt22_tasks.py b/scripts/create_wmt22_tasks.py
deleted file mode 100644
index 43e8e0bd..00000000
--- a/scripts/create_wmt22_tasks.py
+++ /dev/null
@@ -1,1023 +0,0 @@
-# pylint: disable=C0103,C0111,C0330,E1101
-import argparse
-import sys
-from collections import OrderedDict
-from copy import deepcopy
-from glob import iglob
-from json import dumps as json_dumps
-from os.path import basename
-from os.path import join
-from random import choice
-from random import randint
-from random import seed
-from random import shuffle
-from typing import Any
-from typing import Dict
-from typing import List
-from typing import Text
-from typing import Tuple
-
-from lxml import etree
-
-
-MAX_TASK_SIZE = 100  # No support for tasks over 100 items
-MAX_DOC_LENGTH = 70  # We do not support documents longer than 70 segments
-
-MISSING_TRANSLATION_MESSAGE = ("NO TRANSLATION AVAILABLE",)
-DEFAULT_TRANSLATOR = "DEFAULT"
-# If False, documents with control items will be very last ones in each batch
-SHUFFLE_DOCS_WITH_CONTROL_ITEMS = True
-# If True, add references as additional system outputs
-INCLUDE_REFERENCES_AS_SYSTEMS = True
-# If True, documents may be oversampled to form the last batch
-USE_ALL_DOCUMENTS_AND_ALL_SYSTEMS = True
-REFERENCE_AS_SYSTEM_PREFIX = 'translator-'
-
-
-def unwrap_xml(
-    xml_file,
-    missing_message=MISSING_TRANSLATION_MESSAGE,
-    encoding='utf-8',
-):
-    """
-    Unwraps an xml file in WMT format, producing source and (if present) reference files
-
-    :param xml_file: The xml file (or fd)
-    :param missing_message: The message to insert when no reference
-
-    :returns: src_lang, src_lines, ref_lang, ref_lines, hyp_lang, hyp_lines
-
-    ref_lines maps translator to document to tuples of segment id and line text
-    hyp_lines maps system to document to tuples of segment id and line text
-
-    ref_lang and hyp_lang may be None, and then their lines are empty
-    note: a single language is assumed for each of sources, refs and hyps
-
-    This function has been extracted from
-    https://github.com/wmt-conference/wmt-format-tools/wmtformat/unwrap.py with
-    some modifications
-    """
-    tree = etree.parse(xml_file)
-
-    # Find and check  the documents (src, ref, hyp)
-    src_langs, ref_langs, hyp_langs, translators, systems = (
-        set(),
-        set(),
-        set(),
-        set(),
-        set(),
-    )
-
-    for src_doc in tree.getroot().findall(".//src"):
-        src_langs.add(src_doc.get("lang"))
-
-    for ref_doc in tree.getroot().findall(".//ref"):
-        ref_langs.add(ref_doc.get("lang"))
-        translator = ref_doc.get("translator")
-        if translator:
-            translators.add(translator)
-
-    for hyp_doc in tree.getroot().findall(".//hyp"):
-        hyp_langs.add(hyp_doc.get("lang"))
-        systems.add(hyp_doc.get("system"))
-
-    if len(src_langs) > 1:
-        raise RuntimeError("Multiple source languages found")
-
-    if len(src_langs) == 0:
-        raise RuntimeError("No source languages found")
-
-    src_lang = src_langs.pop()
-    src_docs = OrderedDict()
-
-    if len(ref_langs) > 1:
-        raise RuntimeError("Multiple reference languages found")
-
-    translators = list(translators)
-    if len(ref_langs) > 0:
-        if len(translators) == 0:
-            print("No translator identifiers found")
-            translators.append(DEFAULT_TRANSLATOR)
-        ref_lang = ref_langs.pop()
-        ref_docs = OrderedDict(
-            (translator, OrderedDict()) for translator in translators
-        )
-    else:
-        print("No references found")
-        ref_lang = None
-        ref_docs = OrderedDict()
-
-    if len(hyp_langs) > 1:
-        raise RuntimeError(f"Multiple hypothesis languages found: {hyp_langs}")
-
-    systems = list(systems)
-    if len(hyp_langs) > 0:
-        hyp_docs = OrderedDict((system, OrderedDict()) for system in systems)
-        hyp_lang = hyp_langs.pop()
-    else:
-        hyp_docs = OrderedDict()
-        hyp_lang = None
-
-    # Extract text
-    src_sent_count, doc_count = 0, 0
-    for doc in tree.getroot().findall(".//doc"):
-        doc_id = doc.get("id")
-        src = []
-        if "testsuite" in doc.attrib:
-            continue
-        doc_count += 1
-        src_sents = {int(seg.get("id")): seg.text for seg in doc.findall(".//src//seg")}
-
-        def get_sents(doc):
-            return {
-                int(seg.get("id")): seg.text if seg.text else ""
-                for seg in doc.findall(f".//seg")
-            }
-
-        if ref_lang:
-            _ref_docs = doc.findall(".//ref")
-            trans_to_ref = {}
-
-            # If no translator identifiers, we just read one reference (if any)
-            # If there are translator identifiers, we add a reference for each translator
-            if len(translators) == 1 and DEFAULT_TRANSLATOR in translators:
-                if len(_ref_docs):
-                    trans_to_ref[DEFAULT_TRANSLATOR] = get_ref_sents(_ref_docs[0])
-                else:
-                    trans_to_ref[DEFAULT_TRANSLATOR] = {}
-            else:
-                trans_to_ref = {
-                    ref_doc.get("translator"): get_sents(ref_doc)
-                    for ref_doc in _ref_docs
-                }
-
-        if hyp_lang:
-            _hyp_docs = doc.findall(".//hyp")
-            system_to_ref = {
-                hyp_doc.get("system"): get_sents(hyp_doc) for hyp_doc in _hyp_docs
-            }
-
-        for seg_id in sorted(src_sents.keys()):
-            src.append([seg_id, src_sents[seg_id]])
-            src_sent_count += 1
-            if ref_lang:
-                for translator in translators:
-                    if doc_id not in ref_docs[translator]:
-                        ref_docs[translator][doc_id] = []
-
-                    # _ref_text = trans_to_ref.get(translator, {translator: {}}).get(
-                    _ref_text = trans_to_ref[translator].get(seg_id, missing_message)
-                    ref_docs[translator][doc_id].append((seg_id, _ref_text))
-
-                    if _ref_text == MISSING_TRANSLATION_MESSAGE:
-                        print(
-                            f'Warning: missing reference for translator {translator}, '
-                            f'document {doc_id}, segment {seg_id}'
-                        )
-            if hyp_lang:
-                for system in systems:
-                    if doc_id not in hyp_docs[system]:
-                        hyp_docs[system][doc_id] = []
-
-                    # _hyp_text = system_to_ref.get(system, {system: {}}).get(
-                    _hyp_text = system_to_ref[system].get(seg_id, missing_message)
-                    hyp_docs[system][doc_id].append((seg_id, _hyp_text))
-
-                    if _hyp_text == MISSING_TRANSLATION_MESSAGE:
-                        print(
-                            f'Warning: missing translation from {system}, '
-                            f'document {doc_id}, segment {seg_id}'
-                        )
-
-        src_docs[doc_id] = src
-
-    print(
-        f"Extracted {doc_count} document(s) containing {src_sent_count} sentences in {src_lang}"
-    )
-
-    return src_lang, src_docs, ref_lang, ref_docs, hyp_lang, hyp_docs
-
-
-def chop_docs(orig_src_docs, orig_ref_docs, orig_hyp_docs, max_length=10):
-    """
-    Split documents into chunks of max_length size.
-    """
-    src_docs = OrderedDict()
-    src_prev = OrderedDict()
-    src_next = OrderedDict()
-    for doc_id, segs in orig_src_docs.items():
-        for chunk_id, (chunk, prev_ctx, next_ctx) in enumerate(
-            _split_list(segs, max_length)
-        ):
-            src_docs[f"{doc_id}.{chunk_id}"] = list(chunk)
-            src_prev[f"{doc_id}.{chunk_id}"] = list(prev_ctx)
-            src_next[f"{doc_id}.{chunk_id}"] = list(next_ctx)
-
-    ref_docs = OrderedDict()
-    hyp_prev = OrderedDict()
-    hyp_next = OrderedDict()
-    for translator in orig_ref_docs:
-        ref_docs[translator] = OrderedDict()
-        hyp_prev[REFERENCE_AS_SYSTEM_PREFIX + translator] = OrderedDict()
-        hyp_next[REFERENCE_AS_SYSTEM_PREFIX + translator] = OrderedDict()
-        for doc_id, segs in orig_ref_docs[translator].items():
-            for chunk_id, (chunk, prev_ctx, next_ctx) in enumerate(
-                _split_list(segs, max_length)
-            ):
-                ref_docs[translator][f"{doc_id}.{chunk_id}"] = list(chunk)
-                hyp_prev[REFERENCE_AS_SYSTEM_PREFIX + translator][
-                    f"{doc_id}.{chunk_id}"
-                ] = list(prev_ctx)
-                hyp_next[REFERENCE_AS_SYSTEM_PREFIX + translator][
-                    f"{doc_id}.{chunk_id}"
-                ] = list(next_ctx)
-
-    hyp_docs = OrderedDict()
-    for system in orig_hyp_docs:
-        hyp_docs[system] = OrderedDict()
-        hyp_prev[system] = OrderedDict()
-        hyp_next[system] = OrderedDict()
-        for doc_id, segs in orig_hyp_docs[system].items():
-            for chunk_id, (chunk, prev_ctx, next_ctx) in enumerate(
-                _split_list(segs, max_length)
-            ):
-                hyp_docs[system][f"{doc_id}.{chunk_id}"] = list(chunk)
-                hyp_prev[system][f"{doc_id}.{chunk_id}"] = list(prev_ctx)
-                hyp_next[system][f"{doc_id}.{chunk_id}"] = list(next_ctx)
-
-    # print(src_prev)
-    return src_docs, ref_docs, hyp_docs, src_prev, src_next, hyp_prev, hyp_next
-
-
-def select_docs(orig_src_docs, orig_ref_docs, orig_hyp_docs, tsv_file):
-    """
-    Extract preselected segments from given documents and corresponding contexts.
-    """
-    selected_docs = []
-    print("Selecting the following documents only:")
-    with open(tsv_file, "r", encoding="utf8") as tsv:
-        for line in tsv:
-            _docid, _segid_first, _segid_last = line.strip().split("\t")
-            selected_docs.append((_docid, int(_segid_first), int(_segid_last)))
-            print(f"  {selected_docs[-1]}")
-
-    src_docs = OrderedDict()
-    src_prev = OrderedDict()
-    src_next = OrderedDict()
-    for doc_id, seg_id_1, seg_id_2 in selected_docs:
-        if doc_id not in orig_src_docs:
-            print(
-                f"Error: the selected document {doc_id} not found in the XML file/src"
-            )
-            exit()
-        segs = orig_src_docs[doc_id]
-        chunk = segs[seg_id_1 - 1 : seg_id_2]
-        prev_ctx = segs[0 : seg_id_1 - 1]
-        next_ctx = segs[seg_id_2:]
-        chunk_id = f"#{seg_id_1}-{seg_id_2}"
-
-        src_docs[f"{doc_id}{chunk_id}"] = chunk
-        src_prev[f"{doc_id}{chunk_id}"] = prev_ctx
-        src_next[f"{doc_id}{chunk_id}"] = next_ctx
-
-    ref_docs = OrderedDict()
-    hyp_prev = OrderedDict()
-    hyp_next = OrderedDict()
-    for translator in orig_ref_docs:
-        ref_docs[translator] = OrderedDict()
-        hyp_prev[REFERENCE_AS_SYSTEM_PREFIX + translator] = OrderedDict()
-        hyp_next[REFERENCE_AS_SYSTEM_PREFIX + translator] = OrderedDict()
-
-        for doc_id, seg_id_1, seg_id_2 in selected_docs:
-            if doc_id not in orig_ref_docs[translator]:
-                print(
-                    f"Error: the selected document {doc_id} not found in the XML file/ref"
-                )
-                exit()
-
-            segs = orig_ref_docs[translator][doc_id]
-            chunk = segs[seg_id_1 - 1 : seg_id_2]
-            prev_ctx = segs[0 : seg_id_1 - 1]
-            next_ctx = segs[seg_id_2:]
-            chunk_id = f"#{seg_id_1}-{seg_id_2}"
-
-            ref_docs[translator][f"{doc_id}{chunk_id}"] = chunk
-            hyp_prev[REFERENCE_AS_SYSTEM_PREFIX + translator][
-                f"{doc_id}{chunk_id}"
-            ] = prev_ctx
-            hyp_next[REFERENCE_AS_SYSTEM_PREFIX + translator][
-                f"{doc_id}{chunk_id}"
-            ] = next_ctx
-
-    hyp_docs = OrderedDict()
-    for system in orig_hyp_docs:
-        hyp_docs[system] = OrderedDict()
-        hyp_prev[system] = OrderedDict()
-        hyp_next[system] = OrderedDict()
-
-        for doc_id, seg_id_1, seg_id_2 in selected_docs:
-            if doc_id not in orig_hyp_docs[system]:
-                print(
-                    f"Error: the selected document {doc_id} not found in the XML file/hyp"
-                )
-                exit()
-
-            segs = orig_hyp_docs[system][doc_id]
-            chunk = segs[seg_id_1 - 1 : seg_id_2]
-            prev_ctx = segs[0 : seg_id_1 - 1]
-            next_ctx = segs[seg_id_2:]
-            chunk_id = f"#{seg_id_1}-{seg_id_2}"
-
-            hyp_docs[system][f"{doc_id}{chunk_id}"] = chunk
-            hyp_prev[system][f"{doc_id}{chunk_id}"] = prev_ctx
-            hyp_next[system][f"{doc_id}{chunk_id}"] = next_ctx
-
-    return src_docs, ref_docs, hyp_docs, src_prev, src_next, hyp_prev, hyp_next
-
-
-def _split_list(list_a, chunk_size):
-    for i in range(0, len(list_a), chunk_size):
-        prev_context = list_a[0:i]
-        next_context = list_a[i + chunk_size :]
-        yield list_a[i : i + chunk_size], prev_context, next_context
-
-
-def _create_bad_ref(seg_text: str, ref_text: str, character_based: bool = False) -> str:
-    """
-    Creates bad reference for given text.
-
-    Segment length (a, b] to phrase length (excluding a, including b)
-    mapping defined as follows:
-        ( 0,   1] : 1
-        ( 1,   5] : 2
-        ( 5,   8] : 3
-        ( 8,  15] : 4
-        (15,  20] : 5
-        (20, max] : 6
-
-    For character-based languages, which do not support tokenisation
-    by whitespace, the resulting phrase length will be doubled, and
-    is interpreted as a character length.
-    """
-    seg_data = seg_text.split(' ')
-    ref_data = ref_text.split(' ')
-
-    if character_based:
-        seg_data = [x for x in seg_text]
-        ref_data = [x for x in ref_text]
-
-    seg_len = len(seg_data)
-    ref_len = len(ref_data)
-
-    # Determine length of bad phrase, relative to segment length.
-    _seg_to_bad_mapping = {
-        (None, 1): 2,
-        (1, 5): 2,
-        (5, 8): 3,
-        (8, 15): 4,
-        (15, 20): 5,
-        (20, None): 6,
-    }
-
-    bad_len = 0
-    for seg_pair in _seg_to_bad_mapping:
-        left, right = seg_pair
-
-        # seg_len == right; left edge case
-        if not left:
-            if seg_len == right:
-                bad_len = _seg_to_bad_mapping[seg_pair]
-                break
-
-        # left < seg_len; right edge case
-        elif not right:
-            if left < seg_len:
-                bad_len = _seg_to_bad_mapping[seg_pair]
-                break
-
-        # left < seg_len <= right; middle cases
-        elif left < seg_len <= right:
-            bad_len = _seg_to_bad_mapping[seg_pair]
-            break
-
-    # Double length of bad phrase for character-based languages.
-    if character_based:
-        bad_len = 2 * bad_len
-
-    # Determine random replacement position. For segments longer than
-    # (bad_len + 1), we enforce that this cannot be sentence initial
-    # or final, so positions 0 and (seg_len - bad_len -1) are invalid
-    # and we use an embedded bad_pos in [1, (seg_len - bad_len - 1)].
-    # This happens for all seg_len > 3.
-    bad_pos = 0
-    if seg_len - bad_len > 0:
-        bad_pos = choice(range(seg_len - bad_len))
-
-    elif seg_len > 3:
-        _xs = max(1, seg_len - bad_len - 1)
-        bad_pos = choice([x + 1 for x in range(_xs)])
-
-    ref_pos = 0
-    if ref_len - bad_len > 0:
-        ref_pos = choice(range(ref_len - bad_len))
-
-    bad_data = (
-        seg_data[:bad_pos]
-        + ref_data[ref_pos : ref_pos + bad_len]
-        + seg_data[bad_pos + bad_len :]
-    )
-    bad_text = ' '.join(bad_data)
-    if character_based:
-        bad_text = ''.join(bad_data)
-
-    # print(seg_text)
-    # print(bad_text)
-    # print('------------')
-    return bad_text
-
-
-def create_bad_refs(
-    docs: Dict[str, List[Tuple[str, str]]],
-    refs: Dict[str, List[Tuple[str, str]]],
-    character_based: bool = False,
-) -> Dict[str, List[Tuple[str, str]]]:
-    """
-    Creates bad references for given documents.
-
-    For each segment in the given documents, this creates a so-called
-    ``bad reference'' which is constructed by replacing an embedded
-    phrase p with a randomly placed phrase p' of the same length,
-    taken from a different segment contained in refs. The length of
-    the phrase is relative to the full segment length.
-
-    See _create_bad_ref() definition for length mapping details.
-    """
-    # Create mapping from f'{doc_id}_{seg_id}' to reference text.
-    all_refs = {}
-    for curr_doc_id, curr_doc in refs.items():
-        for curr_seg_id, curr_ref_text in curr_doc:
-            all_refs[f'{curr_doc_id}_{curr_seg_id}'] = curr_ref_text
-
-    # Create list of f'{doc_id}_{seg_id}' ids, to be used for random
-    # choice later when we want to identify a reference to work with.
-    all_keys = list(all_refs.keys())
-
-    # Iterate through documents and create bad references.
-    bad_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict()
-    for curr_doc_id, curr_doc in docs.items():
-        if not curr_doc_id in bad_docs:
-            bad_docs[curr_doc_id] = []
-
-        print(f'doc_id: {curr_doc_id},\tdoc_len: {len(curr_doc)}')
-        for curr_seg in curr_doc:
-            curr_seg_id, curr_seg_text = curr_seg
-
-            # Bad reference id may not be identical to current id.
-            bad_id = choice(all_keys)
-            while bad_id == f'{curr_doc_id}_{curr_seg_id}':
-                bad_id = choice(all_keys)
-
-            curr_bad_text = _create_bad_ref(
-                curr_seg_text,
-                all_refs[bad_id],
-                character_based=character_based,
-            )
-
-            # Ensure that keys can be reused.
-            all_keys.append(bad_id)
-
-            bad_docs[curr_doc_id].append((curr_seg_id, curr_bad_text))
-
-    return bad_docs
-
-
-def parse_cmd_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-f",
-        "--xml-file",
-        help="path to .xml file with sources, references and system outputs",
-        required=True,
-    )
-    parser.add_argument(
-        "-o",
-        "--output-prefix",
-        help="prefix for .csv and .json output files",
-        required=True,
-    )
-    parser.add_argument(
-        "-s",
-        "--src-lang",
-        help="ISO code for source language for Appraise",
-        required=True,
-    )
-    parser.add_argument(
-        "-t",
-        "--tgt-lang",
-        help="ISO code for target language for Appraise",
-        required=True,
-    )
-    parser.add_argument(
-        "-c",
-        "--char-based",
-        help="target language is character-based",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--no-qc",
-        help="do not generate BAD references as quality control items",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--max-tasks",
-        help="maximum number of tasks to generate, default: 100",
-        type=int,
-        default=100,
-    )
-    parser.add_argument(
-        "--max-segs",
-        help="maximum number of sentences per document",
-        type=int,
-        default=MAX_DOC_LENGTH,
-    )
-    parser.add_argument(
-        "--rng-seed",
-        help="seed for random number generator",
-        type=int,
-        default=123456,
-    )
-    parser.add_argument(
-        "--selected-docs",
-        help="path to a file with preselected documents; format: docid segid1 segid2",
-    )
-    parser.add_argument(
-        "--static-context",
-        help="number of preceding/succesive segments to show as a static context",
-        type=int,
-        default=MAX_DOC_LENGTH,  # a large number should use all available segments
-    )
-    parser.add_argument(
-        "--even",
-        help="duplicate one task is necessary to keep the total number of tasks even",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    return (
-        args.xml_file,
-        args.output_prefix,
-        args.src_lang,
-        args.tgt_lang,
-        args.char_based,
-        not args.no_qc,
-        args.max_tasks,
-        args.max_segs,
-        args.rng_seed,
-        args.selected_docs,
-        args.static_context,
-        args.even,
-    )
-
-
-if __name__ == "__main__":
-    """
-    Example usage:
-    python3 create_wmt22_tasks.py -f newstest2021.en-de.all.xml -o batches.en-de -s enu -t deu -m 50
-    """
-
-    (
-        XML_FILE,
-        OUT_NAME,
-        SRC_LANG,
-        TGT_LANG,
-        CHARLANG,
-        CONTROLS,
-        TASK_MAX,
-        MAX_SEGS,
-        RND_SEED,
-        SELECTED,
-        CTX_SIZE,
-        EVEN_NUM,
-    ) = parse_cmd_args()
-
-    print(f'Character based={CHARLANG}')
-    ENC = 'utf-8'
-    seed(RND_SEED)
-
-    print(f'Quality control={CONTROLS}')
-    if not CONTROLS or TGT_LANG == 'sgg':  # no BAD refs if the target size has videos
-        REQUIRED_SEGS = 100
-    else:
-        REQUIRED_SEGS = 80
-    print(f'Setting REQUIRED_SEGS={REQUIRED_SEGS}')
-
-    SYS_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict()
-    BAD_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict()
-    print(f'Loading docs from {XML_FILE}')
-    src_lang, SRC_DOCS, ref_lang, REF_DOCS, hyp_lang, SYS_DOCS = unwrap_xml(
-        XML_FILE, encoding=ENC
-    )
-
-    if SELECTED:
-        docs_tuple = select_docs(SRC_DOCS, REF_DOCS, SYS_DOCS, SELECTED)
-    else:
-        docs_tuple = chop_docs(SRC_DOCS, REF_DOCS, SYS_DOCS, MAX_SEGS)
-
-    (
-        SRC_DOCS,
-        REF_DOCS,
-        SYS_DOCS,
-        SRC_PREV,
-        SRC_NEXT,
-        SYS_PREV,
-        SYS_NEXT,
-    ) = docs_tuple
-
-    # This reference will be used for generating BAD items
-    REF_ID = sorted(list(REF_DOCS.keys()))[0]
-    print(f'Using reference "{REF_ID}"')
-
-    # Add references as additional system outputs
-    if INCLUDE_REFERENCES_AS_SYSTEMS:
-        for ref_id in sorted(list(REF_DOCS.keys())):
-            sys_id = REFERENCE_AS_SYSTEM_PREFIX + ref_id
-            print(f'Adding reference "{ref_id}" as system output "{sys_id}"')
-            SYS_DOCS[sys_id] = REF_DOCS[ref_id]
-
-    # List of system names that can be iterated deterministically
-    SYS_IDS = sorted(list(SYS_DOCS.keys()))
-    print("SYS IDS size:", len(SYS_IDS))
-
-    for sys_id in SYS_IDS:
-        print(f'Generating bad references for {sys_id}')
-        BAD_DOCS[sys_id] = create_bad_refs(
-            SYS_DOCS[sys_id], REF_DOCS[REF_ID], character_based=CHARLANG
-        )
-
-    # pylint: disable-msg=invalid-name
-    some_sys_id = choice(SYS_IDS)
-    some_doc_id = choice(sorted(list(SYS_DOCS[some_sys_id].keys())))
-    some_sys_text = SYS_DOCS[some_sys_id][some_doc_id]
-    some_bad_text = BAD_DOCS[some_sys_id][some_doc_id]
-    print("Example:", some_sys_id, some_doc_id)
-
-    for _s, _b in zip(some_sys_text, some_bad_text):
-        print(_s)
-        print(_b)
-        print('---')
-
-    DOC_STATS: Dict[int, List[Tuple[int, str, str]]] = OrderedDict()
-    for sys_id in SYS_IDS:
-        for doc_id in SYS_DOCS[sys_id].keys():
-            doc_len = len(SYS_DOCS[sys_id][doc_id])
-
-            # We do not support documents longer than 70 segments.
-            if doc_len > MAX_DOC_LENGTH:
-                print("!!! DOCUMENT TOO LONG:", doc_id)
-                continue
-
-            if not doc_len in DOC_STATS.keys():
-                DOC_STATS[doc_len] = []
-            DOC_STATS[doc_len].append((doc_len, doc_id, sys_id))
-
-    # Randomise system order
-    for doc_len in DOC_STATS:
-        shuffle(DOC_STATS[doc_len])
-
-    print("Doc. stats (doc.len/count):", DOC_STATS.keys())
-    total_docs = 0
-    total_sys = set()
-    for doc_len in DOC_STATS.keys():
-        print(f'  {doc_len}:\t{len(DOC_STATS[doc_len])}')
-        total_docs += len(DOC_STATS[doc_len])
-        for x in DOC_STATS[doc_len]:
-            total_sys.add(x[2])
-    print("total docs:", total_docs)
-    print("total sys:", total_sys)
-
-    all_systems = list(total_sys)
-    sampled_tasks: List[Tuple[Tuple[int, str, str], ...]] = []
-    CURR_LEN = 0
-    CURR_SYS = 0
-    curr_task: List[Tuple[int, str, str]] = []
-    DOC_STATS_COPY = deepcopy(DOC_STATS)
-    last_task = False
-    while DOC_STATS.keys():
-        ALL_KEYS = sorted(list(DOC_STATS.keys()))
-        # Maximum allowed length of a document to not exceed 100 segments in this task
-        max_delta = REQUIRED_SEGS - CURR_LEN
-        valid_keys = [x for x in ALL_KEYS if x <= max_delta]
-
-        if not valid_keys:
-            print("  #segments in current task:", CURR_LEN)
-            for _doc in curr_task:
-                print("   ", _doc)
-            print('------')
-            sampled_tasks.append(tuple(curr_task))
-            CURR_LEN = 0
-            curr_task = []
-            if last_task:  # Stop if this was the last task with
-                break
-            continue
-
-        # Take the document that fill in the allowed size perfectly, or random
-        if max_delta in valid_keys:
-            curr_key = max_delta
-        else:
-            curr_key = choice(valid_keys)
-
-        CURR_LEN += curr_key
-        curr_val = DOC_STATS[curr_key].pop(0)  # This takes a random system.
-        # print('  ... selected ', curr_val)
-        # print('   .. left systems', sum( len(DOC_STATS[k]) for k in DOC_STATS ))
-
-        # Below code would pick systems one after the other
-        # curr_val = None
-        # for iter_val in DOC_STATS[curr_key]:
-        # if iter_val[2] == all_systems[CURR_SYS]:
-        # curr_val = iter_val
-        # DOC_STATS[curr_key].remove(iter_val)
-        # break
-
-        # if not curr_val:
-        # curr_val = DOC_STATS[curr_key].pop(0)
-        # CURR_SYS = all_systems.index(curr_val[2])
-        # CURR_SYS = (CURR_SYS + 1) % len(all_systems)
-
-        curr_task.append(curr_val)
-        if not DOC_STATS[curr_key]:
-            DOC_STATS.pop(curr_key)
-
-        # If there are some documents left that cannot form a full task with
-        # 100 segments, take random documents to create the last task.
-        # This ensures that all documents have been used at least once.
-        if (
-            USE_ALL_DOCUMENTS_AND_ALL_SYSTEMS
-            and len(DOC_STATS) == 0
-            and len(curr_task) > 0
-        ):
-            DOC_STATS = DOC_STATS_COPY
-            last_task = True
-            print('Creating last batch with padded documents')
-
-    # print("------------")
-    # print("Left docs:")
-    # print(DOC_STATS)
-    # print("------------")
-
-    # Print documents per system
-    _all_tasks = []
-    for _tup in sampled_tasks:
-        _all_tasks += list(_tup)
-    _docs_by_sys: Dict[str, Any] = {}
-    for (_, docid, sysid) in _all_tasks:
-        if sysid not in _docs_by_sys:
-            _docs_by_sys[sysid] = []
-        _docs_by_sys[sysid].append(docid)
-    for i, sysid in enumerate(_docs_by_sys):
-        print(i, sysid)
-        for j, docid in enumerate(sorted(_docs_by_sys[sysid])):
-            print("  ", j, docid)
-
-    # Shuffle order of tasks
-    shuffle(sampled_tasks)
-    print("Total number of tasks:", len(sampled_tasks))
-
-    padded_tasks: List[Tuple[Tuple[int, str, str], ...]] = []
-    for tid, task in enumerate(sampled_tasks):
-        task_docs = len(task)
-        task_len = sum([x[0] for x in task])
-        print(f'task_len: {task_len}')
-        if task_len > MAX_TASK_SIZE:
-            raise NotImplementedError(
-                'No support for tasks >{0} items!'.format(MAX_TASK_SIZE)
-            )
-
-        elif task_len < MAX_TASK_SIZE:
-            pad_size = MAX_TASK_SIZE - task_len
-            pad_data: List[Tuple[int, str, str]] = list(task)
-            pad_pos = 0
-            while pad_size > 0:
-                print(f'pad_size: {pad_size}')
-                print(f'pad_pos: {pad_pos}')
-                pad_data.append(tuple(list(pad_data[pad_pos]) + [True]))  # type: ignore
-                print(pad_data[-1])
-                pad_size -= pad_data[-1][0]
-                pad_pos = (pad_pos + 1) % task_docs
-            if pad_size < 0:
-                print(f'pad_size: {pad_size}')
-                print(f'pad_pos: {pad_pos}')
-
-                last_doc: Tuple[int, str, str] = pad_data[-1]
-                print(last_doc[0], '-->', last_doc[0] + pad_size)
-                fixed_doc = (last_doc[0] + pad_size, *last_doc[1:])
-                pad_data[-1] = fixed_doc
-                print(pad_data[-1][0])
-            padded_tasks.append(tuple(pad_data))
-            print("Padded tasks:")
-            for _pad in padded_tasks[-1]:
-                print("  ", _pad)
-
-        else:
-            print(f'WARNING: no control items in task no. {tid}')
-            # raise NotImplementedError('Needs isControl=True update!')
-            padded_tasks.append(tuple(task))  # TODO: does this ever occur?
-
-    if EVEN_NUM and len(padded_tasks) % 2 == 1:
-        print('Duplicating one batch to keep the number of tasks even')
-        padded_tasks.append(padded_tasks[0])
-        print(f'Number of tasks now is {len(padded_tasks)}')
-
-    csv_data = []
-    task_id = 0
-    for task in padded_tasks:
-        task_id += 1
-        task_len = sum([x[0] for x in task])
-        print(f'>>> task_len: {task_len}')
-
-        for _doc in task:
-            _data = [str(task_id)]
-            for x in _doc:  # type: ignore
-                _data.append(str(x))
-
-            if _data[-1] != 'True':
-                _data.append('False')  # isControl=False
-            print('>>> ', ' '.join(_data))
-            csv_data.append(','.join(_data))
-
-    with open(f'{OUT_NAME}.csv', mode='w') as _file:
-        for csv_line in csv_data:
-            _file.write(csv_line)
-            _file.write('\n')
-
-    json_data = []
-    batch_id = 0
-    for task in padded_tasks[:TASK_MAX]:
-        # Remember, batch numbers are one-based
-        task_data = OrderedDict(
-            {
-                'batchNo': batch_id + 1,
-                'batchSize': 100,
-                'sourceLanguage': SRC_LANG,
-                'targetLanguage': TGT_LANG,
-                'requiredAnnotations': 1,
-                'randomSeed': RND_SEED,
-            }
-        )
-
-        source_id = basename(XML_FILE)
-
-        items_data: List[List[Dict[str, Any]]] = []  # Keeps items grouped into document
-        _item = 0
-        doc_counter = 0
-        for doc_data in task:
-            items_data.append([])  # Add a new bucket for items from this documents
-            has_control_item = False
-
-            doc_len, doc_id, sys_id, *rest = doc_data  # type: ignore
-
-            isControl = rest is not None and rest
-
-            target_id = sys_id
-
-            _src = {}
-            _ref = {}
-            _bad = {}
-            _tgt = {}
-
-            for item_id, item_src in SRC_DOCS[doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _src[seg_id] = item_src
-
-            for item_id, item_ref in REF_DOCS[REF_ID][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _ref[seg_id] = item_ref
-
-            for item_id, item_bad in BAD_DOCS[sys_id][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _bad[seg_id] = item_bad
-
-            for item_id, item_tgt in SYS_DOCS[sys_id][doc_id]:
-                seg_id = f'{doc_id}_{item_id}'
-                _tgt[seg_id] = item_tgt
-
-            seg_counter = 0
-            context_src: List[Text] = []
-            context_ref: List[Text] = []
-            context_bad: List[Text] = []
-            context_tgt: List[Text] = []
-            for seg_id in _src:
-                if seg_counter >= doc_len:  # Padding tasks are shorter!
-                    break
-                item_src = _src[seg_id]
-                item_ref = _ref[seg_id]
-                item_bad = _bad[seg_id]
-                item_tgt = _tgt[seg_id]
-
-                target_text = item_tgt
-                target_type = 'TGT'
-
-                # Do not generate any BAD items if QC is disabled
-                if CONTROLS and isControl:
-                    randomCoinFlip = choice(
-                        [False, False, True, True, True]  # 60:40 chance
-                    )
-                    if randomCoinFlip:
-                        target_text = item_bad
-                        target_type = 'BAD'
-                        has_control_item = True
-
-                src_ctx = []
-                tgt_ctx = []
-                if seg_counter == 0:
-                    src_ctx = [txt for _, txt in SRC_PREV[doc_id]][-CTX_SIZE:]
-                    tgt_ctx = [txt for _, txt in SYS_PREV[sys_id][doc_id]][-CTX_SIZE:]
-
-                obj: Dict[str, Any] = OrderedDict()
-                obj['_item'] = _item
-                obj['_block'] = -1
-                obj['sourceID'] = source_id
-                obj['sourceContextLeft'] = '\n'.join(src_ctx)
-                obj['sourceText'] = item_src
-                obj['targetID'] = target_id
-                obj['targetContextLeft'] = '\n'.join(tgt_ctx)
-                obj['targetText'] = target_text
-                obj['itemID'] = seg_counter
-                obj['itemType'] = target_type
-                obj['documentID'] = doc_id
-                obj['isCompleteDocument'] = False
-
-                # print(seg_id)
-                # print(' '.join(context_src))
-                # print(item_src)
-                # print('...')
-                # print(' '.join(context_tgt))
-                # print(item_tgt.encode('utf-8'))
-                # print('---')
-
-                context_src.append(item_src)
-                context_ref.append(item_ref)
-                context_bad.append(item_bad)
-                context_tgt.append(target_text)
-
-                items_data[-1].append(obj)
-                _item += 1
-                seg_counter += 1
-
-            src_ctx = []
-            tgt_ctx = []
-            src_ctx = [txt for _, txt in SRC_NEXT[doc_id]][:CTX_SIZE]
-            tgt_ctx = [txt for _, txt in SYS_NEXT[sys_id][doc_id]][:CTX_SIZE]
-
-            obj = OrderedDict()
-            obj['_item'] = _item
-            obj['_block'] = -1
-            obj['sourceContextLeft'] = '\n'.join(src_ctx)
-            obj['sourceID'] = source_id
-            obj['sourceText'] = ' '.join(context_src)  # full document
-            obj['targetContextLeft'] = '\n'.join(tgt_ctx)
-            obj['targetID'] = target_id
-            obj['targetText'] = ' '.join(context_tgt)  # full document
-            obj['itemID'] = item_id
-            obj['itemType'] = 'BAD' if has_control_item else 'TGT'
-            obj['documentID'] = doc_id
-            obj['isCompleteDocument'] = True
-            items_data[-1].append(obj)
-
-            if has_control_item and SHUFFLE_DOCS_WITH_CONTROL_ITEMS:
-                # Move the document with control items to a random position so
-                # that they are not accumulated as very last documents
-                _bad_doc = items_data.pop()
-                _pos = randint(0, len(items_data) - 1)
-                print(f'  Moving the last QC document to position {_pos}')
-                items_data.insert(_pos, _bad_doc)
-
-        # Extract items from documents
-        _items_data = [item for doc_items in items_data for item in doc_items]
-        # Re-assign _item numbers
-        if SHUFFLE_DOCS_WITH_CONTROL_ITEMS:
-            _item = 0
-            for i in range(len(_items_data)):
-                _items_data[i]['_item'] = _item
-                if _items_data[i]['isCompleteDocument'] == False:
-                    _item += 1
-
-        output_data = OrderedDict({'task': task_data, 'items': _items_data})
-
-        json_data.append(output_data)
-
-        # write out JSON
-        json_text = json_dumps(json_data, indent=2, sort_keys=True)
-
-        json_file_name = f'{OUT_NAME}.json'
-        with open(json_file_name, mode='w', encoding='utf8') as out_file:
-            sys.stdout.write(
-                'Creating {0}, batch no. {1} ... '.format(json_file_name, batch_id + 1),
-            )
-            out_file.write(str(json_text))
-            sys.stdout.write('OK\n')
-
-        batch_id += 1
-
-    print(f'Total tasks: {len(sampled_tasks)}')
-    print(f'Total docs:  {total_docs}')
-    print(f'Total sys:   {len(total_sys)} {sorted(list(total_sys))}')

From 5a95f92dc02a84b6145e91683be67753a3a328e1 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Fri, 11 Jul 2025 15:51:42 -0700
Subject: [PATCH 14/51] upgrade requirements

---
 requirements-dev.txt | 2 +-
 requirements.txt     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5a8770b1..fe86d4a5 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,5 @@
 -r requirements.txt
-black==22.3.0
+black==24.3.0
 mypy
 pylint
 pylint-django
diff --git a/requirements.txt b/requirements.txt
index 3cf747d8..95a77972 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 beautifulsoup4
-django==4.1
+django==4.2.22
 django-stubs
 lxml
 psycopg2-binary

From dfbba0651bd8188460e840bd450d4b1abe35df58 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Fri, 11 Jul 2025 15:53:50 -0700
Subject: [PATCH 15/51] upgrade Python to 3.12

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2e0c0ca2..142b348c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.9]
+        python-version: [3.12]
 
     steps:
       - uses: actions/checkout@v4

From 53e848cc4cae22da644538102df20b654368be02 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Fri, 11 Jul 2025 15:57:33 -0700
Subject: [PATCH 16/51] reformat with newer version of black

---
 Appraise/settings.py                          |  5 +-
 Appraise/urls.py                              |  1 +
 Appraise/utils.py                             |  7 +--
 Appraise/wsgi.py                              |  1 +
 Campaign/admin.py                             |  1 +
 .../commands/ComputeSystemScores.py           |  1 +
 .../commands/ComputeWMT21Results.py           |  2 +-
 .../management/commands/ComputeZScores.py     |  4 +-
 .../commands/InitCampaignMMT18Task1.py        |  1 +
 .../commands/InitCampaignMMT18Task1b.py       |  1 +
 .../commands/InitCampaignMMT18Task1bv2.py     |  1 +
 .../commands/InitCampaignMMT18Task1bv3.py     |  1 +
 .../commands/InitCampaignMMT18Task1v2.py      |  1 +
 .../commands/InitCampaignMMT18Task1v3.py      |  1 +
 .../commands/InitCampaignWMT18RefDA.py        |  1 +
 .../commands/InitCampaignWMT18RefDA2.py       |  1 +
 .../commands/InitCampaignWMT18RefDA3.py       |  1 +
 .../commands/InitCampaignWMT18RefDA4.py       |  1 +
 .../commands/InitCampaignWMT18SrcDA.py        |  1 +
 .../management/commands/MakeAnnotation.py     |  4 +-
 .../management/commands/StartNewCampaign.py   |  2 +
 .../commands/UpdateCampaignModels.py          |  2 +
 Campaign/management/commands/init_campaign.py |  2 +
 .../commands/validatecampaigndata.py          |  1 +
 Campaign/models.py                            |  1 +
 Campaign/tests.py                             |  1 +
 Campaign/utils.py                             |  1 +
 Campaign/views.py                             | 21 ++++----
 Dashboard/admin.py                            |  1 +
 Dashboard/apps.py                             |  1 +
 .../management/commands/CreateInviteTokens.py |  1 +
 .../commands/UpdateDashboardModels.py         |  2 +
 Dashboard/models.py                           |  1 +
 Dashboard/tests.py                            |  1 +
 Dashboard/utils.py                            |  5 +-
 Dashboard/views.py                            |  5 +-
 EvalData/admin.py                             |  1 +
 EvalData/apps.py                              |  2 +
 EvalData/error_types.py                       |  1 +
 .../commands/CombineSubsetTextData.py         |  2 +
 .../commands/CreateDirectAssessmentData.py    |  2 +
 .../CreateDirectAssessmentDataWMT17.py        |  2 +
 .../management/commands/CreateFakeBadRefs.py  |  2 +
 .../management/commands/CreateIdsFiles.py     |  2 +
 .../CreateMultiModalAssessmentData.py         |  2 +
 .../commands/CreateSubsetTextData.py          |  1 +
 .../management/commands/DumpAllResults.py     |  2 +
 .../commands/DumpScoresAndMetadata.py         |  2 +
 .../commands/PatchDirectAssessmentData.py     |  1 +
 .../commands/UnlinkDirectAssessmentTasks.py   |  2 +
 .../commands/UpdateEvalDataModels.py          |  2 +
 .../commands/ValidateDirectAssessmentData.py  |  1 +
 EvalData/models/__init__.py                   |  1 +
 EvalData/models/base_models.py                |  1 +
 EvalData/models/data_assessment.py            |  1 +
 EvalData/models/direct_assessment.py          |  1 +
 EvalData/models/direct_assessment_context.py  |  2 +-
 EvalData/models/direct_assessment_document.py | 53 ++++++++++---------
 EvalData/models/multi_modal_assessment.py     |  1 +
 EvalData/models/pairwise_assessment.py        |  3 +-
 .../models/pairwise_assessment_document.py    |  3 +-
 EvalData/models/task_agenda.py                |  1 +
 EvalData/views.py                             |  1 +
 EvalView/admin.py                             |  1 +
 EvalView/apps.py                              |  2 +
 EvalView/models.py                            |  1 +
 EvalView/tests.py                             |  1 +
 EvalView/views.py                             | 25 ++++-----
 Makefile                                      |  3 ++
 Scripts/create_iwslt22_tasks.py               |  2 +-
 Scripts/create_wmt22_pairwise_tasks.py        | 35 ++++++++----
 Scripts/create_wmt22_tasks.py                 |  2 +-
 deprecated.py                                 |  5 +-
 73 files changed, 176 insertions(+), 81 deletions(-)

diff --git a/Appraise/settings.py b/Appraise/settings.py
index b0347d44..59c9c413 100644
--- a/Appraise/settings.py
+++ b/Appraise/settings.py
@@ -9,6 +9,7 @@
 For the full list of settings and their values, see
 https://docs.djangoproject.com/en/1.11/ref/settings/
 """
+
 import logging
 import os
 import warnings
@@ -37,7 +38,9 @@
 
 ALLOWED_HOSTS = os.environ.get('APPRAISE_ALLOWED_HOSTS', '127.0.0.1').split(',')
 
-CSRF_TRUSTED_ORIGINS = os.environ.get('APPRAISE_CSRF_TRUSTED_ORIGINS', 'https://*.127.0.0.1').split(',')
+CSRF_TRUSTED_ORIGINS = os.environ.get(
+    'APPRAISE_CSRF_TRUSTED_ORIGINS', 'https://*.127.0.0.1'
+).split(',')
 
 WSGI_APPLICATION = os.environ.get(
     'APPRAISE_WSGI_APPLICATION', 'Appraise.wsgi.application'
diff --git a/Appraise/urls.py b/Appraise/urls.py
index 2274e1c6..54a122be 100644
--- a/Appraise/urls.py
+++ b/Appraise/urls.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=unused-import,import-error
 from django.conf.urls import handler404
 from django.conf.urls import handler500
diff --git a/Appraise/utils.py b/Appraise/utils.py
index 4e9093e2..27540d8a 100644
--- a/Appraise/utils.py
+++ b/Appraise/utils.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 import logging
 
 from Appraise.settings import LOG_HANDLER
@@ -33,8 +34,8 @@ def _compute_user_total_annotation_time(timestamps):
     def _clamp_time(seconds):
         # if a segment takes longer than 10 minutes, set it to 5 minutes
         # it's likely due to inactivity
-        if seconds >= 10*60:
-            return 5*60
+        if seconds >= 10 * 60:
+            return 5 * 60
         else:
             return seconds
 
@@ -54,4 +55,4 @@ def _clamp_time(seconds):
         # Update the previous end timestamp
         previous_end_timestamp = end_timestamp
 
-    return total_annotation_time
\ No newline at end of file
+    return total_annotation_time
diff --git a/Appraise/wsgi.py b/Appraise/wsgi.py
index 7ec10196..6c2ef408 100644
--- a/Appraise/wsgi.py
+++ b/Appraise/wsgi.py
@@ -6,6 +6,7 @@
 For more information on this file, see
 https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/
 """
+
 import os
 
 from django.core.wsgi import get_wsgi_application
diff --git a/Campaign/admin.py b/Campaign/admin.py
index 55d82a1c..5e39b963 100644
--- a/Campaign/admin.py
+++ b/Campaign/admin.py
@@ -1,6 +1,7 @@
 """
 Campaign admin.py
 """
+
 # pylint: disable=C0330,import-error
 from django.contrib import admin
 from django.contrib.admin.filters import AllValuesFieldListFilter
diff --git a/Campaign/management/commands/ComputeSystemScores.py b/Campaign/management/commands/ComputeSystemScores.py
index 4898fba1..9e948135 100644
--- a/Campaign/management/commands/ComputeSystemScores.py
+++ b/Campaign/management/commands/ComputeSystemScores.py
@@ -10,6 +10,7 @@
 from EvalData.models import DirectAssessmentResult
 from EvalData.models import DirectAssessmentTask
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Computes system scores over all results'
diff --git a/Campaign/management/commands/ComputeWMT21Results.py b/Campaign/management/commands/ComputeWMT21Results.py
index d5c6dab6..76eaa645 100644
--- a/Campaign/management/commands/ComputeWMT21Results.py
+++ b/Campaign/management/commands/ComputeWMT21Results.py
@@ -463,7 +463,7 @@ def handle(self, *args, **options):
             wins_for_system = defaultdict(list)
             losses_for_system = defaultdict(list)
             p_level = 0.05
-            for (sysA, sysB) in combinations_with_replacement(system_ids, 2):
+            for sysA, sysB in combinations_with_replacement(system_ids, 2):
                 sysA_ids = set([x[0] for x in system_z_scores[sysA]])
                 sysB_ids = set([x[0] for x in system_z_scores[sysB]])
                 good_ids = set.intersection(sysA_ids, sysB_ids)
diff --git a/Campaign/management/commands/ComputeZScores.py b/Campaign/management/commands/ComputeZScores.py
index f11be90e..183e0ee2 100644
--- a/Campaign/management/commands/ComputeZScores.py
+++ b/Campaign/management/commands/ComputeZScores.py
@@ -427,7 +427,7 @@ def handle(self, *args, **options):
 
             wins_for_system = defaultdict(list)
             p_level = 0.05
-            for (sysA, sysB) in combinations_with_replacement(system_ids, 2):
+            for sysA, sysB in combinations_with_replacement(system_ids, 2):
                 sysA_ids = set([x[0] for x in system_z_scores[sysA]])
                 sysB_ids = set([x[0] for x in system_z_scores[sysB]])
                 good_ids = set.intersection(sysA_ids, sysB_ids)
@@ -577,7 +577,7 @@ def sort_by_wins_and_z_score(x, y):
                 key = system_id[:4].upper()
                 vsystems[key].extend(system_z_scores[system_id])
 
-            for (sysA, sysB) in combinations_with_replacement(
+            for sysA, sysB in combinations_with_replacement(
                 ['GOOG', 'CAND', 'PROD'], 2
             ):
                 sysA_scores = [x[1] for x in vsystems[sysA]]
diff --git a/Campaign/management/commands/InitCampaignMMT18Task1.py b/Campaign/management/commands/InitCampaignMMT18Task1.py
index d43dd536..26338ed9 100644
--- a/Campaign/management/commands/InitCampaignMMT18Task1.py
+++ b/Campaign/management/commands/InitCampaignMMT18Task1.py
@@ -31,6 +31,7 @@
 }
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign MMT18 Task #1'
diff --git a/Campaign/management/commands/InitCampaignMMT18Task1b.py b/Campaign/management/commands/InitCampaignMMT18Task1b.py
index 6db7b98f..e6829bda 100644
--- a/Campaign/management/commands/InitCampaignMMT18Task1b.py
+++ b/Campaign/management/commands/InitCampaignMMT18Task1b.py
@@ -27,6 +27,7 @@
 }
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign MMT18 Task #1.b'
diff --git a/Campaign/management/commands/InitCampaignMMT18Task1bv2.py b/Campaign/management/commands/InitCampaignMMT18Task1bv2.py
index a09ffca5..653cb06f 100644
--- a/Campaign/management/commands/InitCampaignMMT18Task1bv2.py
+++ b/Campaign/management/commands/InitCampaignMMT18Task1bv2.py
@@ -27,6 +27,7 @@
 }
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign MMT18 Task #1.b v2'
diff --git a/Campaign/management/commands/InitCampaignMMT18Task1bv3.py b/Campaign/management/commands/InitCampaignMMT18Task1bv3.py
index 46c8e2c7..7503db6c 100644
--- a/Campaign/management/commands/InitCampaignMMT18Task1bv3.py
+++ b/Campaign/management/commands/InitCampaignMMT18Task1bv3.py
@@ -27,6 +27,7 @@
 }
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign MMT18 Task #1.b v3'
diff --git a/Campaign/management/commands/InitCampaignMMT18Task1v2.py b/Campaign/management/commands/InitCampaignMMT18Task1v2.py
index 99c7908f..bd5fb2c2 100644
--- a/Campaign/management/commands/InitCampaignMMT18Task1v2.py
+++ b/Campaign/management/commands/InitCampaignMMT18Task1v2.py
@@ -31,6 +31,7 @@
 }
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign MMT18 Task #1 v2'
diff --git a/Campaign/management/commands/InitCampaignMMT18Task1v3.py b/Campaign/management/commands/InitCampaignMMT18Task1v3.py
index 9d364728..2caa2932 100644
--- a/Campaign/management/commands/InitCampaignMMT18Task1v3.py
+++ b/Campaign/management/commands/InitCampaignMMT18Task1v3.py
@@ -27,6 +27,7 @@
 }
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign MMT18 Task #1 v3'
diff --git a/Campaign/management/commands/InitCampaignWMT18RefDA.py b/Campaign/management/commands/InitCampaignWMT18RefDA.py
index 7513039a..117c58d3 100644
--- a/Campaign/management/commands/InitCampaignWMT18RefDA.py
+++ b/Campaign/management/commands/InitCampaignWMT18RefDA.py
@@ -23,6 +23,7 @@
 TASKS = 100
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign WMT18 RefDA'
diff --git a/Campaign/management/commands/InitCampaignWMT18RefDA2.py b/Campaign/management/commands/InitCampaignWMT18RefDA2.py
index c46a0b15..22479855 100644
--- a/Campaign/management/commands/InitCampaignWMT18RefDA2.py
+++ b/Campaign/management/commands/InitCampaignWMT18RefDA2.py
@@ -23,6 +23,7 @@
 TASKS = 100
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign WMT18 RefDA2'
diff --git a/Campaign/management/commands/InitCampaignWMT18RefDA3.py b/Campaign/management/commands/InitCampaignWMT18RefDA3.py
index 4a890793..c5ceaf37 100644
--- a/Campaign/management/commands/InitCampaignWMT18RefDA3.py
+++ b/Campaign/management/commands/InitCampaignWMT18RefDA3.py
@@ -23,6 +23,7 @@
 TASKS = 34
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign WMT18 RefDA3'
diff --git a/Campaign/management/commands/InitCampaignWMT18RefDA4.py b/Campaign/management/commands/InitCampaignWMT18RefDA4.py
index 8d2e4852..b58d4d0d 100644
--- a/Campaign/management/commands/InitCampaignWMT18RefDA4.py
+++ b/Campaign/management/commands/InitCampaignWMT18RefDA4.py
@@ -23,6 +23,7 @@
 TASKS = 100
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign WMT18 RefDA4'
diff --git a/Campaign/management/commands/InitCampaignWMT18SrcDA.py b/Campaign/management/commands/InitCampaignWMT18SrcDA.py
index f292a99b..e066880d 100644
--- a/Campaign/management/commands/InitCampaignWMT18SrcDA.py
+++ b/Campaign/management/commands/InitCampaignWMT18SrcDA.py
@@ -23,6 +23,7 @@
 TASKS = 34
 REDUNDANCY = 1
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign WMT18 SrcDA'
diff --git a/Campaign/management/commands/MakeAnnotation.py b/Campaign/management/commands/MakeAnnotation.py
index 9faad8c5..9c0be3de 100644
--- a/Campaign/management/commands/MakeAnnotation.py
+++ b/Campaign/management/commands/MakeAnnotation.py
@@ -123,9 +123,7 @@ def handle(self, *args, **options):
             exit()
 
         if options["verbosity"] > 1:
-            self.stdout.write(
-                f"Available context keys: {response.context.keys()}"
-            )
+            self.stdout.write(f"Available context keys: {response.context.keys()}")
 
         # Each task has different context, so the POST request needs to be
         # built separately for each task type
diff --git a/Campaign/management/commands/StartNewCampaign.py b/Campaign/management/commands/StartNewCampaign.py
index 8680b7b7..3f172e13 100644
--- a/Campaign/management/commands/StartNewCampaign.py
+++ b/Campaign/management/commands/StartNewCampaign.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from datetime import datetime
 from os import path
 
@@ -26,6 +27,7 @@
 from Dashboard.utils import generate_confirmation_token
 from EvalData.management.commands.UpdateEvalDataModels import _update_eval_data_models
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'A single command for creating a new campaign based on manifest file'
diff --git a/Campaign/management/commands/UpdateCampaignModels.py b/Campaign/management/commands/UpdateCampaignModels.py
index 796db261..04715028 100644
--- a/Campaign/management/commands/UpdateCampaignModels.py
+++ b/Campaign/management/commands/UpdateCampaignModels.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=W0611
 from os import path
 
@@ -21,6 +22,7 @@
 INFO_MSG = 'INFO: '
 WARNING_MSG = 'WARN: '
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Updates object instances required for Campaign app'
diff --git a/Campaign/management/commands/init_campaign.py b/Campaign/management/commands/init_campaign.py
index c8846ea6..537b8eff 100644
--- a/Campaign/management/commands/init_campaign.py
+++ b/Campaign/management/commands/init_campaign.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from datetime import datetime
 
 from django.core.management.base import BaseCommand
@@ -21,6 +22,7 @@
 from Campaign.utils import CAMPAIGN_TASK_TYPES
 from Dashboard.utils import generate_confirmation_token
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Initialises campaign based on manifest file'
diff --git a/Campaign/management/commands/validatecampaigndata.py b/Campaign/management/commands/validatecampaigndata.py
index 7432425f..4331c3f7 100644
--- a/Campaign/management/commands/validatecampaigndata.py
+++ b/Campaign/management/commands/validatecampaigndata.py
@@ -1,6 +1,7 @@
 """
 Appraise
 """
+
 # pylint: disable=C0103,C0111,C0330,E1101
 import sys
 from json import loads
diff --git a/Campaign/models.py b/Campaign/models.py
index ed2632f1..46be62ed 100644
--- a/Campaign/models.py
+++ b/Campaign/models.py
@@ -1,6 +1,7 @@
 """
 Campaign models.py
 """
+
 # pylint: disable=C0111,C0330,E1101
 from json import JSONDecodeError
 from json import loads
diff --git a/Campaign/tests.py b/Campaign/tests.py
index 35568675..7f23c473 100644
--- a/Campaign/tests.py
+++ b/Campaign/tests.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from pathlib import Path
 
 from django.contrib.auth.models import User
diff --git a/Campaign/utils.py b/Campaign/utils.py
index 18b17ecc..bcabdb08 100644
--- a/Campaign/utils.py
+++ b/Campaign/utils.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from collections import defaultdict
 from collections import OrderedDict
 from hashlib import md5
diff --git a/Campaign/views.py b/Campaign/views.py
index d1198771..ce7cd863 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -70,7 +70,6 @@ def campaign_status(request, campaign_name, sort_key=2):
             )
             is_mqm_or_esa = False
 
-
             # Exclude document scores in document-level tasks, because we want to keep
             # the numbers reported on the campaign status page consistent across
             # accounts, which usually include different numbers of document
@@ -107,9 +106,10 @@ def campaign_status(request, campaign_name, sort_key=2):
                 )
                 # compute time override based on document times
                 import collections
+
                 _time_pairs = collections.defaultdict(list)
                 for x in _data:
-                    _time_pairs[x[7]+ " ||| " +x[4]].append((x[0], x[1]))
+                    _time_pairs[x[7] + " ||| " + x[4]].append((x[0], x[1]))
                 _time_pairs = [
                     (min([x[0] for x in doc_v]), max([x[1] for x in doc_v]))
                     for doc, doc_v in _time_pairs.items()
@@ -132,17 +132,15 @@ def campaign_status(request, campaign_name, sort_key=2):
                 )
                 # compute time override based on document times
                 import collections
+
                 _time_pairs = collections.defaultdict(list)
                 for x in _data:
-                    _time_pairs[x[7]+ " ||| " +x[4]].append((x[0], x[1]))
+                    _time_pairs[x[7] + " ||| " + x[4]].append((x[0], x[1]))
                 _time_pairs = [
                     (min([x[0] for x in doc_v]), max([x[1] for x in doc_v]))
                     for doc, doc_v in _time_pairs.items()
                 ]
-                _data = [
-                    (x[0], x[1], x[2], x[3], x[4], x[5], x[6])
-                    for x in _data
-                ]
+                _data = [(x[0], x[1], x[2], x[3], x[4], x[5], x[6]) for x in _data]
             else:
                 _data = _data.values_list(
                     'start_time',
@@ -171,7 +169,7 @@ def campaign_status(request, campaign_name, sort_key=2):
                 _first_modified = str(_date_modified).split('.')[0]
             else:
                 _first_modified = 'Never'
-                
+
             # Compute last modified time
             _last_modified_raw = (
                 seconds_to_timedelta(max(_end_times)) if _end_times else None
@@ -185,8 +183,10 @@ def campaign_status(request, campaign_name, sort_key=2):
             # Compute total annotation time
             if is_mqm_or_esa and _first_modified_raw and _last_modified_raw:
                 # for MQM and ESA compute the lower and upper annotation times
-                # use only the end times 
-                _annotation_time_upper = (_last_modified_raw-_first_modified_raw).seconds
+                # use only the end times
+                _annotation_time_upper = (
+                    _last_modified_raw - _first_modified_raw
+                ).seconds
                 _hours = int(floor(_annotation_time_upper / 3600))
                 _minutes = int(floor((_annotation_time_upper % 3600) / 60))
                 _annotation_time_upper = f'{_hours:0>2d}h{_minutes:0>2d}m'
@@ -206,7 +206,6 @@ def campaign_status(request, campaign_name, sort_key=2):
             else:
                 _annotation_time = 'n/a'
 
-
             _item = (
                 user.username,
                 user.is_active,
diff --git a/Dashboard/admin.py b/Dashboard/admin.py
index 36289357..c09c0302 100644
--- a/Dashboard/admin.py
+++ b/Dashboard/admin.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=import-error
 from django.contrib import admin
 
diff --git a/Dashboard/apps.py b/Dashboard/apps.py
index b44cea51..3105d180 100644
--- a/Dashboard/apps.py
+++ b/Dashboard/apps.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from django.apps import AppConfig
 
 
diff --git a/Dashboard/management/commands/CreateInviteTokens.py b/Dashboard/management/commands/CreateInviteTokens.py
index 275a6995..129c461c 100644
--- a/Dashboard/management/commands/CreateInviteTokens.py
+++ b/Dashboard/management/commands/CreateInviteTokens.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103
 from collections import defaultdict
 from csv import DictReader
diff --git a/Dashboard/management/commands/UpdateDashboardModels.py b/Dashboard/management/commands/UpdateDashboardModels.py
index 77db1ce0..7a71f0d7 100644
--- a/Dashboard/management/commands/UpdateDashboardModels.py
+++ b/Dashboard/management/commands/UpdateDashboardModels.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103,W0611
 from os.path import basename
 
@@ -20,6 +21,7 @@
 INFO_MSG = 'INFO: '
 WARNING_MSG = 'WARN: '
 
+
 # pylint: disable=C0111,C0330
 class Command(BaseCommand):
     help = 'Updates object instances required for Dashboard app'
diff --git a/Dashboard/models.py b/Dashboard/models.py
index a0305879..248dc916 100644
--- a/Dashboard/models.py
+++ b/Dashboard/models.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from uuid import uuid4
 
 from django.contrib.auth.models import Group
diff --git a/Dashboard/tests.py b/Dashboard/tests.py
index 53b26aae..aae6718c 100644
--- a/Dashboard/tests.py
+++ b/Dashboard/tests.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=unused-import
 from django.db import models
 
diff --git a/Dashboard/utils.py b/Dashboard/utils.py
index e7c76cf1..44848e13 100644
--- a/Dashboard/utils.py
+++ b/Dashboard/utils.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from collections import defaultdict
 from datetime import datetime
 from hashlib import md5
@@ -65,7 +66,9 @@ def run_quality_control(username):
         _data = _type.objects.filter(createdBy__username=username, completed=True)
         # Get the first result task type available: might not work in all scenarios
         if _data:
-            campaign_opts = set((_data[0].task.campaign.campaignOptions or "").lower().split(";"))
+            campaign_opts = set(
+                (_data[0].task.campaign.campaignOptions or "").lower().split(";")
+            )
             result_type = _type
             break
 
diff --git a/Dashboard/views.py b/Dashboard/views.py
index 959cbda6..fc3f66c3 100644
--- a/Dashboard/views.py
+++ b/Dashboard/views.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from datetime import datetime
 from hashlib import md5
 from inspect import currentframe
@@ -83,7 +84,7 @@ def sso_login(request, username, password):
         logout(request)
 
     user = authenticate(username=username, password=password)
-    
+
     # login failed
     if user is None:
         return redirect('dashboard')
@@ -510,4 +511,4 @@ def dashboard(request):
         }
     )
 
-    return render(request, 'Dashboard/dashboard.html', template_context)
\ No newline at end of file
+    return render(request, 'Dashboard/dashboard.html', template_context)
diff --git a/EvalData/admin.py b/EvalData/admin.py
index 62a54039..62948ac2 100644
--- a/EvalData/admin.py
+++ b/EvalData/admin.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0330
 from datetime import timezone
 
diff --git a/EvalData/apps.py b/EvalData/apps.py
index a669ed4b..4ebf9a1d 100644
--- a/EvalData/apps.py
+++ b/EvalData/apps.py
@@ -3,8 +3,10 @@
 
 See LICENSE for usage details
 """
+
 from django.apps import AppConfig
 
+
 # pylint: disable=missing-docstring
 class EvaldataConfig(AppConfig):
     name = 'EvalData'
diff --git a/EvalData/error_types.py b/EvalData/error_types.py
index ab0b0a3e..604838f3 100644
--- a/EvalData/error_types.py
+++ b/EvalData/error_types.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from collections import OrderedDict
 
 ERROR_TYPES = {
diff --git a/EvalData/management/commands/CombineSubsetTextData.py b/EvalData/management/commands/CombineSubsetTextData.py
index caf8e853..512e4de6 100644
--- a/EvalData/management/commands/CombineSubsetTextData.py
+++ b/EvalData/management/commands/CombineSubsetTextData.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from collections import defaultdict
 from collections import OrderedDict
 from glob import iglob
@@ -17,6 +18,7 @@
 
 INFO_MSG = 'INFO: '
 
+
 # pylint: disable=C0111
 class Command(BaseCommand):
     help = 'Creates combined subset text file based on given CSV file'
diff --git a/EvalData/management/commands/CreateDirectAssessmentData.py b/EvalData/management/commands/CreateDirectAssessmentData.py
index 0a6945b5..db1c723e 100644
--- a/EvalData/management/commands/CreateDirectAssessmentData.py
+++ b/EvalData/management/commands/CreateDirectAssessmentData.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 import hashlib
 import json
 from collections import defaultdict
@@ -25,6 +26,7 @@
 
 # pylint: disable=E0401,W0611
 
+
 # pylint: disable=C0111
 class Command(BaseCommand):
     help = 'Creates JSON file containing DirectAssessmentTask data'
diff --git a/EvalData/management/commands/CreateDirectAssessmentDataWMT17.py b/EvalData/management/commands/CreateDirectAssessmentDataWMT17.py
index ac838a47..72375c5e 100644
--- a/EvalData/management/commands/CreateDirectAssessmentDataWMT17.py
+++ b/EvalData/management/commands/CreateDirectAssessmentDataWMT17.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=W0611
 import json
 from collections import defaultdict
@@ -19,6 +20,7 @@
 
 from Dashboard.models import LANGUAGE_CODES_AND_NAMES
 
+
 # pylint: disable=C0111
 class Command(BaseCommand):
     help = 'Creates JSON file containing DirectAssessmentTask data'
diff --git a/EvalData/management/commands/CreateFakeBadRefs.py b/EvalData/management/commands/CreateFakeBadRefs.py
index c614471a..14a9b764 100644
--- a/EvalData/management/commands/CreateFakeBadRefs.py
+++ b/EvalData/management/commands/CreateFakeBadRefs.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=W0611
 from collections import defaultdict
 from collections import OrderedDict
@@ -25,6 +26,7 @@
 EXTENSION_FOR_BAD_FILES = 'bad'
 EXTENSION_FOR_IDS_FILES = 'ids'
 
+
 # pylint: disable=C0111
 class Command(BaseCommand):
     help = 'Creates fake bad references data'
diff --git a/EvalData/management/commands/CreateIdsFiles.py b/EvalData/management/commands/CreateIdsFiles.py
index 0333edf5..8b707ddd 100644
--- a/EvalData/management/commands/CreateIdsFiles.py
+++ b/EvalData/management/commands/CreateIdsFiles.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=W0611
 from collections import defaultdict
 from collections import OrderedDict
@@ -25,6 +26,7 @@
 EXTENSION_FOR_BAD_FILES = 'bad'
 EXTENSION_FOR_IDS_FILES = 'ids'
 
+
 # pylint: disable=C0111
 class Command(BaseCommand):
     help = 'Creates ids files'
diff --git a/EvalData/management/commands/CreateMultiModalAssessmentData.py b/EvalData/management/commands/CreateMultiModalAssessmentData.py
index c94b4bb2..f2f3c667 100644
--- a/EvalData/management/commands/CreateMultiModalAssessmentData.py
+++ b/EvalData/management/commands/CreateMultiModalAssessmentData.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=W0611
 import json
 from collections import defaultdict
@@ -17,6 +18,7 @@
 
 from Dashboard.models import LANGUAGE_CODES_AND_NAMES
 
+
 # pylint: disable=C0111
 class Command(BaseCommand):
     help = 'Creates JSON file containing MultiModalAssessmentTask data'
diff --git a/EvalData/management/commands/CreateSubsetTextData.py b/EvalData/management/commands/CreateSubsetTextData.py
index 1bf5b99b..2ad95c68 100644
--- a/EvalData/management/commands/CreateSubsetTextData.py
+++ b/EvalData/management/commands/CreateSubsetTextData.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=W0611
 from collections import defaultdict
 from collections import OrderedDict
diff --git a/EvalData/management/commands/DumpAllResults.py b/EvalData/management/commands/DumpAllResults.py
index 0045afa7..a5bea6d2 100644
--- a/EvalData/management/commands/DumpAllResults.py
+++ b/EvalData/management/commands/DumpAllResults.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from os.path import basename
 
 from django.core.management.base import BaseCommand
@@ -17,6 +18,7 @@
 INFO_MSG = 'INFO: '
 WARNING_MSG = 'WARN: '
 
+
 # pylint: disable=C0111,C0330
 class Command(BaseCommand):
     help = 'Dumps all DirectAssessmentResult and MultiModalAssessmentResult instances'
diff --git a/EvalData/management/commands/DumpScoresAndMetadata.py b/EvalData/management/commands/DumpScoresAndMetadata.py
index c9689667..49bfb0d8 100644
--- a/EvalData/management/commands/DumpScoresAndMetadata.py
+++ b/EvalData/management/commands/DumpScoresAndMetadata.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from gzip import open as gz_open
 from os.path import basename
 
@@ -17,6 +18,7 @@
 INFO_MSG = 'INFO: '
 WARNING_MSG = 'WARN: '
 
+
 # pylint: disable=C0111,C0330
 class Command(BaseCommand):
     help = 'Dumps all DirectAssessmentResult scores and associated metadata'
diff --git a/EvalData/management/commands/PatchDirectAssessmentData.py b/EvalData/management/commands/PatchDirectAssessmentData.py
index 70573561..6dc7402e 100644
--- a/EvalData/management/commands/PatchDirectAssessmentData.py
+++ b/EvalData/management/commands/PatchDirectAssessmentData.py
@@ -11,6 +11,7 @@
 from EvalData.models import DirectAssessmentResult
 from EvalData.models import DirectAssessmentTask
 
+
 # pylint: disable=C0111,C0330,E1101
 class Command(BaseCommand):
     help = 'Validates Direct Assessment JSON data files'
diff --git a/EvalData/management/commands/UnlinkDirectAssessmentTasks.py b/EvalData/management/commands/UnlinkDirectAssessmentTasks.py
index 09fc707f..bb17c5fb 100644
--- a/EvalData/management/commands/UnlinkDirectAssessmentTasks.py
+++ b/EvalData/management/commands/UnlinkDirectAssessmentTasks.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=W0611
 from datetime import datetime
 from datetime import timedelta
@@ -30,6 +31,7 @@
 INFO_MSG = 'INFO: '
 WARNING_MSG = 'WARN: '
 
+
 # pylint: disable=C0111,C0330
 class Command(BaseCommand):
     help = 'Unlinks DirectAssessmentTask instances as needed'
diff --git a/EvalData/management/commands/UpdateEvalDataModels.py b/EvalData/management/commands/UpdateEvalDataModels.py
index 5a8604e8..6153596b 100644
--- a/EvalData/management/commands/UpdateEvalDataModels.py
+++ b/EvalData/management/commands/UpdateEvalDataModels.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=W0611
 from datetime import datetime
 from os import path
@@ -26,6 +27,7 @@
 INFO_MSG = 'INFO: '
 WARNING_MSG = 'WARN: '
 
+
 # pylint: disable=C0111,C0330
 class Command(BaseCommand):
     help = 'Updates object instances required for EvalData app'
diff --git a/EvalData/management/commands/ValidateDirectAssessmentData.py b/EvalData/management/commands/ValidateDirectAssessmentData.py
index a267a1b3..f53a4a4d 100644
--- a/EvalData/management/commands/ValidateDirectAssessmentData.py
+++ b/EvalData/management/commands/ValidateDirectAssessmentData.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=W0611
 from collections import defaultdict
 from json import load
diff --git a/EvalData/models/__init__.py b/EvalData/models/__init__.py
index 92943e4c..706486b6 100644
--- a/EvalData/models/__init__.py
+++ b/EvalData/models/__init__.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from .base_models import *
 from .data_assessment import *
 from .direct_assessment import *
diff --git a/EvalData/models/base_models.py b/EvalData/models/base_models.py
index fef7b737..7b83ad15 100644
--- a/EvalData/models/base_models.py
+++ b/EvalData/models/base_models.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103,C0330,no-member
 from datetime import timezone
 
diff --git a/EvalData/models/data_assessment.py b/EvalData/models/data_assessment.py
index 1d239386..3de5292b 100644
--- a/EvalData/models/data_assessment.py
+++ b/EvalData/models/data_assessment.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103,C0330,no-member
 import sys
 from collections import defaultdict
diff --git a/EvalData/models/direct_assessment.py b/EvalData/models/direct_assessment.py
index a605b6d2..801d54d7 100644
--- a/EvalData/models/direct_assessment.py
+++ b/EvalData/models/direct_assessment.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103,C0330,no-member
 import sys
 from collections import defaultdict
diff --git a/EvalData/models/direct_assessment_context.py b/EvalData/models/direct_assessment_context.py
index c6766b7d..37666068 100644
--- a/EvalData/models/direct_assessment_context.py
+++ b/EvalData/models/direct_assessment_context.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103,C0330,no-member
 import sys
 from collections import defaultdict
@@ -499,7 +500,6 @@ def get_time_for_user(cls, user):
             timestamps.append((result.start_time, result.end_time))
 
         return seconds_to_timedelta(_compute_user_total_annotation_time(timestamps))
-    
 
     @classmethod
     def get_system_annotations(cls):
diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index 3115fe2d..410b736c 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -265,23 +265,21 @@ def next_document_for_user_mqmesa(self, user):
         # get all items (100) and try to find resul
         all_items = [
             (
-                item, 
+                item,
                 DirectAssessmentDocumentResult.objects.filter(
                     item=item, activated=False, completed=True, createdBy=user
-                ).last()
+                ).last(),
             )
             for item in self.items.all().order_by('id')
         ]
         unfinished_items = [i for i, r in all_items if not r]
-        
+
         docs_total = len({i.documentID for i, r in all_items})
-        items_completed = len([
-            i for i, r in all_items if r and r.completed
-        ])
-        docs_completed = docs_total - len({
-            i.documentID for i, r in all_items if r is None or not r.completed
-        })
-        
+        items_completed = len([i for i, r in all_items if r and r.completed])
+        docs_completed = docs_total - len(
+            {i.documentID for i, r in all_items if r is None or not r.completed}
+        )
+
         if not unfinished_items:
             return (
                 None,
@@ -295,7 +293,8 @@ def next_document_for_user_mqmesa(self, user):
         # things are ordered with batch order
         next_item = unfinished_items[0]
         doc_items_all = [
-            (i, r) for i, r in all_items
+            (i, r)
+            for i, r in all_items
             # match document name and system
             if i.documentID == next_item.documentID and i.targetID == next_item.targetID
         ]
@@ -308,12 +307,12 @@ def next_document_for_user_mqmesa(self, user):
         )
 
         return (
-            next_item,         # the first unannotated item for the user
-            items_completed,   # the number of completed items in the task
-            docs_completed,    # the number of completed documents in the task
-            doc_items,         # all items from the current document
-            doc_items_results, # all score results from the current document
-            docs_total,        # the total number of documents in the task
+            next_item,  # the first unannotated item for the user
+            items_completed,  # the number of completed items in the task
+            docs_completed,  # the number of completed documents in the task
+            doc_items,  # all items from the current document
+            doc_items_results,  # all score results from the current document
+            docs_total,  # the total number of documents in the task
         )
 
     def get_results_for_each_item(self, block_items, user):
@@ -457,7 +456,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 new_items.append(new_item)
                 if item['isCompleteDocument']:
                     doc_items += 1
-            
+
             LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
 
@@ -592,18 +591,23 @@ def get_hit_status_for_user(cls, user):
     @classmethod
     def get_time_for_user(cls, user):
         results = cls.objects.filter(createdBy=user, activated=False, completed=True)
-        is_esa_or_mqm = any([
-            "esa" in result.task.campaign.campaignOptions.lower().split(";") or
-            "mqm" in result.task.campaign.campaignOptions.lower().split(";")
-            for result in results
-        ])
+        is_esa_or_mqm = any(
+            [
+                "esa" in result.task.campaign.campaignOptions.lower().split(";")
+                or "mqm" in result.task.campaign.campaignOptions.lower().split(";")
+                for result in results
+            ]
+        )
 
         if is_esa_or_mqm:
             # for ESA or MQM, do minimum and maximum from each doc
             import collections
+
             timestamps = collections.defaultdict(list)
             for result in results:
-                timestamps[result.item.documentID+" ||| "+result.item.targetID].append((result.start_time, result.end_time))
+                timestamps[
+                    result.item.documentID + " ||| " + result.item.targetID
+                ].append((result.start_time, result.end_time))
 
             # timestamps are document-level now, but that does not change anything later on
             timestamps = [
@@ -615,7 +619,6 @@ def get_time_for_user(cls, user):
             for result in results:
                 timestamps.append((result.start_time, result.end_time))
 
-
         return seconds_to_timedelta(_compute_user_total_annotation_time(timestamps))
 
     @classmethod
diff --git a/EvalData/models/multi_modal_assessment.py b/EvalData/models/multi_modal_assessment.py
index 7bbccc0f..9112c7ed 100644
--- a/EvalData/models/multi_modal_assessment.py
+++ b/EvalData/models/multi_modal_assessment.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103,C0330,no-member
 import sys
 from collections import defaultdict
diff --git a/EvalData/models/pairwise_assessment.py b/EvalData/models/pairwise_assessment.py
index fb952ff9..ef7e1cf2 100644
--- a/EvalData/models/pairwise_assessment.py
+++ b/EvalData/models/pairwise_assessment.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103,C0330,no-member
 import sys
 from collections import defaultdict
@@ -341,7 +342,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                     contextRight=context_right,
                 )
                 new_items.append(new_item)
-            
+
             LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
 
diff --git a/EvalData/models/pairwise_assessment_document.py b/EvalData/models/pairwise_assessment_document.py
index 69c71088..b538de4a 100644
--- a/EvalData/models/pairwise_assessment_document.py
+++ b/EvalData/models/pairwise_assessment_document.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103,C0330,no-member
 import sys
 from collections import defaultdict
@@ -470,7 +471,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count):
                 new_items.append(new_item)
                 if item['isCompleteDocument']:
                     doc_items += 1
-            
+
             LOGGER.info(f'The task has {len(new_items)} items')
             current_count += 1
 
diff --git a/EvalData/models/task_agenda.py b/EvalData/models/task_agenda.py
index a452cff8..ccc976fb 100644
--- a/EvalData/models/task_agenda.py
+++ b/EvalData/models/task_agenda.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=C0103,C0330,no-member
 from inspect import currentframe
 from inspect import getframeinfo
diff --git a/EvalData/views.py b/EvalData/views.py
index 8065fa8d..29a6b45a 100644
--- a/EvalData/views.py
+++ b/EvalData/views.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from django.contrib import messages
 from django.contrib.auth.decorators import login_required
 from django.contrib.auth.decorators import permission_required
diff --git a/EvalView/admin.py b/EvalView/admin.py
index d4952b9f..c1639906 100644
--- a/EvalView/admin.py
+++ b/EvalView/admin.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=unused-import
 from django.contrib import admin
 
diff --git a/EvalView/apps.py b/EvalView/apps.py
index 652f2c91..3ae78d03 100644
--- a/EvalView/apps.py
+++ b/EvalView/apps.py
@@ -3,8 +3,10 @@
 
 See LICENSE for usage details
 """
+
 from django.apps import AppConfig
 
+
 # pylint: disable=missing-docstring
 class EvalviewConfig(AppConfig):
     name = 'EvalView'
diff --git a/EvalView/models.py b/EvalView/models.py
index 53b26aae..aae6718c 100644
--- a/EvalView/models.py
+++ b/EvalView/models.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=unused-import
 from django.db import models
 
diff --git a/EvalView/tests.py b/EvalView/tests.py
index faac1ca5..2a876eef 100644
--- a/EvalView/tests.py
+++ b/EvalView/tests.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 # pylint: disable=unused-import
 from django.test import TestCase
 
diff --git a/EvalView/views.py b/EvalView/views.py
index 48e48a61..6f14b4d4 100644
--- a/EvalView/views.py
+++ b/EvalView/views.py
@@ -3,6 +3,7 @@
 
 See LICENSE for usage details
 """
+
 from datetime import datetime
 from datetime import timezone
 
@@ -1089,17 +1090,13 @@ def direct_assessment_document_mqmesa(campaign, current_task, request):
         end_timestamp = request.POST.get('end_timestamp', None)
         ajax = bool(request.POST.get('ajax', None) == 'True')
 
-
         db_item = current_task.items.filter(
             itemID=item_id,
             id=task_id,
         )
 
-
         if len(db_item) == 0:
-            error_msg = (
-                f'We could not find item {item_id} in task {task_id}.'
-            )
+            error_msg = f'We could not find item {item_id} in task {task_id}.'
             LOGGER.error(error_msg)
             item_saved = False
         elif len(db_item) > 1:
@@ -1185,10 +1182,10 @@ def direct_assessment_document_mqmesa(campaign, current_task, request):
     if 'contrastiveesa' in campaign_opts:
         # escape <br/> tags in the source and target texts
         for item in doc_items:
-            item.sourceText = item.sourceText \
-                .replace("&lt;eos&gt;", "<code>&lt;eos&gt;</code>") \
-                .replace("&lt;br/&gt;", "<br/>")
-            # HTML-esaping on the target text will not work because MQM/ESA tag insertion prevents it 
+            item.sourceText = item.sourceText.replace(
+                "&lt;eos&gt;", "<code>&lt;eos&gt;</code>"
+            ).replace("&lt;br/&gt;", "<br/>")
+            # HTML-esaping on the target text will not work because MQM/ESA tag insertion prevents it
         guidelines = (
             'You are provided with a text in {0} and its candidate translation(s) into {1}. '
             'Please assess the quality of the translation(s) following the detailed guidelines below. '.format(
@@ -2281,7 +2278,7 @@ def pairwise_assessment_document(request, code=None, campaign_name=None):
     new_ui = 'newui' in campaign_opts
     escape_eos = 'escapeeos' in campaign_opts
     escape_br = 'escapebr' in campaign_opts
-    highlight_style ='highlightstyle' in campaign_opts
+    highlight_style = 'highlightstyle' in campaign_opts
 
     # Get item scores from the latest corresponding results
     block_scores = []
@@ -2310,12 +2307,8 @@ def pairwise_assessment_document(request, code=None, campaign_name=None):
 
         if escape_br:
             _source_text = _source_text.replace("&lt;br/&gt;", "<br/>")
-            _candidate1_text = _candidate1_text.replace(
-                "&lt;br/&gt;", "<br/>"
-            )
-            _candidate2_text = _candidate2_text.replace(
-                "&lt;br/&gt;", "<br/>"
-            )
+            _candidate1_text = _candidate1_text.replace("&lt;br/&gt;", "<br/>")
+            _candidate2_text = _candidate2_text.replace("&lt;br/&gt;", "<br/>")
 
         item_scores = {
             'completed': bool(result and result.score1 > -1),
diff --git a/Makefile b/Makefile
index e0cc7870..17a141c2 100644
--- a/Makefile
+++ b/Makefile
@@ -34,4 +34,7 @@ test:
 install: requirements-dev.txt
 	pip install -r $<
 
+reformat:
+	black -S -l $(BLACK_LINE_MAXLEN) . --force-exclude '/migrations/'
+
 .PHONY: all check check-black check-pylint check-mypy check-safety run test
diff --git a/Scripts/create_iwslt22_tasks.py b/Scripts/create_iwslt22_tasks.py
index fb26f639..9019893c 100644
--- a/Scripts/create_iwslt22_tasks.py
+++ b/Scripts/create_iwslt22_tasks.py
@@ -517,7 +517,7 @@ def create_bad_refs(
     for _tup in sampled_tasks:
         _all_tasks += list(_tup)
     _docs_by_sys: Dict[str, Any] = {}
-    for (_, docid, sysid) in _all_tasks:
+    for _, docid, sysid in _all_tasks:
         if sysid not in _docs_by_sys:
             _docs_by_sys[sysid] = []
         _docs_by_sys[sysid].append(docid)
diff --git a/Scripts/create_wmt22_pairwise_tasks.py b/Scripts/create_wmt22_pairwise_tasks.py
index a4fa03ae..e2346a14 100644
--- a/Scripts/create_wmt22_pairwise_tasks.py
+++ b/Scripts/create_wmt22_pairwise_tasks.py
@@ -218,14 +218,18 @@ def unwrap_tsv(
         for line in tsv:
             fields = line.rstrip("\n").split('\t')
             if len(fields) < 5:
-                print(f"Error: too few fields in {tsv_file}, required fields: DocID, src, ref, sysA, sysB")
+                print(
+                    f"Error: too few fields in {tsv_file}, required fields: DocID, src, ref, sysA, sysB"
+                )
                 exit()
 
             docid, src, ref, sysA, sysB = fields[:5]
 
             if docid not in src_docs:
                 src_docs[docid] = []
-            segid = len(src_docs[docid]) + 1  # segment ID is 1-based to keep it consistent with XML format
+            segid = (
+                len(src_docs[docid]) + 1
+            )  # segment ID is 1-based to keep it consistent with XML format
             src_docs[docid].append((segid, src))
 
             if docid not in ref_docs['A']:
@@ -684,7 +688,9 @@ def parse_cmd_args():
     print(f'Loading docs from {XML_FILE}')
 
     if TSV_FILE:
-        SRC_DOCS, REF_DOCS, SYS_DOCS = unwrap_tsv(XML_FILE, encoding=ENC, system_A=SYSTEM_A, system_B=SYSTEM_B)
+        SRC_DOCS, REF_DOCS, SYS_DOCS = unwrap_tsv(
+            XML_FILE, encoding=ENC, system_A=SYSTEM_A, system_B=SYSTEM_B
+        )
     else:
         src_lang, SRC_DOCS, ref_lang, REF_DOCS, hyp_lang, SYS_DOCS = unwrap_xml(
             XML_FILE, encoding=ENC
@@ -849,7 +855,9 @@ def parse_cmd_args():
 
         elif task_len < MAX_TASK_SIZE:
             pad_size = MAX_TASK_SIZE - task_len
-            pad_data: List[Tuple[int, str, bool]] = [(tup[0], tup[1], False) for tup in task]
+            pad_data: List[Tuple[int, str, bool]] = [
+                (tup[0], tup[1], False) for tup in task
+            ]
             pad_pos = 0
             while pad_size > 0:
                 print(f'pad_size: {pad_size}')
@@ -863,7 +871,12 @@ def parse_cmd_args():
                 print(f'pad_pos: {pad_pos}')
 
                 last_doc: Tuple[int, str, bool] = pad_data[-1]
-                print('Making the last doc smaller', last_doc[0], '-->', last_doc[0] + pad_size)
+                print(
+                    'Making the last doc smaller',
+                    last_doc[0],
+                    '-->',
+                    last_doc[0] + pad_size,
+                )
                 fixed_doc = (last_doc[0] + pad_size, *last_doc[1:])
                 pad_data[-1] = fixed_doc
                 # print(pad_data[-1][0])
@@ -874,7 +887,9 @@ def parse_cmd_args():
 
         else:
             print(f'WARNING: no control items in task no. {tid}')
-            pad_data: List[Tuple[int, str, bool]] = [(tup[0], tup[1], False) for tup in task]
+            pad_data: List[Tuple[int, str, bool]] = [
+                (tup[0], tup[1], False) for tup in task
+            ]
             padded_tasks.append(tuple(pad_data))
 
     if EVEN_NUM and len(padded_tasks) % 2 == 1:
@@ -971,8 +986,8 @@ def parse_cmd_args():
                 item_src = _src[seg_id]
                 item_ref = _ref[seg_id]
 
-                item_bads = { sys_id: _bads[sys_id][seg_id] for sys_id in SYS_IDS }
-                item_tgts = { sys_id: _tgts[sys_id][seg_id] for sys_id in SYS_IDS }
+                item_bads = {sys_id: _bads[sys_id][seg_id] for sys_id in SYS_IDS}
+                item_tgts = {sys_id: _tgts[sys_id][seg_id] for sys_id in SYS_IDS}
                 item_type = 'TGT'
 
                 # Do not generate any BAD items if QC is disabled
@@ -1005,7 +1020,9 @@ def parse_cmd_args():
                 for tgt_idx, sys_id in enumerate(_shuffled_sys_ids):
                     tgt_ctx = []
                     if seg_counter == 0:
-                        tgt_ctx = [txt for _, txt in SYS_PREV[sys_id][doc_id]][-CTX_SIZE:]
+                        tgt_ctx = [txt for _, txt in SYS_PREV[sys_id][doc_id]][
+                            -CTX_SIZE:
+                        ]
 
                     tobj = OrderedDict()
                     tobj['_itemAll'] = _itemAll
diff --git a/Scripts/create_wmt22_tasks.py b/Scripts/create_wmt22_tasks.py
index 43e8e0bd..a7e3215d 100644
--- a/Scripts/create_wmt22_tasks.py
+++ b/Scripts/create_wmt22_tasks.py
@@ -769,7 +769,7 @@ def parse_cmd_args():
     for _tup in sampled_tasks:
         _all_tasks += list(_tup)
     _docs_by_sys: Dict[str, Any] = {}
-    for (_, docid, sysid) in _all_tasks:
+    for _, docid, sysid in _all_tasks:
         if sysid not in _docs_by_sys:
             _docs_by_sys[sysid] = []
         _docs_by_sys[sysid].append(docid)
diff --git a/deprecated.py b/deprecated.py
index ec2db15f..255995ae 100644
--- a/deprecated.py
+++ b/deprecated.py
@@ -10,10 +10,11 @@
 
 Use get_deprecated_methods() to retrieve set of deprecated methods.
 """
+
 from typing import Set
 
 
-_DEPRECATED_METHOD_REGISTRY : Set[str] = set()
+_DEPRECATED_METHOD_REGISTRY: Set[str] = set()
 
 
 def add_deprecated_method(func):
@@ -28,4 +29,4 @@ def get_deprecated_methods():
     """
     Get deprecated methods from registry.
     """
-    return _DEPRECATED_METHOD_REGISTRY
\ No newline at end of file
+    return _DEPRECATED_METHOD_REGISTRY

From 1c04c0c59553367ae2dc3bc9577ba1c942e2d72a Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Fri, 11 Jul 2025 16:46:39 -0700
Subject: [PATCH 17/51] bump version to #wmt25dev

---
 Appraise/settings.py                         | 2 +-
 Dashboard/templates/Dashboard/frontpage.html | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Appraise/settings.py b/Appraise/settings.py
index 59c9c413..6a307b1e 100644
--- a/Appraise/settings.py
+++ b/Appraise/settings.py
@@ -211,7 +211,7 @@
 
 # Base context for all views.
 BASE_CONTEXT = {
-    'commit_tag': '#wmt24dev',
+    'commit_tag': '#wmt25dev',
     'title': 'Appraise evaluation system',
     'static_url': STATIC_URL,
 }
diff --git a/Dashboard/templates/Dashboard/frontpage.html b/Dashboard/templates/Dashboard/frontpage.html
index 5f4ac5ae..0802f7fa 100644
--- a/Dashboard/templates/Dashboard/frontpage.html
+++ b/Dashboard/templates/Dashboard/frontpage.html
@@ -11,7 +11,11 @@ <h3>An open-source system for manual evaluation of MT output</h3>
             <h3 class="panel-title"><strong>This is Appraise</strong></h3>
           </div>
           <div class="panel-body">
-            <p><small>It supports collaborative collection of human feedback for MT evaluation. It implements tasks such as <em>Translation Quality Checking</em>, <em>Ranking and Error Classification</em>, and <em>Manual Post-Editing</em>. For WMT17, we added support for <em>Direct Assessment</em>.  For WMT19, evaluation is focused on source-based <em>Direct Assessment</em> on document level.</small></p>
+            <p><small>
+              It supports collaborative collection of human feedback for MT evaluation. It implements tasks such as
+              Direct Assessment (DA), Scalar Quality Metric (SQM), Multidimensional Quality Metric (MQM), Error Span Annotation (ESA),
+              in various settings, such as source or reference based, contrastive, document-level, multimodal, and others.
+            </small></p>
           </div>
         </div>
 

From a10669cdc559de456377b8730c114bfb28bfbc3a Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Fri, 11 Jul 2025 17:01:25 -0700
Subject: [PATCH 18/51] wmt24 -> wmt25

---
 Dashboard/templates/Dashboard/dashboard.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dashboard/templates/Dashboard/dashboard.html b/Dashboard/templates/Dashboard/dashboard.html
index d1eb5bea..19a1da85 100644
--- a/Dashboard/templates/Dashboard/dashboard.html
+++ b/Dashboard/templates/Dashboard/dashboard.html
@@ -5,7 +5,7 @@
       <div class="jumbotron">
 
         <h1>Dashboard</h1>
-        <h4>Evaluation campaign for shared tasks hosted at <a href="https://statmt.org/wmt24">the 9th Conference on Machine Translation</a> (WMT24)</h4>
+        <h4>Evaluation campaign for shared tasks hosted at <a href="https://www2.statmt.org/wmt25/translation-task.html">the 10th Conference on Machine Translation</a> (WMT25)</h4>
 
         <div class="panel panel-primary" style="margin-top: 20px;">
           <div class="panel-heading">

From 3298c23adf9318c77b1e3cf6441f4cc7a29d2b45 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Fri, 11 Jul 2025 17:13:53 -0700
Subject: [PATCH 19/51] dummy change

---
 Dashboard/templates/Dashboard/frontpage.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dashboard/templates/Dashboard/frontpage.html b/Dashboard/templates/Dashboard/frontpage.html
index 0802f7fa..2d24a985 100644
--- a/Dashboard/templates/Dashboard/frontpage.html
+++ b/Dashboard/templates/Dashboard/frontpage.html
@@ -14,7 +14,7 @@ <h3 class="panel-title"><strong>This is Appraise</strong></h3>
             <p><small>
               It supports collaborative collection of human feedback for MT evaluation. It implements tasks such as
               Direct Assessment (DA), Scalar Quality Metric (SQM), Multidimensional Quality Metric (MQM), Error Span Annotation (ESA),
-              in various settings, such as source or reference based, contrastive, document-level, multimodal, and others.
+              in various settings, such as reference or source based, contrastive, document-level, with video context, and others.
             </small></p>
           </div>
         </div>

From e9234f225d8df660adf68c642da04a64d4cf2b62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Sun, 20 Jul 2025 13:48:10 -0700
Subject: [PATCH 20/51] don't escape image context, ref #185'

---
 EvalView/views.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/EvalView/views.py b/EvalView/views.py
index 6f14b4d4..40b4e7b5 100644
--- a/EvalView/views.py
+++ b/EvalView/views.py
@@ -1154,8 +1154,8 @@ def direct_assessment_document_mqmesa(campaign, current_task, request):
     # TODO: hotfix for WMT24
     # Tracking issue: https://github.com/AppraiseDev/Appraise/issues/185
     for item in doc_items:
-        # don't escape HTML video
-        if item.sourceText.strip().startswith("<video"):
+        # don't escape HTML video or images
+        if item.sourceText.strip().startswith("<video") or item.sourceText.strip().startswith("<img"):
             continue
         item.sourceText = escape(item.sourceText)
 

From a832a62bbd680a5bc56eb72776223515e53327dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Sun, 20 Jul 2025 16:28:58 -0700
Subject: [PATCH 21/51] add <img support, fix document count computation,
 render newlines as newlines

---
 EvalData/models/direct_assessment_document.py | 17 +++++++++------
 .../direct-assessment-document-mqm-esa.css    |  7 +++++++
 .../js/direct-assessment-document-mqm-esa.js  | 21 +++++++++++++++----
 .../direct-assessment-document-mqm-esa.html   |  2 +-
 EvalView/views.py                             | 14 +++++++++----
 5 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index 410b736c..4499bf6a 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -255,11 +255,12 @@ def next_document_for_user_mqmesa(self, user):
         Used for MQM/ESA views
         Specifically a tuple with:
             next_item,
-            completed_items,
-            completed_docs,
+            items_completed,
+            items_total,
+            docs_completed,
+            docs_total,
             doc_items,
             doc_items_results,
-            total_docs,
         """
 
         # get all items (100) and try to find resul
@@ -274,16 +275,19 @@ def next_document_for_user_mqmesa(self, user):
         ]
         unfinished_items = [i for i, r in all_items if not r]
 
-        docs_total = len({i.documentID for i, r in all_items})
+        # documentID + targetID uniquely identifies documents
+        docs_total = len({(i.documentID, i.targetID) for i, r in all_items})
         items_completed = len([i for i, r in all_items if r and r.completed])
         docs_completed = docs_total - len(
-            {i.documentID for i, r in all_items if r is None or not r.completed}
+            {(i.documentID, i.targetID) for i, r in all_items if r is None or not r.completed}
         )
+        items_total = len(all_items)
 
         if not unfinished_items:
             return (
                 None,
                 items_completed,
+                items_total,
                 docs_completed,
                 [],
                 [],
@@ -309,10 +313,11 @@ def next_document_for_user_mqmesa(self, user):
         return (
             next_item,  # the first unannotated item for the user
             items_completed,  # the number of completed items in the task
+            items_total,
             docs_completed,  # the number of completed documents in the task
+            docs_total,  # the total number of documents in the task
             doc_items,  # all items from the current document
             doc_items_results,  # all score results from the current document
-            docs_total,  # the total number of documents in the task
         )
 
     def get_results_for_each_item(self, block_items, user):
diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
index d17ab3e7..f7acdc0b 100644
--- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
+++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
@@ -73,6 +73,13 @@
     width: 100%;
 }
 
+.source-text > img {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    width: 45%;
+}
+
 .tutorial-text {
     text-align: center;
     color: #257;
diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
index 3a7e9e77..8f2c3844 100644
--- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
+++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
@@ -75,6 +75,8 @@ const ERROR_TYPES = {
     },
     "Other": {},
 }
+
+
 Object.keys(SEVERITY_TO_COLOR).map((key) => {
     $(`#instruction_sev_${key}`).css("background-color", SEVERITY_TO_COLOR[key])
 })
@@ -311,8 +313,14 @@ class MQMItemHandler {
         }
         this.mqm_submitted = structuredClone(this.mqm)
         this.mqm_orig = JSON.parse(JSON.parse(this.el.children('#mqm-payload-orig').html()))
-        this.text_source_orig = decodeEntities(JSON.parse(this.el.children('#text-source-payload').html()).trim())
-        this.source_video = JSON.parse(this.el.children('#text-source-payload').html()).trim().startsWith("<video")
+        
+        let _src_raw = JSON.parse(this.el.children('#text-source-payload').html()).trim()
+        this.text_source_orig = decodeEntities(_src_raw)
+        this.source_is_multimodal = (
+            _src_raw.startsWith("<video") ||
+            _src_raw.startsWith("<audio") ||
+            _src_raw.startsWith("<img")
+        )
         // NOTE: we don't decode entities for the target text, which might cause false positive annotated errors
         this.text_target_orig = JSON.parse(this.el.children('#text-target-payload').html()).trim()
         this.SELECTION_STATE = []
@@ -335,9 +343,11 @@ class MQMItemHandler {
         let score = parseFloat(this.el.children('#score-payload').html())
 
     
-
         // setup_span_structure
         let html_target = this.text_target_orig.split("").map((v, i) => {
+            if (v == "\n") {
+                return "<br>" // preserve newlines
+            }
             return `<span class="mqm_char" id="target_char_${i}" char_id="${i}">${v}</span>`
         }).join("") + " <span class='mqm_char span_missing' id='target_char_missing' char_id='missing'>[MISSING]</span>"
         this.el_target.html(html_target)
@@ -357,8 +367,11 @@ class MQMItemHandler {
         }
 
         // handle character alignment estimation
-        if (!this.source_video) {
+        if (!this.source_is_multimodal) {
             let html_source = this.text_source_orig.split("").map((v, i) => {
+                if (v == "\n") {
+                    return "<br>" // preserve newlines
+                }
                 return `<span class="mqm_char_src" id="source_char_${i}" char_id="${i}">${v}</span>`
             }).join("")
             this.el_source.html(html_source)
diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
index 3de8e146..c9df8bba 100644
--- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
+++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
@@ -22,7 +22,7 @@
             <td style="width:33%;text-align:left;">
                 <strong id="task_progress">
                     Completed {{docs_completed}}/{{docs_total}} documents,
-                    {{items_completed}}/100 segments
+                    {{items_completed}}/{{items_total}} segments
                 </strong>
             </td>
             <td style="width:33%;text-align:center;">
diff --git a/EvalView/views.py b/EvalView/views.py
index 40b4e7b5..fbe85cff 100644
--- a/EvalView/views.py
+++ b/EvalView/views.py
@@ -1133,10 +1133,11 @@ def direct_assessment_document_mqmesa(campaign, current_task, request):
     (
         next_item,
         items_completed,
+        items_total,
         docs_completed,
+        docs_total,
         doc_items,
         doc_items_results,
-        docs_total,
     ) = current_task.next_document_for_user_mqmesa(request.user)
 
     if not next_item:
@@ -1151,11 +1152,15 @@ def direct_assessment_document_mqmesa(campaign, current_task, request):
             # Send response to the Ajax POST request
             return JsonResponse(context)
 
-    # TODO: hotfix for WMT24
+    # TODO: hotfix for WMT24 and WMT25
     # Tracking issue: https://github.com/AppraiseDev/Appraise/issues/185
     for item in doc_items:
-        # don't escape HTML video or images
-        if item.sourceText.strip().startswith("<video") or item.sourceText.strip().startswith("<img"):
+        # don't escape HTML video, audio or images
+        if (
+            item.sourceText.strip().startswith("<video") or
+            item.sourceText.strip().startswith("<audio") or
+            item.sourceText.strip().startswith("<img")
+        ):
             continue
         item.sourceText = escape(item.sourceText)
 
@@ -1201,6 +1206,7 @@ def direct_assessment_document_mqmesa(campaign, current_task, request):
         'task_id': next_item.id,
         'document_id': next_item.documentID,
         'items_completed': items_completed,
+        'items_total': items_total,
         'docs_completed': docs_completed,
         'docs_total': docs_total,
         'source_language': source_language,

From c4461877d7e2d115316bc8b414b1e2d24692d7ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Sun, 20 Jul 2025 19:01:46 -0700
Subject: [PATCH 22/51] add Maasai language

---
 Dashboard/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dashboard/models.py b/Dashboard/models.py
index 248dc916..b9eb8252 100644
--- a/Dashboard/models.py
+++ b/Dashboard/models.py
@@ -225,6 +225,7 @@
     'kas': 'Kashmiri (كٲشُر)',
     'mni': 'Meitei (ꯃꯩꯇꯩꯂꯣꯟ)',
     'sat': 'Santali (ᱥᱟᱱᱛᱟᱲᱤ)',
+    'mas': 'Maasai (Ol Maa)',
 }
 
 # All sign language codes

From 50c6e5f05365559a66ed6e9b8af7ec140bf2c2ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Mon, 21 Jul 2025 20:13:27 -0700
Subject: [PATCH 23/51] minor styling & instructions

---
 .../static/EvalView/css/direct-assessment-document-mqm-esa.css | 2 +-
 EvalView/templates/EvalView/_instructions-esa.html             | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
index f7acdc0b..91742033 100644
--- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
+++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
@@ -77,7 +77,7 @@
     display: block;
     margin-left: auto;
     margin-right: auto;
-    width: 45%;
+    width: 500px;
 }
 
 .tutorial-text {
diff --git a/EvalView/templates/EvalView/_instructions-esa.html b/EvalView/templates/EvalView/_instructions-esa.html
index 05e1fa0b..ea8b1526 100644
--- a/EvalView/templates/EvalView/_instructions-esa.html
+++ b/EvalView/templates/EvalView/_instructions-esa.html
@@ -14,7 +14,8 @@
         </ul>
       </li>
       <li><strong>Missing content</strong>: If something is missing, highlight the word <b style="font-family: monospace, monospace;">[MISSING]</b> to mark the error. </li>
-      <li><strong>Tip</strong>: Highlight the word or general area of the error---it doesn’t need to be exact. Use separate highlights for different errors.</li>
+      <li><strong>Tip</strong>: Highlight the word or general area of the error (it doesn't need to be exact). Use multiple highlights for different errors.</li>
+      <li><strong>Tip</strong>: Pay particular attention to translation consistency across the whole document.</li>
       <li><strong>Score the translation</strong>: After marking errors, please use the slider and set an overall score based on meaning preservation and general quality:</li>
         <ul>
           <li>0: <strong>No meaning preserved</strong>: most information is lost.</li>

From c4fc6822c782a832ac03dfce6a46ace33f5caa0c Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Tue, 22 Jul 2025 18:35:13 -0700
Subject: [PATCH 24/51] increase the max length for campaign names

---
 ...market_domainname_alter_market_marketid.py | 27 +++++++++++++++++++
 EvalData/models/base_models.py                |  2 +-
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 EvalData/migrations/0055_alter_market_domainname_alter_market_marketid.py

diff --git a/EvalData/migrations/0055_alter_market_domainname_alter_market_marketid.py b/EvalData/migrations/0055_alter_market_domainname_alter_market_marketid.py
new file mode 100644
index 00000000..f534d783
--- /dev/null
+++ b/EvalData/migrations/0055_alter_market_domainname_alter_market_marketid.py
@@ -0,0 +1,27 @@
+# Generated by Django 4.2.22 on 2025-07-23 01:33
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("EvalData", "0054_alter_dataassessmentresult_activatedby_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="market",
+            name="domainName",
+            field=models.CharField(
+                help_text="(max. 50 characters)",
+                max_length=50,
+                verbose_name="Domain name",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="market",
+            name="marketID",
+            field=models.CharField(editable=False, max_length=72, unique=True),
+        ),
+    ]
diff --git a/EvalData/models/base_models.py b/EvalData/models/base_models.py
index 7b83ad15..56a3bace 100644
--- a/EvalData/models/base_models.py
+++ b/EvalData/models/base_models.py
@@ -26,7 +26,7 @@
 # TODO: Unclear if these are needed?
 # from Appraise.settings import STATIC_URL, BASE_CONTEXT
 
-MAX_DOMAINNAME_LENGTH = 20
+MAX_DOMAINNAME_LENGTH = 50
 MAX_LANGUAGECODE_LENGTH = 10
 MAX_CORPUSNAME_LENGTH = 100
 MAX_VERSIONINFO_LENGTH = 20

From 92492a206b2a7f5844d9d519a0249a659c417043 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 24 Jul 2025 10:22:03 -0700
Subject: [PATCH 25/51] update next document button

---
 .../templates/EvalView/direct-assessment-document-mqm-esa.html  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
index c9df8bba..8782642b 100644
--- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
+++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
@@ -143,7 +143,7 @@
     id="button-next-doc-fake"
     title="Please first complete all items in the document (error spans + scores)."
 >
-    Continue to next document (unavailable)
+    Continue to next document <span style="font-size: 12pt;">(finish all segments first)</span>
 </button>
 
 {% endblock %}
\ No newline at end of file

From 54ea604d79be2b4a6e4ca77e4fada89379fd713c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 24 Jul 2025 10:29:56 -0700
Subject: [PATCH 26/51] update ESA slider anchor instructions

---
 EvalView/templates/EvalView/_instructions-esa.html | 10 +++++-----
 EvalView/templates/EvalView/_slider-mqm-esa.html   |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/EvalView/templates/EvalView/_instructions-esa.html b/EvalView/templates/EvalView/_instructions-esa.html
index 05e1fa0b..106a1526 100644
--- a/EvalView/templates/EvalView/_instructions-esa.html
+++ b/EvalView/templates/EvalView/_instructions-esa.html
@@ -14,13 +14,13 @@
         </ul>
       </li>
       <li><strong>Missing content</strong>: If something is missing, highlight the word <b style="font-family: monospace, monospace;">[MISSING]</b> to mark the error. </li>
-      <li><strong>Tip</strong>: Highlight the word or general area of the error---it doesn’t need to be exact. Use separate highlights for different errors.</li>
+      <li><strong>Tip</strong>: Highlight the word or general area of the error (it doesn't need to be exact). Use separate highlights for different errors.</li>
       <li><strong>Score the translation</strong>: After marking errors, please use the slider and set an overall score based on meaning preservation and general quality:</li>
         <ul>
-          <li>0: <strong>No meaning preserved</strong>: most information is lost.</li>
-          <li>33%: <strong>Some meaning preserved</strong>: major gaps and narrative issues.</li>
-          <li>66%: <strong>Most meaning preserved</strong>: minor issues with grammar or consistency.</li>
-          <li>100%: <strong>Perfect</strong>: meaning and grammar align completely with the source.</li>
+          <li>0: <strong>Broken/poor</strong> translation.</li>
+          <li>33%: <strong>Flawed</strong>: significant issues</li>
+          <li>66%: <strong>Good</strong>: insignificant issues with grammar, fluency, or consistency</li>
+          <li>100%: <strong>Perfect</strong>: meaning and style aligned completely with the source</li>
         </ul>
       </li>
     </ul>
diff --git a/EvalView/templates/EvalView/_slider-mqm-esa.html b/EvalView/templates/EvalView/_slider-mqm-esa.html
index e2eecea8..44ecf29b 100644
--- a/EvalView/templates/EvalView/_slider-mqm-esa.html
+++ b/EvalView/templates/EvalView/_slider-mqm-esa.html
@@ -1,9 +1,9 @@
 <div style="width: 95%; margin-left: auto; margin-right: auto;">
-  <table style="color:#777; margin-top:1ex; font-size:x-small; width:100%;">
+  <table style="color:#777; margin-top:1ex; font-size:small; width:100%;">
     <tr>
-      <td class="col-xs-2" style="vertical-align:top;text-align:left;padding-left:0;">0%: No meaning preserved</td>
-      <td class="col-xs-4" style="vertical-align:top;text-align:center;">33%: Some meaning preserved</td>
-      <td class="col-xs-4" style="vertical-align:top;text-align:center;">66%: Most meaning preserved</td>
+      <td class="col-xs-2" style="vertical-align:top;text-align:left;padding-left:0;">0%: Broken/poor</td>
+      <td class="col-xs-4" style="vertical-align:top;text-align:center;">33%: Flawed (significant issues)</td>
+      <td class="col-xs-4" style="vertical-align:top;text-align:center;">66%: Good (insignificant issues)</td>
       <td class="col-xs-4" style="vertical-align:top;text-align:right;padding-right:0;">100%: Perfect</td>
     </tr>
   </table>

From 9021b50a085dce78341d32008e6a4613f3358f4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 24 Jul 2025 10:40:25 -0700
Subject: [PATCH 27/51] fix vertical videos

---
 .../static/EvalView/css/direct-assessment-document-mqm-esa.css   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
index f7acdc0b..321c14a7 100644
--- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
+++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
@@ -71,6 +71,7 @@
 
 .source-text > audio, .source-text > video {
     width: 100%;
+    max-height: 550px;
 }
 
 .source-text > img {

From 31e1a9a16da5b83772712e3ebe91aec38bf78c85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 24 Jul 2025 10:44:34 -0700
Subject: [PATCH 28/51] show ESA instructions by default

---
 .../EvalView/js/direct-assessment-document-mqm-esa.js     | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
index 8f2c3844..8cc42c7d 100644
--- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
+++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
@@ -174,8 +174,14 @@ $(document).ready(() => {
     // show submit button only on MQM and not ESA
     $(".button-submit").toggle(MQM_TYPE == "MQM")
 
-    let instructions_show = localStorage.getItem("appraise-instructions-show") == "true"
+    let instructions_show = localStorage.getItem("appraise-instructions-show")
     if (instructions_show == null) instructions_show = true;
+    else instructions_show = instructions_show == "true";
+    console.log(
+        localStorage.getItem("appraise-instructions-show"),
+        localStorage.getItem("appraise-instructions-show") == null,
+        instructions_show,
+    )
 
     $("#instructions-show").on("click", () => {
         instructions_show = !instructions_show;

From 8268e3078c1826a273ad0ef375d6790a070a51ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 24 Jul 2025 13:49:19 -0700
Subject: [PATCH 29/51] update ESA slider anchors

---
 EvalView/templates/EvalView/_slider-mqm-esa.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/EvalView/templates/EvalView/_slider-mqm-esa.html b/EvalView/templates/EvalView/_slider-mqm-esa.html
index 44ecf29b..363e8a57 100644
--- a/EvalView/templates/EvalView/_slider-mqm-esa.html
+++ b/EvalView/templates/EvalView/_slider-mqm-esa.html
@@ -2,8 +2,8 @@
   <table style="color:#777; margin-top:1ex; font-size:small; width:100%;">
     <tr>
       <td class="col-xs-2" style="vertical-align:top;text-align:left;padding-left:0;">0%: Broken/poor</td>
-      <td class="col-xs-4" style="vertical-align:top;text-align:center;">33%: Flawed (significant issues)</td>
-      <td class="col-xs-4" style="vertical-align:top;text-align:center;">66%: Good (insignificant issues)</td>
+      <td class="col-xs-4" style="vertical-align:top;text-align:center;">33%: Flawed</td>
+      <td class="col-xs-4" style="vertical-align:top;text-align:center;">66%: Good</td>
       <td class="col-xs-4" style="vertical-align:top;text-align:right;padding-right:0;">100%: Perfect</td>
     </tr>
   </table>

From 3a90994ed6018ea0d478c1659709ab17bb781464 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 24 Jul 2025 16:54:27 -0700
Subject: [PATCH 30/51] add note on LLM usage, resolve #201

---
 EvalView/templates/EvalView/_instructions-esa.html | 1 +
 1 file changed, 1 insertion(+)

diff --git a/EvalView/templates/EvalView/_instructions-esa.html b/EvalView/templates/EvalView/_instructions-esa.html
index 2cd34472..38abbe41 100644
--- a/EvalView/templates/EvalView/_instructions-esa.html
+++ b/EvalView/templates/EvalView/_instructions-esa.html
@@ -24,6 +24,7 @@
           <li>100%: <strong>Perfect</strong>: meaning and style aligned completely with the source</li>
         </ul>
       </li>
+      <li>Using external tools for annotations (chatbots, LLMs) is not allowed.</li>
     </ul>
   </div>
 </div>
\ No newline at end of file

From e79e4e9448281ad0466a1e49b343c544354d3562 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Fri, 25 Jul 2025 12:16:59 -0700
Subject: [PATCH 31/51] ESA styling, add language tags on the side, resolve #50

---
 .../direct-assessment-document-mqm-esa.css    | 37 ++++++++++++++++++-
 .../js/direct-assessment-document-mqm-esa.js  |  5 ---
 .../direct-assessment-document-mqm-esa.html   | 20 ++++++----
 3 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
index bbaf26fb..4339ea75 100644
--- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
+++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
@@ -34,8 +34,23 @@
     color: black;
 }
 
-.quotelike {
-    border-left: 5px solid #ddd;
+
+.language_tag_holder {
+    position: relative;
+}
+
+
+.language_tag {
+    /* transform: rotate(-90deg); */
+    transform-origin: top left;
+    width: 200px;
+    display: inline-block;
+    position: absolute;
+    text-align: right;
+    left: -210px;
+    top: 15px;
+    color: #257;
+    font-size: small;
 }
 
 .quotelike {
@@ -52,6 +67,7 @@
 
 .item-box {
     margin-bottom: 20px;
+    border-radius: 4px;
 }
 
 .target-text {
@@ -127,6 +143,12 @@
     color: black;
 }
 
+#instructions {
+    background-color: #d9edf7;
+    padding: 10px;
+    border-radius: 4px;
+}
+
 .alert_message {
     position: fixed;
     top: 25px;
@@ -191,4 +213,15 @@
 
 .ui-widget-content {
     border: none !important;
+}
+
+
+/* override defaults */
+.alert-info {
+    border: none;
+    color: #257;
+}
+.navbar-fixed-top {
+    position: absolute;
+    top: -2px;
 }
\ No newline at end of file
diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
index 8cc42c7d..4256ca56 100644
--- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
+++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
@@ -177,11 +177,6 @@ $(document).ready(() => {
     let instructions_show = localStorage.getItem("appraise-instructions-show")
     if (instructions_show == null) instructions_show = true;
     else instructions_show = instructions_show == "true";
-    console.log(
-        localStorage.getItem("appraise-instructions-show"),
-        localStorage.getItem("appraise-instructions-show") == null,
-        instructions_show,
-    )
 
     $("#instructions-show").on("click", () => {
         instructions_show = !instructions_show;
diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
index 8782642b..b37ee003 100644
--- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
+++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
@@ -35,14 +35,13 @@
     </table>
 </div>
 
-<div class="question-box" id="tutorial-text" style="display: none; font-size: large; font-weight: bold;">
-    In the first two pages we will walk you through the tutorial.
-    It is available only in English to German but knowledge of German is not needed.
-    If you already went through the tutorial you can
-    <input type="button" value="skip the tutorial" id="skip-tutorial">.
-</div>
 
 <div class="question-box" id="instructions">
+    <div class="question-box" id="tutorial-text" style="display: none; font-weight: bold;">
+        In the first two pages we will walk you through the tutorial.
+        If you already went through the tutorial you can
+        <input type="button" value="skip the tutorial" id="skip-tutorial">.
+    </div>
     {% if guidelines %}
     <p>{{ guidelines }}</p>
     {% endif %}
@@ -81,11 +80,18 @@
 
         <div class="source-box">
             <div class="tutorial-text"></div>
+            <div class="language_tag_holder">
+                <div class="language_tag">{{source_language}}</div>
+            </div>
+            
             <div class="source-text">
                 <!-- NOTE: "safe" means that HTML can be injected, which is needed for video! -->
                 {{ item.sourceText|safe }}
             </div>
-
+            <hr style="border-top: 2pt solid #ccc; margin: 0;">
+            <div class="language_tag_holder">
+                <div class="language_tag">{{target_language}}</div>
+            </div>
             <div class="target-text">
                 {{item.targetText}}
             </div>

From 35bcf1eb8af199df051c448620caf5abea5fbb65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Fri, 25 Jul 2025 12:45:20 -0700
Subject: [PATCH 32/51] make ESA interface neater by moving icons to the side

---
 .../direct-assessment-document-mqm-esa.css    | 44 ++++++++++++++-----
 .../js/direct-assessment-document-mqm-esa.js  | 15 +++----
 .../direct-assessment-document-mqm-esa.html   | 23 +++-------
 3 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
index 4339ea75..384f7075 100644
--- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
+++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css
@@ -35,21 +35,45 @@
 }
 
 
-.language_tag_holder {
-    position: relative;
+.status-indicator {
+    display: inline-block;
+    position: absolute;
+    left: -35px;
+    top: 10px;
+    font-size: small;
+    width: 20px;
+    margin: 0px;
+    padding: 0px;
+}
+
+.button-reset {
+    display: inline-block;
+    position: absolute;
+    left: -40px;
+    top: 40px;
+    font-size: small;
+    background-color: transparent !important;
+    border: none;
+    width: 20px;
+    margin: 0px;
+    padding: 0px;
 }
 
+.button-submit {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+}
+
+.target-box {
+    position: relative;
+    margin-bottom: -10px;
+}
 
 .language_tag {
     /* transform: rotate(-90deg); */
-    transform-origin: top left;
-    width: 200px;
-    display: inline-block;
-    position: absolute;
-    text-align: right;
-    left: -210px;
-    top: 15px;
-    color: #257;
+    float: right;
+    color: #777;
     font-size: small;
 }
 
diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
index 4256ca56..a01cd352 100644
--- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
+++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js
@@ -195,9 +195,8 @@ function _all_sentences_scored() {
     return items_left == 0;
 }
 
-function _change_item_status_icon(item_box, icon_name, status_text) {
+function _change_item_status_icon(item_box, icon_name) {
     let icon_box = item_box.find('.status-indicator').removeClass('glyphicon-refresh glyphicon-ok glyphicon-flag');
-    item_box.find(".status-text").text(status_text)
     icon_box.addClass(`glyphicon-${icon_name}`)
 }
 
@@ -211,21 +210,21 @@ function submit_form_ajax(item_box) {
         dataType: 'json',
         beforeSend: function () {
             console.log('Sending AJAX request, item-id=', item_box.data('item-id'));
-            _change_item_status_icon(item_box, 'refresh', "Uploading");
+            _change_item_status_icon(item_box, 'refresh');
         },
         success: function (data) {
             console.log(`Success, saved=${data.saved} next_item=${data.item_id}`);
             if (data.saved) {
-                _change_item_status_icon(item_box, 'ok', "Completed");
+                _change_item_status_icon(item_box, 'ok');
 
             } else {
-                _change_item_status_icon(item_box, 'none', "Upload failed");
+                _change_item_status_icon(item_box, 'warning-sign');
                 _show_error_box(data.error_msg, 10_000);
             }
         },
         error: function (x, s, t) {
             console.log('Error:', x, s, t);
-            _change_item_status_icon(item_box, 'none', "Upload failed");
+            _change_item_status_icon(item_box, 'warning-sign');
             _show_error_box(
                 'An unrecognized error has occured. ' +
                 'Please reload the page or try again in a moment. ',
@@ -514,10 +513,10 @@ class MQMItemHandler {
 
     check_status() {
         if (this.el.attr("data-item-completed") == "True") {
-            _change_item_status_icon(this.el, "ok", "Completed")
+            _change_item_status_icon(this.el, "ok")
             this.el.find(".button-submit").hide()
         } else {
-            _change_item_status_icon(this.el, "flag", "Unfinished")
+            _change_item_status_icon(this.el, "flag")
         }
     }
 
diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
index b37ee003..ef500e95 100644
--- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
+++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html
@@ -101,23 +101,12 @@
             <div class="row esa_slider">
                 {% include 'EvalView/_slider-mqm-esa.html' %}
             </div>
-
-            <table class="row action-box">
-                <tr>
-                    <td style="width:30%;text-align:left;">
-                        <button class="btn button-reset" accesskey="2" type="reset">Reset</button>
-                    </td>
-                    <td style="width: 40%; text-align: center;">
-                        <span class="status-indicator glyphicon glyphicon-ok"></span>
-                        <span class="status-text">Item status</span>
-                    </td>
-                    <td style="width:30%;text-align:right;">
-                        <button class="btn button-submit btn-primary" name="next_button" accesskey="1" type="submit"
-                            value="{{ item.itemID }}"> Mark complete
-                        </button>
-                    </td>
-                </tr>
-            </table>
+            <span class="status-indicator glyphicon glyphicon-ok"></span>
+            <button class="button-reset glyphicon glyphicon-trash" accesskey="2" type="reset"></button>
+            
+            <button class="btn button-submit btn-primary" name="next_button" accesskey="1" type="submit"
+                value="{{ item.itemID }}"> Mark complete
+            </button>
         </div>
     </form>
 </div>

From 3c75e973f2189b0f0b77ece28afadcbbaf754c0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Fri, 25 Jul 2025 19:03:22 -0700
Subject: [PATCH 33/51] create campaign status page for ESA

---
 Campaign/views.py | 125 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 92 insertions(+), 33 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index ce7cd863..6126ef8b 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -23,6 +23,7 @@
 from EvalData.models import PairwiseAssessmentResult
 from EvalData.models import seconds_to_timedelta
 from EvalData.models import TASK_DEFINITIONS
+from EvalData.models.direct_assessment_document import DirectAssessmentDocumentTask
 
 # pylint: disable=import-error
 
@@ -51,19 +52,27 @@ def campaign_status(request, campaign_name, sort_key=2):
         _msg = 'Failure to identify campaign {0}'.format(campaign_name)
         return HttpResponse(_msg, content_type='text/plain')
 
+    try:
+        campaign_opts = campaign.campaignOptions.lower().split(";")
+        # may raise KeyError
+        result_type = RESULT_TYPE_BY_CLASS_NAME[campaign.get_campaign_type()]
+    except KeyError as exc:
+        LOGGER.debug(
+            f'Invalid campaign type {campaign.get_campaign_type()} for campaign {campaign.campaignName}'
+        )
+        LOGGER.error(exc)
+        return HttpResponse(
+            'Invalid campaign type for campaign {0}'.format(campaign.campaignName),
+            content_type='text/plain',
+        )
+    
+    # special handling for ESA
+    if "esa" in campaign_opts:
+        return campaign_status_esa(campaign)
+
     _out = []
     for team in campaign.teams.all():
         for user in team.members.all():
-            try:
-                campaign_opts = campaign.campaignOptions.lower().split(";")
-                # may raise KeyError
-                result_type = RESULT_TYPE_BY_CLASS_NAME[campaign.get_campaign_type()]
-            except KeyError as exc:
-                LOGGER.debug(
-                    f'Invalid campaign type {campaign.get_campaign_type()} for campaign {campaign.campaignName}'
-                )
-                LOGGER.error(exc)
-                continue
 
             _data = result_type.objects.filter(
                 createdBy=user, completed=True, task__campaign=campaign.id
@@ -118,29 +127,6 @@ def campaign_status(request, campaign_name, sort_key=2):
                     (x[0], x[1], -len(json.loads(x[2])), x[3], x[4], x[5], x[6])
                     for x in _data
                 ]
-            elif "esa" in campaign_opts:
-                is_mqm_or_esa = True
-                _data = _data.values_list(
-                    'start_time',
-                    'end_time',
-                    'score',
-                    'item__itemID',
-                    'item__targetID',
-                    'item__itemType',
-                    'item__id',
-                    'item__documentID',
-                )
-                # compute time override based on document times
-                import collections
-
-                _time_pairs = collections.defaultdict(list)
-                for x in _data:
-                    _time_pairs[x[7] + " ||| " + x[4]].append((x[0], x[1]))
-                _time_pairs = [
-                    (min([x[0] for x in doc_v]), max([x[1] for x in doc_v]))
-                    for doc, doc_v in _time_pairs.items()
-                ]
-                _data = [(x[0], x[1], x[2], x[3], x[4], x[5], x[6]) for x in _data]
             else:
                 _data = _data.values_list(
                     'start_time',
@@ -245,6 +231,79 @@ def campaign_status(request, campaign_name, sort_key=2):
     return HttpResponse(u'\n'.join(_txt), content_type='text/plain')
 
 
+def campaign_status_esa(campaign) -> str:
+    import collections
+    out_str = """
+    <meta charset="UTF-8">
+
+    <style>
+    table, tr, td, th { 
+        border: 1px solid black; border-collapse: collapse;
+    }
+    td, th {
+        padding: 5px;
+    }
+    </style>
+    """
+    out_str += "<table>\n"
+    out_str += "<tr><th>Username</th><th>Progress</th><th>First Modified</th><th>Last Modified</th><th>Annotation Time</th></tr>\n"
+
+    for team in campaign.teams.all():
+        for user in team.members.all():
+            if user.is_staff:
+                continue
+            
+            out_str += "<tr>"
+            _data = DirectAssessmentDocumentResult.objects.filter(
+                createdBy=user, completed=True, task__campaign=campaign.id
+            )
+
+            total_count = None
+            if _data:
+                _data_all = DirectAssessmentDocumentTask.objects.filter(campaign=campaign.id)
+                # brute-force try to find if any task has at least one item annotated by this user
+                for task in _data_all:
+                    for item in task.items.all():
+                        item = DirectAssessmentDocumentResult.objects.filter(
+                            item=item, createdBy=user
+                        ).last()
+                        if item:
+                            total_count = task.items.count()
+                            break
+                    if total_count:
+                        break
+            if total_count is None:
+                out_str += f"<td>{user.username} 💤</td>"
+                out_str += f"<td>0%</td>"
+                out_str += "<td></td>"
+                out_str += "<td></td>"
+            else:
+                if total_count == len(_data):
+                    out_str += f"<td>{user.username} ✅</td>"
+                else:
+                    out_str += f"<td>{user.username} 🛠️</td>"
+                out_str += f"<td>{len(_data)}/{total_count} ({len(_data) / total_count:.0%})</td>"
+                first_modified = min([x.start_time for x in _data])
+                last_modified = max([x.end_time for x in _data])
+                out_str += f"<td>{str(datetime(1970, 1, 1) + seconds_to_timedelta(first_modified)).split('.')[0]}</td>"
+                out_str += f"<td>{str(datetime(1970, 1, 1) + seconds_to_timedelta(last_modified)).split('.')[0]}</td>"
+
+                times = collections.defaultdict()
+                for item in _data:
+                    times[(item.item.documentID, item.item.targetID)] = (item.start_time, item.end_time)
+                annotation_time = sum([b-a for a, b in times.values()])
+                annotation_time = f'{int(floor(annotation_time / 3600)):0>2d}h{int(floor((annotation_time % 3600) / 60)):0>2d}m'
+
+                annotation_time_upper = last_modified - first_modified
+                annotation_time_upper = f'{int(floor(annotation_time_upper / 3600)):0>2d}h{int(floor((annotation_time_upper % 3600) / 60)):0>2d}m'
+
+                out_str += f"<td>{annotation_time} - {annotation_time_upper}</td>"
+            out_str += "</tr>\n"
+
+    out_str += "</table>"
+    return HttpResponse(out_str, content_type='text/html')
+
+
 def stat_reliable_testing(_data, campaign_opts, result_type):
     _annotations = len(set([x[6] for x in _data]))
     _user_mean = sum([x[2] for x in _data]) / (_annotations or 1)

From 55715c817da422d5bcd7e3716e21d8e9b9f16298 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Fri, 25 Jul 2025 19:25:29 -0700
Subject: [PATCH 34/51] update campaign-status styling

---
 Campaign/views.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index 6126ef8b..616dce54 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -243,8 +243,12 @@ def campaign_status_esa(campaign) -> str:
     td, th {
         padding: 5px;
     }
-    </style>
+    * {
+    font-family: monospace;
+    }
+    </style>\n
     """
+    out_str += f"<h1>{campaign.campaignName}</h1>\n"
     out_str += "<table>\n"
     out_str += "<tr><th>Username</th><th>Progress</th><th>First Modified</th><th>Last Modified</th><th>Annotation Time</th></tr>\n"
 
@@ -252,12 +256,10 @@ def campaign_status_esa(campaign) -> str:
         for user in team.members.all():
             if user.is_staff:
                 continue
-            
             out_str += "<tr>"
             _data = DirectAssessmentDocumentResult.objects.filter(
                 createdBy=user, completed=True, task__campaign=campaign.id
             )
-
             total_count = None
             if _data:
                 _data_all = DirectAssessmentDocumentTask.objects.filter(campaign=campaign.id)
@@ -274,7 +276,7 @@ def campaign_status_esa(campaign) -> str:
                         break
             if total_count is None:
                 out_str += f"<td>{user.username} 💤</td>"
-                out_str += f"<td>0%</td>"
+                out_str += "<td></td>"
                 out_str += "<td></td>"
                 out_str += "<td></td>"
             else:
@@ -285,8 +287,15 @@ def campaign_status_esa(campaign) -> str:
                 out_str += f"<td>{len(_data)}/{total_count} ({len(_data) / total_count:.0%})</td>"
                 first_modified = min([x.start_time for x in _data])
                 last_modified = max([x.end_time for x in _data])
-                out_str += f"<td>{str(datetime(1970, 1, 1) + seconds_to_timedelta(first_modified)).split('.')[0]}</td>"
-                out_str += f"<td>{str(datetime(1970, 1, 1) + seconds_to_timedelta(last_modified)).split('.')[0]}</td>"
+
+                first_modified_str = str(datetime(1970, 1, 1) + seconds_to_timedelta(first_modified)).split('.')[0]
+                last_modified_str = str(datetime(1970, 1, 1) + seconds_to_timedelta(last_modified)).split('.')[0]
+                # remove seconds
+                first_modified_str = ":".join(first_modified_str.split(":")[:-1]) 
+                last_modified_str = ":".join(last_modified_str.split(":")[:-1])
+
+                out_str += f"<td>{first_modified_str}</td>"
+                out_str += f"<td>{last_modified_str}</td>"
 
                 times = collections.defaultdict()
                 for item in _data:

From ed89e7e4d3c1aa351f8ab476bc12cd1936a752e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Fri, 25 Jul 2025 19:43:13 -0700
Subject: [PATCH 35/51] speed-up campaign-status and next document fetch

---
 Campaign/views.py                             | 18 +++---------------
 EvalData/models/direct_assessment_document.py | 11 +++++++----
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index 616dce54..a37b8db0 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -260,26 +260,14 @@ def campaign_status_esa(campaign) -> str:
             _data = DirectAssessmentDocumentResult.objects.filter(
                 createdBy=user, completed=True, task__campaign=campaign.id
             )
-            total_count = None
-            if _data:
-                _data_all = DirectAssessmentDocumentTask.objects.filter(campaign=campaign.id)
-                # brute-force try to find if any task has at least one item annotated by this user
-                for task in _data_all:
-                    for item in task.items.all():
-                        item = DirectAssessmentDocumentResult.objects.filter(
-                            item=item, createdBy=user
-                        ).last()
-                        if item:
-                            total_count = task.items.count()
-                            break
-                    if total_count:
-                        break
-            if total_count is None:
+            if not _data:
                 out_str += f"<td>{user.username} 💤</td>"
                 out_str += "<td></td>"
                 out_str += "<td></td>"
                 out_str += "<td></td>"
             else:
+                task = DirectAssessmentDocumentTask.objects.filter(id=_data[0].task_id).first()
+                total_count = task.items.count()
                 if total_count == len(_data):
                     out_str += f"<td>{user.username} ✅</td>"
                 else:
diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index 4499bf6a..2a3a56dd 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -263,13 +263,16 @@ def next_document_for_user_mqmesa(self, user):
             doc_items_results,
         """
 
-        # get all items (100) and try to find resul
+        # get all items and try to find a matching result
+        # TODO: probably can be optimized better
+
+        items_user = DirectAssessmentDocumentResult.objects.filter(
+            activated=False, completed=True, createdBy=user
+        )
         all_items = [
             (
                 item,
-                DirectAssessmentDocumentResult.objects.filter(
-                    item=item, activated=False, completed=True, createdBy=user
-                ).last(),
+                items_user.filter(item=item).last(),
             )
             for item in self.items.all().order_by('id')
         ]

From 7e8b9fa2321d12a3a2bf8b721e0be2af4d77257b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Fri, 25 Jul 2025 20:03:56 -0700
Subject: [PATCH 36/51] edit style of times

---
 Campaign/views.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index a37b8db0..e112e0e9 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -250,7 +250,9 @@ def campaign_status_esa(campaign) -> str:
     """
     out_str += f"<h1>{campaign.campaignName}</h1>\n"
     out_str += "<table>\n"
-    out_str += "<tr><th>Username</th><th>Progress</th><th>First Modified</th><th>Last Modified</th><th>Annotation Time</th></tr>\n"
+    out_str += "<tr>" + "".join(
+        f"<th>{x}</th>" for x in ["Username", "Progress", "First Modified", "Last Modified", "Time (Last-First)", "Time (Real)"]
+    ) + "</tr>\n"
 
     for team in campaign.teams.all():
         for user in team.members.all():
@@ -265,6 +267,8 @@ def campaign_status_esa(campaign) -> str:
                 out_str += "<td></td>"
                 out_str += "<td></td>"
                 out_str += "<td></td>"
+                out_str += "<td></td>"
+                out_str += "<td></td>"
             else:
                 task = DirectAssessmentDocumentTask.objects.filter(id=_data[0].task_id).first()
                 total_count = task.items.count()
@@ -284,17 +288,23 @@ def campaign_status_esa(campaign) -> str:
 
                 out_str += f"<td>{first_modified_str}</td>"
                 out_str += f"<td>{last_modified_str}</td>"
+                annotation_time_upper = last_modified - first_modified
+                annotation_time_upper = f'{int(floor(annotation_time_upper / 3600)):0>2d}h {int(floor((annotation_time_upper % 3600) / 60)):0>2d}m'
+                out_str += f"<td>{annotation_time_upper}</td>"
 
-                times = collections.defaultdict()
+                times = collections.defaultdict(list)
                 for item in _data:
-                    times[(item.item.documentID, item.item.targetID)] = (item.start_time, item.end_time)
-                annotation_time = sum([b-a for a, b in times.values()])
-                annotation_time = f'{int(floor(annotation_time / 3600)):0>2d}h{int(floor((annotation_time % 3600) / 60)):0>2d}m'
+                    times[(item.item.documentID, item.item.targetID)].append((item.start_time, item.end_time))
+                times = [
+                    (min([x[0] for x in doc_v]), max([x[1] for x in doc_v]))
+                    for doc, doc_v in times.items()
+                ]
 
-                annotation_time_upper = last_modified - first_modified
-                annotation_time_upper = f'{int(floor(annotation_time_upper / 3600)):0>2d}h{int(floor((annotation_time_upper % 3600) / 60)):0>2d}m'
+                annotation_time = sum([b-a for a, b in times])
+                annotation_time = f'{int(floor(annotation_time / 3600)):0>2d}h {int(floor((annotation_time % 3600) / 60)):0>2d}m'
+
+                out_str += f"<td>{annotation_time}</td>"
 
-                out_str += f"<td>{annotation_time} - {annotation_time_upper}</td>"
             out_str += "</tr>\n"
 
     out_str += "</table>"

From 11cd25995bccf85bd9e5f01cc205827c968f47e3 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Sat, 26 Jul 2025 09:45:42 -0700
Subject: [PATCH 37/51] use TaskAgenda to get the task for a user; display
 0/xxx even if no annotations made

---
 Campaign/views.py | 58 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index e112e0e9..36c7f927 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -23,6 +23,7 @@
 from EvalData.models import PairwiseAssessmentResult
 from EvalData.models import seconds_to_timedelta
 from EvalData.models import TASK_DEFINITIONS
+from EvalData.models import TaskAgenda
 from EvalData.models.direct_assessment_document import DirectAssessmentDocumentTask
 
 # pylint: disable=import-error
@@ -65,7 +66,7 @@ def campaign_status(request, campaign_name, sort_key=2):
             'Invalid campaign type for campaign {0}'.format(campaign.campaignName),
             content_type='text/plain',
         )
-    
+
     # special handling for ESA
     if "esa" in campaign_opts:
         return campaign_status_esa(campaign)
@@ -237,7 +238,7 @@ def campaign_status_esa(campaign) -> str:
     <meta charset="UTF-8">
 
     <style>
-    table, tr, td, th { 
+    table, tr, td, th {
         border: 1px solid black; border-collapse: collapse;
     }
     td, th {
@@ -259,18 +260,63 @@ def campaign_status_esa(campaign) -> str:
             if user.is_staff:
                 continue
             out_str += "<tr>"
+
+            # Get the task for this user even when there's no completed data
+            task = None
+
+            # First try to get the task from TaskAgenda
+            agenda = TaskAgenda.objects.filter(user=user, campaign=campaign).first()
+            if agenda:
+                # Try to get an open or completed task from the agenda
+                for serialized_task in agenda.serialized_open_tasks():
+                    potential_task = serialized_task.get_object_instance()
+                    if isinstance(potential_task, DirectAssessmentDocumentTask):
+                        task = potential_task
+                        break
+                # If no open task, try completed tasks
+                if not task:
+                    for serialized_task in agenda._completed_tasks.all():
+                        potential_task = serialized_task.get_object_instance()
+                        if isinstance(potential_task, DirectAssessmentDocumentTask):
+                            task = potential_task
+                            break
+
+            # Get the completed data for this user
             _data = DirectAssessmentDocumentResult.objects.filter(
                 createdBy=user, completed=True, task__campaign=campaign.id
             )
+
+            # If no data, show 0 progress or show that no task is assigned
             if not _data:
-                out_str += f"<td>{user.username} 💤</td>"
-                out_str += "<td></td>"
+                if task:
+                    total_count = task.items.count()
+                    out_str += f"<td>{user.username} 💤</td>"
+                    out_str += f"<td>0/{total_count} (0%)</td>"
+                else:
+                    # No task assigned to this user
+                    out_str += f"<td>{user.username} 💤</td>"
+                    out_str += "<td>No task assigned</td>"
                 out_str += "<td></td>"
                 out_str += "<td></td>"
                 out_str += "<td></td>"
                 out_str += "<td></td>"
+
+            # If we have data, show the progress
             else:
-                task = DirectAssessmentDocumentTask.objects.filter(id=_data[0].task_id).first()
+                if not task:
+                    # Fallback to checking the first result's task for the task ID
+                    task = DirectAssessmentDocumentTask.objects.filter(id=_data[0].task_id).first()
+                if not task:
+                    # Skip this user if we can't find the task
+                    out_str += f"<td>{user.username} ❌</td>"
+                    out_str += "<td>Task not found</td>"
+                    out_str += "<td></td>"
+                    out_str += "<td></td>"
+                    out_str += "<td></td>"
+                    out_str += "<td></td>"
+                    out_str += "</tr>\n"
+                    continue
+
                 total_count = task.items.count()
                 if total_count == len(_data):
                     out_str += f"<td>{user.username} ✅</td>"
@@ -283,7 +329,7 @@ def campaign_status_esa(campaign) -> str:
                 first_modified_str = str(datetime(1970, 1, 1) + seconds_to_timedelta(first_modified)).split('.')[0]
                 last_modified_str = str(datetime(1970, 1, 1) + seconds_to_timedelta(last_modified)).split('.')[0]
                 # remove seconds
-                first_modified_str = ":".join(first_modified_str.split(":")[:-1]) 
+                first_modified_str = ":".join(first_modified_str.split(":")[:-1])
                 last_modified_str = ":".join(last_modified_str.split(":")[:-1])
 
                 out_str += f"<td>{first_modified_str}</td>"

From b14ecc94c66cc667b9050b4c418d692d1ca653d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Sat, 26 Jul 2025 12:28:03 -0700
Subject: [PATCH 38/51] make campaign-status accessible without login

---
 Campaign/views.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index 36c7f927..4162f743 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -11,7 +11,6 @@
 from math import floor
 from math import sqrt
 
-from django.contrib.auth.decorators import login_required
 from django.core.management.base import CommandError
 from django.http import HttpResponse
 
@@ -33,8 +32,7 @@
 LOGGER = _get_logger(name=__name__)
 
 
-@login_required
-def campaign_status(request, campaign_name, sort_key=2):
+def campaign_status(request, campaign_name):
     """
     Campaign status view with completion details.
     """
@@ -42,8 +40,6 @@ def campaign_status(request, campaign_name, sort_key=2):
         'Rendering campaign status view for user "%s".',
         request.user.username or "Anonymous",
     )
-    if sort_key is None:
-        sort_key = 2
 
     # Get Campaign instance for campaign name
     try:
@@ -206,7 +202,7 @@ def campaign_status(request, campaign_name, sort_key=2):
 
             _out.append(_item)
 
-    _out.sort(key=lambda x: x[int(sort_key)])
+    _out.sort(key=lambda x: x[2])
 
     _header = (
         'username',

From 8b8ad4b146f4e8a91413fe6237148a2d7824533c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 31 Jul 2025 09:18:25 -0700
Subject: [PATCH 39/51] update ESA instructions; resolve #211

---
 EvalView/templates/EvalView/_instructions-esa.html | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/EvalView/templates/EvalView/_instructions-esa.html b/EvalView/templates/EvalView/_instructions-esa.html
index 38abbe41..a930aabc 100644
--- a/EvalView/templates/EvalView/_instructions-esa.html
+++ b/EvalView/templates/EvalView/_instructions-esa.html
@@ -16,6 +16,9 @@
       <li><strong>Missing content</strong>: If something is missing, highlight the word <b style="font-family: monospace, monospace;">[MISSING]</b> to mark the error. </li>
       <li><strong>Tip</strong>: Highlight the word or general area of the error (it doesn't need to be exact). Use multiple highlights for different errors.</li>
       <li><strong>Tip</strong>: Pay particular attention to translation consistency between texts across the whole document.</li>
+      <li><strong>Tip</strong>: If the translation is in the wrong language, mark it fully and assign it 0%</li>
+      <li><strong>Tip</strong>: If the translation contains additional text (e.g. "Here is the translation") or alternative secondary translation, mark it as a major error.</li>
+
       <li><strong>Score the translation</strong>: After marking errors, please use the slider and set an overall score based on meaning preservation and general quality:</li>
         <ul>
           <li>0: <strong>Broken/poor</strong> translation.</li>

From 2a3c9104608ef079bd54eff01b47b0500db11d82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 31 Jul 2025 10:17:16 -0700
Subject: [PATCH 40/51] fix token generation when no QC exists, resolve #210

---
 Dashboard/utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Dashboard/utils.py b/Dashboard/utils.py
index 44848e13..5bd73fb0 100644
--- a/Dashboard/utils.py
+++ b/Dashboard/utils.py
@@ -146,20 +146,20 @@ def run_quality_control(username):
         # File "scipy/stats/stats.py", line 4865, in mannwhitneyu
         #   raise ValueError(
         #     'All numbers are identical in mannwhitneyu')
+        # In this case, let's consider it a failed QC.
         except ValueError:
-            pass
+            _pvalue = 1
 
     # Compute the total annotation time
-    _durations = [x[1] - x[0] for x in _data]
-    annotation_time = sum(_durations) if _durations else None
+    # Be very generous, essentially last action - first action (not individual times)
+    times = [x[1] for x in _data] + [x[0] for x in _data]
+    annotation_time = max(times) - min(times) if times else 0
 
     print(
         f"User '{username}', items= {len(_x)}, p-value= {pvalue}, time= {annotation_time}"
     )
 
-    return (
-        pvalue is not None
-        and pvalue <= MAX_WILCOXON_PVALUE
-        and annotation_time is not None
-        and annotation_time >= MIN_ANNOTATION_TIME
+    return annotation_time >= MIN_ANNOTATION_TIME and (
+        pvalue is None or
+        pvalue <= MAX_WILCOXON_PVALUE
     )

From 401cab1d8a13f152bad007481d036feffa220080 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 31 Jul 2025 10:36:02 -0700
Subject: [PATCH 41/51] add sort_key functionality back, support multi-campaign
 status view

---
 Appraise/urls.py  |  4 ++--
 Campaign/views.py | 23 ++++++++++++++++++++---
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/Appraise/urls.py b/Appraise/urls.py
index 54a122be..a1690c8c 100644
--- a/Appraise/urls.py
+++ b/Appraise/urls.py
@@ -188,8 +188,8 @@
         name='pairwise-assessment-document',
     ),
     re_path(
-        r'^campaign-status/(?P<campaign_name>[a-zA-Z0-9]+)/'
-        r'(?P<sort_key>[0123456])?/?$',
+        r'^campaign-status/(?P<campaign_name>[a-zA-Z0-9]+(,[a-zA-Z0-9]+))*/'
+        r'(?P<sort_key>[a-zA-Z0-9_])?/?$',
         campaign_views.campaign_status,
         name='campaign_status',
     ),
diff --git a/Campaign/views.py b/Campaign/views.py
index 4162f743..2f326f5c 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -32,7 +32,7 @@
 LOGGER = _get_logger(name=__name__)
 
 
-def campaign_status(request, campaign_name):
+def campaign_status(request, campaign_name, sort_key=None):
     """
     Campaign status view with completion details.
     """
@@ -41,6 +41,19 @@ def campaign_status(request, campaign_name):
         request.user.username or "Anonymous",
     )
 
+    if "," in campaign_name:
+        responses = [campaign_status(request, name, sort_key) for name in campaign_name.split(",")]
+        if not all([response.headers["Content-Type"] == responses[0].headers["Content-Type"] for response in responses]):
+            return HttpResponse(
+                'ERROR: You are mixing unrelated campaigns (views.py:campaign_status).',
+                content_type='text/plain',
+            )
+        else:
+            return HttpResponse(
+                "\n\n".join([response.content.decode('utf-8') for response in responses]),
+                content_type=responses[0].headers["Content-Type"],
+            )
+
     # Get Campaign instance for campaign name
     try:
         campaign = _get_campaign_instance(campaign_name)
@@ -66,7 +79,11 @@ def campaign_status(request, campaign_name):
     # special handling for ESA
     if "esa" in campaign_opts:
         return campaign_status_esa(campaign)
+    else:
+        return campaign_status_plain(request, campaign, result_type, campaign_opts, sort_key)
+
 
+def campaign_status_plain(request, campaign, result_type, campaign_opts, sort_key):
     _out = []
     for team in campaign.teams.all():
         for user in team.members.all():
@@ -202,7 +219,7 @@ def campaign_status(request, campaign_name):
 
             _out.append(_item)
 
-    _out.sort(key=lambda x: x[2])
+    _out.sort(key=lambda x: x[sort_key if sort_key else 2])
 
     _header = (
         'username',
@@ -250,7 +267,7 @@ def campaign_status_esa(campaign) -> str:
     out_str += "<tr>" + "".join(
         f"<th>{x}</th>" for x in ["Username", "Progress", "First Modified", "Last Modified", "Time (Last-First)", "Time (Real)"]
     ) + "</tr>\n"
-
+    
     for team in campaign.teams.all():
         for user in team.members.all():
             if user.is_staff:

From aed8807644dd178aabf7a89722203cf84f43f817 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Thu, 31 Jul 2025 13:03:57 -0700
Subject: [PATCH 42/51] fix campaign-status to work on single campaign

---
 Appraise/urls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Appraise/urls.py b/Appraise/urls.py
index a1690c8c..09946d45 100644
--- a/Appraise/urls.py
+++ b/Appraise/urls.py
@@ -188,7 +188,7 @@
         name='pairwise-assessment-document',
     ),
     re_path(
-        r'^campaign-status/(?P<campaign_name>[a-zA-Z0-9]+(,[a-zA-Z0-9]+))*/'
+        r'^campaign-status/(?P<campaign_name>[a-zA-Z0-9]+(,[a-zA-Z0-9]+)*)/'
         r'(?P<sort_key>[a-zA-Z0-9_])?/?$',
         campaign_views.campaign_status,
         name='campaign_status',

From 1c092d65c16f8396c3b2b7cca44d3bd1debea0ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Mon, 4 Aug 2025 10:48:53 -0700
Subject: [PATCH 43/51] fix >100% in campaign-status; more precise ESA time
 computation

---
 Campaign/views.py                             | 17 +++++--------
 EvalData/models/direct_assessment_document.py | 24 ++++++-------------
 2 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index 2f326f5c..5e256c42 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -298,6 +298,7 @@ def campaign_status_esa(campaign) -> str:
             _data = DirectAssessmentDocumentResult.objects.filter(
                 createdBy=user, completed=True, task__campaign=campaign.id
             )
+            _data_uniq_len = len({item.id for item in _data})
 
             # If no data, show 0 progress or show that no task is assigned
             if not _data:
@@ -331,11 +332,11 @@ def campaign_status_esa(campaign) -> str:
                     continue
 
                 total_count = task.items.count()
-                if total_count == len(_data):
+                if total_count == _data_uniq_len:
                     out_str += f"<td>{user.username} ✅</td>"
                 else:
                     out_str += f"<td>{user.username} 🛠️</td>"
-                out_str += f"<td>{len(_data)}/{total_count} ({len(_data) / total_count:.0%})</td>"
+                out_str += f"<td>{_data_uniq_len}/{total_count} ({_data_uniq_len / total_count:.0%})</td>"
                 first_modified = min([x.start_time for x in _data])
                 last_modified = max([x.end_time for x in _data])
 
@@ -351,15 +352,9 @@ def campaign_status_esa(campaign) -> str:
                 annotation_time_upper = f'{int(floor(annotation_time_upper / 3600)):0>2d}h {int(floor((annotation_time_upper % 3600) / 60)):0>2d}m'
                 out_str += f"<td>{annotation_time_upper}</td>"
 
-                times = collections.defaultdict(list)
-                for item in _data:
-                    times[(item.item.documentID, item.item.targetID)].append((item.start_time, item.end_time))
-                times = [
-                    (min([x[0] for x in doc_v]), max([x[1] for x in doc_v]))
-                    for doc, doc_v in times.items()
-                ]
-
-                annotation_time = sum([b-a for a, b in times])
+                # consider time that's in any action within 5 minutes
+                times = sorted([item.start_time for item in _data] + [item.end_time for item in _data])
+                annotation_time = sum([b-a for a, b in zip(times, times[1:]) if (b-a) < 5*60])
                 annotation_time = f'{int(floor(annotation_time / 3600)):0>2d}h {int(floor((annotation_time % 3600) / 60)):0>2d}m'
 
                 out_str += f"<td>{annotation_time}</td>"
diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index 2a3a56dd..4ba10143 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -599,35 +599,25 @@ def get_hit_status_for_user(cls, user):
     @classmethod
     def get_time_for_user(cls, user):
         results = cls.objects.filter(createdBy=user, activated=False, completed=True)
+        campaign_opts = result.task.campaign.campaignOptions.lower().split(";")
         is_esa_or_mqm = any(
             [
-                "esa" in result.task.campaign.campaignOptions.lower().split(";")
-                or "mqm" in result.task.campaign.campaignOptions.lower().split(";")
+                "esa" in campaign_opts or "mqm" in campaign_opts
                 for result in results
             ]
         )
 
         if is_esa_or_mqm:
-            # for ESA or MQM, do minimum and maximum from each doc
-            import collections
-
-            timestamps = collections.defaultdict(list)
-            for result in results:
-                timestamps[
-                    result.item.documentID + " ||| " + result.item.targetID
-                ].append((result.start_time, result.end_time))
-
-            # timestamps are document-level now, but that does not change anything later on
-            timestamps = [
-                (min([x[0] for x in doc_v]), max([x[1] for x in doc_v]))
-                for doc, doc_v in timestamps.items()
-            ]
+            # consider time that's in any action within 5 minutes
+            times = sorted([item.start_time for item in results] + [item.end_time for item in results])
+            annotation_time = sum([b-a for a, b in zip(times, times[1:]) if (b-a) < 5*60])
+            return seconds_to_timedelta(annotation_time)
         else:
             timestamps = []
             for result in results:
                 timestamps.append((result.start_time, result.end_time))
 
-        return seconds_to_timedelta(_compute_user_total_annotation_time(timestamps))
+            return seconds_to_timedelta(_compute_user_total_annotation_time(timestamps))
 
     @classmethod
     def get_system_annotations(cls):

From 7932c7e142c2a03d56130fc4d88c92626475d8d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Mon, 4 Aug 2025 10:56:57 -0700
Subject: [PATCH 44/51] add tooltip to campaign-status

---
 Campaign/views.py                             | 17 +++++++++++------
 EvalData/models/direct_assessment_document.py |  4 ++--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index 5e256c42..a5770b0a 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -264,10 +264,15 @@ def campaign_status_esa(campaign) -> str:
     """
     out_str += f"<h1>{campaign.campaignName}</h1>\n"
     out_str += "<table>\n"
-    out_str += "<tr>" + "".join(
-        f"<th>{x}</th>" for x in ["Username", "Progress", "First Modified", "Last Modified", "Time (Last-First)", "Time (Real)"]
-    ) + "</tr>\n"
-    
+    out_str += """<tr>
+<th>Username</th>
+<th>Progress</th>
+<th>First Modified</th>
+<th>Last Modified</th>
+<th style="cursor: pointer" title="Very coarse upper bound estimate between the last and the first interaction with the system.">Time (Coarse ❔)</th>
+<th style="cursor: pointer" title="Sum of times between any two interactions that are not longer than 10 minutes.">Time (Real ❔)</th>
+</tr>\n
+""" 
     for team in campaign.teams.all():
         for user in team.members.all():
             if user.is_staff:
@@ -352,9 +357,9 @@ def campaign_status_esa(campaign) -> str:
                 annotation_time_upper = f'{int(floor(annotation_time_upper / 3600)):0>2d}h {int(floor((annotation_time_upper % 3600) / 60)):0>2d}m'
                 out_str += f"<td>{annotation_time_upper}</td>"
 
-                # consider time that's in any action within 5 minutes
+                # consider time that's in any action within 10 minutes
                 times = sorted([item.start_time for item in _data] + [item.end_time for item in _data])
-                annotation_time = sum([b-a for a, b in zip(times, times[1:]) if (b-a) < 5*60])
+                annotation_time = sum([b-a for a, b in zip(times, times[1:]) if (b-a) < 10*60])
                 annotation_time = f'{int(floor(annotation_time / 3600)):0>2d}h {int(floor((annotation_time % 3600) / 60)):0>2d}m'
 
                 out_str += f"<td>{annotation_time}</td>"
diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index 4ba10143..97d71af5 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -608,9 +608,9 @@ def get_time_for_user(cls, user):
         )
 
         if is_esa_or_mqm:
-            # consider time that's in any action within 5 minutes
+            # consider time that's in any action within 10 minutes
             times = sorted([item.start_time for item in results] + [item.end_time for item in results])
-            annotation_time = sum([b-a for a, b in zip(times, times[1:]) if (b-a) < 5*60])
+            annotation_time = sum([b-a for a, b in zip(times, times[1:]) if (b-a) < 10*60])
             return seconds_to_timedelta(annotation_time)
         else:
             timestamps = []

From aabf1c380468df5681185195dc78a987248de7e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Le=20Maguer?=
 <sebastien.lemaguer@helsinki.fi>
Date: Tue, 5 Aug 2025 10:56:58 -0700
Subject: [PATCH 45/51] Add action to download/bulk-download annotated data
 from the admin view.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Vilém Zouhar <vilem.zouhar@gmail.com>
---
 Campaign/admin.py                             | 135 ++++++++++++------
 EvalData/models/data_assessment.py            |   7 +-
 EvalData/models/direct_assessment.py          |   7 +-
 EvalData/models/direct_assessment_context.py  |   5 +-
 EvalData/models/direct_assessment_document.py |  12 +-
 manage.py                                     |   2 +-
 6 files changed, 107 insertions(+), 61 deletions(-)
 mode change 100644 => 100755 manage.py

diff --git a/Campaign/admin.py b/Campaign/admin.py
index 5e39b963..fce8c9db 100644
--- a/Campaign/admin.py
+++ b/Campaign/admin.py
@@ -11,14 +11,18 @@
 from Campaign.models import CampaignTeam
 from Campaign.models import TrustedUser
 from EvalData.admin import BaseMetadataAdmin
-
+from django.http import HttpResponse
+import csv
+import zipfile
+from io import StringIO
+import importlib
 
 class DropdownFilter(AllValuesFieldListFilter):
     """
     Experimental dropdown filter.
     """
 
-    template = 'Campaign/filter_select.html'
+    template = "Campaign/filter_select.html"
 
 
 class CampaignTeamAdmin(BaseMetadataAdmin):
@@ -27,33 +31,33 @@ class CampaignTeamAdmin(BaseMetadataAdmin):
     """
 
     list_display = [
-        'teamName',
-        'owner',
-        'teamMembers',
-        'requiredAnnotations',
-        'requiredHours',
-        'completionStatus',
+        "teamName",
+        "owner",
+        "teamMembers",
+        "requiredAnnotations",
+        "requiredHours",
+        "completionStatus",
     ] + BaseMetadataAdmin.list_display  # type: ignore
-    list_filter = ['owner'] + BaseMetadataAdmin.list_filter  # type: ignore
+    list_filter = ["owner"] + BaseMetadataAdmin.list_filter  # type: ignore
     search_fields = [
-        'teamName',
-        'owner__username',
-        'owner__first_name',
-        'owner__last_name',
+        "teamName",
+        "owner__username",
+        "owner__first_name",
+        "owner__last_name",
     ] + BaseMetadataAdmin.search_fields  # type: ignore
 
-    filter_horizontal = ['members']
+    filter_horizontal = ["members"]
 
     fieldsets = (
         (
             None,
             {
-                'fields': (
-                    'teamName',
-                    'owner',
-                    'members',
-                    'requiredAnnotations',
-                    'requiredHours',
+                "fields": (
+                    "teamName",
+                    "owner",
+                    "members",
+                    "requiredAnnotations",
+                    "requiredHours",
                 )
             },
         ),
@@ -66,22 +70,22 @@ class CampaignDataAdmin(BaseMetadataAdmin):
     """
 
     list_display = [
-        'dataName',
-        'market',
-        'metadata',
-        'dataValid',
-        'dataReady',
+        "dataName",
+        "market",
+        "metadata",
+        "dataValid",
+        "dataReady",
     ] + BaseMetadataAdmin.list_display  # type: ignore
     list_filter = [
-        'dataValid',
-        'dataReady',
+        "dataValid",
+        "dataReady",
     ] + BaseMetadataAdmin.list_filter  # type: ignore
     search_fields = [
         # nothing model specific
     ] + BaseMetadataAdmin.search_fields  # type: ignore
 
     fieldsets = (
-        (None, {'fields': ('dataFile', 'market', 'metadata')}),
+        (None, {"fields": ("dataFile", "market", "metadata")}),
     ) + BaseMetadataAdmin.fieldsets  # type: ignore
 
 
@@ -90,7 +94,8 @@ class CampaignAdmin(BaseMetadataAdmin):
     Model admin for Campaign instances.
     """
 
-    list_display = ['campaignName'] + BaseMetadataAdmin.list_display + ['id']  # type: ignore
+    list_display = ["campaignName"] + \
+        BaseMetadataAdmin.list_display + ["id"]  # type: ignore
     list_filter = [
         # nothing model specific
     ] + BaseMetadataAdmin.list_filter  # type: ignore
@@ -98,39 +103,89 @@ class CampaignAdmin(BaseMetadataAdmin):
         # nothing model specific
     ] + BaseMetadataAdmin.search_fields  # type: ignore
 
-    filter_horizontal = ['batches']
+    filter_horizontal = ["batches"]
 
     fieldsets = (
         (
             None,
             {
-                'fields': (
-                    'campaignName',
-                    'packageFile',
-                    'teams',
-                    'batches',
-                    'campaignOptions',
+                "fields": (
+                    "campaignName",
+                    "packageFile",
+                    "teams",
+                    "batches",
+                    "campaignOptions",
                 )
             },
         ),
     ) + BaseMetadataAdmin.fieldsets  # type: ignore
 
 
+    actions = ["export_results"]
+
+    def _retrieve_csv(self, current_campaign):
+        # Get the task type  corresponding to the campaign
+        qs_name = current_campaign.get_campaign_type().lower()
+        qs_attr = "evaldata_{0}_campaign".format(qs_name)
+        qs_obj = getattr(current_campaign, qs_attr, None)
+        cls = type(qs_obj.all()[0])
+        cls_name = cls.__name__
+        cls_name = cls_name.replace("Task", "Result")
+        module = importlib.import_module(cls.__module__)
+        cls = getattr(module, cls_name)
+
+        # Now get the content
+        f = StringIO()
+        writer = csv.writer(f)
+        csv_content = cls.get_system_data(current_campaign.id, extended_csv=True)
+        for r in csv_content:
+            writer.writerow(r)
+
+        f.seek(0)
+        return f
+
+
+    def export_results(self, request, queryset):
+        if len(queryset) == 1:
+            current_campaign = queryset[0]
+            csv_content = self._retrieve_csv(current_campaign)
+            filename = f"results_{current_campaign.campaignName}.csv"
+            response = HttpResponse(csv_content, content_type="text/csv")
+            response["Content-Disposition"] = f"attachment; filename={filename}"
+        else:
+            response = HttpResponse(content_type='application/zip')
+            response['Content-Disposition'] = 'attachment; filename="campaign_results.zip"'
+
+            # Create a zip file with selected objects
+            with zipfile.ZipFile(response, 'w') as zipf:
+                for current_campaign in queryset:
+
+                    csv_content = self._retrieve_csv(current_campaign)
+                    # Add objects to the zip file, customize as per your model's data
+                    # For example, you can add an object's name and description to a text file in the zip
+                    filename = f"results_{current_campaign.campaignName}.csv"
+                    zipf.writestr(filename, csv_content.getvalue())
+        return response
+
+    export_results.short_description = "Download results"
+
+
+
 class TrustedUserAdmin(admin.ModelAdmin):
     """
     Model admin for Campaign instances.
     """
 
-    list_display = ['user', 'campaign']
+    list_display = ["user", "campaign"]
     list_filter = [
-        ('campaign__campaignName', DropdownFilter),
-        #      'campaign'
+        ("campaign__campaignName", DropdownFilter),
+        #      "campaign"
     ]
     search_fields = [  # type: ignore
         # nothing model specific
     ]
 
-    fieldsets = ((None, {'fields': ('user', 'campaign')}),)
+    fieldsets = ((None, {"fields": ("user", "campaign")}),)
 
 
 admin.site.register(CampaignTeam, CampaignTeamAdmin)
diff --git a/EvalData/models/data_assessment.py b/EvalData/models/data_assessment.py
index 3de5292b..c10df8eb 100644
--- a/EvalData/models/data_assessment.py
+++ b/EvalData/models/data_assessment.py
@@ -890,18 +890,15 @@ def get_system_data(
         for result in qs.values_list(*attributes_to_extract):
             user_id = result[0]
 
-            _fixed_ids = result[1].replace('Transformer+R2L', 'Transformer_R2L')
-            _fixed_ids = _fixed_ids.replace('R2L+Back', 'R2L_Back')
-
             if expand_multi_sys:
-                system_ids = _fixed_ids.split('+')
+                system_ids = result[1].split('+')
 
                 for system_id in system_ids:
                     data = (user_id,) + (system_id,) + result[2:]
                     system_data.append(data)
 
             else:
-                system_id = _fixed_ids
+                system_id = result[1]
                 data = (user_id,) + (system_id,) + result[2:]
                 system_data.append(data)
 
diff --git a/EvalData/models/direct_assessment.py b/EvalData/models/direct_assessment.py
index 801d54d7..c52e4b34 100644
--- a/EvalData/models/direct_assessment.py
+++ b/EvalData/models/direct_assessment.py
@@ -751,18 +751,15 @@ def get_system_data(
         for result in qs.values_list(*attributes_to_extract):
             user_id = result[0]
 
-            _fixed_ids = result[1].replace('Transformer+R2L', 'Transformer_R2L')
-            _fixed_ids = _fixed_ids.replace('R2L+Back', 'R2L_Back')
-
             if expand_multi_sys:
-                system_ids = _fixed_ids.split('+')
+                system_ids = result[1].split('+')
 
                 for system_id in system_ids:
                     data = (user_id,) + (system_id,) + result[2:]
                     system_data.append(data)
 
             else:
-                system_id = _fixed_ids
+                system_id = result[1]
                 data = (user_id,) + (system_id,) + result[2:]
                 system_data.append(data)
 
diff --git a/EvalData/models/direct_assessment_context.py b/EvalData/models/direct_assessment_context.py
index 37666068..aa76c6d2 100644
--- a/EvalData/models/direct_assessment_context.py
+++ b/EvalData/models/direct_assessment_context.py
@@ -836,11 +836,8 @@ def get_system_data(
         for result in qs.values_list(*attributes_to_extract):
             user_id = result[0]
 
-            _fixed_ids = result[1].replace('Transformer+R2L', 'Transformer_R2L')
-            _fixed_ids = _fixed_ids.replace('R2L+Back', 'R2L_Back')
-
             if expand_multi_sys:
-                system_ids = _fixed_ids.split('+')
+                system_ids = result[1].split('+')
 
                 for system_id in system_ids:
                     data = (user_id,) + (system_id,) + result[2:]
diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index 97d71af5..14d06d88 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -599,6 +599,8 @@ def get_hit_status_for_user(cls, user):
     @classmethod
     def get_time_for_user(cls, user):
         results = cls.objects.filter(createdBy=user, activated=False, completed=True)
+        if not results:
+            return seconds_to_timedelta(0)
         campaign_opts = result.task.campaign.campaignOptions.lower().split(";")
         is_esa_or_mqm = any(
             [
@@ -931,9 +933,10 @@ def get_system_data(
         qs = cls.objects.filter(completed=True, item__itemType__in=item_types)
 
         # If campaign ID is given, only return results for this campaign.
-        campaign_name = None
         if campaign_id:
             qs = qs.filter(task__campaign__id=campaign_id)
+            if not qs:
+                return []
             campaign_opts = str(qs.first().task.campaign.campaignOptions)
 
         if not include_inactive:
@@ -978,18 +981,15 @@ def get_system_data(
         for result in qs.values_list(*attributes_to_extract):
             user_id = result[0]
 
-            _fixed_ids = result[1].replace('Transformer+R2L', 'Transformer_R2L')
-            _fixed_ids = _fixed_ids.replace('R2L+Back', 'R2L_Back')
-
             if expand_multi_sys:
-                system_ids = _fixed_ids.split('+')
+                system_ids = result[1].split('+')
 
                 for system_id in system_ids:
                     data = (user_id,) + (system_id,) + result[2:]
                     system_data.append(data)
 
             else:
-                system_id = _fixed_ids
+                system_id = result[1]
                 data = (user_id,) + (system_id,) + result[2:]
                 system_data.append(data)
 
diff --git a/manage.py b/manage.py
old mode 100644
new mode 100755
index dfd799b6..84cdf5d4
--- a/manage.py
+++ b/manage.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import os
 import sys
 

From e30cdc9bca85d7019f2620fdada3286ef3507638 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Tue, 5 Aug 2025 12:47:14 -0700
Subject: [PATCH 46/51] fix tests for ESA

---
 EvalData/models/direct_assessment_document.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index 14d06d88..a9565337 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -601,10 +601,10 @@ def get_time_for_user(cls, user):
         results = cls.objects.filter(createdBy=user, activated=False, completed=True)
         if not results:
             return seconds_to_timedelta(0)
-        campaign_opts = result.task.campaign.campaignOptions.lower().split(";")
         is_esa_or_mqm = any(
             [
-                "esa" in campaign_opts or "mqm" in campaign_opts
+                "esa" in result.task.campaign.campaignOptions.lower().split(";") or
+                "mqm" in result.task.campaign.campaignOptions.lower().split(";")
                 for result in results
             ]
         )

From 42ded897a471d4a96dd5818e64ea021ecccd820f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Tue, 5 Aug 2025 12:58:14 -0700
Subject: [PATCH 47/51] restructure github tests to fail on fail

---
 .github/workflows/tests.yml | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ee50a6fe..9a367f9a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -20,18 +20,28 @@ jobs:
           pip freeze | tee pip_freeze.log
       - name: Unit tests
         run: python3 manage.py test -v2
+
       - name: Regression tests
+        id: regression_tests
         run: bash RegressionTests/run.sh
-        # Continue even if tests fail, so that we can collect test outputs for debugging
+        # Continue even if tests fail, so that we can collect tst outputs for debugging
         continue-on-error: true
+
       - name: Collect outputs
+        # This step will run even if the regression tests failed
         run: |
           find . -type f \( -name "*.log" -o -name "*.out" -o -name "*.diff" \) -print | cut -c3- > listing.txt
           echo "Creating an artifact with the following files:"
           cat listing.txt
           7z a -tzip regression-tests-appraise.zip @listing.txt
+        
       - name: Publish outputs
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
         with:
           name: regression-tests-appraise
           path: regression-tests-appraise.zip
+     
+      # Enforce the failure
+      - name: Check on failures
+        if: steps.regression_tests.outcome == 'failure'
+        run: exit 1

From c27b69eb070b0cb6141de4dc849caaf432bd3d14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Tue, 5 Aug 2025 13:05:51 -0700
Subject: [PATCH 48/51] fix version

---
 .github/workflows/tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9a367f9a..0403d5bd 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -24,7 +24,7 @@ jobs:
       - name: Regression tests
         id: regression_tests
         run: bash RegressionTests/run.sh
-        # Continue even if tests fail, so that we can collect tst outputs for debugging
+        # Continue even if tests fail, so that we can collect test outputs for debugging
         continue-on-error: true
 
       - name: Collect outputs
@@ -36,7 +36,7 @@ jobs:
           7z a -tzip regression-tests-appraise.zip @listing.txt
         
       - name: Publish outputs
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: regression-tests-appraise
           path: regression-tests-appraise.zip

From 2131d6fb5ad5c18fb95439b8998895272363fd91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Tue, 5 Aug 2025 21:13:19 -0700
Subject: [PATCH 49/51] attempt fix duplicate campaign-status

---
 Campaign/views.py                             | 2 +-
 EvalData/models/direct_assessment_document.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index a5770b0a..a84f324e 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -303,7 +303,7 @@ def campaign_status_esa(campaign) -> str:
             _data = DirectAssessmentDocumentResult.objects.filter(
                 createdBy=user, completed=True, task__campaign=campaign.id
             )
-            _data_uniq_len = len({item.id for item in _data})
+            _data_uniq_len = len({(item.item.sourceID, item.item.targetID, item.item.itemType, item.item.id) for item in _data})
 
             # If no data, show 0 progress or show that no task is assigned
             if not _data:
diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py
index a9565337..58843051 100644
--- a/EvalData/models/direct_assessment_document.py
+++ b/EvalData/models/direct_assessment_document.py
@@ -945,7 +945,7 @@ def get_system_data(
         attributes_to_extract = (
             'createdBy__username',  # User ID
             'item__targetID',  # System ID
-            'item__itemID',  # Segment ID
+            'item__sourceID',  # Source ID
             'item__itemType',  # Item type
             'item__metadata__market__sourceLanguageCode',  # Source language
             'item__metadata__market__targetLanguageCode',  # Target language

From 5e71313d75d015e34bcf26256a35446e29625a6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Tue, 5 Aug 2025 21:53:43 -0700
Subject: [PATCH 50/51] fix tests due to changing csv export

---
 .../example_mqmesa.scores.csv.expected        |  10 +-
 .../special/example_gt100.scores.csv.expected | 220 +++++++++---------
 .../special/example_lt100.scores.csv.expected |  20 +-
 3 files changed, 125 insertions(+), 125 deletions(-)

diff --git a/RegressionTests/tests/examples/example_mqmesa.scores.csv.expected b/RegressionTests/tests/examples/example_mqmesa.scores.csv.expected
index 7fdbf7b5..60cc6146 100644
--- a/RegressionTests/tests/examples/example_mqmesa.scores.csv.expected
+++ b/RegressionTests/tests/examples/example_mqmesa.scores.csv.expected
@@ -1,5 +1,5 @@
-engdeu0f01,ende-tutorial1,1000000,TGT,eng,deu,10,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu0f01,ende-tutorial1,1000001,TGT,eng,deu,20,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu0f01,ende-tutorial1,1000002,TGT,eng,deu,30,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu0f01,ende-tutorial2,1000003,TGT,eng,deu,40,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu0f01,ende-tutorial2,1000004,TGT,eng,deu,50,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu0f01,ende-tutorial1,ende-tutorial1,TGT,eng,deu,10,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu0f01,ende-tutorial1,ende-tutorial1,TGT,eng,deu,20,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu0f01,ende-tutorial1,ende-tutorial1,TGT,eng,deu,30,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu0f01,ende-tutorial2,ende-tutorial2,TGT,eng,deu,40,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu0f01,ende-tutorial2,ende-tutorial2,TGT,eng,deu,50,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
diff --git a/RegressionTests/tests/special/example_gt100.scores.csv.expected b/RegressionTests/tests/special/example_gt100.scores.csv.expected
index b325c99d..cdf1ff62 100644
--- a/RegressionTests/tests/special/example_gt100.scores.csv.expected
+++ b/RegressionTests/tests/special/example_gt100.scores.csv.expected
@@ -1,110 +1,110 @@
-engdeu9604,ende-tutorial1,1000000,TGT,eng,deu,1,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,ende-tutorial1,1000001,TGT,eng,deu,2,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,ende-tutorial1,1000002,TGT,eng,deu,3,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,ende-tutorial2,1000003,TGT,eng,deu,4,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,ende-tutorial2,1000004,TGT,eng,deu,5,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,ende-tutorial2,1000005,TGT,eng,deu,6,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,706,TGT,eng,deu,7,test-en-speech_392RoIzR2Fs_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,ONLINE-B,778,TGT,eng,deu,8,test-en-speech_TBPP-za78BQ_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Llama3-70B,5,TGT,eng,deu,9,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Llama3-70B,6,TGT,eng,deu,10,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Llama3-70B,7,TGT,eng,deu,11,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Llama3-70B,8,TGT,eng,deu,12,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Llama3-70B,9,TGT,eng,deu,13,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,451,BAD,eng,deu,14,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,452,BAD,eng,deu,15,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,453,BAD,eng,deu,16,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,454,BAD,eng,deu,17,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,455,BAD,eng,deu,18,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,456,BAD,eng,deu,19,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,457,BAD,eng,deu,20,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,458,BAD,eng,deu,21,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,459,BAD,eng,deu,22,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,460,BAD,eng,deu,23,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,899,TGT,eng,deu,24,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,900,TGT,eng,deu,25,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,901,TGT,eng,deu,26,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,902,TGT,eng,deu,27,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,903,TGT,eng,deu,28,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,904,TGT,eng,deu,29,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,905,TGT,eng,deu,30,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,906,TGT,eng,deu,31,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,907,TGT,eng,deu,32,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,908,TGT,eng,deu,33,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,724,TGT,eng,deu,34,test-en-speech_6JeSS_CODZ0_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,689,TGT,eng,deu,35,test-en-speech_07FOJFFqOYc_002,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,373,BAD,eng,deu,36,test-en-social_112111346044907536#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,374,BAD,eng,deu,37,test-en-social_112111346044907536#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Llama3-70B,789,TGT,eng,deu,38,test-en-speech_XwIQLLbD7SI_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,969,TGT,eng,deu,39,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,970,TGT,eng,deu,40,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,971,TGT,eng,deu,41,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,972,TGT,eng,deu,42,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,973,TGT,eng,deu,43,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,974,TGT,eng,deu,44,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,975,TGT,eng,deu,45,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,976,TGT,eng,deu,46,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,977,TGT,eng,deu,47,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,978,TGT,eng,deu,48,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,373,TGT,eng,deu,49,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,374,TGT,eng,deu,50,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,375,TGT,eng,deu,51,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,376,TGT,eng,deu,52,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,377,TGT,eng,deu,53,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,378,TGT,eng,deu,54,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,379,TGT,eng,deu,55,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,380,TGT,eng,deu,56,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,381,TGT,eng,deu,57,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,382,TGT,eng,deu,58,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Llama3-70B,19,TGT,eng,deu,59,test-en-news_economist.14223#incomplete,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Llama3-70B,20,TGT,eng,deu,60,test-en-news_economist.14223#incomplete,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,42,TGT,eng,deu,61,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,43,TGT,eng,deu,62,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,44,TGT,eng,deu,63,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,45,TGT,eng,deu,64,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,46,TGT,eng,deu,65,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,47,TGT,eng,deu,66,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,48,TGT,eng,deu,67,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,49,TGT,eng,deu,68,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,50,TGT,eng,deu,69,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Claude-3.5,51,TGT,eng,deu,70,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,10,TGT,eng,deu,71,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,11,TGT,eng,deu,72,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,12,TGT,eng,deu,73,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,13,TGT,eng,deu,74,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,14,TGT,eng,deu,75,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,15,TGT,eng,deu,76,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,16,TGT,eng,deu,77,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,17,TGT,eng,deu,78,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IKUN-C,18,TGT,eng,deu,79,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,424,TGT,eng,deu,80,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,425,TGT,eng,deu,81,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,426,TGT,eng,deu,82,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,427,TGT,eng,deu,83,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,428,TGT,eng,deu,84,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,429,TGT,eng,deu,85,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,430,TGT,eng,deu,86,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,431,TGT,eng,deu,87,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,432,TGT,eng,deu,88,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,433,TGT,eng,deu,89,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,451,TGT,eng,deu,90,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,452,TGT,eng,deu,91,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,453,TGT,eng,deu,92,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,454,TGT,eng,deu,93,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,455,TGT,eng,deu,94,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,456,TGT,eng,deu,95,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,457,TGT,eng,deu,96,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,458,TGT,eng,deu,97,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,459,TGT,eng,deu,98,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,460,TGT,eng,deu,99,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,767,TGT,eng,deu,100,test-en-speech_QaueRRYecxo_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,453,TGT,eng,deu,101,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,454,TGT,eng,deu,102,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,455,TGT,eng,deu,103,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,456,TGT,eng,deu,104,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,457,TGT,eng,deu,105,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,458,TGT,eng,deu,106,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,459,TGT,eng,deu,107,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,460,TGT,eng,deu,108,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,Dubformer,461,TGT,eng,deu,109,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9604,IOL-Research,768,TGT,eng,deu,110,test-en-speech_QaueRRYecxo_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial1,ende-tutorial1,TGT,eng,deu,1,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial1,ende-tutorial1,TGT,eng,deu,2,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial1,ende-tutorial1,TGT,eng,deu,3,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial2,ende-tutorial2,TGT,eng,deu,4,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial2,ende-tutorial2,TGT,eng,deu,5,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ende-tutorial2,ende-tutorial2,TGT,eng,deu,6,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,7,test-en-speech_392RoIzR2Fs_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,ONLINE-B,en-de,TGT,eng,deu,8,test-en-speech_TBPP-za78BQ_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,en-de,TGT,eng,deu,9,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,en-de,TGT,eng,deu,10,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,en-de,TGT,eng,deu,11,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,en-de,TGT,eng,deu,12,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,en-de,TGT,eng,deu,13,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,14,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,15,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,16,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,17,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,18,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,19,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,20,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,21,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,22,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,BAD,eng,deu,23,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,24,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,25,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,26,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,27,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,28,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,29,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,30,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,31,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,32,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,33,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,34,test-en-speech_6JeSS_CODZ0_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,35,test-en-speech_07FOJFFqOYc_002,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,BAD,eng,deu,36,test-en-social_112111346044907536#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,BAD,eng,deu,37,test-en-social_112111346044907536#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,en-de,TGT,eng,deu,38,test-en-speech_XwIQLLbD7SI_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,39,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,40,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,41,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,42,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,43,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,44,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,45,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,46,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,47,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,48,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,49,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,50,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,51,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,52,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,53,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,54,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,55,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,56,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,57,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,58,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,en-de,TGT,eng,deu,59,test-en-news_economist.14223#incomplete,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Llama3-70B,en-de,TGT,eng,deu,60,test-en-news_economist.14223#incomplete,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,61,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,62,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,63,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,64,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,65,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,66,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,67,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,68,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,69,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Claude-3.5,en-de,TGT,eng,deu,70,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,71,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,72,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,73,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,74,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,75,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,76,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,77,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,78,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IKUN-C,en-de,TGT,eng,deu,79,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,80,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,81,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,82,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,83,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,84,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,85,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,86,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,87,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,88,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,89,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,90,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,91,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,92,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,93,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,94,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,95,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,96,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,97,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,98,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,99,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,100,test-en-speech_QaueRRYecxo_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,101,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,102,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,103,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,104,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,105,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,106,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,107,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,108,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,Dubformer,en-de,TGT,eng,deu,109,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9604,IOL-Research,en-de,TGT,eng,deu,110,test-en-speech_QaueRRYecxo_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
diff --git a/RegressionTests/tests/special/example_lt100.scores.csv.expected b/RegressionTests/tests/special/example_lt100.scores.csv.expected
index cc6b9db4..83e1e78a 100644
--- a/RegressionTests/tests/special/example_lt100.scores.csv.expected
+++ b/RegressionTests/tests/special/example_lt100.scores.csv.expected
@@ -1,10 +1,10 @@
-engdeu9704,ende-tutorial1,1000000,TGT,eng,deu,1,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9704,ende-tutorial1,1000001,TGT,eng,deu,2,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9704,ende-tutorial1,1000002,TGT,eng,deu,3,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9704,ende-tutorial2,1000003,TGT,eng,deu,4,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9704,ende-tutorial2,1000004,TGT,eng,deu,5,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9704,ende-tutorial2,1000005,TGT,eng,deu,6,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9704,Claude-3.5,706,TGT,eng,deu,7,test-en-speech_392RoIzR2Fs_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9704,ONLINE-B,778,TGT,eng,deu,8,test-en-speech_TBPP-za78BQ_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9704,Llama3-70B,5,TGT,eng,deu,9,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
-engdeu9704,Llama3-70B,6,TGT,eng,deu,10,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial1,ende-tutorial1,TGT,eng,deu,1,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial1,ende-tutorial1,TGT,eng,deu,2,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial1,ende-tutorial1,TGT,eng,deu,3,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial2,ende-tutorial2,TGT,eng,deu,4,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial2,ende-tutorial2,TGT,eng,deu,5,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ende-tutorial2,ende-tutorial2,TGT,eng,deu,6,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,Claude-3.5,en-de,TGT,eng,deu,7,test-en-speech_392RoIzR2Fs_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,ONLINE-B,en-de,TGT,eng,deu,8,test-en-speech_TBPP-za78BQ_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,Llama3-70B,en-de,TGT,eng,deu,9,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"
+engdeu9704,Llama3-70B,en-de,TGT,eng,deu,10,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}"

From 52cbfd31b367c523b6edce86ab475ab318abeaea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Wed, 6 Aug 2025 21:17:47 -0700
Subject: [PATCH 51/51] remove typo

---
 Campaign/views.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Campaign/views.py b/Campaign/views.py
index a84f324e..7c07b62b 100644
--- a/Campaign/views.py
+++ b/Campaign/views.py
@@ -269,8 +269,8 @@ def campaign_status_esa(campaign) -> str:
 <th>Progress</th>
 <th>First Modified</th>
 <th>Last Modified</th>
-<th style="cursor: pointer" title="Very coarse upper bound estimate between the last and the first interaction with the system.">Time (Coarse ❔)</th>
-<th style="cursor: pointer" title="Sum of times between any two interactions that are not longer than 10 minutes.">Time (Real ❔)</th>
+<th style="cursor: pointer" title="Very coarse upper bound estimate between the last and the first interaction with the system.">Time (Coarse❔)</th>
+<th style="cursor: pointer" title="Sum of times between any two interactions that are not longer than 10 minutes.">Time (Real❔)</th>
 </tr>\n
 """ 
     for team in campaign.teams.all():

Username	Progress	First Modified	Last Modified	Annotation Time
{x}
					{first_modified_str}	{last_modified_str}	{annotation_time_upper}	{annotation_time}	{annotation_time} - {annotation_time_upper}