From f0c47e06e73c78dcaac2ad20a7e55b1a4f5190a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Fri, 2 Aug 2024 20:12:08 +0200 Subject: [PATCH 01/51] fix typo --- EvalView/templates/EvalView/_instructions-esa.html | 2 +- EvalView/templates/EvalView/_instructions-mqm.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/EvalView/templates/EvalView/_instructions-esa.html b/EvalView/templates/EvalView/_instructions-esa.html index 62c2d143..04ec0f4d 100644 --- a/EvalView/templates/EvalView/_instructions-esa.html +++ b/EvalView/templates/EvalView/_instructions-esa.html @@ -1,7 +1,7 @@
    -
  • Higlighting errors: +
  • Highlighting errors:
    • Highlight the text fragment where you have identified a translation error (drag or click start & end). diff --git a/EvalView/templates/EvalView/_instructions-mqm.html b/EvalView/templates/EvalView/_instructions-mqm.html index 2f36b694..284beed1 100644 --- a/EvalView/templates/EvalView/_instructions-mqm.html +++ b/EvalView/templates/EvalView/_instructions-mqm.html @@ -1,7 +1,7 @@
        -
      • Higlighting errors: +
      • Highlighting errors:
        • Highlight the text fragment where you have identified a translation error (drag or click start & end). From d64659b142ad16d88262cdec5d4fac9b9708e687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Mon, 5 Aug 2024 10:29:24 +0200 Subject: [PATCH 02/51] clarify annotator tokens --- INSTALL.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index fffcf124..33809a75 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,4 +1,4 @@ -## Setup +# Setup 1. Basic setup: @@ -39,6 +39,9 @@ python3 manage.py StartNewCampaign Examples/MQM+ESA/manifest.json \ python3 manage.py CreateInviteTokens test_group 20 --create-group test_group ``` +Add `--task-confirmation-tokens` if you with to show annotators tokens at the end. +See [quality control](#Quality control) for more details. + 5. Optionally clean up everything ``` @@ -122,4 +125,13 @@ For task: - `batchNo`: task number - `randomSeed`: number used in batch generation - `requiredAnnotations`: how many annotations does a task need, in most cases use 1 -- `source/targetLanguage`: source and target language \ No newline at end of file +- `source/targetLanguage`: source and target language + +## Quality control + +With `--task-confirmation-tokens`, the annotators will be shown a random one if they fail the quality control and a correct one (matching the one in the CSV output) if they succeed. +The quality control checks if the perturbed samples (`itemType=BAD`) have statistically lower scores than the original ones (`itemType=TGT`). +Even without the switch, the campaign status page will show a p-value (last column for staff account) that corresponds to the outcome of this test. +If it's close to 1, then the annotator is annotating randomly and is of poor quality. +For values close to 0, the annotations are good. +The threshold to generate the true token for annotators is currently p<=10%. From 2666cb22ef5376f9da33e7829a7d63b3ef5082fe Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 19 Aug 2024 10:38:18 +0100 Subject: [PATCH 03/51] Update INSTALL.md --- INSTALL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.md b/INSTALL.md index 33809a75..5c2c32a7 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -39,7 +39,7 @@ python3 manage.py StartNewCampaign Examples/MQM+ESA/manifest.json \ python3 manage.py CreateInviteTokens test_group 20 --create-group test_group ``` -Add `--task-confirmation-tokens` if you with to show annotators tokens at the end. +Add `--task-confirmation-tokens` if you want to generate annotator confirmation tokens. See [quality control](#Quality control) for more details. 5. Optionally clean up everything From 82e9eab7e19dfa35f12af3aaf17cd6db47172d3b Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 19 Aug 2024 10:38:24 +0100 Subject: [PATCH 04/51] Update INSTALL.md --- INSTALL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.md b/INSTALL.md index 5c2c32a7..6b11109e 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -129,7 +129,7 @@ For task: ## Quality control -With `--task-confirmation-tokens`, the annotators will be shown a random one if they fail the quality control and a correct one (matching the one in the CSV output) if they succeed. +With `--task-confirmation-tokens`, the annotators will be shown a random key/token if they fail the quality control and a correct one (matching the one in the CSV output with credentials) if they succeed. The quality control checks if the perturbed samples (`itemType=BAD`) have statistically lower scores than the original ones (`itemType=TGT`). Even without the switch, the campaign status page will show a p-value (last column for staff account) that corresponds to the outcome of this test. If it's close to 1, then the annotator is annotating randomly and is of poor quality. From 715ade94b35c9fe0364d07e55ed79586b6deca54 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 19 Aug 2024 10:38:30 +0100 Subject: [PATCH 05/51] Update INSTALL.md --- INSTALL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.md b/INSTALL.md index 6b11109e..a85f6d10 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -134,4 +134,4 @@ The quality control checks if the perturbed samples (`itemType=BAD`) have statis Even without the switch, the campaign status page will show a p-value (last column for staff account) that corresponds to the outcome of this test. If it's close to 1, then the annotator is annotating randomly and is of poor quality. For values close to 0, the annotations are good. -The threshold to generate the true token for annotators is currently p<=10%. +The threshold to generate the valid token for annotators is currently p<=10%. From d07015d9ec24f048b327a9845324628258b196ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Sat, 4 Jan 2025 11:41:45 +0100 Subject: [PATCH 06/51] remove constraints from loader --- EvalData/models/data_assessment.py | 8 -------- EvalData/models/direct_assessment.py | 4 ---- EvalData/models/direct_assessment_context.py | 7 ------- EvalData/models/direct_assessment_document.py | 7 ------- EvalData/models/multi_modal_assessment.py | 7 ------- EvalData/models/pairwise_assessment.py | 5 ----- EvalData/models/pairwise_assessment_document.py | 7 ------- INSTALL.md | 2 +- 8 files changed, 1 insertion(+), 46 deletions(-) diff --git a/EvalData/models/data_assessment.py b/EvalData/models/data_assessment.py index 1fcd72bb..9eb9ea00 100644 --- a/EvalData/models/data_assessment.py +++ b/EvalData/models/data_assessment.py @@ -429,14 +429,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): ) new_items.append(new_item) - if not len(new_items) == 100: - _msg = 'Expected 100 items for task but found {0}'.format( - len(new_items) - ) - LOGGER.warn(_msg) - print(_msg) - continue - current_count += 1 # for new_item in new_items: diff --git a/EvalData/models/direct_assessment.py b/EvalData/models/direct_assessment.py index 18d32e71..2f2ada09 100644 --- a/EvalData/models/direct_assessment.py +++ b/EvalData/models/direct_assessment.py @@ -311,10 +311,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): ) new_items.append(new_item) - if len(new_items) != 100: - LOGGER.error(f'Expected 100 items for task but found {len(new_items)}') - continue - current_count += 1 batch_meta.textpair_set.add(*new_items, bulk=False) batch_meta.save() diff --git a/EvalData/models/direct_assessment_context.py b/EvalData/models/direct_assessment_context.py index 9c231b71..988c59c6 100644 --- a/EvalData/models/direct_assessment_context.py +++ b/EvalData/models/direct_assessment_context.py @@ -367,13 +367,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): if item['isCompleteDocument']: doc_items += 1 - if (len(new_items) - doc_items) != 100: - _msg = 'Expected 100 items for task but found {0}'.format( - len(new_items) - doc_items - ) - LOGGER.warn(_msg) - continue - current_count += 1 for new_item in new_items: diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py index 861f0755..ab88b01e 100644 --- a/EvalData/models/direct_assessment_document.py +++ b/EvalData/models/direct_assessment_document.py @@ -462,13 +462,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): if item['isCompleteDocument']: doc_items += 1 - if (len(new_items) - doc_items) != 100: - _msg = 'Expected 100 items for task but found {0}'.format( - len(new_items) - doc_items - ) - LOGGER.warn(_msg) - continue - current_count += 1 for new_item in new_items: diff --git a/EvalData/models/multi_modal_assessment.py b/EvalData/models/multi_modal_assessment.py index 17778e1f..65bebc1b 100644 --- a/EvalData/models/multi_modal_assessment.py +++ b/EvalData/models/multi_modal_assessment.py @@ -348,13 +348,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): ) new_items.append(new_item) - if not len(new_items) == 100: - _msg = 'Expected 100 items for task but found {0}'.format( - len(new_items) - ) - LOGGER.warn(_msg) - continue - current_count += 1 # for new_item in new_items: diff --git a/EvalData/models/pairwise_assessment.py b/EvalData/models/pairwise_assessment.py index 7158e001..11934524 100644 --- a/EvalData/models/pairwise_assessment.py +++ b/EvalData/models/pairwise_assessment.py @@ -346,11 +346,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): ) new_items.append(new_item) - if not len(new_items) == 100: - _msg = 'Expected 100 items for task but found {0}'.format(count_items) - LOGGER.warn(_msg) - continue - current_count += 1 # for new_item in new_items: diff --git a/EvalData/models/pairwise_assessment_document.py b/EvalData/models/pairwise_assessment_document.py index f834e815..26097a2e 100644 --- a/EvalData/models/pairwise_assessment_document.py +++ b/EvalData/models/pairwise_assessment_document.py @@ -471,13 +471,6 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): if item['isCompleteDocument']: doc_items += 1 - if (len(new_items) - doc_items) != 100: - _msg = 'Expected 100 items for task but found {0}'.format( - len(new_items) - doc_items - ) - LOGGER.warn(_msg) - continue - current_count += 1 for new_item in new_items: diff --git a/INSTALL.md b/INSTALL.md index a85f6d10..42304238 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -78,7 +78,7 @@ The manifest looks like this: - In the associated data we had only one En-De task. The combination of redundancy of 2 and the first 2 in the task distribution simply creates two accounts with the same single task (redundant). If there were e.g. 5 tasks and we wanted no redundancy, the line would be `["eng", "deu", "uniform", 5, 5]`. Alternatively to manual manifests, a Django command can be created instead of the manifest file, see `Campaign/management/commands/InitCampaigh*.py`. -The batches file is a list of tasks with items and task descriptions. As a rule, there are exactly 100 segments in a task. An example for ESA/MQM: +The batches file is a list of tasks with items and task descriptions. There are usually at least 100 segments in a task. An example for ESA/MQM: ``` [ { From d6ff83c6698c2590eb78628188a0b5b19add99e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Sat, 4 Jan 2025 11:53:25 +0100 Subject: [PATCH 07/51] rename hit to set --- Dashboard/templates/Dashboard/dashboard.html | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Dashboard/templates/Dashboard/dashboard.html b/Dashboard/templates/Dashboard/dashboard.html index 2c9f893e..d1eb5bea 100644 --- a/Dashboard/templates/Dashboard/dashboard.html +++ b/Dashboard/templates/Dashboard/dashboard.html @@ -3,11 +3,6 @@ {% block content %}
          -

          Dashboard

          Evaluation campaign for shared tasks hosted at the 9th Conference on Machine Translation (WMT24)

          @@ -18,12 +13,12 @@

          This is Appraise

          {% if current_task %} -

          Current HIT

          +

          Current set

          Continue annotation for {{current_task.campaign}}:{{current_task.marketTargetLanguage}}.

          {% elif all_languages %} {% for _, languages in all_languages.items %} {% if languages %} -

          Next HIT

          +

          Next set

          Start annotation for: {% for code, language, campaign, task_url in languages %} {{campaign}}:{{language}}{% if not forloop.last %} · {% endif %} @@ -46,11 +41,11 @@

          Work completed

      {% else %} -

      Next HIT

      +

      Next set

      We are currently finalising the registration process for annotator accounts. Once this has been completed, direct assessment tasks will be become available from this page. Please check back in a little while.

      {% endif %}

      User status

      -

      {{annotations}} annotation{{annotations|pluralize}}, {{hits}} HIT{{hits|pluralize}} completed. Total annotation duration {% if days %}{{days|stringformat:"02d"}}d{% endif %}{{hours|stringformat:"02d"}}h{{minutes|stringformat:"02d"}}m{{seconds|stringformat:"02d"}}s.

      +

      {{annotations}} annotation{{annotations|pluralize}}, {{hits}} set{{hits|pluralize}} completed. Total annotation duration {% if days %}{{days|stringformat:"02d"}}d{% endif %}{{hours|stringformat:"02d"}}h{{minutes|stringformat:"02d"}}m{{seconds|stringformat:"02d"}}s.

From 1954d7b90a861d285a5b27afea6ab9e5d9b2630c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Sat, 4 Jan 2025 13:09:34 +0100 Subject: [PATCH 08/51] implement character-level alignment --- .../direct-assessment-document-mqm-esa.css | 1 - .../js/direct-assessment-document-mqm-esa.js | 53 +++++++++++++++---- .../direct-assessment-document-mqm-esa.html | 3 +- 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css index 89d25f78..d17ab3e7 100644 --- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css +++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css @@ -136,7 +136,6 @@ min-width: 130px; } - .mqm_char:hover:not([selected]):not([in_mqm]) { outline: 2px solid #ccc; } diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js index 7c9afc93..05c22eff 100644 --- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js +++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js @@ -287,7 +287,8 @@ class MQMItemHandler { this.initialize() } - initialize() { + async initialize() { + this.el_source = this.el.find(".source-text") this.el_target = this.el.find(".target-text") this.el_slider = this.el.find('.slider') // for Appraise reasons it's a JSON string encoding JSON @@ -306,6 +307,9 @@ class MQMItemHandler { } this.mqm_submitted = structuredClone(this.mqm) this.mqm_orig = JSON.parse(JSON.parse(this.el.children('#mqm-payload-orig').html())) + this.text_source_orig = decodeEntitiesPreservingTags(JSON.parse(this.el.children('#text-source-payload').html()).trim()) + this.source_video = JSON.parse(this.el.children('#text-source-payload').html()).trim().startsWith(" v[0]) - let html_candidate = split_text.map((v, i) => { - return `${v}` - }).join("") + " [MISSING]" - this.el_target.html(html_candidate) + // setup_span_structure + let html_target = this.text_target_orig.split("").map((v, i) => { + return `${v}` + }).join("") + " [MISSING]" + this.el_target.html(html_target) this.redraw_mqm() @@ -350,6 +352,39 @@ class MQMItemHandler { this.el_slider.slider('value', score); } + // handle character alignment estimation + if (!this.source_video) { + let html_source = this.text_source_orig.split("").map((v, i) => { + return `${v}` + }).join("") + this.el_source.html(html_source) + + await waitout_js_loop() + + let len_src = this.text_source_orig.split("").length + let len_tgt = this.text_target_orig.split("").length + this.el_target.children(".mqm_char").each((i, el) => { + // on hover + $(el).on("mouseenter", () => { + // get char position from attribute + let tgt_char_i = Number.parseInt($(el).attr("char_id")) + // approximate position + let src_char_i = Math.floor(tgt_char_i * len_src / len_tgt) + // remove underline from all mqm + this.el_source.children(".mqm_char_src").css("text-decoration", "") + // set underline to the corresponding character and its neighbours + this.el_source.children(`#source_char_${src_char_i}`).css("text-decoration", "underline 10%") + this.el_source.children(`#source_char_${src_char_i-1}`).css("text-decoration", "underline 10%") + this.el_source.children(`#source_char_${src_char_i+1}`).css("text-decoration", "underline 10%") + }) + // on leave remove all decorations + $(el).on("mouseleave", () => { + this.el_source.children(".mqm_char_src").css("text-decoration", "") + }) + }) + } + + // slider bubble handling this.el_slider.find(".ui-slider-handle").append("
100
") let refresh_bubble = () => { diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html index 3b380283..753cff37 100644 --- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html +++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html @@ -60,6 +60,7 @@ {{ scores.mqm|json_script:"mqm-payload" }} {{ scores.mqm_orig|json_script:"mqm-payload-orig" }} + {{ item.sourceText|json_script:"text-source-payload" }} {{ item.targetText|json_script:"text-target-payload" }} {{ scores.score|json_script:"score-payload" }} @@ -79,7 +80,7 @@
- + {{ item.sourceText|safe }}
From d8d89682b3494eca9b81fad4d814511036d59d71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Tue, 7 Jan 2025 11:03:35 +0100 Subject: [PATCH 09/51] make src-tgt character highlights more prominent --- .../js/direct-assessment-document-mqm-esa.js | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js index 05c22eff..857fc2c3 100644 --- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js +++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js @@ -261,7 +261,11 @@ async function submit_finish_document(override_tutorial_check=false) { await new Promise(resolve => setTimeout(resolve, 5_000)) $("#button-next-doc").prop('disabled', false); } - +} +function decodeEntities(html) { + var txt = document.createElement("textarea"); + txt.innerHTML = html; + return txt.value; } function _show_error_box(text, timeout = 2000) { @@ -307,7 +311,7 @@ class MQMItemHandler { } this.mqm_submitted = structuredClone(this.mqm) this.mqm_orig = JSON.parse(JSON.parse(this.el.children('#mqm-payload-orig').html())) - this.text_source_orig = decodeEntitiesPreservingTags(JSON.parse(this.el.children('#text-source-payload').html()).trim()) + this.text_source_orig = decodeEntities(JSON.parse(this.el.children('#text-source-payload').html()).trim()) this.source_video = JSON.parse(this.el.children('#text-source-payload').html()).trim().startsWith(" 0; range--) { + // extrapolate range between #111 and #ddd + let color = (Math.floor(range/5 * (0xd - 0x1))+0x1).toString(16) + for (let i = Math.max(0, src_char_i - range); i < Math.min(len_src, src_char_i + range); i++) { + this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 25% #${color}${color}${color} solid`) + } + } }) // on leave remove all decorations $(el).on("mouseleave", () => { From a9f42215a867791ed36c6277414b69cf5f8274d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Wed, 8 Jan 2025 09:35:46 +0100 Subject: [PATCH 10/51] add prompt dialog for style of src-tgt highlight (testing only) --- .../js/direct-assessment-document-mqm-esa.js | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js index 857fc2c3..f744dfd8 100644 --- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js +++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js @@ -127,7 +127,19 @@ async function get_error_type() { return error_stack } +var TMP_HIGHLIGHT_MODE = null +var TMP_HIGHLIGHT_WIDTH = null + $(document).ready(() => { + // TODO: only temporary, remove once decided + // native dialog box to select highlight mode + while(!["thin", "normal", "bold", "wavy", "dotted"].includes(TMP_HIGHLIGHT_MODE)) { + TMP_HIGHLIGHT_MODE = prompt('Please select highlight mode: "thin", "normal" (default), "bold", "wavy", "dotted"', "normal") + } + while(isNaN(parseInt(TMP_HIGHLIGHT_WIDTH)) || TMP_HIGHLIGHT_WIDTH < 1) { + TMP_HIGHLIGHT_WIDTH = parseInt(prompt('Please select how many characters to highlight. Default is 8.', 8)) + } + MQM_TYPE = JSON.parse($('#mqm-type-payload').html()) // sliders are present only for ESA @@ -377,12 +389,23 @@ class MQMItemHandler { // remove underline from all mqm this.el_source.children(".mqm_char_src").css("text-decoration", "") + let highlight_width = Math.floor(TMP_HIGHLIGHT_WIDTH / 2) // set underline to the corresponding character and its neighbours - for (let range = 5; range > 0; range--) { + for (let range = highlight_width; range > 0; range--) { // extrapolate range between #111 and #ddd - let color = (Math.floor(range/5 * (0xd - 0x1))+0x1).toString(16) - for (let i = Math.max(0, src_char_i - range); i < Math.min(len_src, src_char_i + range); i++) { - this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 25% #${color}${color}${color} solid`) + let color = (Math.floor((range-1)/highlight_width * (0xd - 0x1))+0x1).toString(16) + for (let i = Math.max(0, src_char_i - range); i <= Math.min(len_src, src_char_i + range); i++) { + if (TMP_HIGHLIGHT_MODE == "bold") { + this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 25% #${color}${color}${color} solid`) + } else if (TMP_HIGHLIGHT_MODE == "wavy") { + this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} wavy`) + } else if (TMP_HIGHLIGHT_MODE == "dotted") { + this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} dotted`) + } else if (TMP_HIGHLIGHT_MODE == "normal") { + this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} solid`) + } else if (TMP_HIGHLIGHT_MODE == "thin") { + this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 5% #${color}${color}${color} solid`) + } } } }) @@ -443,10 +466,6 @@ class MQMItemHandler { // store currently displayed version this.el.find('input[name="mqm"]').val(JSON.stringify(this.mqm)); - // NOTE: do not automatically recompute - // should be in range [0, 100] - // this.el_slider.slider('value', this.current_mqm_score(true)) - // redraw this.el_target.children(".mqm_char").each((i, el) => { el = $(el) @@ -558,23 +577,6 @@ class MQMItemHandler { alert(`Please follow the tutorial instructions.\n(${this.text_target_orig.substring(0, 60)}...)`); return false } - // skip other messages in the tutorial - // if (this.tutorial) { - // return true - // } - - // if (this.mqm.some((x) => x["severity"] == "undecided")) { - // alert('There are some segments without severity (in blue). Click on them to change their severities.'); - // return false - // } - - // remove dialogs - // if (this.mqm.length == 0 && !confirm("There are no annotated text fragments. Are you sure you want to submit?")) { - // return false - // } - // if (MQM_TYPE == "ESA" && this.current_mqm_score(true) == Number.parseFloat(this.el.find("input[name='score']").val()) && !confirm("You did not change the original translation score. Are you sure you want to submit?")) { - // return false - // } return true; } From 8268137bf6c2756d9d4742f393b0273763d4c58d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Mon, 17 Feb 2025 18:13:02 +0100 Subject: [PATCH 11/51] add tests for more or less than 100 items; add info about the number of loaded items --- .github/workflows/tests.yml | 2 +- EvalData/models/data_assessment.py | 1 + EvalData/models/direct_assessment.py | 1 + EvalData/models/direct_assessment_context.py | 1 + EvalData/models/direct_assessment_document.py | 3 +- EvalData/models/multi_modal_assessment.py | 1 + EvalData/models/pairwise_assessment.py | 3 +- .../models/pairwise_assessment_document.py | 3 +- .../special/example_gt100.scores.csv.expected | 110 ++++++++++++++++++ .../special/example_lt100.scores.csv.expected | 10 ++ .../tests/special/manifest_gt100.json | 14 +++ .../tests/special/manifest_lt100.json | 14 +++ .../tests/special/test_examples_gt100.sh | 37 ++++++ .../tests/special/test_examples_lt100.sh | 48 ++++++++ 14 files changed, 244 insertions(+), 4 deletions(-) create mode 100644 RegressionTests/tests/special/example_gt100.scores.csv.expected create mode 100644 RegressionTests/tests/special/example_lt100.scores.csv.expected create mode 100644 RegressionTests/tests/special/manifest_gt100.json create mode 100644 RegressionTests/tests/special/manifest_lt100.json create mode 100644 RegressionTests/tests/special/test_examples_gt100.sh create mode 100644 RegressionTests/tests/special/test_examples_lt100.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1fff83db..ee50a6fe 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -31,7 +31,7 @@ jobs: cat listing.txt 7z a -tzip regression-tests-appraise.zip @listing.txt - name: Publish outputs - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: regression-tests-appraise path: regression-tests-appraise.zip diff --git a/EvalData/models/data_assessment.py b/EvalData/models/data_assessment.py index 9eb9ea00..08b0397f 100644 --- a/EvalData/models/data_assessment.py +++ b/EvalData/models/data_assessment.py @@ -429,6 +429,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): ) new_items.append(new_item) + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 # for new_item in new_items: diff --git a/EvalData/models/direct_assessment.py b/EvalData/models/direct_assessment.py index 2f2ada09..a605b6d2 100644 --- a/EvalData/models/direct_assessment.py +++ b/EvalData/models/direct_assessment.py @@ -311,6 +311,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): ) new_items.append(new_item) + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 batch_meta.textpair_set.add(*new_items, bulk=False) batch_meta.save() diff --git a/EvalData/models/direct_assessment_context.py b/EvalData/models/direct_assessment_context.py index 988c59c6..cb0a581d 100644 --- a/EvalData/models/direct_assessment_context.py +++ b/EvalData/models/direct_assessment_context.py @@ -367,6 +367,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): if item['isCompleteDocument']: doc_items += 1 + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 for new_item in new_items: diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py index ab88b01e..38ccfb8a 100644 --- a/EvalData/models/direct_assessment_document.py +++ b/EvalData/models/direct_assessment_document.py @@ -461,7 +461,8 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): new_items.append(new_item) if item['isCompleteDocument']: doc_items += 1 - + + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 for new_item in new_items: diff --git a/EvalData/models/multi_modal_assessment.py b/EvalData/models/multi_modal_assessment.py index 65bebc1b..42b609d8 100644 --- a/EvalData/models/multi_modal_assessment.py +++ b/EvalData/models/multi_modal_assessment.py @@ -348,6 +348,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): ) new_items.append(new_item) + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 # for new_item in new_items: diff --git a/EvalData/models/pairwise_assessment.py b/EvalData/models/pairwise_assessment.py index 11934524..8ad8c987 100644 --- a/EvalData/models/pairwise_assessment.py +++ b/EvalData/models/pairwise_assessment.py @@ -345,7 +345,8 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): contextRight=context_right, ) new_items.append(new_item) - + + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 # for new_item in new_items: diff --git a/EvalData/models/pairwise_assessment_document.py b/EvalData/models/pairwise_assessment_document.py index 26097a2e..69c71088 100644 --- a/EvalData/models/pairwise_assessment_document.py +++ b/EvalData/models/pairwise_assessment_document.py @@ -470,7 +470,8 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): new_items.append(new_item) if item['isCompleteDocument']: doc_items += 1 - + + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 for new_item in new_items: diff --git a/RegressionTests/tests/special/example_gt100.scores.csv.expected b/RegressionTests/tests/special/example_gt100.scores.csv.expected new file mode 100644 index 00000000..b325c99d --- /dev/null +++ b/RegressionTests/tests/special/example_gt100.scores.csv.expected @@ -0,0 +1,110 @@ +engdeu9604,ende-tutorial1,1000000,TGT,eng,deu,1,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,ende-tutorial1,1000001,TGT,eng,deu,2,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,ende-tutorial1,1000002,TGT,eng,deu,3,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,ende-tutorial2,1000003,TGT,eng,deu,4,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,ende-tutorial2,1000004,TGT,eng,deu,5,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,ende-tutorial2,1000005,TGT,eng,deu,6,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,706,TGT,eng,deu,7,test-en-speech_392RoIzR2Fs_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,ONLINE-B,778,TGT,eng,deu,8,test-en-speech_TBPP-za78BQ_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Llama3-70B,5,TGT,eng,deu,9,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Llama3-70B,6,TGT,eng,deu,10,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Llama3-70B,7,TGT,eng,deu,11,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Llama3-70B,8,TGT,eng,deu,12,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Llama3-70B,9,TGT,eng,deu,13,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,451,BAD,eng,deu,14,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,452,BAD,eng,deu,15,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,453,BAD,eng,deu,16,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,454,BAD,eng,deu,17,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,455,BAD,eng,deu,18,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,456,BAD,eng,deu,19,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,457,BAD,eng,deu,20,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,458,BAD,eng,deu,21,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,459,BAD,eng,deu,22,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,460,BAD,eng,deu,23,test-en-social_112122127346453600#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,899,TGT,eng,deu,24,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,900,TGT,eng,deu,25,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,901,TGT,eng,deu,26,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,902,TGT,eng,deu,27,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,903,TGT,eng,deu,28,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,904,TGT,eng,deu,29,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,905,TGT,eng,deu,30,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,906,TGT,eng,deu,31,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,907,TGT,eng,deu,32,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,908,TGT,eng,deu,33,test-en-literary_forever_snow_chunk_2_words_986,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,724,TGT,eng,deu,34,test-en-speech_6JeSS_CODZ0_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,689,TGT,eng,deu,35,test-en-speech_07FOJFFqOYc_002,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,373,BAD,eng,deu,36,test-en-social_112111346044907536#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,374,BAD,eng,deu,37,test-en-social_112111346044907536#bad,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Llama3-70B,789,TGT,eng,deu,38,test-en-speech_XwIQLLbD7SI_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,969,TGT,eng,deu,39,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,970,TGT,eng,deu,40,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,971,TGT,eng,deu,41,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,972,TGT,eng,deu,42,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,973,TGT,eng,deu,43,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,974,TGT,eng,deu,44,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,975,TGT,eng,deu,45,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,976,TGT,eng,deu,46,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,977,TGT,eng,deu,47,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,978,TGT,eng,deu,48,test-en-literary_the_other_side_stormfall_chunk_2_words_956,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,373,TGT,eng,deu,49,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,374,TGT,eng,deu,50,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,375,TGT,eng,deu,51,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,376,TGT,eng,deu,52,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,377,TGT,eng,deu,53,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,378,TGT,eng,deu,54,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,379,TGT,eng,deu,55,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,380,TGT,eng,deu,56,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,381,TGT,eng,deu,57,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,382,TGT,eng,deu,58,test-en-social_112111346044907536,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Llama3-70B,19,TGT,eng,deu,59,test-en-news_economist.14223#incomplete,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Llama3-70B,20,TGT,eng,deu,60,test-en-news_economist.14223#incomplete,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,42,TGT,eng,deu,61,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,43,TGT,eng,deu,62,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,44,TGT,eng,deu,63,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,45,TGT,eng,deu,64,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,46,TGT,eng,deu,65,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,47,TGT,eng,deu,66,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,48,TGT,eng,deu,67,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,49,TGT,eng,deu,68,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,50,TGT,eng,deu,69,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Claude-3.5,51,TGT,eng,deu,70,test-en-news_newsweek.63908,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,10,TGT,eng,deu,71,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,11,TGT,eng,deu,72,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,12,TGT,eng,deu,73,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,13,TGT,eng,deu,74,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,14,TGT,eng,deu,75,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,15,TGT,eng,deu,76,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,16,TGT,eng,deu,77,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,17,TGT,eng,deu,78,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IKUN-C,18,TGT,eng,deu,79,test-en-news_csmonitor.com.7750,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,424,TGT,eng,deu,80,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,425,TGT,eng,deu,81,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,426,TGT,eng,deu,82,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,427,TGT,eng,deu,83,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,428,TGT,eng,deu,84,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,429,TGT,eng,deu,85,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,430,TGT,eng,deu,86,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,431,TGT,eng,deu,87,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,432,TGT,eng,deu,88,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,433,TGT,eng,deu,89,test-en-social_112121157211696272,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,451,TGT,eng,deu,90,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,452,TGT,eng,deu,91,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,453,TGT,eng,deu,92,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,454,TGT,eng,deu,93,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,455,TGT,eng,deu,94,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,456,TGT,eng,deu,95,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,457,TGT,eng,deu,96,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,458,TGT,eng,deu,97,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,459,TGT,eng,deu,98,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,460,TGT,eng,deu,99,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,767,TGT,eng,deu,100,test-en-speech_QaueRRYecxo_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,453,TGT,eng,deu,101,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,454,TGT,eng,deu,102,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,455,TGT,eng,deu,103,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,456,TGT,eng,deu,104,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,457,TGT,eng,deu,105,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,458,TGT,eng,deu,106,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,459,TGT,eng,deu,107,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,460,TGT,eng,deu,108,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,Dubformer,461,TGT,eng,deu,109,test-en-social_112122127346453600,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9604,IOL-Research,768,TGT,eng,deu,110,test-en-speech_QaueRRYecxo_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" diff --git a/RegressionTests/tests/special/example_lt100.scores.csv.expected b/RegressionTests/tests/special/example_lt100.scores.csv.expected new file mode 100644 index 00000000..cc6b9db4 --- /dev/null +++ b/RegressionTests/tests/special/example_lt100.scores.csv.expected @@ -0,0 +1,10 @@ +engdeu9704,ende-tutorial1,1000000,TGT,eng,deu,1,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9704,ende-tutorial1,1000001,TGT,eng,deu,2,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9704,ende-tutorial1,1000002,TGT,eng,deu,3,ende-tutorial1,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9704,ende-tutorial2,1000003,TGT,eng,deu,4,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9704,ende-tutorial2,1000004,TGT,eng,deu,5,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9704,ende-tutorial2,1000005,TGT,eng,deu,6,ende-tutorial2,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9704,Claude-3.5,706,TGT,eng,deu,7,test-en-speech_392RoIzR2Fs_001,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9704,ONLINE-B,778,TGT,eng,deu,8,test-en-speech_TBPP-za78BQ_000,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9704,Llama3-70B,5,TGT,eng,deu,9,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" +engdeu9704,Llama3-70B,6,TGT,eng,deu,10,test-en-news_brisbanetimes.com.au.228963,False,"{'start_i': 0, 'end_i': 50, 'severity': 'major'}" diff --git a/RegressionTests/tests/special/manifest_gt100.json b/RegressionTests/tests/special/manifest_gt100.json new file mode 100644 index 00000000..41f301b8 --- /dev/null +++ b/RegressionTests/tests/special/manifest_gt100.json @@ -0,0 +1,14 @@ +{ + "CAMPAIGN_URL": "http://127.0.0.1:8000/dashboard/sso/", + "CAMPAIGN_NAME": "example15esaGT100", + "CAMPAIGN_KEY": "example15esaGT100", + "CAMPAIGN_NO": 150, + "REDUNDANCY": 2, + + "TASKS_TO_ANNOTATORS": [ + ["eng", "deu", "uniform", 4, 2] + ], + + "TASK_TYPE": "Document", + "TASK_OPTIONS": "ESA" +} diff --git a/RegressionTests/tests/special/manifest_lt100.json b/RegressionTests/tests/special/manifest_lt100.json new file mode 100644 index 00000000..3ae9a6ba --- /dev/null +++ b/RegressionTests/tests/special/manifest_lt100.json @@ -0,0 +1,14 @@ +{ + "CAMPAIGN_URL": "http://127.0.0.1:8000/dashboard/sso/", + "CAMPAIGN_NAME": "example15esaLT100", + "CAMPAIGN_KEY": "example15esaLT100", + "CAMPAIGN_NO": 151, + "REDUNDANCY": 2, + + "TASKS_TO_ANNOTATORS": [ + ["eng", "deu", "uniform", 4, 2] + ], + + "TASK_TYPE": "Document", + "TASK_OPTIONS": "ESA" +} diff --git a/RegressionTests/tests/special/test_examples_gt100.sh b/RegressionTests/tests/special/test_examples_gt100.sh new file mode 100644 index 00000000..fdfd618d --- /dev/null +++ b/RegressionTests/tests/special/test_examples_gt100.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash -x + +# Exit on error +set -eo pipefail + +prefix=example_gt100 + +mkdir -p Batches + +# duplicate the last 10 examples in the list but increase "itemID" by one +python3 < ${prefix}.scores.csv +diff --strip-trailing-cr ${prefix}.scores.csv ${prefix}.scores.csv.expected + +# Exit with success code +exit $EXIT_CODE_SUCCESS diff --git a/RegressionTests/tests/special/test_examples_lt100.sh b/RegressionTests/tests/special/test_examples_lt100.sh new file mode 100644 index 00000000..c13e7426 --- /dev/null +++ b/RegressionTests/tests/special/test_examples_lt100.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash -x + +# Exit on error +set -eo pipefail + +prefix=example_lt100 + +mkdir -p Batches + +# $APPRAISE_EXAMPLES/MQM+ESA/batches_esa.json is a list of dictionaries, each containing among other fields a field "items", which is a list +# read $APPRAISE_EXAMPLES/MQM+ESA/batches_esa.json and keep only the first 10 examples in each "items" list +python3 < ${prefix}.scores.csv +diff --strip-trailing-cr ${prefix}.scores.csv ${prefix}.scores.csv.expected + +# Make two more annotations, should not create any new entries in the scores file +for score in $( seq 1 3 ); do + $APPRAISE_MANAGE MakeAnnotation engdeu9704:17d9e109 Document $score --mqm '[{"start_i": 0, "end_i": 50, "severity": "major"}]' +done + +# the output should remain the same +$APPRAISE_MANAGE ExportSystemScoresToCSV example15esaLT100 | sed "s/, /| /g" | cut -f-10 -d, | sed "s/| /, /g" > ${prefix}.scores2.csv +diff --strip-trailing-cr ${prefix}.scores2.csv ${prefix}.scores.csv.expected + + +# Exit with success code +exit $EXIT_CODE_SUCCESS From d97c1b2d8d1297552071665998cae49f8d2a72f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Mon, 17 Feb 2025 18:31:31 +0100 Subject: [PATCH 12/51] set src-tgt char alignment as discussed --- .../js/direct-assessment-document-mqm-esa.js | 26 ++----------------- 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js index f744dfd8..3a7e9e77 100644 --- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js +++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js @@ -127,19 +127,7 @@ async function get_error_type() { return error_stack } -var TMP_HIGHLIGHT_MODE = null -var TMP_HIGHLIGHT_WIDTH = null - $(document).ready(() => { - // TODO: only temporary, remove once decided - // native dialog box to select highlight mode - while(!["thin", "normal", "bold", "wavy", "dotted"].includes(TMP_HIGHLIGHT_MODE)) { - TMP_HIGHLIGHT_MODE = prompt('Please select highlight mode: "thin", "normal" (default), "bold", "wavy", "dotted"', "normal") - } - while(isNaN(parseInt(TMP_HIGHLIGHT_WIDTH)) || TMP_HIGHLIGHT_WIDTH < 1) { - TMP_HIGHLIGHT_WIDTH = parseInt(prompt('Please select how many characters to highlight. Default is 8.', 8)) - } - MQM_TYPE = JSON.parse($('#mqm-type-payload').html()) // sliders are present only for ESA @@ -389,23 +377,13 @@ class MQMItemHandler { // remove underline from all mqm this.el_source.children(".mqm_char_src").css("text-decoration", "") - let highlight_width = Math.floor(TMP_HIGHLIGHT_WIDTH / 2) + let highlight_width = Math.floor(16 / 2) // set underline to the corresponding character and its neighbours for (let range = highlight_width; range > 0; range--) { // extrapolate range between #111 and #ddd let color = (Math.floor((range-1)/highlight_width * (0xd - 0x1))+0x1).toString(16) for (let i = Math.max(0, src_char_i - range); i <= Math.min(len_src, src_char_i + range); i++) { - if (TMP_HIGHLIGHT_MODE == "bold") { - this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 25% #${color}${color}${color} solid`) - } else if (TMP_HIGHLIGHT_MODE == "wavy") { - this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} wavy`) - } else if (TMP_HIGHLIGHT_MODE == "dotted") { - this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} dotted`) - } else if (TMP_HIGHLIGHT_MODE == "normal") { - this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} solid`) - } else if (TMP_HIGHLIGHT_MODE == "thin") { - this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 5% #${color}${color}${color} solid`) - } + this.el_source.children(`#source_char_${i}`).css("text-decoration", `underline 15% #${color}${color}${color} solid`) } } }) From bdd3840ec2386bffa0fecc3ea448535e5c54b1ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Mon, 17 Feb 2025 18:33:42 +0100 Subject: [PATCH 13/51] remove unused/duplicate files --- scripts/create_iwslt22_tasks.py | 764 ----------------------- scripts/create_wmt19_tasks.py | 546 ----------------- scripts/create_wmt21_tasks.py | 709 --------------------- scripts/create_wmt22_tasks.py | 1023 ------------------------------- 4 files changed, 3042 deletions(-) delete mode 100644 scripts/create_iwslt22_tasks.py delete mode 100644 scripts/create_wmt19_tasks.py delete mode 100644 scripts/create_wmt21_tasks.py delete mode 100644 scripts/create_wmt22_tasks.py diff --git a/scripts/create_iwslt22_tasks.py b/scripts/create_iwslt22_tasks.py deleted file mode 100644 index fb26f639..00000000 --- a/scripts/create_iwslt22_tasks.py +++ /dev/null @@ -1,764 +0,0 @@ -# pylint: disable=C0103,C0111,C0330,E1101 -import sys -from collections import OrderedDict -from copy import deepcopy -from glob import iglob -from json import dumps as json_dumps -from os.path import basename -from os.path import join -from random import choice -from random import randint -from random import seed -from random import shuffle -from typing import Any -from typing import Dict -from typing import List -from typing import Text -from typing import Tuple - -from lxml import etree - - -MAX_TASK_SIZE = 100 # No support for tasks over 100 items -MAX_DOC_LENGTH = 70 # We do not support documents longer than 70 segments - -MISSING_TRANSLATION_MESSAGE = ("NO TRANSLATION AVAILABLE",) -DEFAULT_TRANSLATOR = "DEFAULT" -# If False, documents with control items will be very last ones in each batch -SHUFFLE_DOCS_WITH_CONTROL_ITEMS = True -# If True, add references as additional system outputs -INCLUDE_REFERENCES_AS_SYSTEMS = True -# If True, documents may be oversampled to form the last batch -USE_ALL_DOCUMENTS_AND_ALL_SYSTEMS = True -REFERENCE_AS_SYSTEM_PREFIX = 'translator-' - - -def unwrap_xml( - xml_file, - missing_message=MISSING_TRANSLATION_MESSAGE, - encoding='utf-8', -): - """ - Unwraps an xml file in WMT format, producing source and (if present) reference files - - :param xml_file: The xml file (or fd) - :param missing_message: The message to insert when no reference - - :returns: src_lang, src_lines, ref_lang, ref_lines, hyp_lang, hyp_lines - - ref_lines maps translator to document to tuples of segment id and line text - hyp_lines maps system to document to tuples of segment id and line text - - ref_lang and hyp_lang may be None, and then their lines are empty - note: a single language is assumed for each of sources, refs and hyps - - This function has been extracted from - https://github.com/wmt-conference/wmt-format-tools/wmtformat/unwrap.py with - some modifications - """ - tree = etree.parse(xml_file) - - # Find and check the documents (src, ref, hyp) - src_langs, ref_langs, hyp_langs, translators, systems = ( - set(), - set(), - set(), - set(), - set(), - ) - - for src_doc in tree.getroot().findall(".//src"): - src_langs.add(src_doc.get("lang")) - - for ref_doc in tree.getroot().findall(".//ref"): - ref_langs.add(ref_doc.get("lang")) - translator = ref_doc.get("translator") - if translator: - translators.add(translator) - - for hyp_doc in tree.getroot().findall(".//hyp"): - hyp_langs.add(hyp_doc.get("lang")) - systems.add(hyp_doc.get("system")) - - if len(src_langs) > 1: - raise RuntimeError("Multiple source languages found") - - if len(src_langs) == 0: - raise RuntimeError("No source languages found") - - src_lang = src_langs.pop() - src_docs = OrderedDict() - - if len(ref_langs) > 1: - raise RuntimeError("Multiple reference languages found") - - translators = list(translators) - if len(ref_langs) > 0: - if len(translators) == 0: - print("No translator identifiers found") - translators.append(DEFAULT_TRANSLATOR) - ref_lang = ref_langs.pop() - ref_docs = OrderedDict( - (translator, OrderedDict()) for translator in translators - ) - else: - print("No references found") - ref_lang = None - ref_docs = OrderedDict() - - if len(hyp_langs) > 1: - raise RuntimeError("Multiple hypothesis languages found") - - systems = list(systems) - if len(hyp_langs) > 0: - hyp_docs = OrderedDict((system, OrderedDict()) for system in systems) - hyp_lang = hyp_langs.pop() - else: - hyp_docs = OrderedDict() - hyp_lang = None - - # Extract text - src_sent_count, doc_count = 0, 0 - for doc in tree.getroot().findall(".//doc"): - doc_id = doc.get("id") - src = [] - if "testsuite" in doc.attrib: - continue - doc_count += 1 - src_sents = {int(seg.get("id")): seg.text for seg in doc.findall(".//src//seg")} - - def get_sents(doc): - return { - int(seg.get("id")): seg.text if seg.text else "" - for seg in doc.findall(f".//seg") - } - - if ref_lang: - _ref_docs = doc.findall(".//ref") - trans_to_ref = {} - - # If no translator identifiers, we just read one reference (if any) - # If there are translator identifiers, we add a reference for each translator - if len(translators) == 1 and DEFAULT_TRANSLATOR in translators: - if len(_ref_docs): - trans_to_ref[DEFAULT_TRANSLATOR] = get_ref_sents(_ref_docs[0]) - else: - trans_to_ref[DEFAULT_TRANSLATOR] = {} - else: - trans_to_ref = { - ref_doc.get("translator"): get_sents(ref_doc) - for ref_doc in _ref_docs - } - - if hyp_lang: - _hyp_docs = doc.findall(".//hyp") - system_to_ref = { - hyp_doc.get("system"): get_sents(hyp_doc) for hyp_doc in _hyp_docs - } - - for seg_id in sorted(src_sents.keys()): - src.append([seg_id, src_sents[seg_id]]) - src_sent_count += 1 - if ref_lang: - for translator in translators: - if doc_id not in ref_docs[translator]: - ref_docs[translator][doc_id] = [] - - # _ref_text = trans_to_ref.get(translator, {translator: {}}).get( - _ref_text = trans_to_ref[translator].get(seg_id, missing_message) - ref_docs[translator][doc_id].append((seg_id, _ref_text)) - - if _ref_text == MISSING_TRANSLATION_MESSAGE: - print( - f'Warning: missing reference for translator {translator}, ' - f'document {doc_id}, segment {seg_id}' - ) - if hyp_lang: - for system in systems: - if doc_id not in hyp_docs[system]: - hyp_docs[system][doc_id] = [] - - # _hyp_text = system_to_ref.get(system, {system: {}}).get( - _hyp_text = system_to_ref[system].get(seg_id, missing_message) - hyp_docs[system][doc_id].append((seg_id, _hyp_text)) - - if _hyp_text == MISSING_TRANSLATION_MESSAGE: - print( - f'Warning: missing translation from {system}, ' - f'document {doc_id}, segment {seg_id}' - ) - - src_docs[doc_id] = src - - print( - f"Extracted {doc_count} document(s) containing {src_sent_count} sentences in {src_lang}" - ) - - return src_lang, src_docs, ref_lang, ref_docs, hyp_lang, hyp_docs - - -def _create_bad_ref(seg_text: str, ref_text: str, character_based: bool = False) -> str: - """ - Creates bad reference for given text. - - Segment length (a, b] to phrase length (excluding a, including b) - mapping defined as follows: - ( 0, 1] : 1 - ( 1, 5] : 2 - ( 5, 8] : 3 - ( 8, 15] : 4 - (15, 20] : 5 - (20, max] : 6 - - For character-based languages, which do not support tokenisation - by whitespace, the resulting phrase length will be doubled, and - is interpreted as a character length. - """ - seg_data = seg_text.split(' ') - ref_data = ref_text.split(' ')[1:] # Don't use the first word - - if character_based: - seg_data = [x for x in seg_text] - ref_data = [x for x in ref_text] - - seg_len = len(seg_data) - ref_len = len(ref_data) - - # Determine length of bad phrase, relative to segment length. - _seg_to_bad_mapping = { - (None, 1): 2, - (1, 5): 2, - (5, 8): 3, - (8, 15): 4, - (15, 20): 5, - (20, None): 6, - } - - bad_len = 0 - for seg_pair in _seg_to_bad_mapping: - left, right = seg_pair - - # seg_len == right; left edge case - if not left: - if seg_len == right: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # left < seg_len; right edge case - elif not right: - if left < seg_len: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # left < seg_len <= right; middle cases - elif left < seg_len <= right: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # Double length of bad phrase for character-based languages. - if character_based: - bad_len = 2 * bad_len - - # Determine random replacement position. For segments longer than - # (bad_len + 1), we enforce that this cannot be sentence initial - # or final, so positions 0 and (seg_len - bad_len -1) are invalid - # and we use an embedded bad_pos in [1, (seg_len - bad_len - 1)]. - # This happens for all seg_len > 3. - bad_pos = 1 - _xs = max(1, seg_len - bad_len - 1) - bad_pos = choice([x + 1 for x in range(_xs)]) - - ref_pos = 1 - if ref_len - bad_len > 0: - _xs = max(1, ref_len - bad_len - 1) - ref_pos = choice(range(_xs)) - - bad_data = ( - seg_data[:bad_pos] - + ref_data[ref_pos : ref_pos + bad_len] - + seg_data[bad_pos + bad_len :] - ) - bad_text = ' '.join(bad_data) - if character_based: - bad_text = ''.join(bad_data) - - # print(seg_text) - # print(bad_text) - # print('------------') - return bad_text - - -def create_bad_refs( - docs: Dict[str, List[Tuple[str, str]]], - refs: Dict[str, List[Tuple[str, str]]], - character_based: bool = False, -) -> Dict[str, List[Tuple[str, str]]]: - """ - Creates bad references for given documents. - - For each segment in the given documents, this creates a so-called - ``bad reference'' which is constructed by replacing an embedded - phrase p with a randomly placed phrase p' of the same length, - taken from a different segment contained in refs. The length of - the phrase is relative to the full segment length. - - See _create_bad_ref() definition for length mapping details. - """ - # Create mapping from f'{doc_id}_{seg_id}' to reference text. - all_refs = {} - for curr_doc_id, curr_doc in refs.items(): - for curr_seg_id, curr_ref_text in curr_doc: - all_refs[f'{curr_doc_id}_{curr_seg_id}'] = curr_ref_text - - # Create list of f'{doc_id}_{seg_id}' ids, to be used for random - # choice later when we want to identify a reference to work with. - all_keys = list(all_refs.keys()) - - # Iterate through documents and create bad references. - bad_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict() - for curr_doc_id, curr_doc in docs.items(): - if not curr_doc_id in bad_docs: - bad_docs[curr_doc_id] = [] - - print(f'doc_id: {curr_doc_id},\tdoc_len: {len(curr_doc)}') - for curr_seg in curr_doc: - curr_seg_id, curr_seg_text = curr_seg - - # Bad reference id may not be identical to current id. - bad_id = choice(all_keys) - while bad_id == f'{curr_doc_id}_{curr_seg_id}': - bad_id = choice(all_keys) - - curr_bad_text = _create_bad_ref( - curr_seg_text, - all_refs[bad_id], - character_based=character_based, - ) - - # Ensure that keys can be reused. - all_keys.append(bad_id) - - bad_docs[curr_doc_id].append((curr_seg_id, curr_bad_text)) - - return bad_docs - - -if __name__ == "__main__": - if len(sys.argv) < 8: - print('Example usage:') - print( - f' {sys.argv[0]} newstest2021.en-de.all.xml batches.en-de enu deu 50 True False' - ) - exit() - - XML_FILE = sys.argv[1] # Path to .xml file with sources, references and outputs - OUT_NAME = sys.argv[2] # Prefix for .csv and .json output files - SRC_LANG = sys.argv[3] # Code for source language, e.g. eng - TGT_LANG = sys.argv[4] # Code for target language, e.g. deu - TASK_MAX = int(sys.argv[5]) # Maximum number of tasks - CONTROLS = sys.argv[6].lower() not in ['', '0', 'false', 'off'] # Generate QC items - CHARLANG = sys.argv[7].lower() in ['1', 'true', 'on'] # Character-based - print(f'Character based={CHARLANG}') - - ENC = 'utf-8' - - RND_SEED = 1234567 - # RND_SEED = 11111 - seed(RND_SEED) - - print(f'Quality control={CONTROLS}') - if CONTROLS: - REQUIRED_SEGS = 92 - else: - REQUIRED_SEGS = 100 - print(f'Setting REQUIRED_SEGS={REQUIRED_SEGS}') - - SYS_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict() - BAD_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict() - print(f'Loading docs from {XML_FILE}') - src_lang, SRC_DOCS, ref_lang, REF_DOCS, hyp_lang, SYS_DOCS = unwrap_xml( - XML_FILE, encoding=ENC - ) - - # This reference will be used for generating BAD items - REF_ID = sorted(list(REF_DOCS.keys()))[0] - print(f'Using reference "{REF_ID}"') - - # Add references as additional system outputs - if INCLUDE_REFERENCES_AS_SYSTEMS: - for ref_id in sorted(list(REF_DOCS.keys())): - sys_id = REFERENCE_AS_SYSTEM_PREFIX + ref_id - print(f'Adding reference "{ref_id}" as system output "{sys_id}"') - SYS_DOCS[sys_id] = REF_DOCS[ref_id] - - # List of system names that can be iterated deterministically - SYS_IDS = sorted(list(SYS_DOCS.keys())) - print("SYS IDS size:", len(SYS_IDS)) - - for sys_id in SYS_IDS: - print(f'Generating bad references for {sys_id}') - BAD_DOCS[sys_id] = create_bad_refs( - SYS_DOCS[sys_id], REF_DOCS[REF_ID], character_based=CHARLANG - ) - - # pylint: disable-msg=invalid-name - some_sys_id = choice(SYS_IDS) - some_doc_id = choice(sorted(list(SYS_DOCS[some_sys_id].keys()))) - some_sys_text = SYS_DOCS[some_sys_id][some_doc_id] - some_bad_text = BAD_DOCS[some_sys_id][some_doc_id] - print("Example:", some_sys_id, some_doc_id) - - for _s, _b in zip(some_sys_text, some_bad_text): - print(_s) - print(_b) - print('---') - - DOC_STATS: Dict[int, List[Tuple[int, str, str]]] = OrderedDict() - for sys_id in SYS_IDS: - for doc_id in SYS_DOCS[sys_id].keys(): - doc_len = len(SYS_DOCS[sys_id][doc_id]) - - # We do not support documents longer than 70 segments. - if doc_len > MAX_DOC_LENGTH: - print("!!! DOCUMENT TOO LONG:", doc_id) - continue - - if not doc_len in DOC_STATS.keys(): - DOC_STATS[doc_len] = [] - DOC_STATS[doc_len].append((doc_len, doc_id, sys_id)) - - # Randomise system order - for doc_len in DOC_STATS: - shuffle(DOC_STATS[doc_len]) - - print("Doc. stats (doc.len/count):", DOC_STATS.keys()) - total_docs = 0 - total_sys = set() - for doc_len in DOC_STATS.keys(): - print(f' {doc_len}:\t{len(DOC_STATS[doc_len])}') - total_docs += len(DOC_STATS[doc_len]) - for x in DOC_STATS[doc_len]: - total_sys.add(x[2]) - print("total docs:", total_docs) - print("total sys:", total_sys) - - all_systems = list(total_sys) - sampled_tasks: List[Tuple[Tuple[int, str, str], ...]] = [] - CURR_LEN = 0 - CURR_SYS = 0 - curr_task: List[Tuple[int, str, str]] = [] - DOC_STATS_COPY = deepcopy(DOC_STATS) - last_task = False - while DOC_STATS.keys(): - ALL_KEYS = sorted(list(DOC_STATS.keys())) - # Maximum allowed length of a document to not exceed 100 segments in this task - max_delta = REQUIRED_SEGS - CURR_LEN - valid_keys = [x for x in ALL_KEYS if x <= max_delta] - - if not valid_keys: - print(" #segments in current task:", CURR_LEN) - for _doc in curr_task: - print(" ", _doc) - print('------') - sampled_tasks.append(tuple(curr_task)) - CURR_LEN = 0 - curr_task = [] - if last_task: # Stop if this was the last task with - break - continue - - # Take the document that fill in the allowed size perfectly, or random - if max_delta in valid_keys: - curr_key = max_delta - else: - curr_key = choice(valid_keys) - - CURR_LEN += curr_key - curr_val = DOC_STATS[curr_key].pop(0) # This takes a random system. - # print(' ... selected ', curr_val) - # print(' .. left systems', sum( len(DOC_STATS[k]) for k in DOC_STATS )) - - # Below code would pick systems one after the other - # curr_val = None - # for iter_val in DOC_STATS[curr_key]: - # if iter_val[2] == all_systems[CURR_SYS]: - # curr_val = iter_val - # DOC_STATS[curr_key].remove(iter_val) - # break - - # if not curr_val: - # curr_val = DOC_STATS[curr_key].pop(0) - # CURR_SYS = all_systems.index(curr_val[2]) - # CURR_SYS = (CURR_SYS + 1) % len(all_systems) - - curr_task.append(curr_val) - if not DOC_STATS[curr_key]: - DOC_STATS.pop(curr_key) - - # If there are some documents left that cannot form a full task with - # 100 segments, take random documents to create the last task. - # This ensures that all documents have been used at least once. - if ( - USE_ALL_DOCUMENTS_AND_ALL_SYSTEMS - and len(DOC_STATS) == 0 - and len(curr_task) > 0 - ): - DOC_STATS = DOC_STATS_COPY - last_task = True - print('Creating last batch with padded documents') - - # print("------------") - # print("Left docs:") - # print(DOC_STATS) - # print("------------") - - # Print documents per system - _all_tasks = [] - for _tup in sampled_tasks: - _all_tasks += list(_tup) - _docs_by_sys: Dict[str, Any] = {} - for (_, docid, sysid) in _all_tasks: - if sysid not in _docs_by_sys: - _docs_by_sys[sysid] = [] - _docs_by_sys[sysid].append(docid) - for i, sysid in enumerate(_docs_by_sys): - print(i, sysid) - for j, docid in enumerate(sorted(_docs_by_sys[sysid])): - print(" ", j, docid) - - # Shuffle order of tasks - shuffle(sampled_tasks) - print("Total number of tasks:", len(sampled_tasks)) - - padded_tasks: List[Tuple[Tuple[int, str, str], ...]] = [] - for tid, task in enumerate(sampled_tasks): - task_docs = len(task) - task_len = sum([x[0] for x in task]) - print(f'task_len: {task_len}') - if task_len > MAX_TASK_SIZE: - raise NotImplementedError( - 'No support for tasks >{0} items!'.format(MAX_TASK_SIZE) - ) - - elif task_len < MAX_TASK_SIZE: - pad_size = MAX_TASK_SIZE - task_len - pad_data: List[Tuple[int, str, str]] = list(task) - pad_pos = 0 - while pad_size > 0: - print(f'pad_size: {pad_size}') - print(f'pad_pos: {pad_pos}') - pad_data.append(tuple(list(pad_data[pad_pos]) + [True])) # type: ignore - print(pad_data[-1]) - pad_size -= pad_data[-1][0] - pad_pos = (pad_pos + 1) % task_docs - if pad_size < 0: - print(f'pad_size: {pad_size}') - print(f'pad_pos: {pad_pos}') - - last_doc: Tuple[int, str, str] = pad_data[-1] - print(last_doc[0], '-->', last_doc[0] + pad_size) - fixed_doc = (last_doc[0] + pad_size, *last_doc[1:]) - pad_data[-1] = fixed_doc - print(pad_data[-1][0]) - padded_tasks.append(tuple(pad_data)) - print("Padded tasks:") - for _pad in padded_tasks[-1]: - print(" ", _pad) - - else: - print(f'WARNING: no control items in task no. {tid}') - # raise NotImplementedError('Needs isControl=True update!') - padded_tasks.append(tuple(task)) # TODO: does this ever occur? - - csv_data = [] - task_id = 0 - for task in padded_tasks: - task_id += 1 - task_len = sum([x[0] for x in task]) - print(f'>>> task_len: {task_len}') - - for _doc in task: - _data = [str(task_id)] - for x in _doc: # type: ignore - _data.append(str(x)) - - if _data[-1] != 'True': - _data.append('False') # isControl=False - print('>>> ', ' '.join(_data)) - csv_data.append(','.join(_data)) - - with open(f'{OUT_NAME}.csv', mode='w') as _file: - for csv_line in csv_data: - _file.write(csv_line) - _file.write('\n') - - json_data = [] - batch_id = 0 - for task in padded_tasks[:TASK_MAX]: - # Remember, batch numbers are one-based - task_data = OrderedDict( - { - 'batchNo': batch_id + 1, - 'batchSize': 100, - 'sourceLanguage': SRC_LANG, - 'targetLanguage': TGT_LANG, - 'requiredAnnotations': 1, - 'randomSeed': RND_SEED, - } - ) - - source_id = basename(XML_FILE) - - items_data: List[List[Dict[str, Any]]] = [] # Keeps items grouped into document - _item = 0 - doc_counter = 0 - for doc_data in task: - items_data.append([]) # Add a new bucket for items from this documents - has_control_item = False - - doc_len, doc_id, sys_id, *rest = doc_data # type: ignore - - isControl = rest is not None and rest - - target_id = sys_id - - _src = {} - _ref = {} - _bad = {} - _tgt = {} - - for item_id, item_src in SRC_DOCS[doc_id]: - seg_id = f'{doc_id}_{item_id}' - _src[seg_id] = item_src - - for item_id, item_ref in REF_DOCS[REF_ID][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _ref[seg_id] = item_ref - - for item_id, item_bad in BAD_DOCS[sys_id][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _bad[seg_id] = item_bad - - for item_id, item_tgt in SYS_DOCS[sys_id][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _tgt[seg_id] = item_tgt - - seg_counter = 0 - context_src: List[Text] = [] - context_ref: List[Text] = [] - context_bad: List[Text] = [] - context_tgt: List[Text] = [] - for seg_id in _src: - if seg_counter >= doc_len: # Padding tasks are shorter! - break - item_src = _src[seg_id] - item_ref = _ref[seg_id] - item_bad = _bad[seg_id] - item_tgt = _tgt[seg_id] - - target_text = item_tgt - target_type = 'TGT' - - # Do not generate any BAD items if QC is disabled - if CONTROLS and isControl: - randomCoinFlip = choice( - [ - False, - False, - False, - True, - True, - True, - True, - True, - True, - True, - ] # 7:3 chance - ) - if randomCoinFlip: - target_text = item_bad - target_type = 'BAD' - has_control_item = True - - obj: Dict[str, Any] = OrderedDict() - obj['_item'] = _item - obj['_block'] = -1 - obj['sourceID'] = source_id - obj['sourceContextLeft'] = ' '.join(context_src) - obj['sourceText'] = item_src - obj['targetID'] = target_id - obj['targetContextLeft'] = ' '.join(context_tgt) - obj['targetText'] = target_text - obj['itemID'] = seg_counter - obj['itemType'] = target_type - obj['documentID'] = doc_id - obj['isCompleteDocument'] = False - - # print(seg_id) - # print(' '.join(context_src)) - # print(item_src) - # print('...') - # print(' '.join(context_tgt)) - # print(item_tgt.encode('utf-8')) - # print('---') - - context_src.append(item_src) - context_ref.append(item_ref) - context_bad.append(item_bad) - context_tgt.append(target_text) - - items_data[-1].append(obj) - _item += 1 - seg_counter += 1 - - obj = OrderedDict() - obj['_item'] = _item - obj['_block'] = -1 - obj['sourceID'] = source_id - obj['sourceText'] = ' '.join(context_src) # full document - obj['targetID'] = target_id - obj['targetText'] = ' '.join(context_tgt) # full document - obj['itemID'] = item_id - obj['itemType'] = 'TGT' - obj['documentID'] = doc_id - obj['isCompleteDocument'] = True - items_data[-1].append(obj) - - if has_control_item and SHUFFLE_DOCS_WITH_CONTROL_ITEMS: - # Move the document with control items to a random position so - # that they are not accumulated as very last documents - _bad_doc = items_data.pop() - _pos = randint(0, len(items_data) - 1) - print(f' Moving the last QC document to position {_pos}') - items_data.insert(_pos, _bad_doc) - - # Extract items from documents - _items_data = [item for doc_items in items_data for item in doc_items] - # Re-assign _item numbers - if SHUFFLE_DOCS_WITH_CONTROL_ITEMS: - _item = 0 - for i in range(len(_items_data)): - _items_data[i]['_item'] = _item - if _items_data[i]['isCompleteDocument'] == False: - _item += 1 - - output_data = OrderedDict({'task': task_data, 'items': _items_data}) - - json_data.append(output_data) - - # write out JSON - json_text = json_dumps(json_data, indent=2, sort_keys=True) - - json_file_name = f'{OUT_NAME}.json' - with open(json_file_name, mode='w', encoding='utf8') as out_file: - sys.stdout.write( - 'Creating {0}, batch no. {1} ... '.format(json_file_name, batch_id + 1), - ) - out_file.write(str(json_text)) - sys.stdout.write('OK\n') - - batch_id += 1 - - print(f'Total tasks: {len(sampled_tasks)}') - print(f'Total docs: {total_docs}') - print(f'Total sys: {len(total_sys)} {sorted(list(total_sys))}') diff --git a/scripts/create_wmt19_tasks.py b/scripts/create_wmt19_tasks.py deleted file mode 100644 index 77b56cde..00000000 --- a/scripts/create_wmt19_tasks.py +++ /dev/null @@ -1,546 +0,0 @@ -# pylint: disable=C0103,C0111,C0330,E1101 -import sys -from collections import defaultdict -from collections import OrderedDict -from glob import iglob -from json import dumps as json_dumps -from os.path import basename -from os.path import join -from random import choice -from random import seed -from random import shuffle -from typing import Any -from typing import Dict -from typing import List -from typing import Text -from typing import Tuple - -from bs4 import BeautifulSoup # type: ignore - - -MAX_TASK_SIZE = 100 # No support for tasks over 100 items -MAX_DOC_LENGTH = 70 # We do not support documents longer than 70 segments - - -def load_docs_from_sgml( - file_path: str, encoding='utf-8' -) -> Dict[str, List[Tuple[str, str]]]: - """ - Loads documents from given SGML file. - - Returns dict mapping document ids to list of segments [segments]. - Each segment is a tuple (segment id, segment text). - """ - soup = None - - with open(file_path, encoding=encoding) as _file: - soup = BeautifulSoup(_file, features='lxml') - - all_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict() - for curr_doc in soup.find_all('doc'): - curr_doc_id = curr_doc.attrs['docid'] - if not curr_doc_id in all_docs: - all_docs[curr_doc_id] = [] - - for curr_seg in curr_doc.find_all('seg'): - curr_seg_id = curr_seg.attrs['id'] - curr_seg_text = curr_seg.get_text() - all_docs[curr_doc_id].append((curr_seg_id, curr_seg_text)) - - return all_docs - - -def _create_bad_ref(seg_text: str, ref_text: str, character_based: bool = False) -> str: - """ - Creates bad reference for given text. - - Segment length (a, b] to phrase length (excluding a, including b) - mapping defined as follows: - ( 0, 1] : 1 - ( 1, 5] : 2 - ( 5, 8] : 3 - ( 8, 15] : 4 - (15, 20] : 5 - (20, max] : 6 - - For character-based languages, which do not support tokenisation - by whitespace, the resulting phrase length will be doubled, and - is interpreted as a character length. - """ - seg_data = seg_text.split(' ') - ref_data = ref_text.split(' ') - - if character_based: - seg_data = [x for x in seg_text] - ref_data = [x for x in ref_text] - - seg_len = len(seg_data) - ref_len = len(ref_data) - - # Determine length of bad phrase, relative to segment length. - _seg_to_bad_mapping = { - (None, 1): 1, - (1, 5): 2, - (5, 8): 3, - (8, 15): 4, - (15, 20): 5, - (20, None): 6, - } - - bad_len = 0 - for seg_pair in _seg_to_bad_mapping: - left, right = seg_pair - - # seg_len == right; left edge case - if not left: - if seg_len == right: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # left < seg_len; right edge case - elif not right: - if left < seg_len: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # left < seg_len <= right; middle cases - elif left < seg_len <= right: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # Double length of bad phrase for character-based languages. - if character_based: - bad_len = 2 * bad_len - - # Determine random replacement position. For segments longer than - # (bad_len + 1), we enforce that this cannot be sentence initial - # or final, so positions 0 and (seg_len - bad_len -1) are invalid - # and we use an embedded bad_pos in [1, (seg_len - bad_len - 1)]. - # This happens for all seg_len > 3. - bad_pos = 0 - if seg_len - bad_len > 0: - bad_pos = choice(range(seg_len - bad_len)) - - elif seg_len > 3: - bad_pos = choice([x + 1 for x in range(seg_len - bad_len - 1)]) - - ref_pos = 0 - if ref_len - bad_len > 0: - ref_pos = choice(range(ref_len - bad_len)) - - bad_data = ( - seg_data[:bad_pos] - + ref_data[ref_pos : ref_pos + bad_len] - + seg_data[bad_pos + bad_len :] - ) - bad_text = ' '.join(bad_data) - if character_based: - bad_text = ''.join(bad_data) - - return bad_text - - -def create_bad_refs( - docs: Dict[str, List[Tuple[str, str]]], - refs: Dict[str, List[Tuple[str, str]]], - character_based: bool = False, -) -> Dict[str, List[Tuple[str, str]]]: - """ - Creates bad references for given documents. - - For each segment in the given documents, this creates a so-called - ``bad reference'' which is constructed by replacing an embedded - phrase p with a randomly placed phrase p' of the same length, - taken from a different segment contained in refs. The length of - the phrase is relative to the full segment length. - - See _create_bad_ref() definition for length mapping details. - """ - # Create mapping from f'{doc_id}_{seg_id}' to reference text. - all_refs = {} - for curr_doc_id, curr_doc in refs.items(): - for curr_seg_id, curr_ref_text in curr_doc: - all_refs[f'{curr_doc_id}_{curr_seg_id}'] = curr_ref_text - - # Create list of f'{doc_id}_{seg_id}' ids, to be used for random - # choice later when we want to identify a reference to work with. - all_keys = list(all_refs.keys()) - - # Iterate through documents and create bad references. - bad_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict() - for curr_doc_id, curr_doc in docs.items(): - if not curr_doc_id in bad_docs: - bad_docs[curr_doc_id] = [] - - print(f'doc_id: {curr_doc_id},\tdoc_len: {len(curr_doc)}') - for curr_seg in curr_doc: - curr_seg_id, curr_seg_text = curr_seg - - # Bad reference id may not be identical to current id. - bad_id = choice(all_keys) - while bad_id == f'{curr_doc_id}_{curr_seg_id}': - bad_id = choice(all_keys) - - curr_bad_text = _create_bad_ref( - curr_seg_text, - all_refs[bad_id], - character_based=character_based, - ) - - # Ensure that keys can be reused. - all_keys.append(bad_id) - - bad_docs[curr_doc_id].append((curr_seg_id, curr_bad_text)) - - return bad_docs - - -def process_sgml(file_path: str) -> Dict[int, List[str]]: - """ - Extracts document stats from given SGML file. - - Returns dict mapping number of segments to list of document [ids]. - Each referenced document has the respective number of segments. - """ - soup = None - - with open(file_path) as _file: - soup = BeautifulSoup(_file, features='lxml') - - all_docs = [] - stats: Dict[int, List[str]] = defaultdict(list) - for curr_doc in soup.find_all('doc'): - curr_doc_id = curr_doc.attrs['docid'] - seg_count = len(curr_doc.find_all('seg')) - stats[seg_count].append(curr_doc_id) - all_docs.append(seg_count) - - curr_len = 0 - for doc in all_docs: - if curr_len + doc > REQUIRED_SEGS: - print(curr_len) - curr_len = 0 - curr_len += doc - print(curr_len) - - return stats - - -if __name__ == "__main__": - SRC_SGML = sys.argv[1] # Path to source .sgm file - REF_SGML = sys.argv[2] # Path to reference .sgm file - SYS_PATH = sys.argv[3] # Path to the directory with system outputs - SYS_GLOB = sys.argv[4] # Pattern for .sgm files, e.g '*.sgm' - OUT_NAME = sys.argv[5] # Prefix for .csv and .json output files - SRC_LANG = sys.argv[6] # Code for source language, e.g. eng - TGT_LANG = sys.argv[7] # Code for target language, e.g. deu - TASK_MAX = int(sys.argv[8]) # Maximum number of tasks - CONTROLS = sys.argv[9].lower() not in ['', '0', 'false', 'off'] - ENC = 'utf-8' - - RND_SEED = 123456 - seed(RND_SEED) - - print(f'Quality control={CONTROLS}') - if CONTROLS: - REQUIRED_SEGS = 80 - else: - REQUIRED_SEGS = 100 - print(f'Setting REQUIRED_SEGS={REQUIRED_SEGS}') - - print(f'Loading source docs from {SRC_SGML}') - SRC_DOCS = load_docs_from_sgml(SRC_SGML, encoding=ENC) - print(f'Loading reference docs from {SRC_SGML}') - REF_DOCS = load_docs_from_sgml(REF_SGML, encoding=ENC) - - SYS_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = {} - BAD_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = {} - for SYS_SGML in iglob(join(SYS_PATH, SYS_GLOB)): - SYS_ID = basename(SYS_SGML) - print(f'Loading outputs of {SYS_ID}') - - SYS_DOCS[SYS_ID] = load_docs_from_sgml(SYS_SGML, encoding=ENC) - BAD_DOCS[SYS_ID] = create_bad_refs(SYS_DOCS[SYS_ID], REF_DOCS) - - # pylint: disable-msg=invalid-name - some_sys_id = choice(list(SYS_DOCS.keys())) - some_doc_id = choice(list(SYS_DOCS[some_sys_id].keys())) - some_sys_text = SYS_DOCS[some_sys_id][some_doc_id] - some_bad_text = BAD_DOCS[some_sys_id][some_doc_id] - print(some_sys_id, some_doc_id) - - for _s, _b in zip(some_sys_text, some_bad_text): - print(_s) - print(_b) - print('---') - - DOC_STATS: Dict[int, List[Tuple[int, str, str]]] = {} - for sys_id in SYS_DOCS: - for doc_id in SYS_DOCS[sys_id]: - doc_len = len(SYS_DOCS[sys_id][doc_id]) - - # We do not support documents longer than 70 segments. - if doc_len > MAX_DOC_LENGTH: - continue - - if not doc_len in DOC_STATS.keys(): - DOC_STATS[doc_len] = [] - - DOC_STATS[doc_len].append((doc_len, doc_id, sys_id)) - - # Randomise system order - for doc_len in DOC_STATS: - shuffle(DOC_STATS[doc_len]) - - print(sorted(DOC_STATS.keys())) - total_docs = 0 - total_sys = set() - for doc_len in sorted(DOC_STATS.keys()): - print(f'{doc_len}:\t{len(DOC_STATS[doc_len])}') - total_docs += len(DOC_STATS[doc_len]) - for x in DOC_STATS[doc_len]: - total_sys.add(x[2]) - - all_systems = list(total_sys) - sampled_tasks: List[Tuple[Tuple[int, str, str], ...]] = [] - CURR_LEN = 0 - CURR_SYS = 0 - curr_task: List[Tuple[int, str, str]] = [] - while DOC_STATS.keys(): - ALL_KEYS = list(DOC_STATS.keys()) - max_delta = REQUIRED_SEGS - CURR_LEN - valid_keys = [x for x in ALL_KEYS if x <= max_delta] - - if not valid_keys: - print(CURR_LEN) - print(curr_task) - print('------') - sampled_tasks.append(tuple(curr_task)) - CURR_LEN = 0 - curr_task = [] - continue - - if max_delta in valid_keys: - curr_key = max_delta - else: - curr_key = choice(valid_keys) - - CURR_LEN += curr_key - - curr_val = DOC_STATS[curr_key].pop(0) # This takes a random system. - - # Below code would pick systems one after the other - # - # curr_val = None - # for iter_val in DOC_STATS[curr_key]: - # if iter_val[2] == all_systems[CURR_SYS]: - # curr_val = iter_val - # DOC_STATS[curr_key].remove(iter_val) - # break - # - # if not curr_val: - # curr_val = DOC_STATS[curr_key].pop(0) - # CURR_SYS = all_systems.index(curr_val[2]) - # CURR_SYS = (CURR_SYS + 1) % len(all_systems) - - curr_task.append(curr_val) - if not DOC_STATS[curr_key]: - DOC_STATS.pop(curr_key) - - # Shuffle order of tasks - shuffle(sampled_tasks) - - padded_tasks: List[Tuple[Tuple[int, str, str], ...]] = [] - for tid, task in enumerate(sampled_tasks): - task_docs = len(task) - task_len = sum([x[0] for x in task]) - print(f'task_len: {task_len}') - if task_len > MAX_TASK_SIZE: - raise NotImplementedError( - 'No support for tasks >{0} items!'.format(MAX_TASK_SIZE) - ) - - elif task_len < MAX_TASK_SIZE: - pad_size = MAX_TASK_SIZE - task_len - pad_data: List[Tuple[int, str, str]] = list(task) - pad_pos = 0 - while pad_size > 0: - print(f'pad_size: {pad_size}') - print(f'pad_pos: {pad_pos}') - pad_data.append(tuple(list(pad_data[pad_pos]) + [True])) # type: ignore - print(pad_data[-1]) - pad_size -= pad_data[-1][0] - pad_pos = (pad_pos + 1) % task_docs - if pad_size < 0: - print(f'pad_size: {pad_size}') - print(f'pad_pos: {pad_pos}') - - last_doc: Tuple[int, str, str] = pad_data[-1] - print(last_doc[0], '-->', last_doc[0] + pad_size) - fixed_doc = (last_doc[0] + pad_size, *last_doc[1:]) - pad_data[-1] = fixed_doc - print(pad_data[-1][0]) - padded_tasks.append(tuple(pad_data)) - print(padded_tasks[-1]) - - else: - print(f'WARNING: no control items in task no. {tid}') - # raise NotImplementedError('Needs isControl=True update!') - padded_tasks.append(tuple(task)) # TODO: does this ever occur? - - csv_data = [] - task_id = 0 - for task in padded_tasks: - task_id += 1 - task_len = sum([x[0] for x in task]) - print(f'task_len: {task_len}') - - for _doc in task: - _data = [str(task_id)] - for x in _doc: # type: ignore - _data.append(str(x)) - - if _data[-1] != 'True': - _data.append('False') # isControl=False - print(_data) - csv_data.append(','.join(_data)) - - with open(f'{OUT_NAME}.csv', mode='w') as _file: - for csv_line in csv_data: - _file.write(csv_line) - _file.write('\n') - - json_data = [] - batch_id = 0 - for task in padded_tasks[:TASK_MAX]: - # Remember, batch numbers are one-based - task_data = OrderedDict( - { - 'batchNo': batch_id + 1, - 'batchSize': 100, - 'sourceLanguage': SRC_LANG, - 'targetLanguage': TGT_LANG, - 'requiredAnnotations': 1, - 'randomSeed': RND_SEED, - } - ) - - source_id = basename(SRC_SGML) - - items_data = [] - _item = 0 - for doc_data in task: - doc_len, doc_id, sys_id, *rest = doc_data # type: ignore - - isControl = rest is not None and rest - - target_id = sys_id - - _src = {} - _ref = {} - _bad = {} - _tgt = {} - - for item_id, item_src in SRC_DOCS[doc_id]: - seg_id = f'{doc_id}_{item_id}' - _src[seg_id] = item_src - - for item_id, item_ref in REF_DOCS[doc_id]: - seg_id = f'{doc_id}_{item_id}' - _ref[seg_id] = item_ref - - for item_id, item_bad in BAD_DOCS[sys_id][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _bad[seg_id] = item_bad - - for item_id, item_tgt in SYS_DOCS[sys_id][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _tgt[seg_id] = item_tgt - - seg_counter = 0 - context_src: List[Text] = [] - context_ref: List[Text] = [] - context_bad: List[Text] = [] - context_tgt: List[Text] = [] - for seg_id in _src: - if seg_counter >= doc_len: # Padding tasks are shorter! - break - item_src = _src[seg_id] - item_ref = _ref[seg_id] - item_bad = _bad[seg_id] - item_tgt = _tgt[seg_id] - - target_text = item_tgt - target_type = 'TGT' - if ( - CONTROLS and isControl - ): # Do not generate any BAD items if QC is disabled - randomCoinFlip = choice( - [False, False, True, True, True] - ) # 60:40 chance - if randomCoinFlip: - target_text = item_bad - target_type = 'BAD' - - obj: Dict[str, Any] = OrderedDict() - obj['_item'] = _item - obj['_block'] = -1 - obj['sourceID'] = source_id - obj['sourceContextLeft'] = ' '.join(context_src) - obj['sourceText'] = item_src - obj['targetID'] = target_id - obj['targetContextLeft'] = ' '.join(context_tgt) - obj['targetText'] = target_text - obj['itemID'] = seg_counter - obj['itemType'] = target_type - obj['documentID'] = doc_id - obj['isCompleteDocument'] = False - - print(seg_id) - print(' '.join(context_src)) - print(item_src) - print('...') - print(' '.join(context_tgt)) - print(item_tgt.encode('utf-8')) - print('---') - - context_src.append(item_src) - context_ref.append(item_ref) - context_bad.append(item_bad) - context_tgt.append(target_text) - - items_data.append(obj) - _item += 1 - seg_counter += 1 - - obj = OrderedDict() - obj['_item'] = _item - obj['_block'] = -1 - obj['sourceID'] = source_id - obj['sourceText'] = ' '.join(context_src) # full document - obj['targetID'] = target_id - obj['targetText'] = ' '.join(context_tgt) # full document - obj['itemID'] = item_id - obj['itemType'] = 'TGT' - obj['documentID'] = doc_id - obj['isCompleteDocument'] = True - items_data.append(obj) - - output_data = OrderedDict({'task': task_data, 'items': items_data}) - - json_data.append(output_data) - - # write out JSON - json_text = json_dumps(json_data, indent=2, sort_keys=True) - - json_file_name = f'{OUT_NAME}.json' - with open(json_file_name, mode='w', encoding='utf8') as out_file: - sys.stdout.write('Creating {0} ... '.format(json_file_name, ending='')) # type: ignore - out_file.write(str(json_text)) - sys.stdout.write('OK\n') - - batch_id += 1 - - print(f'Total tasks: {len(sampled_tasks)}') - print(f'Total docs: {total_docs}') - print(f'Total sys: {len(total_sys)} {total_sys}') diff --git a/scripts/create_wmt21_tasks.py b/scripts/create_wmt21_tasks.py deleted file mode 100644 index ea9fd14a..00000000 --- a/scripts/create_wmt21_tasks.py +++ /dev/null @@ -1,709 +0,0 @@ -# pylint: disable=C0103,C0111,C0330,E1101 -import sys -from collections import defaultdict -from collections import OrderedDict -from glob import iglob -from json import dumps as json_dumps -from os.path import basename -from os.path import join -from random import choice -from random import randint -from random import seed -from random import shuffle -from typing import Any -from typing import Dict -from typing import List -from typing import Text -from typing import Tuple - -from lxml import etree - - -MAX_TASK_SIZE = 100 # No support for tasks over 100 items -MAX_DOC_LENGTH = 70 # We do not support documents longer than 70 segments - -MISSING_TRANSLATION_MESSAGE = ("NO TRANSLATION AVAILABLE",) -DEFAULT_TRANSLATOR = "DEFAULT" -# If False, documents with control items will be very last ones in each batch -SHUFFLE_DOCS_WITH_CONTROL_ITEMS = True -# If True, add references as additional system outputs -INCLUDE_REFERENCES_AS_SYSTEMS = True -REFERENCE_AS_SYSTEM_PREFIX = 'translator-' - - -def unwrap_xml( - xml_file, - missing_message=MISSING_TRANSLATION_MESSAGE, - encoding='utf-8', -): - """ - Unwraps an xml file in WMT format, producing source and (if present) reference files - - :param xml_file: The xml file (or fd) - :param missing_message: The message to insert when no reference - - :returns: src_lang, src_lines, ref_lang, ref_lines, hyp_lang, hyp_lines - - ref_lines maps translator to document to tuples of segment id and line text - hyp_lines maps system to document to tuples of segment id and line text - - ref_lang and hyp_lang may be None, and then their lines are empty - note: a single language is assumed for each of sources, refs and hyps - - This function has been extracted from - https://github.com/wmt-conference/wmt-format-tools/wmtformat/unwrap.py with - some modifications - """ - tree = etree.parse(xml_file) - - # Find and check the documents (src, ref, hyp) - src_langs, ref_langs, hyp_langs, translators, systems = ( - set(), - set(), - set(), - set(), - set(), - ) - - for src_doc in tree.getroot().findall(".//src"): - src_langs.add(src_doc.get("lang")) - - for ref_doc in tree.getroot().findall(".//ref"): - ref_langs.add(ref_doc.get("lang")) - translator = ref_doc.get("translator") - if translator: - translators.add(translator) - - for hyp_doc in tree.getroot().findall(".//hyp"): - hyp_langs.add(hyp_doc.get("lang")) - systems.add(hyp_doc.get("system")) - - if len(src_langs) > 1: - raise RuntimeError("Multiple source languages found") - - if len(src_langs) == 0: - raise RuntimeError("No source languages found") - - src_lang = src_langs.pop() - src_docs = OrderedDict() - - if len(ref_langs) > 1: - raise RuntimeError("Multiple reference languages found") - - translators = list(translators) - if len(ref_langs) > 0: - if len(translators) == 0: - print("No translator identifiers found") - translators.append(DEFAULT_TRANSLATOR) - ref_lang = ref_langs.pop() - ref_docs = OrderedDict( - (translator, OrderedDict()) for translator in translators - ) - else: - print("No references found") - ref_lang = None - ref_docs = OrderedDict() - - if len(hyp_langs) > 1: - raise RuntimeError("Multiple hypothesis languages found") - - systems = list(systems) - if len(hyp_langs) > 0: - hyp_docs = OrderedDict((system, OrderedDict()) for system in systems) - hyp_lang = hyp_langs.pop() - else: - hyp_docs = OrderedDict() - hyp_lang = None - - # Extract text - src_sent_count, doc_count = 0, 0 - for doc in tree.getroot().findall(".//doc"): - doc_id = doc.get("id") - src = [] - if "testsuite" in doc.attrib: - continue - doc_count += 1 - src_sents = {int(seg.get("id")): seg.text for seg in doc.findall(".//src//seg")} - - def get_sents(doc): - return { - int(seg.get("id")): seg.text if seg.text else "" - for seg in doc.findall(f".//seg") - } - - if ref_lang: - _ref_docs = doc.findall(".//ref") - trans_to_ref = {} - - # If no translator identifiers, we just read one reference (if any) - # If there are translator identifiers, we add a reference for each translator - if len(translators) == 1 and DEFAULT_TRANSLATOR in translators: - if len(_ref_docs): - trans_to_ref[DEFAULT_TRANSLATOR] = get_ref_sents(_ref_docs[0]) - else: - trans_to_ref[DEFAULT_TRANSLATOR] = {} - else: - trans_to_ref = { - ref_doc.get("translator"): get_sents(ref_doc) - for ref_doc in _ref_docs - } - - if hyp_lang: - _hyp_docs = doc.findall(".//hyp") - system_to_ref = { - hyp_doc.get("system"): get_sents(hyp_doc) for hyp_doc in _hyp_docs - } - - for seg_id in sorted(src_sents.keys()): - src.append([seg_id, src_sents[seg_id]]) - src_sent_count += 1 - if ref_lang: - for translator in translators: - if doc_id not in ref_docs[translator]: - ref_docs[translator][doc_id] = [] - - # _ref_text = trans_to_ref.get(translator, {translator: {}}).get( - _ref_text = trans_to_ref[translator].get(seg_id, missing_message) - ref_docs[translator][doc_id].append((seg_id, _ref_text)) - - if _ref_text == MISSING_TRANSLATION_MESSAGE: - print( - f'Warning: missing reference for translator {translator}, ' - f'document {doc_id}, segment {seg_id}' - ) - if hyp_lang: - for system in systems: - if doc_id not in hyp_docs[system]: - hyp_docs[system][doc_id] = [] - - # _hyp_text = system_to_ref.get(system, {system: {}}).get( - _hyp_text = system_to_ref[system].get(seg_id, missing_message) - hyp_docs[system][doc_id].append((seg_id, _hyp_text)) - - if _hyp_text == MISSING_TRANSLATION_MESSAGE: - print( - f'Warning: missing translation from {system}, ' - f'document {doc_id}, segment {seg_id}' - ) - - src_docs[doc_id] = src - - print( - f"Extracted {doc_count} document(s) containing {src_sent_count} sentences in {src_lang}" - ) - - return src_lang, src_docs, ref_lang, ref_docs, hyp_lang, hyp_docs - - -def _create_bad_ref(seg_text: str, ref_text: str, character_based: bool = False) -> str: - """ - Creates bad reference for given text. - - Segment length (a, b] to phrase length (excluding a, including b) - mapping defined as follows: - ( 0, 1] : 1 - ( 1, 5] : 2 - ( 5, 8] : 3 - ( 8, 15] : 4 - (15, 20] : 5 - (20, max] : 6 - - For character-based languages, which do not support tokenisation - by whitespace, the resulting phrase length will be doubled, and - is interpreted as a character length. - """ - seg_data = seg_text.split(' ') - ref_data = ref_text.split(' ') - - if character_based: - seg_data = [x for x in seg_text] - ref_data = [x for x in ref_text] - - seg_len = len(seg_data) - ref_len = len(ref_data) - - # Determine length of bad phrase, relative to segment length. - _seg_to_bad_mapping = { - (None, 1): 1, - (1, 5): 2, - (5, 8): 3, - (8, 15): 4, - (15, 20): 5, - (20, None): 6, - } - - bad_len = 0 - for seg_pair in _seg_to_bad_mapping: - left, right = seg_pair - - # seg_len == right; left edge case - if not left: - if seg_len == right: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # left < seg_len; right edge case - elif not right: - if left < seg_len: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # left < seg_len <= right; middle cases - elif left < seg_len <= right: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # Double length of bad phrase for character-based languages. - if character_based: - bad_len = 2 * bad_len - - # Determine random replacement position. For segments longer than - # (bad_len + 1), we enforce that this cannot be sentence initial - # or final, so positions 0 and (seg_len - bad_len -1) are invalid - # and we use an embedded bad_pos in [1, (seg_len - bad_len - 1)]. - # This happens for all seg_len > 3. - bad_pos = 0 - if seg_len - bad_len > 0: - bad_pos = choice(range(seg_len - bad_len)) - - elif seg_len > 3: - _xs = max(1, seg_len - bad_len - 1) - bad_pos = choice([x + 1 for x in range(_xs)]) - - ref_pos = 0 - if ref_len - bad_len > 0: - ref_pos = choice(range(ref_len - bad_len)) - - bad_data = ( - seg_data[:bad_pos] - + ref_data[ref_pos : ref_pos + bad_len] - + seg_data[bad_pos + bad_len :] - ) - bad_text = ' '.join(bad_data) - if character_based: - bad_text = ''.join(bad_data) - - # print(seg_text) - # print(bad_text) - # print('------------') - return bad_text - - -def create_bad_refs( - docs: Dict[str, List[Tuple[str, str]]], - refs: Dict[str, List[Tuple[str, str]]], - character_based: bool = False, -) -> Dict[str, List[Tuple[str, str]]]: - """ - Creates bad references for given documents. - - For each segment in the given documents, this creates a so-called - ``bad reference'' which is constructed by replacing an embedded - phrase p with a randomly placed phrase p' of the same length, - taken from a different segment contained in refs. The length of - the phrase is relative to the full segment length. - - See _create_bad_ref() definition for length mapping details. - """ - # Create mapping from f'{doc_id}_{seg_id}' to reference text. - all_refs = {} - for curr_doc_id, curr_doc in refs.items(): - for curr_seg_id, curr_ref_text in curr_doc: - all_refs[f'{curr_doc_id}_{curr_seg_id}'] = curr_ref_text - - # Create list of f'{doc_id}_{seg_id}' ids, to be used for random - # choice later when we want to identify a reference to work with. - all_keys = list(all_refs.keys()) - - # Iterate through documents and create bad references. - bad_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict() - for curr_doc_id, curr_doc in docs.items(): - if not curr_doc_id in bad_docs: - bad_docs[curr_doc_id] = [] - - print(f'doc_id: {curr_doc_id},\tdoc_len: {len(curr_doc)}') - for curr_seg in curr_doc: - curr_seg_id, curr_seg_text = curr_seg - - # Bad reference id may not be identical to current id. - bad_id = choice(all_keys) - while bad_id == f'{curr_doc_id}_{curr_seg_id}': - bad_id = choice(all_keys) - - curr_bad_text = _create_bad_ref( - curr_seg_text, - all_refs[bad_id], - character_based=character_based, - ) - - # Ensure that keys can be reused. - all_keys.append(bad_id) - - bad_docs[curr_doc_id].append((curr_seg_id, curr_bad_text)) - - return bad_docs - - -if __name__ == "__main__": - if len(sys.argv) < 8: - print('Example usage:') - print( - f' {sys.argv[0]} newstest2021.en-de.all.xml batches.en-de enu deu 50 True False' - ) - exit() - - XML_FILE = sys.argv[1] # Path to .xml file with sources, references and outputs - OUT_NAME = sys.argv[2] # Prefix for .csv and .json output files - SRC_LANG = sys.argv[3] # Code for source language, e.g. eng - TGT_LANG = sys.argv[4] # Code for target language, e.g. deu - TASK_MAX = int(sys.argv[5]) # Maximum number of tasks - CONTROLS = sys.argv[6].lower() not in ['', '0', 'false', 'off'] # Generate QC items - CHARLANG = sys.argv[7].lower() in ['1', 'true', 'on'] # Character-based - print(f'Character based={CHARLANG}') - - ENC = 'utf-8' - - RND_SEED = 123456 - seed(RND_SEED) - - print(f'Quality control={CONTROLS}') - if CONTROLS: - REQUIRED_SEGS = 80 - else: - REQUIRED_SEGS = 100 - print(f'Setting REQUIRED_SEGS={REQUIRED_SEGS}') - - SYS_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict() - BAD_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict() - print(f'Loading docs from {XML_FILE}') - src_lang, SRC_DOCS, ref_lang, REF_DOCS, hyp_lang, SYS_DOCS = unwrap_xml( - XML_FILE, encoding=ENC - ) - - # This reference will be used for generating BAD items - REF_ID = sorted(list(REF_DOCS.keys()))[0] - print(f'Using reference "{REF_ID}"') - - # Add references as additional system outputs - if INCLUDE_REFERENCES_AS_SYSTEMS: - for ref_id in sorted(list(REF_DOCS.keys())): - sys_id = REFERENCE_AS_SYSTEM_PREFIX + ref_id - print(f'Adding reference "{ref_id}" as system output "{sys_id}"') - SYS_DOCS[sys_id] = REF_DOCS[ref_id] - - # List of system names that can be iterated deterministically - SYS_IDS = sorted(list(SYS_DOCS.keys())) - - for sys_id in SYS_IDS: - print(f'Generating bad references for {sys_id}') - BAD_DOCS[sys_id] = create_bad_refs( - SYS_DOCS[sys_id], REF_DOCS[REF_ID], character_based=CHARLANG - ) - - # pylint: disable-msg=invalid-name - some_sys_id = choice(SYS_IDS) - some_doc_id = choice(sorted(list(SYS_DOCS[some_sys_id].keys()))) - some_sys_text = SYS_DOCS[some_sys_id][some_doc_id] - some_bad_text = BAD_DOCS[some_sys_id][some_doc_id] - print(some_sys_id, some_doc_id) - - for _s, _b in zip(some_sys_text, some_bad_text): - print(_s) - print(_b) - print('---') - - DOC_STATS: Dict[int, List[Tuple[int, str, str]]] = OrderedDict() - for sys_id in SYS_IDS: - for doc_id in SYS_DOCS[sys_id].keys(): - doc_len = len(SYS_DOCS[sys_id][doc_id]) - - # We do not support documents longer than 70 segments. - if doc_len > MAX_DOC_LENGTH: - continue - - if not doc_len in DOC_STATS.keys(): - DOC_STATS[doc_len] = [] - - DOC_STATS[doc_len].append((doc_len, doc_id, sys_id)) - - # Randomise system order - for doc_len in DOC_STATS: - shuffle(DOC_STATS[doc_len]) - - print(DOC_STATS.keys()) - total_docs = 0 - total_sys = set() - for doc_len in DOC_STATS.keys(): - print(f'{doc_len}:\t{len(DOC_STATS[doc_len])}') - total_docs += len(DOC_STATS[doc_len]) - for x in DOC_STATS[doc_len]: - total_sys.add(x[2]) - - all_systems = list(total_sys) - sampled_tasks: List[Tuple[Tuple[int, str, str], ...]] = [] - CURR_LEN = 0 - CURR_SYS = 0 - curr_task: List[Tuple[int, str, str]] = [] - while DOC_STATS.keys(): - ALL_KEYS = sorted(list(DOC_STATS.keys())) - max_delta = REQUIRED_SEGS - CURR_LEN - valid_keys = [x for x in ALL_KEYS if x <= max_delta] - - if not valid_keys: - print(CURR_LEN) - print(curr_task) - print('------') - sampled_tasks.append(tuple(curr_task)) - CURR_LEN = 0 - curr_task = [] - continue - - if max_delta in valid_keys: - curr_key = max_delta - else: - curr_key = choice(valid_keys) - - CURR_LEN += curr_key - - curr_val = DOC_STATS[curr_key].pop(0) # This takes a random system. - - # Below code would pick systems one after the other - # - # curr_val = None - # for iter_val in DOC_STATS[curr_key]: - # if iter_val[2] == all_systems[CURR_SYS]: - # curr_val = iter_val - # DOC_STATS[curr_key].remove(iter_val) - # break - # - # if not curr_val: - # curr_val = DOC_STATS[curr_key].pop(0) - # CURR_SYS = all_systems.index(curr_val[2]) - # CURR_SYS = (CURR_SYS + 1) % len(all_systems) - - curr_task.append(curr_val) - if not DOC_STATS[curr_key]: - DOC_STATS.pop(curr_key) - - # Shuffle order of tasks - shuffle(sampled_tasks) - - padded_tasks: List[Tuple[Tuple[int, str, str], ...]] = [] - for tid, task in enumerate(sampled_tasks): - task_docs = len(task) - task_len = sum([x[0] for x in task]) - print(f'task_len: {task_len}') - if task_len > MAX_TASK_SIZE: - raise NotImplementedError( - 'No support for tasks >{0} items!'.format(MAX_TASK_SIZE) - ) - - elif task_len < MAX_TASK_SIZE: - pad_size = MAX_TASK_SIZE - task_len - pad_data: List[Tuple[int, str, str]] = list(task) - pad_pos = 0 - while pad_size > 0: - print(f'pad_size: {pad_size}') - print(f'pad_pos: {pad_pos}') - pad_data.append(tuple(list(pad_data[pad_pos]) + [True])) # type: ignore - print(pad_data[-1]) - pad_size -= pad_data[-1][0] - pad_pos = (pad_pos + 1) % task_docs - if pad_size < 0: - print(f'pad_size: {pad_size}') - print(f'pad_pos: {pad_pos}') - - last_doc: Tuple[int, str, str] = pad_data[-1] - print(last_doc[0], '-->', last_doc[0] + pad_size) - fixed_doc = (last_doc[0] + pad_size, *last_doc[1:]) - pad_data[-1] = fixed_doc - print(pad_data[-1][0]) - padded_tasks.append(tuple(pad_data)) - print(padded_tasks[-1]) - - else: - print(f'WARNING: no control items in task no. {tid}') - # raise NotImplementedError('Needs isControl=True update!') - padded_tasks.append(tuple(task)) # TODO: does this ever occur? - - csv_data = [] - task_id = 0 - for task in padded_tasks: - task_id += 1 - task_len = sum([x[0] for x in task]) - print(f'task_len: {task_len}') - - for _doc in task: - _data = [str(task_id)] - for x in _doc: # type: ignore - _data.append(str(x)) - - if _data[-1] != 'True': - _data.append('False') # isControl=False - print(_data) - csv_data.append(','.join(_data)) - - with open(f'{OUT_NAME}.csv', mode='w') as _file: - for csv_line in csv_data: - _file.write(csv_line) - _file.write('\n') - - json_data = [] - batch_id = 0 - for task in padded_tasks[:TASK_MAX]: - # Remember, batch numbers are one-based - task_data = OrderedDict( - { - 'batchNo': batch_id + 1, - 'batchSize': 100, - 'sourceLanguage': SRC_LANG, - 'targetLanguage': TGT_LANG, - 'requiredAnnotations': 1, - 'randomSeed': RND_SEED, - } - ) - - source_id = basename(XML_FILE) - - items_data: List[List[Dict[str, Any]]] = [] # Keeps items grouped into document - _item = 0 - doc_counter = 0 - for doc_data in task: - items_data.append([]) # Add a new bucket for items from this documents - has_control_item = False - - doc_len, doc_id, sys_id, *rest = doc_data # type: ignore - - isControl = rest is not None and rest - - target_id = sys_id - - _src = {} - _ref = {} - _bad = {} - _tgt = {} - - for item_id, item_src in SRC_DOCS[doc_id]: - seg_id = f'{doc_id}_{item_id}' - _src[seg_id] = item_src - - for item_id, item_ref in REF_DOCS[REF_ID][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _ref[seg_id] = item_ref - - for item_id, item_bad in BAD_DOCS[sys_id][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _bad[seg_id] = item_bad - - for item_id, item_tgt in SYS_DOCS[sys_id][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _tgt[seg_id] = item_tgt - - seg_counter = 0 - context_src: List[Text] = [] - context_ref: List[Text] = [] - context_bad: List[Text] = [] - context_tgt: List[Text] = [] - for seg_id in _src: - if seg_counter >= doc_len: # Padding tasks are shorter! - break - item_src = _src[seg_id] - item_ref = _ref[seg_id] - item_bad = _bad[seg_id] - item_tgt = _tgt[seg_id] - - target_text = item_tgt - target_type = 'TGT' - - # Do not generate any BAD items if QC is disabled - if CONTROLS and isControl: - randomCoinFlip = choice( - [False, False, True, True, True] # 60:40 chance - ) - if randomCoinFlip: - target_text = item_bad - target_type = 'BAD' - has_control_item = True - - obj: Dict[str, Any] = OrderedDict() - obj['_item'] = _item - obj['_block'] = -1 - obj['sourceID'] = source_id - obj['sourceContextLeft'] = ' '.join(context_src) - obj['sourceText'] = item_src - obj['targetID'] = target_id - obj['targetContextLeft'] = ' '.join(context_tgt) - obj['targetText'] = target_text - obj['itemID'] = seg_counter - obj['itemType'] = target_type - obj['documentID'] = doc_id - obj['isCompleteDocument'] = False - - # print(seg_id) - # print(' '.join(context_src)) - # print(item_src) - # print('...') - # print(' '.join(context_tgt)) - # print(item_tgt.encode('utf-8')) - # print('---') - - context_src.append(item_src) - context_ref.append(item_ref) - context_bad.append(item_bad) - context_tgt.append(target_text) - - items_data[-1].append(obj) - _item += 1 - seg_counter += 1 - - obj = OrderedDict() - obj['_item'] = _item - obj['_block'] = -1 - obj['sourceID'] = source_id - obj['sourceText'] = ' '.join(context_src) # full document - obj['targetID'] = target_id - obj['targetText'] = ' '.join(context_tgt) # full document - obj['itemID'] = item_id - obj['itemType'] = 'TGT' - obj['documentID'] = doc_id - obj['isCompleteDocument'] = True - items_data[-1].append(obj) - - if has_control_item and SHUFFLE_DOCS_WITH_CONTROL_ITEMS: - # Move the document with control items to a random position so - # that they are not accumulated as very last documents - _bad_doc = items_data.pop() - _pos = randint(0, len(items_data) - 1) - print(f' Moving the last QC document to position {_pos}') - items_data.insert(_pos, _bad_doc) - - # Extract items from documents - _items_data = [item for doc_items in items_data for item in doc_items] - # Re-assign _item numbers - if SHUFFLE_DOCS_WITH_CONTROL_ITEMS: - _item = 0 - for i in range(len(_items_data)): - _items_data[i]['_item'] = _item - if _items_data[i]['isCompleteDocument'] == False: - _item += 1 - - output_data = OrderedDict({'task': task_data, 'items': _items_data}) - - json_data.append(output_data) - - # write out JSON - json_text = json_dumps(json_data, indent=2, sort_keys=True) - - json_file_name = f'{OUT_NAME}.json' - with open(json_file_name, mode='w', encoding='utf8') as out_file: - sys.stdout.write( - 'Creating {0}, batch no. {1} ... '.format(json_file_name, batch_id + 1), - ) - out_file.write(str(json_text)) - sys.stdout.write('OK\n') - - batch_id += 1 - - print(f'Total tasks: {len(sampled_tasks)}') - print(f'Total docs: {total_docs}') - print(f'Total sys: {len(total_sys)} {sorted(list(total_sys))}') diff --git a/scripts/create_wmt22_tasks.py b/scripts/create_wmt22_tasks.py deleted file mode 100644 index 43e8e0bd..00000000 --- a/scripts/create_wmt22_tasks.py +++ /dev/null @@ -1,1023 +0,0 @@ -# pylint: disable=C0103,C0111,C0330,E1101 -import argparse -import sys -from collections import OrderedDict -from copy import deepcopy -from glob import iglob -from json import dumps as json_dumps -from os.path import basename -from os.path import join -from random import choice -from random import randint -from random import seed -from random import shuffle -from typing import Any -from typing import Dict -from typing import List -from typing import Text -from typing import Tuple - -from lxml import etree - - -MAX_TASK_SIZE = 100 # No support for tasks over 100 items -MAX_DOC_LENGTH = 70 # We do not support documents longer than 70 segments - -MISSING_TRANSLATION_MESSAGE = ("NO TRANSLATION AVAILABLE",) -DEFAULT_TRANSLATOR = "DEFAULT" -# If False, documents with control items will be very last ones in each batch -SHUFFLE_DOCS_WITH_CONTROL_ITEMS = True -# If True, add references as additional system outputs -INCLUDE_REFERENCES_AS_SYSTEMS = True -# If True, documents may be oversampled to form the last batch -USE_ALL_DOCUMENTS_AND_ALL_SYSTEMS = True -REFERENCE_AS_SYSTEM_PREFIX = 'translator-' - - -def unwrap_xml( - xml_file, - missing_message=MISSING_TRANSLATION_MESSAGE, - encoding='utf-8', -): - """ - Unwraps an xml file in WMT format, producing source and (if present) reference files - - :param xml_file: The xml file (or fd) - :param missing_message: The message to insert when no reference - - :returns: src_lang, src_lines, ref_lang, ref_lines, hyp_lang, hyp_lines - - ref_lines maps translator to document to tuples of segment id and line text - hyp_lines maps system to document to tuples of segment id and line text - - ref_lang and hyp_lang may be None, and then their lines are empty - note: a single language is assumed for each of sources, refs and hyps - - This function has been extracted from - https://github.com/wmt-conference/wmt-format-tools/wmtformat/unwrap.py with - some modifications - """ - tree = etree.parse(xml_file) - - # Find and check the documents (src, ref, hyp) - src_langs, ref_langs, hyp_langs, translators, systems = ( - set(), - set(), - set(), - set(), - set(), - ) - - for src_doc in tree.getroot().findall(".//src"): - src_langs.add(src_doc.get("lang")) - - for ref_doc in tree.getroot().findall(".//ref"): - ref_langs.add(ref_doc.get("lang")) - translator = ref_doc.get("translator") - if translator: - translators.add(translator) - - for hyp_doc in tree.getroot().findall(".//hyp"): - hyp_langs.add(hyp_doc.get("lang")) - systems.add(hyp_doc.get("system")) - - if len(src_langs) > 1: - raise RuntimeError("Multiple source languages found") - - if len(src_langs) == 0: - raise RuntimeError("No source languages found") - - src_lang = src_langs.pop() - src_docs = OrderedDict() - - if len(ref_langs) > 1: - raise RuntimeError("Multiple reference languages found") - - translators = list(translators) - if len(ref_langs) > 0: - if len(translators) == 0: - print("No translator identifiers found") - translators.append(DEFAULT_TRANSLATOR) - ref_lang = ref_langs.pop() - ref_docs = OrderedDict( - (translator, OrderedDict()) for translator in translators - ) - else: - print("No references found") - ref_lang = None - ref_docs = OrderedDict() - - if len(hyp_langs) > 1: - raise RuntimeError(f"Multiple hypothesis languages found: {hyp_langs}") - - systems = list(systems) - if len(hyp_langs) > 0: - hyp_docs = OrderedDict((system, OrderedDict()) for system in systems) - hyp_lang = hyp_langs.pop() - else: - hyp_docs = OrderedDict() - hyp_lang = None - - # Extract text - src_sent_count, doc_count = 0, 0 - for doc in tree.getroot().findall(".//doc"): - doc_id = doc.get("id") - src = [] - if "testsuite" in doc.attrib: - continue - doc_count += 1 - src_sents = {int(seg.get("id")): seg.text for seg in doc.findall(".//src//seg")} - - def get_sents(doc): - return { - int(seg.get("id")): seg.text if seg.text else "" - for seg in doc.findall(f".//seg") - } - - if ref_lang: - _ref_docs = doc.findall(".//ref") - trans_to_ref = {} - - # If no translator identifiers, we just read one reference (if any) - # If there are translator identifiers, we add a reference for each translator - if len(translators) == 1 and DEFAULT_TRANSLATOR in translators: - if len(_ref_docs): - trans_to_ref[DEFAULT_TRANSLATOR] = get_ref_sents(_ref_docs[0]) - else: - trans_to_ref[DEFAULT_TRANSLATOR] = {} - else: - trans_to_ref = { - ref_doc.get("translator"): get_sents(ref_doc) - for ref_doc in _ref_docs - } - - if hyp_lang: - _hyp_docs = doc.findall(".//hyp") - system_to_ref = { - hyp_doc.get("system"): get_sents(hyp_doc) for hyp_doc in _hyp_docs - } - - for seg_id in sorted(src_sents.keys()): - src.append([seg_id, src_sents[seg_id]]) - src_sent_count += 1 - if ref_lang: - for translator in translators: - if doc_id not in ref_docs[translator]: - ref_docs[translator][doc_id] = [] - - # _ref_text = trans_to_ref.get(translator, {translator: {}}).get( - _ref_text = trans_to_ref[translator].get(seg_id, missing_message) - ref_docs[translator][doc_id].append((seg_id, _ref_text)) - - if _ref_text == MISSING_TRANSLATION_MESSAGE: - print( - f'Warning: missing reference for translator {translator}, ' - f'document {doc_id}, segment {seg_id}' - ) - if hyp_lang: - for system in systems: - if doc_id not in hyp_docs[system]: - hyp_docs[system][doc_id] = [] - - # _hyp_text = system_to_ref.get(system, {system: {}}).get( - _hyp_text = system_to_ref[system].get(seg_id, missing_message) - hyp_docs[system][doc_id].append((seg_id, _hyp_text)) - - if _hyp_text == MISSING_TRANSLATION_MESSAGE: - print( - f'Warning: missing translation from {system}, ' - f'document {doc_id}, segment {seg_id}' - ) - - src_docs[doc_id] = src - - print( - f"Extracted {doc_count} document(s) containing {src_sent_count} sentences in {src_lang}" - ) - - return src_lang, src_docs, ref_lang, ref_docs, hyp_lang, hyp_docs - - -def chop_docs(orig_src_docs, orig_ref_docs, orig_hyp_docs, max_length=10): - """ - Split documents into chunks of max_length size. - """ - src_docs = OrderedDict() - src_prev = OrderedDict() - src_next = OrderedDict() - for doc_id, segs in orig_src_docs.items(): - for chunk_id, (chunk, prev_ctx, next_ctx) in enumerate( - _split_list(segs, max_length) - ): - src_docs[f"{doc_id}.{chunk_id}"] = list(chunk) - src_prev[f"{doc_id}.{chunk_id}"] = list(prev_ctx) - src_next[f"{doc_id}.{chunk_id}"] = list(next_ctx) - - ref_docs = OrderedDict() - hyp_prev = OrderedDict() - hyp_next = OrderedDict() - for translator in orig_ref_docs: - ref_docs[translator] = OrderedDict() - hyp_prev[REFERENCE_AS_SYSTEM_PREFIX + translator] = OrderedDict() - hyp_next[REFERENCE_AS_SYSTEM_PREFIX + translator] = OrderedDict() - for doc_id, segs in orig_ref_docs[translator].items(): - for chunk_id, (chunk, prev_ctx, next_ctx) in enumerate( - _split_list(segs, max_length) - ): - ref_docs[translator][f"{doc_id}.{chunk_id}"] = list(chunk) - hyp_prev[REFERENCE_AS_SYSTEM_PREFIX + translator][ - f"{doc_id}.{chunk_id}" - ] = list(prev_ctx) - hyp_next[REFERENCE_AS_SYSTEM_PREFIX + translator][ - f"{doc_id}.{chunk_id}" - ] = list(next_ctx) - - hyp_docs = OrderedDict() - for system in orig_hyp_docs: - hyp_docs[system] = OrderedDict() - hyp_prev[system] = OrderedDict() - hyp_next[system] = OrderedDict() - for doc_id, segs in orig_hyp_docs[system].items(): - for chunk_id, (chunk, prev_ctx, next_ctx) in enumerate( - _split_list(segs, max_length) - ): - hyp_docs[system][f"{doc_id}.{chunk_id}"] = list(chunk) - hyp_prev[system][f"{doc_id}.{chunk_id}"] = list(prev_ctx) - hyp_next[system][f"{doc_id}.{chunk_id}"] = list(next_ctx) - - # print(src_prev) - return src_docs, ref_docs, hyp_docs, src_prev, src_next, hyp_prev, hyp_next - - -def select_docs(orig_src_docs, orig_ref_docs, orig_hyp_docs, tsv_file): - """ - Extract preselected segments from given documents and corresponding contexts. - """ - selected_docs = [] - print("Selecting the following documents only:") - with open(tsv_file, "r", encoding="utf8") as tsv: - for line in tsv: - _docid, _segid_first, _segid_last = line.strip().split("\t") - selected_docs.append((_docid, int(_segid_first), int(_segid_last))) - print(f" {selected_docs[-1]}") - - src_docs = OrderedDict() - src_prev = OrderedDict() - src_next = OrderedDict() - for doc_id, seg_id_1, seg_id_2 in selected_docs: - if doc_id not in orig_src_docs: - print( - f"Error: the selected document {doc_id} not found in the XML file/src" - ) - exit() - segs = orig_src_docs[doc_id] - chunk = segs[seg_id_1 - 1 : seg_id_2] - prev_ctx = segs[0 : seg_id_1 - 1] - next_ctx = segs[seg_id_2:] - chunk_id = f"#{seg_id_1}-{seg_id_2}" - - src_docs[f"{doc_id}{chunk_id}"] = chunk - src_prev[f"{doc_id}{chunk_id}"] = prev_ctx - src_next[f"{doc_id}{chunk_id}"] = next_ctx - - ref_docs = OrderedDict() - hyp_prev = OrderedDict() - hyp_next = OrderedDict() - for translator in orig_ref_docs: - ref_docs[translator] = OrderedDict() - hyp_prev[REFERENCE_AS_SYSTEM_PREFIX + translator] = OrderedDict() - hyp_next[REFERENCE_AS_SYSTEM_PREFIX + translator] = OrderedDict() - - for doc_id, seg_id_1, seg_id_2 in selected_docs: - if doc_id not in orig_ref_docs[translator]: - print( - f"Error: the selected document {doc_id} not found in the XML file/ref" - ) - exit() - - segs = orig_ref_docs[translator][doc_id] - chunk = segs[seg_id_1 - 1 : seg_id_2] - prev_ctx = segs[0 : seg_id_1 - 1] - next_ctx = segs[seg_id_2:] - chunk_id = f"#{seg_id_1}-{seg_id_2}" - - ref_docs[translator][f"{doc_id}{chunk_id}"] = chunk - hyp_prev[REFERENCE_AS_SYSTEM_PREFIX + translator][ - f"{doc_id}{chunk_id}" - ] = prev_ctx - hyp_next[REFERENCE_AS_SYSTEM_PREFIX + translator][ - f"{doc_id}{chunk_id}" - ] = next_ctx - - hyp_docs = OrderedDict() - for system in orig_hyp_docs: - hyp_docs[system] = OrderedDict() - hyp_prev[system] = OrderedDict() - hyp_next[system] = OrderedDict() - - for doc_id, seg_id_1, seg_id_2 in selected_docs: - if doc_id not in orig_hyp_docs[system]: - print( - f"Error: the selected document {doc_id} not found in the XML file/hyp" - ) - exit() - - segs = orig_hyp_docs[system][doc_id] - chunk = segs[seg_id_1 - 1 : seg_id_2] - prev_ctx = segs[0 : seg_id_1 - 1] - next_ctx = segs[seg_id_2:] - chunk_id = f"#{seg_id_1}-{seg_id_2}" - - hyp_docs[system][f"{doc_id}{chunk_id}"] = chunk - hyp_prev[system][f"{doc_id}{chunk_id}"] = prev_ctx - hyp_next[system][f"{doc_id}{chunk_id}"] = next_ctx - - return src_docs, ref_docs, hyp_docs, src_prev, src_next, hyp_prev, hyp_next - - -def _split_list(list_a, chunk_size): - for i in range(0, len(list_a), chunk_size): - prev_context = list_a[0:i] - next_context = list_a[i + chunk_size :] - yield list_a[i : i + chunk_size], prev_context, next_context - - -def _create_bad_ref(seg_text: str, ref_text: str, character_based: bool = False) -> str: - """ - Creates bad reference for given text. - - Segment length (a, b] to phrase length (excluding a, including b) - mapping defined as follows: - ( 0, 1] : 1 - ( 1, 5] : 2 - ( 5, 8] : 3 - ( 8, 15] : 4 - (15, 20] : 5 - (20, max] : 6 - - For character-based languages, which do not support tokenisation - by whitespace, the resulting phrase length will be doubled, and - is interpreted as a character length. - """ - seg_data = seg_text.split(' ') - ref_data = ref_text.split(' ') - - if character_based: - seg_data = [x for x in seg_text] - ref_data = [x for x in ref_text] - - seg_len = len(seg_data) - ref_len = len(ref_data) - - # Determine length of bad phrase, relative to segment length. - _seg_to_bad_mapping = { - (None, 1): 2, - (1, 5): 2, - (5, 8): 3, - (8, 15): 4, - (15, 20): 5, - (20, None): 6, - } - - bad_len = 0 - for seg_pair in _seg_to_bad_mapping: - left, right = seg_pair - - # seg_len == right; left edge case - if not left: - if seg_len == right: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # left < seg_len; right edge case - elif not right: - if left < seg_len: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # left < seg_len <= right; middle cases - elif left < seg_len <= right: - bad_len = _seg_to_bad_mapping[seg_pair] - break - - # Double length of bad phrase for character-based languages. - if character_based: - bad_len = 2 * bad_len - - # Determine random replacement position. For segments longer than - # (bad_len + 1), we enforce that this cannot be sentence initial - # or final, so positions 0 and (seg_len - bad_len -1) are invalid - # and we use an embedded bad_pos in [1, (seg_len - bad_len - 1)]. - # This happens for all seg_len > 3. - bad_pos = 0 - if seg_len - bad_len > 0: - bad_pos = choice(range(seg_len - bad_len)) - - elif seg_len > 3: - _xs = max(1, seg_len - bad_len - 1) - bad_pos = choice([x + 1 for x in range(_xs)]) - - ref_pos = 0 - if ref_len - bad_len > 0: - ref_pos = choice(range(ref_len - bad_len)) - - bad_data = ( - seg_data[:bad_pos] - + ref_data[ref_pos : ref_pos + bad_len] - + seg_data[bad_pos + bad_len :] - ) - bad_text = ' '.join(bad_data) - if character_based: - bad_text = ''.join(bad_data) - - # print(seg_text) - # print(bad_text) - # print('------------') - return bad_text - - -def create_bad_refs( - docs: Dict[str, List[Tuple[str, str]]], - refs: Dict[str, List[Tuple[str, str]]], - character_based: bool = False, -) -> Dict[str, List[Tuple[str, str]]]: - """ - Creates bad references for given documents. - - For each segment in the given documents, this creates a so-called - ``bad reference'' which is constructed by replacing an embedded - phrase p with a randomly placed phrase p' of the same length, - taken from a different segment contained in refs. The length of - the phrase is relative to the full segment length. - - See _create_bad_ref() definition for length mapping details. - """ - # Create mapping from f'{doc_id}_{seg_id}' to reference text. - all_refs = {} - for curr_doc_id, curr_doc in refs.items(): - for curr_seg_id, curr_ref_text in curr_doc: - all_refs[f'{curr_doc_id}_{curr_seg_id}'] = curr_ref_text - - # Create list of f'{doc_id}_{seg_id}' ids, to be used for random - # choice later when we want to identify a reference to work with. - all_keys = list(all_refs.keys()) - - # Iterate through documents and create bad references. - bad_docs: Dict[str, List[Tuple[str, str]]] = OrderedDict() - for curr_doc_id, curr_doc in docs.items(): - if not curr_doc_id in bad_docs: - bad_docs[curr_doc_id] = [] - - print(f'doc_id: {curr_doc_id},\tdoc_len: {len(curr_doc)}') - for curr_seg in curr_doc: - curr_seg_id, curr_seg_text = curr_seg - - # Bad reference id may not be identical to current id. - bad_id = choice(all_keys) - while bad_id == f'{curr_doc_id}_{curr_seg_id}': - bad_id = choice(all_keys) - - curr_bad_text = _create_bad_ref( - curr_seg_text, - all_refs[bad_id], - character_based=character_based, - ) - - # Ensure that keys can be reused. - all_keys.append(bad_id) - - bad_docs[curr_doc_id].append((curr_seg_id, curr_bad_text)) - - return bad_docs - - -def parse_cmd_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "-f", - "--xml-file", - help="path to .xml file with sources, references and system outputs", - required=True, - ) - parser.add_argument( - "-o", - "--output-prefix", - help="prefix for .csv and .json output files", - required=True, - ) - parser.add_argument( - "-s", - "--src-lang", - help="ISO code for source language for Appraise", - required=True, - ) - parser.add_argument( - "-t", - "--tgt-lang", - help="ISO code for target language for Appraise", - required=True, - ) - parser.add_argument( - "-c", - "--char-based", - help="target language is character-based", - action="store_true", - ) - parser.add_argument( - "--no-qc", - help="do not generate BAD references as quality control items", - action="store_true", - ) - parser.add_argument( - "--max-tasks", - help="maximum number of tasks to generate, default: 100", - type=int, - default=100, - ) - parser.add_argument( - "--max-segs", - help="maximum number of sentences per document", - type=int, - default=MAX_DOC_LENGTH, - ) - parser.add_argument( - "--rng-seed", - help="seed for random number generator", - type=int, - default=123456, - ) - parser.add_argument( - "--selected-docs", - help="path to a file with preselected documents; format: docid segid1 segid2", - ) - parser.add_argument( - "--static-context", - help="number of preceding/succesive segments to show as a static context", - type=int, - default=MAX_DOC_LENGTH, # a large number should use all available segments - ) - parser.add_argument( - "--even", - help="duplicate one task is necessary to keep the total number of tasks even", - action="store_true", - ) - args = parser.parse_args() - return ( - args.xml_file, - args.output_prefix, - args.src_lang, - args.tgt_lang, - args.char_based, - not args.no_qc, - args.max_tasks, - args.max_segs, - args.rng_seed, - args.selected_docs, - args.static_context, - args.even, - ) - - -if __name__ == "__main__": - """ - Example usage: - python3 create_wmt22_tasks.py -f newstest2021.en-de.all.xml -o batches.en-de -s enu -t deu -m 50 - """ - - ( - XML_FILE, - OUT_NAME, - SRC_LANG, - TGT_LANG, - CHARLANG, - CONTROLS, - TASK_MAX, - MAX_SEGS, - RND_SEED, - SELECTED, - CTX_SIZE, - EVEN_NUM, - ) = parse_cmd_args() - - print(f'Character based={CHARLANG}') - ENC = 'utf-8' - seed(RND_SEED) - - print(f'Quality control={CONTROLS}') - if not CONTROLS or TGT_LANG == 'sgg': # no BAD refs if the target size has videos - REQUIRED_SEGS = 100 - else: - REQUIRED_SEGS = 80 - print(f'Setting REQUIRED_SEGS={REQUIRED_SEGS}') - - SYS_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict() - BAD_DOCS: Dict[str, Dict[str, List[Tuple[str, str]]]] = OrderedDict() - print(f'Loading docs from {XML_FILE}') - src_lang, SRC_DOCS, ref_lang, REF_DOCS, hyp_lang, SYS_DOCS = unwrap_xml( - XML_FILE, encoding=ENC - ) - - if SELECTED: - docs_tuple = select_docs(SRC_DOCS, REF_DOCS, SYS_DOCS, SELECTED) - else: - docs_tuple = chop_docs(SRC_DOCS, REF_DOCS, SYS_DOCS, MAX_SEGS) - - ( - SRC_DOCS, - REF_DOCS, - SYS_DOCS, - SRC_PREV, - SRC_NEXT, - SYS_PREV, - SYS_NEXT, - ) = docs_tuple - - # This reference will be used for generating BAD items - REF_ID = sorted(list(REF_DOCS.keys()))[0] - print(f'Using reference "{REF_ID}"') - - # Add references as additional system outputs - if INCLUDE_REFERENCES_AS_SYSTEMS: - for ref_id in sorted(list(REF_DOCS.keys())): - sys_id = REFERENCE_AS_SYSTEM_PREFIX + ref_id - print(f'Adding reference "{ref_id}" as system output "{sys_id}"') - SYS_DOCS[sys_id] = REF_DOCS[ref_id] - - # List of system names that can be iterated deterministically - SYS_IDS = sorted(list(SYS_DOCS.keys())) - print("SYS IDS size:", len(SYS_IDS)) - - for sys_id in SYS_IDS: - print(f'Generating bad references for {sys_id}') - BAD_DOCS[sys_id] = create_bad_refs( - SYS_DOCS[sys_id], REF_DOCS[REF_ID], character_based=CHARLANG - ) - - # pylint: disable-msg=invalid-name - some_sys_id = choice(SYS_IDS) - some_doc_id = choice(sorted(list(SYS_DOCS[some_sys_id].keys()))) - some_sys_text = SYS_DOCS[some_sys_id][some_doc_id] - some_bad_text = BAD_DOCS[some_sys_id][some_doc_id] - print("Example:", some_sys_id, some_doc_id) - - for _s, _b in zip(some_sys_text, some_bad_text): - print(_s) - print(_b) - print('---') - - DOC_STATS: Dict[int, List[Tuple[int, str, str]]] = OrderedDict() - for sys_id in SYS_IDS: - for doc_id in SYS_DOCS[sys_id].keys(): - doc_len = len(SYS_DOCS[sys_id][doc_id]) - - # We do not support documents longer than 70 segments. - if doc_len > MAX_DOC_LENGTH: - print("!!! DOCUMENT TOO LONG:", doc_id) - continue - - if not doc_len in DOC_STATS.keys(): - DOC_STATS[doc_len] = [] - DOC_STATS[doc_len].append((doc_len, doc_id, sys_id)) - - # Randomise system order - for doc_len in DOC_STATS: - shuffle(DOC_STATS[doc_len]) - - print("Doc. stats (doc.len/count):", DOC_STATS.keys()) - total_docs = 0 - total_sys = set() - for doc_len in DOC_STATS.keys(): - print(f' {doc_len}:\t{len(DOC_STATS[doc_len])}') - total_docs += len(DOC_STATS[doc_len]) - for x in DOC_STATS[doc_len]: - total_sys.add(x[2]) - print("total docs:", total_docs) - print("total sys:", total_sys) - - all_systems = list(total_sys) - sampled_tasks: List[Tuple[Tuple[int, str, str], ...]] = [] - CURR_LEN = 0 - CURR_SYS = 0 - curr_task: List[Tuple[int, str, str]] = [] - DOC_STATS_COPY = deepcopy(DOC_STATS) - last_task = False - while DOC_STATS.keys(): - ALL_KEYS = sorted(list(DOC_STATS.keys())) - # Maximum allowed length of a document to not exceed 100 segments in this task - max_delta = REQUIRED_SEGS - CURR_LEN - valid_keys = [x for x in ALL_KEYS if x <= max_delta] - - if not valid_keys: - print(" #segments in current task:", CURR_LEN) - for _doc in curr_task: - print(" ", _doc) - print('------') - sampled_tasks.append(tuple(curr_task)) - CURR_LEN = 0 - curr_task = [] - if last_task: # Stop if this was the last task with - break - continue - - # Take the document that fill in the allowed size perfectly, or random - if max_delta in valid_keys: - curr_key = max_delta - else: - curr_key = choice(valid_keys) - - CURR_LEN += curr_key - curr_val = DOC_STATS[curr_key].pop(0) # This takes a random system. - # print(' ... selected ', curr_val) - # print(' .. left systems', sum( len(DOC_STATS[k]) for k in DOC_STATS )) - - # Below code would pick systems one after the other - # curr_val = None - # for iter_val in DOC_STATS[curr_key]: - # if iter_val[2] == all_systems[CURR_SYS]: - # curr_val = iter_val - # DOC_STATS[curr_key].remove(iter_val) - # break - - # if not curr_val: - # curr_val = DOC_STATS[curr_key].pop(0) - # CURR_SYS = all_systems.index(curr_val[2]) - # CURR_SYS = (CURR_SYS + 1) % len(all_systems) - - curr_task.append(curr_val) - if not DOC_STATS[curr_key]: - DOC_STATS.pop(curr_key) - - # If there are some documents left that cannot form a full task with - # 100 segments, take random documents to create the last task. - # This ensures that all documents have been used at least once. - if ( - USE_ALL_DOCUMENTS_AND_ALL_SYSTEMS - and len(DOC_STATS) == 0 - and len(curr_task) > 0 - ): - DOC_STATS = DOC_STATS_COPY - last_task = True - print('Creating last batch with padded documents') - - # print("------------") - # print("Left docs:") - # print(DOC_STATS) - # print("------------") - - # Print documents per system - _all_tasks = [] - for _tup in sampled_tasks: - _all_tasks += list(_tup) - _docs_by_sys: Dict[str, Any] = {} - for (_, docid, sysid) in _all_tasks: - if sysid not in _docs_by_sys: - _docs_by_sys[sysid] = [] - _docs_by_sys[sysid].append(docid) - for i, sysid in enumerate(_docs_by_sys): - print(i, sysid) - for j, docid in enumerate(sorted(_docs_by_sys[sysid])): - print(" ", j, docid) - - # Shuffle order of tasks - shuffle(sampled_tasks) - print("Total number of tasks:", len(sampled_tasks)) - - padded_tasks: List[Tuple[Tuple[int, str, str], ...]] = [] - for tid, task in enumerate(sampled_tasks): - task_docs = len(task) - task_len = sum([x[0] for x in task]) - print(f'task_len: {task_len}') - if task_len > MAX_TASK_SIZE: - raise NotImplementedError( - 'No support for tasks >{0} items!'.format(MAX_TASK_SIZE) - ) - - elif task_len < MAX_TASK_SIZE: - pad_size = MAX_TASK_SIZE - task_len - pad_data: List[Tuple[int, str, str]] = list(task) - pad_pos = 0 - while pad_size > 0: - print(f'pad_size: {pad_size}') - print(f'pad_pos: {pad_pos}') - pad_data.append(tuple(list(pad_data[pad_pos]) + [True])) # type: ignore - print(pad_data[-1]) - pad_size -= pad_data[-1][0] - pad_pos = (pad_pos + 1) % task_docs - if pad_size < 0: - print(f'pad_size: {pad_size}') - print(f'pad_pos: {pad_pos}') - - last_doc: Tuple[int, str, str] = pad_data[-1] - print(last_doc[0], '-->', last_doc[0] + pad_size) - fixed_doc = (last_doc[0] + pad_size, *last_doc[1:]) - pad_data[-1] = fixed_doc - print(pad_data[-1][0]) - padded_tasks.append(tuple(pad_data)) - print("Padded tasks:") - for _pad in padded_tasks[-1]: - print(" ", _pad) - - else: - print(f'WARNING: no control items in task no. {tid}') - # raise NotImplementedError('Needs isControl=True update!') - padded_tasks.append(tuple(task)) # TODO: does this ever occur? - - if EVEN_NUM and len(padded_tasks) % 2 == 1: - print('Duplicating one batch to keep the number of tasks even') - padded_tasks.append(padded_tasks[0]) - print(f'Number of tasks now is {len(padded_tasks)}') - - csv_data = [] - task_id = 0 - for task in padded_tasks: - task_id += 1 - task_len = sum([x[0] for x in task]) - print(f'>>> task_len: {task_len}') - - for _doc in task: - _data = [str(task_id)] - for x in _doc: # type: ignore - _data.append(str(x)) - - if _data[-1] != 'True': - _data.append('False') # isControl=False - print('>>> ', ' '.join(_data)) - csv_data.append(','.join(_data)) - - with open(f'{OUT_NAME}.csv', mode='w') as _file: - for csv_line in csv_data: - _file.write(csv_line) - _file.write('\n') - - json_data = [] - batch_id = 0 - for task in padded_tasks[:TASK_MAX]: - # Remember, batch numbers are one-based - task_data = OrderedDict( - { - 'batchNo': batch_id + 1, - 'batchSize': 100, - 'sourceLanguage': SRC_LANG, - 'targetLanguage': TGT_LANG, - 'requiredAnnotations': 1, - 'randomSeed': RND_SEED, - } - ) - - source_id = basename(XML_FILE) - - items_data: List[List[Dict[str, Any]]] = [] # Keeps items grouped into document - _item = 0 - doc_counter = 0 - for doc_data in task: - items_data.append([]) # Add a new bucket for items from this documents - has_control_item = False - - doc_len, doc_id, sys_id, *rest = doc_data # type: ignore - - isControl = rest is not None and rest - - target_id = sys_id - - _src = {} - _ref = {} - _bad = {} - _tgt = {} - - for item_id, item_src in SRC_DOCS[doc_id]: - seg_id = f'{doc_id}_{item_id}' - _src[seg_id] = item_src - - for item_id, item_ref in REF_DOCS[REF_ID][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _ref[seg_id] = item_ref - - for item_id, item_bad in BAD_DOCS[sys_id][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _bad[seg_id] = item_bad - - for item_id, item_tgt in SYS_DOCS[sys_id][doc_id]: - seg_id = f'{doc_id}_{item_id}' - _tgt[seg_id] = item_tgt - - seg_counter = 0 - context_src: List[Text] = [] - context_ref: List[Text] = [] - context_bad: List[Text] = [] - context_tgt: List[Text] = [] - for seg_id in _src: - if seg_counter >= doc_len: # Padding tasks are shorter! - break - item_src = _src[seg_id] - item_ref = _ref[seg_id] - item_bad = _bad[seg_id] - item_tgt = _tgt[seg_id] - - target_text = item_tgt - target_type = 'TGT' - - # Do not generate any BAD items if QC is disabled - if CONTROLS and isControl: - randomCoinFlip = choice( - [False, False, True, True, True] # 60:40 chance - ) - if randomCoinFlip: - target_text = item_bad - target_type = 'BAD' - has_control_item = True - - src_ctx = [] - tgt_ctx = [] - if seg_counter == 0: - src_ctx = [txt for _, txt in SRC_PREV[doc_id]][-CTX_SIZE:] - tgt_ctx = [txt for _, txt in SYS_PREV[sys_id][doc_id]][-CTX_SIZE:] - - obj: Dict[str, Any] = OrderedDict() - obj['_item'] = _item - obj['_block'] = -1 - obj['sourceID'] = source_id - obj['sourceContextLeft'] = '\n'.join(src_ctx) - obj['sourceText'] = item_src - obj['targetID'] = target_id - obj['targetContextLeft'] = '\n'.join(tgt_ctx) - obj['targetText'] = target_text - obj['itemID'] = seg_counter - obj['itemType'] = target_type - obj['documentID'] = doc_id - obj['isCompleteDocument'] = False - - # print(seg_id) - # print(' '.join(context_src)) - # print(item_src) - # print('...') - # print(' '.join(context_tgt)) - # print(item_tgt.encode('utf-8')) - # print('---') - - context_src.append(item_src) - context_ref.append(item_ref) - context_bad.append(item_bad) - context_tgt.append(target_text) - - items_data[-1].append(obj) - _item += 1 - seg_counter += 1 - - src_ctx = [] - tgt_ctx = [] - src_ctx = [txt for _, txt in SRC_NEXT[doc_id]][:CTX_SIZE] - tgt_ctx = [txt for _, txt in SYS_NEXT[sys_id][doc_id]][:CTX_SIZE] - - obj = OrderedDict() - obj['_item'] = _item - obj['_block'] = -1 - obj['sourceContextLeft'] = '\n'.join(src_ctx) - obj['sourceID'] = source_id - obj['sourceText'] = ' '.join(context_src) # full document - obj['targetContextLeft'] = '\n'.join(tgt_ctx) - obj['targetID'] = target_id - obj['targetText'] = ' '.join(context_tgt) # full document - obj['itemID'] = item_id - obj['itemType'] = 'BAD' if has_control_item else 'TGT' - obj['documentID'] = doc_id - obj['isCompleteDocument'] = True - items_data[-1].append(obj) - - if has_control_item and SHUFFLE_DOCS_WITH_CONTROL_ITEMS: - # Move the document with control items to a random position so - # that they are not accumulated as very last documents - _bad_doc = items_data.pop() - _pos = randint(0, len(items_data) - 1) - print(f' Moving the last QC document to position {_pos}') - items_data.insert(_pos, _bad_doc) - - # Extract items from documents - _items_data = [item for doc_items in items_data for item in doc_items] - # Re-assign _item numbers - if SHUFFLE_DOCS_WITH_CONTROL_ITEMS: - _item = 0 - for i in range(len(_items_data)): - _items_data[i]['_item'] = _item - if _items_data[i]['isCompleteDocument'] == False: - _item += 1 - - output_data = OrderedDict({'task': task_data, 'items': _items_data}) - - json_data.append(output_data) - - # write out JSON - json_text = json_dumps(json_data, indent=2, sort_keys=True) - - json_file_name = f'{OUT_NAME}.json' - with open(json_file_name, mode='w', encoding='utf8') as out_file: - sys.stdout.write( - 'Creating {0}, batch no. {1} ... '.format(json_file_name, batch_id + 1), - ) - out_file.write(str(json_text)) - sys.stdout.write('OK\n') - - batch_id += 1 - - print(f'Total tasks: {len(sampled_tasks)}') - print(f'Total docs: {total_docs}') - print(f'Total sys: {len(total_sys)} {sorted(list(total_sys))}') From 5a95f92dc02a84b6145e91683be67753a3a328e1 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 11 Jul 2025 15:51:42 -0700 Subject: [PATCH 14/51] upgrade requirements --- requirements-dev.txt | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 5a8770b1..fe86d4a5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ -r requirements.txt -black==22.3.0 +black==24.3.0 mypy pylint pylint-django diff --git a/requirements.txt b/requirements.txt index 3cf747d8..95a77972 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ beautifulsoup4 -django==4.1 +django==4.2.22 django-stubs lxml psycopg2-binary From dfbba0651bd8188460e840bd450d4b1abe35df58 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 11 Jul 2025 15:53:50 -0700 Subject: [PATCH 15/51] upgrade Python to 3.12 --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2e0c0ca2..142b348c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9] + python-version: [3.12] steps: - uses: actions/checkout@v4 From 53e848cc4cae22da644538102df20b654368be02 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 11 Jul 2025 15:57:33 -0700 Subject: [PATCH 16/51] reformat with newer version of black --- Appraise/settings.py | 5 +- Appraise/urls.py | 1 + Appraise/utils.py | 7 +-- Appraise/wsgi.py | 1 + Campaign/admin.py | 1 + .../commands/ComputeSystemScores.py | 1 + .../commands/ComputeWMT21Results.py | 2 +- .../management/commands/ComputeZScores.py | 4 +- .../commands/InitCampaignMMT18Task1.py | 1 + .../commands/InitCampaignMMT18Task1b.py | 1 + .../commands/InitCampaignMMT18Task1bv2.py | 1 + .../commands/InitCampaignMMT18Task1bv3.py | 1 + .../commands/InitCampaignMMT18Task1v2.py | 1 + .../commands/InitCampaignMMT18Task1v3.py | 1 + .../commands/InitCampaignWMT18RefDA.py | 1 + .../commands/InitCampaignWMT18RefDA2.py | 1 + .../commands/InitCampaignWMT18RefDA3.py | 1 + .../commands/InitCampaignWMT18RefDA4.py | 1 + .../commands/InitCampaignWMT18SrcDA.py | 1 + .../management/commands/MakeAnnotation.py | 4 +- .../management/commands/StartNewCampaign.py | 2 + .../commands/UpdateCampaignModels.py | 2 + Campaign/management/commands/init_campaign.py | 2 + .../commands/validatecampaigndata.py | 1 + Campaign/models.py | 1 + Campaign/tests.py | 1 + Campaign/utils.py | 1 + Campaign/views.py | 21 ++++---- Dashboard/admin.py | 1 + Dashboard/apps.py | 1 + .../management/commands/CreateInviteTokens.py | 1 + .../commands/UpdateDashboardModels.py | 2 + Dashboard/models.py | 1 + Dashboard/tests.py | 1 + Dashboard/utils.py | 5 +- Dashboard/views.py | 5 +- EvalData/admin.py | 1 + EvalData/apps.py | 2 + EvalData/error_types.py | 1 + .../commands/CombineSubsetTextData.py | 2 + .../commands/CreateDirectAssessmentData.py | 2 + .../CreateDirectAssessmentDataWMT17.py | 2 + .../management/commands/CreateFakeBadRefs.py | 2 + .../management/commands/CreateIdsFiles.py | 2 + .../CreateMultiModalAssessmentData.py | 2 + .../commands/CreateSubsetTextData.py | 1 + .../management/commands/DumpAllResults.py | 2 + .../commands/DumpScoresAndMetadata.py | 2 + .../commands/PatchDirectAssessmentData.py | 1 + .../commands/UnlinkDirectAssessmentTasks.py | 2 + .../commands/UpdateEvalDataModels.py | 2 + .../commands/ValidateDirectAssessmentData.py | 1 + EvalData/models/__init__.py | 1 + EvalData/models/base_models.py | 1 + EvalData/models/data_assessment.py | 1 + EvalData/models/direct_assessment.py | 1 + EvalData/models/direct_assessment_context.py | 2 +- EvalData/models/direct_assessment_document.py | 53 ++++++++++--------- EvalData/models/multi_modal_assessment.py | 1 + EvalData/models/pairwise_assessment.py | 3 +- .../models/pairwise_assessment_document.py | 3 +- EvalData/models/task_agenda.py | 1 + EvalData/views.py | 1 + EvalView/admin.py | 1 + EvalView/apps.py | 2 + EvalView/models.py | 1 + EvalView/tests.py | 1 + EvalView/views.py | 25 ++++----- Makefile | 3 ++ Scripts/create_iwslt22_tasks.py | 2 +- Scripts/create_wmt22_pairwise_tasks.py | 35 ++++++++---- Scripts/create_wmt22_tasks.py | 2 +- deprecated.py | 5 +- 73 files changed, 176 insertions(+), 81 deletions(-) diff --git a/Appraise/settings.py b/Appraise/settings.py index b0347d44..59c9c413 100644 --- a/Appraise/settings.py +++ b/Appraise/settings.py @@ -9,6 +9,7 @@ For the full list of settings and their values, see https://docs.djangoproject.com/en/1.11/ref/settings/ """ + import logging import os import warnings @@ -37,7 +38,9 @@ ALLOWED_HOSTS = os.environ.get('APPRAISE_ALLOWED_HOSTS', '127.0.0.1').split(',') -CSRF_TRUSTED_ORIGINS = os.environ.get('APPRAISE_CSRF_TRUSTED_ORIGINS', 'https://*.127.0.0.1').split(',') +CSRF_TRUSTED_ORIGINS = os.environ.get( + 'APPRAISE_CSRF_TRUSTED_ORIGINS', 'https://*.127.0.0.1' +).split(',') WSGI_APPLICATION = os.environ.get( 'APPRAISE_WSGI_APPLICATION', 'Appraise.wsgi.application' diff --git a/Appraise/urls.py b/Appraise/urls.py index 2274e1c6..54a122be 100644 --- a/Appraise/urls.py +++ b/Appraise/urls.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=unused-import,import-error from django.conf.urls import handler404 from django.conf.urls import handler500 diff --git a/Appraise/utils.py b/Appraise/utils.py index 4e9093e2..27540d8a 100644 --- a/Appraise/utils.py +++ b/Appraise/utils.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + import logging from Appraise.settings import LOG_HANDLER @@ -33,8 +34,8 @@ def _compute_user_total_annotation_time(timestamps): def _clamp_time(seconds): # if a segment takes longer than 10 minutes, set it to 5 minutes # it's likely due to inactivity - if seconds >= 10*60: - return 5*60 + if seconds >= 10 * 60: + return 5 * 60 else: return seconds @@ -54,4 +55,4 @@ def _clamp_time(seconds): # Update the previous end timestamp previous_end_timestamp = end_timestamp - return total_annotation_time \ No newline at end of file + return total_annotation_time diff --git a/Appraise/wsgi.py b/Appraise/wsgi.py index 7ec10196..6c2ef408 100644 --- a/Appraise/wsgi.py +++ b/Appraise/wsgi.py @@ -6,6 +6,7 @@ For more information on this file, see https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/ """ + import os from django.core.wsgi import get_wsgi_application diff --git a/Campaign/admin.py b/Campaign/admin.py index 55d82a1c..5e39b963 100644 --- a/Campaign/admin.py +++ b/Campaign/admin.py @@ -1,6 +1,7 @@ """ Campaign admin.py """ + # pylint: disable=C0330,import-error from django.contrib import admin from django.contrib.admin.filters import AllValuesFieldListFilter diff --git a/Campaign/management/commands/ComputeSystemScores.py b/Campaign/management/commands/ComputeSystemScores.py index 4898fba1..9e948135 100644 --- a/Campaign/management/commands/ComputeSystemScores.py +++ b/Campaign/management/commands/ComputeSystemScores.py @@ -10,6 +10,7 @@ from EvalData.models import DirectAssessmentResult from EvalData.models import DirectAssessmentTask + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Computes system scores over all results' diff --git a/Campaign/management/commands/ComputeWMT21Results.py b/Campaign/management/commands/ComputeWMT21Results.py index d5c6dab6..76eaa645 100644 --- a/Campaign/management/commands/ComputeWMT21Results.py +++ b/Campaign/management/commands/ComputeWMT21Results.py @@ -463,7 +463,7 @@ def handle(self, *args, **options): wins_for_system = defaultdict(list) losses_for_system = defaultdict(list) p_level = 0.05 - for (sysA, sysB) in combinations_with_replacement(system_ids, 2): + for sysA, sysB in combinations_with_replacement(system_ids, 2): sysA_ids = set([x[0] for x in system_z_scores[sysA]]) sysB_ids = set([x[0] for x in system_z_scores[sysB]]) good_ids = set.intersection(sysA_ids, sysB_ids) diff --git a/Campaign/management/commands/ComputeZScores.py b/Campaign/management/commands/ComputeZScores.py index f11be90e..183e0ee2 100644 --- a/Campaign/management/commands/ComputeZScores.py +++ b/Campaign/management/commands/ComputeZScores.py @@ -427,7 +427,7 @@ def handle(self, *args, **options): wins_for_system = defaultdict(list) p_level = 0.05 - for (sysA, sysB) in combinations_with_replacement(system_ids, 2): + for sysA, sysB in combinations_with_replacement(system_ids, 2): sysA_ids = set([x[0] for x in system_z_scores[sysA]]) sysB_ids = set([x[0] for x in system_z_scores[sysB]]) good_ids = set.intersection(sysA_ids, sysB_ids) @@ -577,7 +577,7 @@ def sort_by_wins_and_z_score(x, y): key = system_id[:4].upper() vsystems[key].extend(system_z_scores[system_id]) - for (sysA, sysB) in combinations_with_replacement( + for sysA, sysB in combinations_with_replacement( ['GOOG', 'CAND', 'PROD'], 2 ): sysA_scores = [x[1] for x in vsystems[sysA]] diff --git a/Campaign/management/commands/InitCampaignMMT18Task1.py b/Campaign/management/commands/InitCampaignMMT18Task1.py index d43dd536..26338ed9 100644 --- a/Campaign/management/commands/InitCampaignMMT18Task1.py +++ b/Campaign/management/commands/InitCampaignMMT18Task1.py @@ -31,6 +31,7 @@ } REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign MMT18 Task #1' diff --git a/Campaign/management/commands/InitCampaignMMT18Task1b.py b/Campaign/management/commands/InitCampaignMMT18Task1b.py index 6db7b98f..e6829bda 100644 --- a/Campaign/management/commands/InitCampaignMMT18Task1b.py +++ b/Campaign/management/commands/InitCampaignMMT18Task1b.py @@ -27,6 +27,7 @@ } REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign MMT18 Task #1.b' diff --git a/Campaign/management/commands/InitCampaignMMT18Task1bv2.py b/Campaign/management/commands/InitCampaignMMT18Task1bv2.py index a09ffca5..653cb06f 100644 --- a/Campaign/management/commands/InitCampaignMMT18Task1bv2.py +++ b/Campaign/management/commands/InitCampaignMMT18Task1bv2.py @@ -27,6 +27,7 @@ } REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign MMT18 Task #1.b v2' diff --git a/Campaign/management/commands/InitCampaignMMT18Task1bv3.py b/Campaign/management/commands/InitCampaignMMT18Task1bv3.py index 46c8e2c7..7503db6c 100644 --- a/Campaign/management/commands/InitCampaignMMT18Task1bv3.py +++ b/Campaign/management/commands/InitCampaignMMT18Task1bv3.py @@ -27,6 +27,7 @@ } REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign MMT18 Task #1.b v3' diff --git a/Campaign/management/commands/InitCampaignMMT18Task1v2.py b/Campaign/management/commands/InitCampaignMMT18Task1v2.py index 99c7908f..bd5fb2c2 100644 --- a/Campaign/management/commands/InitCampaignMMT18Task1v2.py +++ b/Campaign/management/commands/InitCampaignMMT18Task1v2.py @@ -31,6 +31,7 @@ } REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign MMT18 Task #1 v2' diff --git a/Campaign/management/commands/InitCampaignMMT18Task1v3.py b/Campaign/management/commands/InitCampaignMMT18Task1v3.py index 9d364728..2caa2932 100644 --- a/Campaign/management/commands/InitCampaignMMT18Task1v3.py +++ b/Campaign/management/commands/InitCampaignMMT18Task1v3.py @@ -27,6 +27,7 @@ } REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign MMT18 Task #1 v3' diff --git a/Campaign/management/commands/InitCampaignWMT18RefDA.py b/Campaign/management/commands/InitCampaignWMT18RefDA.py index 7513039a..117c58d3 100644 --- a/Campaign/management/commands/InitCampaignWMT18RefDA.py +++ b/Campaign/management/commands/InitCampaignWMT18RefDA.py @@ -23,6 +23,7 @@ TASKS = 100 REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign WMT18 RefDA' diff --git a/Campaign/management/commands/InitCampaignWMT18RefDA2.py b/Campaign/management/commands/InitCampaignWMT18RefDA2.py index c46a0b15..22479855 100644 --- a/Campaign/management/commands/InitCampaignWMT18RefDA2.py +++ b/Campaign/management/commands/InitCampaignWMT18RefDA2.py @@ -23,6 +23,7 @@ TASKS = 100 REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign WMT18 RefDA2' diff --git a/Campaign/management/commands/InitCampaignWMT18RefDA3.py b/Campaign/management/commands/InitCampaignWMT18RefDA3.py index 4a890793..c5ceaf37 100644 --- a/Campaign/management/commands/InitCampaignWMT18RefDA3.py +++ b/Campaign/management/commands/InitCampaignWMT18RefDA3.py @@ -23,6 +23,7 @@ TASKS = 34 REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign WMT18 RefDA3' diff --git a/Campaign/management/commands/InitCampaignWMT18RefDA4.py b/Campaign/management/commands/InitCampaignWMT18RefDA4.py index 8d2e4852..b58d4d0d 100644 --- a/Campaign/management/commands/InitCampaignWMT18RefDA4.py +++ b/Campaign/management/commands/InitCampaignWMT18RefDA4.py @@ -23,6 +23,7 @@ TASKS = 100 REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign WMT18 RefDA4' diff --git a/Campaign/management/commands/InitCampaignWMT18SrcDA.py b/Campaign/management/commands/InitCampaignWMT18SrcDA.py index f292a99b..e066880d 100644 --- a/Campaign/management/commands/InitCampaignWMT18SrcDA.py +++ b/Campaign/management/commands/InitCampaignWMT18SrcDA.py @@ -23,6 +23,7 @@ TASKS = 34 REDUNDANCY = 1 + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign WMT18 SrcDA' diff --git a/Campaign/management/commands/MakeAnnotation.py b/Campaign/management/commands/MakeAnnotation.py index 9faad8c5..9c0be3de 100644 --- a/Campaign/management/commands/MakeAnnotation.py +++ b/Campaign/management/commands/MakeAnnotation.py @@ -123,9 +123,7 @@ def handle(self, *args, **options): exit() if options["verbosity"] > 1: - self.stdout.write( - f"Available context keys: {response.context.keys()}" - ) + self.stdout.write(f"Available context keys: {response.context.keys()}") # Each task has different context, so the POST request needs to be # built separately for each task type diff --git a/Campaign/management/commands/StartNewCampaign.py b/Campaign/management/commands/StartNewCampaign.py index 8680b7b7..3f172e13 100644 --- a/Campaign/management/commands/StartNewCampaign.py +++ b/Campaign/management/commands/StartNewCampaign.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from datetime import datetime from os import path @@ -26,6 +27,7 @@ from Dashboard.utils import generate_confirmation_token from EvalData.management.commands.UpdateEvalDataModels import _update_eval_data_models + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'A single command for creating a new campaign based on manifest file' diff --git a/Campaign/management/commands/UpdateCampaignModels.py b/Campaign/management/commands/UpdateCampaignModels.py index 796db261..04715028 100644 --- a/Campaign/management/commands/UpdateCampaignModels.py +++ b/Campaign/management/commands/UpdateCampaignModels.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=W0611 from os import path @@ -21,6 +22,7 @@ INFO_MSG = 'INFO: ' WARNING_MSG = 'WARN: ' + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Updates object instances required for Campaign app' diff --git a/Campaign/management/commands/init_campaign.py b/Campaign/management/commands/init_campaign.py index c8846ea6..537b8eff 100644 --- a/Campaign/management/commands/init_campaign.py +++ b/Campaign/management/commands/init_campaign.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from datetime import datetime from django.core.management.base import BaseCommand @@ -21,6 +22,7 @@ from Campaign.utils import CAMPAIGN_TASK_TYPES from Dashboard.utils import generate_confirmation_token + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Initialises campaign based on manifest file' diff --git a/Campaign/management/commands/validatecampaigndata.py b/Campaign/management/commands/validatecampaigndata.py index 7432425f..4331c3f7 100644 --- a/Campaign/management/commands/validatecampaigndata.py +++ b/Campaign/management/commands/validatecampaigndata.py @@ -1,6 +1,7 @@ """ Appraise """ + # pylint: disable=C0103,C0111,C0330,E1101 import sys from json import loads diff --git a/Campaign/models.py b/Campaign/models.py index ed2632f1..46be62ed 100644 --- a/Campaign/models.py +++ b/Campaign/models.py @@ -1,6 +1,7 @@ """ Campaign models.py """ + # pylint: disable=C0111,C0330,E1101 from json import JSONDecodeError from json import loads diff --git a/Campaign/tests.py b/Campaign/tests.py index 35568675..7f23c473 100644 --- a/Campaign/tests.py +++ b/Campaign/tests.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from pathlib import Path from django.contrib.auth.models import User diff --git a/Campaign/utils.py b/Campaign/utils.py index 18b17ecc..bcabdb08 100644 --- a/Campaign/utils.py +++ b/Campaign/utils.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from collections import defaultdict from collections import OrderedDict from hashlib import md5 diff --git a/Campaign/views.py b/Campaign/views.py index d1198771..ce7cd863 100644 --- a/Campaign/views.py +++ b/Campaign/views.py @@ -70,7 +70,6 @@ def campaign_status(request, campaign_name, sort_key=2): ) is_mqm_or_esa = False - # Exclude document scores in document-level tasks, because we want to keep # the numbers reported on the campaign status page consistent across # accounts, which usually include different numbers of document @@ -107,9 +106,10 @@ def campaign_status(request, campaign_name, sort_key=2): ) # compute time override based on document times import collections + _time_pairs = collections.defaultdict(list) for x in _data: - _time_pairs[x[7]+ " ||| " +x[4]].append((x[0], x[1])) + _time_pairs[x[7] + " ||| " + x[4]].append((x[0], x[1])) _time_pairs = [ (min([x[0] for x in doc_v]), max([x[1] for x in doc_v])) for doc, doc_v in _time_pairs.items() @@ -132,17 +132,15 @@ def campaign_status(request, campaign_name, sort_key=2): ) # compute time override based on document times import collections + _time_pairs = collections.defaultdict(list) for x in _data: - _time_pairs[x[7]+ " ||| " +x[4]].append((x[0], x[1])) + _time_pairs[x[7] + " ||| " + x[4]].append((x[0], x[1])) _time_pairs = [ (min([x[0] for x in doc_v]), max([x[1] for x in doc_v])) for doc, doc_v in _time_pairs.items() ] - _data = [ - (x[0], x[1], x[2], x[3], x[4], x[5], x[6]) - for x in _data - ] + _data = [(x[0], x[1], x[2], x[3], x[4], x[5], x[6]) for x in _data] else: _data = _data.values_list( 'start_time', @@ -171,7 +169,7 @@ def campaign_status(request, campaign_name, sort_key=2): _first_modified = str(_date_modified).split('.')[0] else: _first_modified = 'Never' - + # Compute last modified time _last_modified_raw = ( seconds_to_timedelta(max(_end_times)) if _end_times else None @@ -185,8 +183,10 @@ def campaign_status(request, campaign_name, sort_key=2): # Compute total annotation time if is_mqm_or_esa and _first_modified_raw and _last_modified_raw: # for MQM and ESA compute the lower and upper annotation times - # use only the end times - _annotation_time_upper = (_last_modified_raw-_first_modified_raw).seconds + # use only the end times + _annotation_time_upper = ( + _last_modified_raw - _first_modified_raw + ).seconds _hours = int(floor(_annotation_time_upper / 3600)) _minutes = int(floor((_annotation_time_upper % 3600) / 60)) _annotation_time_upper = f'{_hours:0>2d}h{_minutes:0>2d}m' @@ -206,7 +206,6 @@ def campaign_status(request, campaign_name, sort_key=2): else: _annotation_time = 'n/a' - _item = ( user.username, user.is_active, diff --git a/Dashboard/admin.py b/Dashboard/admin.py index 36289357..c09c0302 100644 --- a/Dashboard/admin.py +++ b/Dashboard/admin.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=import-error from django.contrib import admin diff --git a/Dashboard/apps.py b/Dashboard/apps.py index b44cea51..3105d180 100644 --- a/Dashboard/apps.py +++ b/Dashboard/apps.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from django.apps import AppConfig diff --git a/Dashboard/management/commands/CreateInviteTokens.py b/Dashboard/management/commands/CreateInviteTokens.py index 275a6995..129c461c 100644 --- a/Dashboard/management/commands/CreateInviteTokens.py +++ b/Dashboard/management/commands/CreateInviteTokens.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103 from collections import defaultdict from csv import DictReader diff --git a/Dashboard/management/commands/UpdateDashboardModels.py b/Dashboard/management/commands/UpdateDashboardModels.py index 77db1ce0..7a71f0d7 100644 --- a/Dashboard/management/commands/UpdateDashboardModels.py +++ b/Dashboard/management/commands/UpdateDashboardModels.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103,W0611 from os.path import basename @@ -20,6 +21,7 @@ INFO_MSG = 'INFO: ' WARNING_MSG = 'WARN: ' + # pylint: disable=C0111,C0330 class Command(BaseCommand): help = 'Updates object instances required for Dashboard app' diff --git a/Dashboard/models.py b/Dashboard/models.py index a0305879..248dc916 100644 --- a/Dashboard/models.py +++ b/Dashboard/models.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from uuid import uuid4 from django.contrib.auth.models import Group diff --git a/Dashboard/tests.py b/Dashboard/tests.py index 53b26aae..aae6718c 100644 --- a/Dashboard/tests.py +++ b/Dashboard/tests.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=unused-import from django.db import models diff --git a/Dashboard/utils.py b/Dashboard/utils.py index e7c76cf1..44848e13 100644 --- a/Dashboard/utils.py +++ b/Dashboard/utils.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from collections import defaultdict from datetime import datetime from hashlib import md5 @@ -65,7 +66,9 @@ def run_quality_control(username): _data = _type.objects.filter(createdBy__username=username, completed=True) # Get the first result task type available: might not work in all scenarios if _data: - campaign_opts = set((_data[0].task.campaign.campaignOptions or "").lower().split(";")) + campaign_opts = set( + (_data[0].task.campaign.campaignOptions or "").lower().split(";") + ) result_type = _type break diff --git a/Dashboard/views.py b/Dashboard/views.py index 959cbda6..fc3f66c3 100644 --- a/Dashboard/views.py +++ b/Dashboard/views.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from datetime import datetime from hashlib import md5 from inspect import currentframe @@ -83,7 +84,7 @@ def sso_login(request, username, password): logout(request) user = authenticate(username=username, password=password) - + # login failed if user is None: return redirect('dashboard') @@ -510,4 +511,4 @@ def dashboard(request): } ) - return render(request, 'Dashboard/dashboard.html', template_context) \ No newline at end of file + return render(request, 'Dashboard/dashboard.html', template_context) diff --git a/EvalData/admin.py b/EvalData/admin.py index 62a54039..62948ac2 100644 --- a/EvalData/admin.py +++ b/EvalData/admin.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0330 from datetime import timezone diff --git a/EvalData/apps.py b/EvalData/apps.py index a669ed4b..4ebf9a1d 100644 --- a/EvalData/apps.py +++ b/EvalData/apps.py @@ -3,8 +3,10 @@ See LICENSE for usage details """ + from django.apps import AppConfig + # pylint: disable=missing-docstring class EvaldataConfig(AppConfig): name = 'EvalData' diff --git a/EvalData/error_types.py b/EvalData/error_types.py index ab0b0a3e..604838f3 100644 --- a/EvalData/error_types.py +++ b/EvalData/error_types.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from collections import OrderedDict ERROR_TYPES = { diff --git a/EvalData/management/commands/CombineSubsetTextData.py b/EvalData/management/commands/CombineSubsetTextData.py index caf8e853..512e4de6 100644 --- a/EvalData/management/commands/CombineSubsetTextData.py +++ b/EvalData/management/commands/CombineSubsetTextData.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from collections import defaultdict from collections import OrderedDict from glob import iglob @@ -17,6 +18,7 @@ INFO_MSG = 'INFO: ' + # pylint: disable=C0111 class Command(BaseCommand): help = 'Creates combined subset text file based on given CSV file' diff --git a/EvalData/management/commands/CreateDirectAssessmentData.py b/EvalData/management/commands/CreateDirectAssessmentData.py index 0a6945b5..db1c723e 100644 --- a/EvalData/management/commands/CreateDirectAssessmentData.py +++ b/EvalData/management/commands/CreateDirectAssessmentData.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + import hashlib import json from collections import defaultdict @@ -25,6 +26,7 @@ # pylint: disable=E0401,W0611 + # pylint: disable=C0111 class Command(BaseCommand): help = 'Creates JSON file containing DirectAssessmentTask data' diff --git a/EvalData/management/commands/CreateDirectAssessmentDataWMT17.py b/EvalData/management/commands/CreateDirectAssessmentDataWMT17.py index ac838a47..72375c5e 100644 --- a/EvalData/management/commands/CreateDirectAssessmentDataWMT17.py +++ b/EvalData/management/commands/CreateDirectAssessmentDataWMT17.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=W0611 import json from collections import defaultdict @@ -19,6 +20,7 @@ from Dashboard.models import LANGUAGE_CODES_AND_NAMES + # pylint: disable=C0111 class Command(BaseCommand): help = 'Creates JSON file containing DirectAssessmentTask data' diff --git a/EvalData/management/commands/CreateFakeBadRefs.py b/EvalData/management/commands/CreateFakeBadRefs.py index c614471a..14a9b764 100644 --- a/EvalData/management/commands/CreateFakeBadRefs.py +++ b/EvalData/management/commands/CreateFakeBadRefs.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=W0611 from collections import defaultdict from collections import OrderedDict @@ -25,6 +26,7 @@ EXTENSION_FOR_BAD_FILES = 'bad' EXTENSION_FOR_IDS_FILES = 'ids' + # pylint: disable=C0111 class Command(BaseCommand): help = 'Creates fake bad references data' diff --git a/EvalData/management/commands/CreateIdsFiles.py b/EvalData/management/commands/CreateIdsFiles.py index 0333edf5..8b707ddd 100644 --- a/EvalData/management/commands/CreateIdsFiles.py +++ b/EvalData/management/commands/CreateIdsFiles.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=W0611 from collections import defaultdict from collections import OrderedDict @@ -25,6 +26,7 @@ EXTENSION_FOR_BAD_FILES = 'bad' EXTENSION_FOR_IDS_FILES = 'ids' + # pylint: disable=C0111 class Command(BaseCommand): help = 'Creates ids files' diff --git a/EvalData/management/commands/CreateMultiModalAssessmentData.py b/EvalData/management/commands/CreateMultiModalAssessmentData.py index c94b4bb2..f2f3c667 100644 --- a/EvalData/management/commands/CreateMultiModalAssessmentData.py +++ b/EvalData/management/commands/CreateMultiModalAssessmentData.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=W0611 import json from collections import defaultdict @@ -17,6 +18,7 @@ from Dashboard.models import LANGUAGE_CODES_AND_NAMES + # pylint: disable=C0111 class Command(BaseCommand): help = 'Creates JSON file containing MultiModalAssessmentTask data' diff --git a/EvalData/management/commands/CreateSubsetTextData.py b/EvalData/management/commands/CreateSubsetTextData.py index 1bf5b99b..2ad95c68 100644 --- a/EvalData/management/commands/CreateSubsetTextData.py +++ b/EvalData/management/commands/CreateSubsetTextData.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=W0611 from collections import defaultdict from collections import OrderedDict diff --git a/EvalData/management/commands/DumpAllResults.py b/EvalData/management/commands/DumpAllResults.py index 0045afa7..a5bea6d2 100644 --- a/EvalData/management/commands/DumpAllResults.py +++ b/EvalData/management/commands/DumpAllResults.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from os.path import basename from django.core.management.base import BaseCommand @@ -17,6 +18,7 @@ INFO_MSG = 'INFO: ' WARNING_MSG = 'WARN: ' + # pylint: disable=C0111,C0330 class Command(BaseCommand): help = 'Dumps all DirectAssessmentResult and MultiModalAssessmentResult instances' diff --git a/EvalData/management/commands/DumpScoresAndMetadata.py b/EvalData/management/commands/DumpScoresAndMetadata.py index c9689667..49bfb0d8 100644 --- a/EvalData/management/commands/DumpScoresAndMetadata.py +++ b/EvalData/management/commands/DumpScoresAndMetadata.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from gzip import open as gz_open from os.path import basename @@ -17,6 +18,7 @@ INFO_MSG = 'INFO: ' WARNING_MSG = 'WARN: ' + # pylint: disable=C0111,C0330 class Command(BaseCommand): help = 'Dumps all DirectAssessmentResult scores and associated metadata' diff --git a/EvalData/management/commands/PatchDirectAssessmentData.py b/EvalData/management/commands/PatchDirectAssessmentData.py index 70573561..6dc7402e 100644 --- a/EvalData/management/commands/PatchDirectAssessmentData.py +++ b/EvalData/management/commands/PatchDirectAssessmentData.py @@ -11,6 +11,7 @@ from EvalData.models import DirectAssessmentResult from EvalData.models import DirectAssessmentTask + # pylint: disable=C0111,C0330,E1101 class Command(BaseCommand): help = 'Validates Direct Assessment JSON data files' diff --git a/EvalData/management/commands/UnlinkDirectAssessmentTasks.py b/EvalData/management/commands/UnlinkDirectAssessmentTasks.py index 09fc707f..bb17c5fb 100644 --- a/EvalData/management/commands/UnlinkDirectAssessmentTasks.py +++ b/EvalData/management/commands/UnlinkDirectAssessmentTasks.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=W0611 from datetime import datetime from datetime import timedelta @@ -30,6 +31,7 @@ INFO_MSG = 'INFO: ' WARNING_MSG = 'WARN: ' + # pylint: disable=C0111,C0330 class Command(BaseCommand): help = 'Unlinks DirectAssessmentTask instances as needed' diff --git a/EvalData/management/commands/UpdateEvalDataModels.py b/EvalData/management/commands/UpdateEvalDataModels.py index 5a8604e8..6153596b 100644 --- a/EvalData/management/commands/UpdateEvalDataModels.py +++ b/EvalData/management/commands/UpdateEvalDataModels.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=W0611 from datetime import datetime from os import path @@ -26,6 +27,7 @@ INFO_MSG = 'INFO: ' WARNING_MSG = 'WARN: ' + # pylint: disable=C0111,C0330 class Command(BaseCommand): help = 'Updates object instances required for EvalData app' diff --git a/EvalData/management/commands/ValidateDirectAssessmentData.py b/EvalData/management/commands/ValidateDirectAssessmentData.py index a267a1b3..f53a4a4d 100644 --- a/EvalData/management/commands/ValidateDirectAssessmentData.py +++ b/EvalData/management/commands/ValidateDirectAssessmentData.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=W0611 from collections import defaultdict from json import load diff --git a/EvalData/models/__init__.py b/EvalData/models/__init__.py index 92943e4c..706486b6 100644 --- a/EvalData/models/__init__.py +++ b/EvalData/models/__init__.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from .base_models import * from .data_assessment import * from .direct_assessment import * diff --git a/EvalData/models/base_models.py b/EvalData/models/base_models.py index fef7b737..7b83ad15 100644 --- a/EvalData/models/base_models.py +++ b/EvalData/models/base_models.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103,C0330,no-member from datetime import timezone diff --git a/EvalData/models/data_assessment.py b/EvalData/models/data_assessment.py index 1d239386..3de5292b 100644 --- a/EvalData/models/data_assessment.py +++ b/EvalData/models/data_assessment.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103,C0330,no-member import sys from collections import defaultdict diff --git a/EvalData/models/direct_assessment.py b/EvalData/models/direct_assessment.py index a605b6d2..801d54d7 100644 --- a/EvalData/models/direct_assessment.py +++ b/EvalData/models/direct_assessment.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103,C0330,no-member import sys from collections import defaultdict diff --git a/EvalData/models/direct_assessment_context.py b/EvalData/models/direct_assessment_context.py index c6766b7d..37666068 100644 --- a/EvalData/models/direct_assessment_context.py +++ b/EvalData/models/direct_assessment_context.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103,C0330,no-member import sys from collections import defaultdict @@ -499,7 +500,6 @@ def get_time_for_user(cls, user): timestamps.append((result.start_time, result.end_time)) return seconds_to_timedelta(_compute_user_total_annotation_time(timestamps)) - @classmethod def get_system_annotations(cls): diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py index 3115fe2d..410b736c 100644 --- a/EvalData/models/direct_assessment_document.py +++ b/EvalData/models/direct_assessment_document.py @@ -265,23 +265,21 @@ def next_document_for_user_mqmesa(self, user): # get all items (100) and try to find resul all_items = [ ( - item, + item, DirectAssessmentDocumentResult.objects.filter( item=item, activated=False, completed=True, createdBy=user - ).last() + ).last(), ) for item in self.items.all().order_by('id') ] unfinished_items = [i for i, r in all_items if not r] - + docs_total = len({i.documentID for i, r in all_items}) - items_completed = len([ - i for i, r in all_items if r and r.completed - ]) - docs_completed = docs_total - len({ - i.documentID for i, r in all_items if r is None or not r.completed - }) - + items_completed = len([i for i, r in all_items if r and r.completed]) + docs_completed = docs_total - len( + {i.documentID for i, r in all_items if r is None or not r.completed} + ) + if not unfinished_items: return ( None, @@ -295,7 +293,8 @@ def next_document_for_user_mqmesa(self, user): # things are ordered with batch order next_item = unfinished_items[0] doc_items_all = [ - (i, r) for i, r in all_items + (i, r) + for i, r in all_items # match document name and system if i.documentID == next_item.documentID and i.targetID == next_item.targetID ] @@ -308,12 +307,12 @@ def next_document_for_user_mqmesa(self, user): ) return ( - next_item, # the first unannotated item for the user - items_completed, # the number of completed items in the task - docs_completed, # the number of completed documents in the task - doc_items, # all items from the current document - doc_items_results, # all score results from the current document - docs_total, # the total number of documents in the task + next_item, # the first unannotated item for the user + items_completed, # the number of completed items in the task + docs_completed, # the number of completed documents in the task + doc_items, # all items from the current document + doc_items_results, # all score results from the current document + docs_total, # the total number of documents in the task ) def get_results_for_each_item(self, block_items, user): @@ -457,7 +456,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): new_items.append(new_item) if item['isCompleteDocument']: doc_items += 1 - + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 @@ -592,18 +591,23 @@ def get_hit_status_for_user(cls, user): @classmethod def get_time_for_user(cls, user): results = cls.objects.filter(createdBy=user, activated=False, completed=True) - is_esa_or_mqm = any([ - "esa" in result.task.campaign.campaignOptions.lower().split(";") or - "mqm" in result.task.campaign.campaignOptions.lower().split(";") - for result in results - ]) + is_esa_or_mqm = any( + [ + "esa" in result.task.campaign.campaignOptions.lower().split(";") + or "mqm" in result.task.campaign.campaignOptions.lower().split(";") + for result in results + ] + ) if is_esa_or_mqm: # for ESA or MQM, do minimum and maximum from each doc import collections + timestamps = collections.defaultdict(list) for result in results: - timestamps[result.item.documentID+" ||| "+result.item.targetID].append((result.start_time, result.end_time)) + timestamps[ + result.item.documentID + " ||| " + result.item.targetID + ].append((result.start_time, result.end_time)) # timestamps are document-level now, but that does not change anything later on timestamps = [ @@ -615,7 +619,6 @@ def get_time_for_user(cls, user): for result in results: timestamps.append((result.start_time, result.end_time)) - return seconds_to_timedelta(_compute_user_total_annotation_time(timestamps)) @classmethod diff --git a/EvalData/models/multi_modal_assessment.py b/EvalData/models/multi_modal_assessment.py index 7bbccc0f..9112c7ed 100644 --- a/EvalData/models/multi_modal_assessment.py +++ b/EvalData/models/multi_modal_assessment.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103,C0330,no-member import sys from collections import defaultdict diff --git a/EvalData/models/pairwise_assessment.py b/EvalData/models/pairwise_assessment.py index fb952ff9..ef7e1cf2 100644 --- a/EvalData/models/pairwise_assessment.py +++ b/EvalData/models/pairwise_assessment.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103,C0330,no-member import sys from collections import defaultdict @@ -341,7 +342,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): contextRight=context_right, ) new_items.append(new_item) - + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 diff --git a/EvalData/models/pairwise_assessment_document.py b/EvalData/models/pairwise_assessment_document.py index 69c71088..b538de4a 100644 --- a/EvalData/models/pairwise_assessment_document.py +++ b/EvalData/models/pairwise_assessment_document.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103,C0330,no-member import sys from collections import defaultdict @@ -470,7 +471,7 @@ def import_from_json(cls, campaign, batch_user, batch_data, max_count): new_items.append(new_item) if item['isCompleteDocument']: doc_items += 1 - + LOGGER.info(f'The task has {len(new_items)} items') current_count += 1 diff --git a/EvalData/models/task_agenda.py b/EvalData/models/task_agenda.py index a452cff8..ccc976fb 100644 --- a/EvalData/models/task_agenda.py +++ b/EvalData/models/task_agenda.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=C0103,C0330,no-member from inspect import currentframe from inspect import getframeinfo diff --git a/EvalData/views.py b/EvalData/views.py index 8065fa8d..29a6b45a 100644 --- a/EvalData/views.py +++ b/EvalData/views.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from django.contrib import messages from django.contrib.auth.decorators import login_required from django.contrib.auth.decorators import permission_required diff --git a/EvalView/admin.py b/EvalView/admin.py index d4952b9f..c1639906 100644 --- a/EvalView/admin.py +++ b/EvalView/admin.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=unused-import from django.contrib import admin diff --git a/EvalView/apps.py b/EvalView/apps.py index 652f2c91..3ae78d03 100644 --- a/EvalView/apps.py +++ b/EvalView/apps.py @@ -3,8 +3,10 @@ See LICENSE for usage details """ + from django.apps import AppConfig + # pylint: disable=missing-docstring class EvalviewConfig(AppConfig): name = 'EvalView' diff --git a/EvalView/models.py b/EvalView/models.py index 53b26aae..aae6718c 100644 --- a/EvalView/models.py +++ b/EvalView/models.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=unused-import from django.db import models diff --git a/EvalView/tests.py b/EvalView/tests.py index faac1ca5..2a876eef 100644 --- a/EvalView/tests.py +++ b/EvalView/tests.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + # pylint: disable=unused-import from django.test import TestCase diff --git a/EvalView/views.py b/EvalView/views.py index 48e48a61..6f14b4d4 100644 --- a/EvalView/views.py +++ b/EvalView/views.py @@ -3,6 +3,7 @@ See LICENSE for usage details """ + from datetime import datetime from datetime import timezone @@ -1089,17 +1090,13 @@ def direct_assessment_document_mqmesa(campaign, current_task, request): end_timestamp = request.POST.get('end_timestamp', None) ajax = bool(request.POST.get('ajax', None) == 'True') - db_item = current_task.items.filter( itemID=item_id, id=task_id, ) - if len(db_item) == 0: - error_msg = ( - f'We could not find item {item_id} in task {task_id}.' - ) + error_msg = f'We could not find item {item_id} in task {task_id}.' LOGGER.error(error_msg) item_saved = False elif len(db_item) > 1: @@ -1185,10 +1182,10 @@ def direct_assessment_document_mqmesa(campaign, current_task, request): if 'contrastiveesa' in campaign_opts: # escape
tags in the source and target texts for item in doc_items: - item.sourceText = item.sourceText \ - .replace("<eos>", "<eos>") \ - .replace("<br/>", "
") - # HTML-esaping on the target text will not work because MQM/ESA tag insertion prevents it + item.sourceText = item.sourceText.replace( + "<eos>", "<eos>" + ).replace("<br/>", "
") + # HTML-esaping on the target text will not work because MQM/ESA tag insertion prevents it guidelines = ( 'You are provided with a text in {0} and its candidate translation(s) into {1}. ' 'Please assess the quality of the translation(s) following the detailed guidelines below. '.format( @@ -2281,7 +2278,7 @@ def pairwise_assessment_document(request, code=None, campaign_name=None): new_ui = 'newui' in campaign_opts escape_eos = 'escapeeos' in campaign_opts escape_br = 'escapebr' in campaign_opts - highlight_style ='highlightstyle' in campaign_opts + highlight_style = 'highlightstyle' in campaign_opts # Get item scores from the latest corresponding results block_scores = [] @@ -2310,12 +2307,8 @@ def pairwise_assessment_document(request, code=None, campaign_name=None): if escape_br: _source_text = _source_text.replace("<br/>", "
") - _candidate1_text = _candidate1_text.replace( - "<br/>", "
" - ) - _candidate2_text = _candidate2_text.replace( - "<br/>", "
" - ) + _candidate1_text = _candidate1_text.replace("<br/>", "
") + _candidate2_text = _candidate2_text.replace("<br/>", "
") item_scores = { 'completed': bool(result and result.score1 > -1), diff --git a/Makefile b/Makefile index e0cc7870..17a141c2 100644 --- a/Makefile +++ b/Makefile @@ -34,4 +34,7 @@ test: install: requirements-dev.txt pip install -r $< +reformat: + black -S -l $(BLACK_LINE_MAXLEN) . --force-exclude '/migrations/' + .PHONY: all check check-black check-pylint check-mypy check-safety run test diff --git a/Scripts/create_iwslt22_tasks.py b/Scripts/create_iwslt22_tasks.py index fb26f639..9019893c 100644 --- a/Scripts/create_iwslt22_tasks.py +++ b/Scripts/create_iwslt22_tasks.py @@ -517,7 +517,7 @@ def create_bad_refs( for _tup in sampled_tasks: _all_tasks += list(_tup) _docs_by_sys: Dict[str, Any] = {} - for (_, docid, sysid) in _all_tasks: + for _, docid, sysid in _all_tasks: if sysid not in _docs_by_sys: _docs_by_sys[sysid] = [] _docs_by_sys[sysid].append(docid) diff --git a/Scripts/create_wmt22_pairwise_tasks.py b/Scripts/create_wmt22_pairwise_tasks.py index a4fa03ae..e2346a14 100644 --- a/Scripts/create_wmt22_pairwise_tasks.py +++ b/Scripts/create_wmt22_pairwise_tasks.py @@ -218,14 +218,18 @@ def unwrap_tsv( for line in tsv: fields = line.rstrip("\n").split('\t') if len(fields) < 5: - print(f"Error: too few fields in {tsv_file}, required fields: DocID, src, ref, sysA, sysB") + print( + f"Error: too few fields in {tsv_file}, required fields: DocID, src, ref, sysA, sysB" + ) exit() docid, src, ref, sysA, sysB = fields[:5] if docid not in src_docs: src_docs[docid] = [] - segid = len(src_docs[docid]) + 1 # segment ID is 1-based to keep it consistent with XML format + segid = ( + len(src_docs[docid]) + 1 + ) # segment ID is 1-based to keep it consistent with XML format src_docs[docid].append((segid, src)) if docid not in ref_docs['A']: @@ -684,7 +688,9 @@ def parse_cmd_args(): print(f'Loading docs from {XML_FILE}') if TSV_FILE: - SRC_DOCS, REF_DOCS, SYS_DOCS = unwrap_tsv(XML_FILE, encoding=ENC, system_A=SYSTEM_A, system_B=SYSTEM_B) + SRC_DOCS, REF_DOCS, SYS_DOCS = unwrap_tsv( + XML_FILE, encoding=ENC, system_A=SYSTEM_A, system_B=SYSTEM_B + ) else: src_lang, SRC_DOCS, ref_lang, REF_DOCS, hyp_lang, SYS_DOCS = unwrap_xml( XML_FILE, encoding=ENC @@ -849,7 +855,9 @@ def parse_cmd_args(): elif task_len < MAX_TASK_SIZE: pad_size = MAX_TASK_SIZE - task_len - pad_data: List[Tuple[int, str, bool]] = [(tup[0], tup[1], False) for tup in task] + pad_data: List[Tuple[int, str, bool]] = [ + (tup[0], tup[1], False) for tup in task + ] pad_pos = 0 while pad_size > 0: print(f'pad_size: {pad_size}') @@ -863,7 +871,12 @@ def parse_cmd_args(): print(f'pad_pos: {pad_pos}') last_doc: Tuple[int, str, bool] = pad_data[-1] - print('Making the last doc smaller', last_doc[0], '-->', last_doc[0] + pad_size) + print( + 'Making the last doc smaller', + last_doc[0], + '-->', + last_doc[0] + pad_size, + ) fixed_doc = (last_doc[0] + pad_size, *last_doc[1:]) pad_data[-1] = fixed_doc # print(pad_data[-1][0]) @@ -874,7 +887,9 @@ def parse_cmd_args(): else: print(f'WARNING: no control items in task no. {tid}') - pad_data: List[Tuple[int, str, bool]] = [(tup[0], tup[1], False) for tup in task] + pad_data: List[Tuple[int, str, bool]] = [ + (tup[0], tup[1], False) for tup in task + ] padded_tasks.append(tuple(pad_data)) if EVEN_NUM and len(padded_tasks) % 2 == 1: @@ -971,8 +986,8 @@ def parse_cmd_args(): item_src = _src[seg_id] item_ref = _ref[seg_id] - item_bads = { sys_id: _bads[sys_id][seg_id] for sys_id in SYS_IDS } - item_tgts = { sys_id: _tgts[sys_id][seg_id] for sys_id in SYS_IDS } + item_bads = {sys_id: _bads[sys_id][seg_id] for sys_id in SYS_IDS} + item_tgts = {sys_id: _tgts[sys_id][seg_id] for sys_id in SYS_IDS} item_type = 'TGT' # Do not generate any BAD items if QC is disabled @@ -1005,7 +1020,9 @@ def parse_cmd_args(): for tgt_idx, sys_id in enumerate(_shuffled_sys_ids): tgt_ctx = [] if seg_counter == 0: - tgt_ctx = [txt for _, txt in SYS_PREV[sys_id][doc_id]][-CTX_SIZE:] + tgt_ctx = [txt for _, txt in SYS_PREV[sys_id][doc_id]][ + -CTX_SIZE: + ] tobj = OrderedDict() tobj['_itemAll'] = _itemAll diff --git a/Scripts/create_wmt22_tasks.py b/Scripts/create_wmt22_tasks.py index 43e8e0bd..a7e3215d 100644 --- a/Scripts/create_wmt22_tasks.py +++ b/Scripts/create_wmt22_tasks.py @@ -769,7 +769,7 @@ def parse_cmd_args(): for _tup in sampled_tasks: _all_tasks += list(_tup) _docs_by_sys: Dict[str, Any] = {} - for (_, docid, sysid) in _all_tasks: + for _, docid, sysid in _all_tasks: if sysid not in _docs_by_sys: _docs_by_sys[sysid] = [] _docs_by_sys[sysid].append(docid) diff --git a/deprecated.py b/deprecated.py index ec2db15f..255995ae 100644 --- a/deprecated.py +++ b/deprecated.py @@ -10,10 +10,11 @@ Use get_deprecated_methods() to retrieve set of deprecated methods. """ + from typing import Set -_DEPRECATED_METHOD_REGISTRY : Set[str] = set() +_DEPRECATED_METHOD_REGISTRY: Set[str] = set() def add_deprecated_method(func): @@ -28,4 +29,4 @@ def get_deprecated_methods(): """ Get deprecated methods from registry. """ - return _DEPRECATED_METHOD_REGISTRY \ No newline at end of file + return _DEPRECATED_METHOD_REGISTRY From 1c04c0c59553367ae2dc3bc9577ba1c942e2d72a Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 11 Jul 2025 16:46:39 -0700 Subject: [PATCH 17/51] bump version to #wmt25dev --- Appraise/settings.py | 2 +- Dashboard/templates/Dashboard/frontpage.html | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Appraise/settings.py b/Appraise/settings.py index 59c9c413..6a307b1e 100644 --- a/Appraise/settings.py +++ b/Appraise/settings.py @@ -211,7 +211,7 @@ # Base context for all views. BASE_CONTEXT = { - 'commit_tag': '#wmt24dev', + 'commit_tag': '#wmt25dev', 'title': 'Appraise evaluation system', 'static_url': STATIC_URL, } diff --git a/Dashboard/templates/Dashboard/frontpage.html b/Dashboard/templates/Dashboard/frontpage.html index 5f4ac5ae..0802f7fa 100644 --- a/Dashboard/templates/Dashboard/frontpage.html +++ b/Dashboard/templates/Dashboard/frontpage.html @@ -11,7 +11,11 @@

An open-source system for manual evaluation of MT output

This is Appraise

-

It supports collaborative collection of human feedback for MT evaluation. It implements tasks such as Translation Quality Checking, Ranking and Error Classification, and Manual Post-Editing. For WMT17, we added support for Direct Assessment. For WMT19, evaluation is focused on source-based Direct Assessment on document level.

+

+ It supports collaborative collection of human feedback for MT evaluation. It implements tasks such as + Direct Assessment (DA), Scalar Quality Metric (SQM), Multidimensional Quality Metric (MQM), Error Span Annotation (ESA), + in various settings, such as source or reference based, contrastive, document-level, multimodal, and others. +

From a10669cdc559de456377b8730c114bfb28bfbc3a Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 11 Jul 2025 17:01:25 -0700 Subject: [PATCH 18/51] wmt24 -> wmt25 --- Dashboard/templates/Dashboard/dashboard.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dashboard/templates/Dashboard/dashboard.html b/Dashboard/templates/Dashboard/dashboard.html index d1eb5bea..19a1da85 100644 --- a/Dashboard/templates/Dashboard/dashboard.html +++ b/Dashboard/templates/Dashboard/dashboard.html @@ -5,7 +5,7 @@

Dashboard

-

Evaluation campaign for shared tasks hosted at the 9th Conference on Machine Translation (WMT24)

+

Evaluation campaign for shared tasks hosted at the 10th Conference on Machine Translation (WMT25)

From 3298c23adf9318c77b1e3cf6441f4cc7a29d2b45 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 11 Jul 2025 17:13:53 -0700 Subject: [PATCH 19/51] dummy change --- Dashboard/templates/Dashboard/frontpage.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dashboard/templates/Dashboard/frontpage.html b/Dashboard/templates/Dashboard/frontpage.html index 0802f7fa..2d24a985 100644 --- a/Dashboard/templates/Dashboard/frontpage.html +++ b/Dashboard/templates/Dashboard/frontpage.html @@ -14,7 +14,7 @@

This is Appraise

It supports collaborative collection of human feedback for MT evaluation. It implements tasks such as Direct Assessment (DA), Scalar Quality Metric (SQM), Multidimensional Quality Metric (MQM), Error Span Annotation (ESA), - in various settings, such as source or reference based, contrastive, document-level, multimodal, and others. + in various settings, such as reference or source based, contrastive, document-level, with video context, and others.

From e9234f225d8df660adf68c642da04a64d4cf2b62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Sun, 20 Jul 2025 13:48:10 -0700 Subject: [PATCH 20/51] don't escape image context, ref #185' --- EvalView/views.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EvalView/views.py b/EvalView/views.py index 6f14b4d4..40b4e7b5 100644 --- a/EvalView/views.py +++ b/EvalView/views.py @@ -1154,8 +1154,8 @@ def direct_assessment_document_mqmesa(campaign, current_task, request): # TODO: hotfix for WMT24 # Tracking issue: https://github.com/AppraiseDev/Appraise/issues/185 for item in doc_items: - # don't escape HTML video - if item.sourceText.strip().startswith(" Date: Sun, 20 Jul 2025 16:28:58 -0700 Subject: [PATCH 21/51] add img { + display: block; + margin-left: auto; + margin-right: auto; + width: 45%; +} + .tutorial-text { text-align: center; color: #257; diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js index 3a7e9e77..8f2c3844 100644 --- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js +++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js @@ -75,6 +75,8 @@ const ERROR_TYPES = { }, "Other": {}, } + + Object.keys(SEVERITY_TO_COLOR).map((key) => { $(`#instruction_sev_${key}`).css("background-color", SEVERITY_TO_COLOR[key]) }) @@ -311,8 +313,14 @@ class MQMItemHandler { } this.mqm_submitted = structuredClone(this.mqm) this.mqm_orig = JSON.parse(JSON.parse(this.el.children('#mqm-payload-orig').html())) - this.text_source_orig = decodeEntities(JSON.parse(this.el.children('#text-source-payload').html()).trim()) - this.source_video = JSON.parse(this.el.children('#text-source-payload').html()).trim().startsWith(" { + if (v == "\n") { + return "
" // preserve newlines + } return `${v}` }).join("") + " [MISSING]" this.el_target.html(html_target) @@ -357,8 +367,11 @@ class MQMItemHandler { } // handle character alignment estimation - if (!this.source_video) { + if (!this.source_is_multimodal) { let html_source = this.text_source_orig.split("").map((v, i) => { + if (v == "\n") { + return "
" // preserve newlines + } return `${v}` }).join("") this.el_source.html(html_source) diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html index 3de8e146..c9df8bba 100644 --- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html +++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html @@ -22,7 +22,7 @@ Completed {{docs_completed}}/{{docs_total}} documents, - {{items_completed}}/100 segments + {{items_completed}}/{{items_total}} segments diff --git a/EvalView/views.py b/EvalView/views.py index 40b4e7b5..fbe85cff 100644 --- a/EvalView/views.py +++ b/EvalView/views.py @@ -1133,10 +1133,11 @@ def direct_assessment_document_mqmesa(campaign, current_task, request): ( next_item, items_completed, + items_total, docs_completed, + docs_total, doc_items, doc_items_results, - docs_total, ) = current_task.next_document_for_user_mqmesa(request.user) if not next_item: @@ -1151,11 +1152,15 @@ def direct_assessment_document_mqmesa(campaign, current_task, request): # Send response to the Ajax POST request return JsonResponse(context) - # TODO: hotfix for WMT24 + # TODO: hotfix for WMT24 and WMT25 # Tracking issue: https://github.com/AppraiseDev/Appraise/issues/185 for item in doc_items: - # don't escape HTML video or images - if item.sourceText.strip().startswith(" Date: Sun, 20 Jul 2025 19:01:46 -0700 Subject: [PATCH 22/51] add Maasai language --- Dashboard/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Dashboard/models.py b/Dashboard/models.py index 248dc916..b9eb8252 100644 --- a/Dashboard/models.py +++ b/Dashboard/models.py @@ -225,6 +225,7 @@ 'kas': 'Kashmiri (كٲشُر)', 'mni': 'Meitei (ꯃꯩꯇꯩꯂꯣꯟ)', 'sat': 'Santali (ᱥᱟᱱᱛᱟᱲᱤ)', + 'mas': 'Maasai (Ol Maa)', } # All sign language codes From 50c6e5f05365559a66ed6e9b8af7ec140bf2c2ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Mon, 21 Jul 2025 20:13:27 -0700 Subject: [PATCH 23/51] minor styling & instructions --- .../static/EvalView/css/direct-assessment-document-mqm-esa.css | 2 +- EvalView/templates/EvalView/_instructions-esa.html | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css index f7acdc0b..91742033 100644 --- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css +++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css @@ -77,7 +77,7 @@ display: block; margin-left: auto; margin-right: auto; - width: 45%; + width: 500px; } .tutorial-text { diff --git a/EvalView/templates/EvalView/_instructions-esa.html b/EvalView/templates/EvalView/_instructions-esa.html index 05e1fa0b..ea8b1526 100644 --- a/EvalView/templates/EvalView/_instructions-esa.html +++ b/EvalView/templates/EvalView/_instructions-esa.html @@ -14,7 +14,8 @@
  • Missing content: If something is missing, highlight the word [MISSING] to mark the error.
  • -
  • Tip: Highlight the word or general area of the error---it doesn’t need to be exact. Use separate highlights for different errors.
  • +
  • Tip: Highlight the word or general area of the error (it doesn't need to be exact). Use multiple highlights for different errors.
  • +
  • Tip: Pay particular attention to translation consistency across the whole document.
  • Score the translation: After marking errors, please use the slider and set an overall score based on meaning preservation and general quality:
    • 0: No meaning preserved: most information is lost.
    • From c4fc6822c782a832ac03dfce6a46ace33f5caa0c Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 22 Jul 2025 18:35:13 -0700 Subject: [PATCH 24/51] increase the max length for campaign names --- ...market_domainname_alter_market_marketid.py | 27 +++++++++++++++++++ EvalData/models/base_models.py | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 EvalData/migrations/0055_alter_market_domainname_alter_market_marketid.py diff --git a/EvalData/migrations/0055_alter_market_domainname_alter_market_marketid.py b/EvalData/migrations/0055_alter_market_domainname_alter_market_marketid.py new file mode 100644 index 00000000..f534d783 --- /dev/null +++ b/EvalData/migrations/0055_alter_market_domainname_alter_market_marketid.py @@ -0,0 +1,27 @@ +# Generated by Django 4.2.22 on 2025-07-23 01:33 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("EvalData", "0054_alter_dataassessmentresult_activatedby_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="market", + name="domainName", + field=models.CharField( + help_text="(max. 50 characters)", + max_length=50, + verbose_name="Domain name", + ), + ), + migrations.AlterField( + model_name="market", + name="marketID", + field=models.CharField(editable=False, max_length=72, unique=True), + ), + ] diff --git a/EvalData/models/base_models.py b/EvalData/models/base_models.py index 7b83ad15..56a3bace 100644 --- a/EvalData/models/base_models.py +++ b/EvalData/models/base_models.py @@ -26,7 +26,7 @@ # TODO: Unclear if these are needed? # from Appraise.settings import STATIC_URL, BASE_CONTEXT -MAX_DOMAINNAME_LENGTH = 20 +MAX_DOMAINNAME_LENGTH = 50 MAX_LANGUAGECODE_LENGTH = 10 MAX_CORPUSNAME_LENGTH = 100 MAX_VERSIONINFO_LENGTH = 20 From 92492a206b2a7f5844d9d519a0249a659c417043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Thu, 24 Jul 2025 10:22:03 -0700 Subject: [PATCH 25/51] update next document button --- .../templates/EvalView/direct-assessment-document-mqm-esa.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html index c9df8bba..8782642b 100644 --- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html +++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html @@ -143,7 +143,7 @@ id="button-next-doc-fake" title="Please first complete all items in the document (error spans + scores)." > - Continue to next document (unavailable) + Continue to next document (finish all segments first) {% endblock %} \ No newline at end of file From 54ea604d79be2b4a6e4ca77e4fada89379fd713c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Thu, 24 Jul 2025 10:29:56 -0700 Subject: [PATCH 26/51] update ESA slider anchor instructions --- EvalView/templates/EvalView/_instructions-esa.html | 10 +++++----- EvalView/templates/EvalView/_slider-mqm-esa.html | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/EvalView/templates/EvalView/_instructions-esa.html b/EvalView/templates/EvalView/_instructions-esa.html index 05e1fa0b..106a1526 100644 --- a/EvalView/templates/EvalView/_instructions-esa.html +++ b/EvalView/templates/EvalView/_instructions-esa.html @@ -14,13 +14,13 @@
  • Missing content: If something is missing, highlight the word [MISSING] to mark the error.
  • -
  • Tip: Highlight the word or general area of the error---it doesn’t need to be exact. Use separate highlights for different errors.
  • +
  • Tip: Highlight the word or general area of the error (it doesn't need to be exact). Use separate highlights for different errors.
  • Score the translation: After marking errors, please use the slider and set an overall score based on meaning preservation and general quality:
    • -
    • 0: No meaning preserved: most information is lost.
    • -
    • 33%: Some meaning preserved: major gaps and narrative issues.
    • -
    • 66%: Most meaning preserved: minor issues with grammar or consistency.
    • -
    • 100%: Perfect: meaning and grammar align completely with the source.
    • +
    • 0: Broken/poor translation.
    • +
    • 33%: Flawed: significant issues
    • +
    • 66%: Good: insignificant issues with grammar, fluency, or consistency
    • +
    • 100%: Perfect: meaning and style aligned completely with the source
    diff --git a/EvalView/templates/EvalView/_slider-mqm-esa.html b/EvalView/templates/EvalView/_slider-mqm-esa.html index e2eecea8..44ecf29b 100644 --- a/EvalView/templates/EvalView/_slider-mqm-esa.html +++ b/EvalView/templates/EvalView/_slider-mqm-esa.html @@ -1,9 +1,9 @@
    - +
    - - - + + +
    0%: No meaning preserved33%: Some meaning preserved66%: Most meaning preserved0%: Broken/poor33%: Flawed (significant issues)66%: Good (insignificant issues) 100%: Perfect
    From 9021b50a085dce78341d32008e6a4613f3358f4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Thu, 24 Jul 2025 10:40:25 -0700 Subject: [PATCH 27/51] fix vertical videos --- .../static/EvalView/css/direct-assessment-document-mqm-esa.css | 1 + 1 file changed, 1 insertion(+) diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css index f7acdc0b..321c14a7 100644 --- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css +++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css @@ -71,6 +71,7 @@ .source-text > audio, .source-text > video { width: 100%; + max-height: 550px; } .source-text > img { From 31e1a9a16da5b83772712e3ebe91aec38bf78c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Thu, 24 Jul 2025 10:44:34 -0700 Subject: [PATCH 28/51] show ESA instructions by default --- .../EvalView/js/direct-assessment-document-mqm-esa.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js index 8f2c3844..8cc42c7d 100644 --- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js +++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js @@ -174,8 +174,14 @@ $(document).ready(() => { // show submit button only on MQM and not ESA $(".button-submit").toggle(MQM_TYPE == "MQM") - let instructions_show = localStorage.getItem("appraise-instructions-show") == "true" + let instructions_show = localStorage.getItem("appraise-instructions-show") if (instructions_show == null) instructions_show = true; + else instructions_show = instructions_show == "true"; + console.log( + localStorage.getItem("appraise-instructions-show"), + localStorage.getItem("appraise-instructions-show") == null, + instructions_show, + ) $("#instructions-show").on("click", () => { instructions_show = !instructions_show; From 8268e3078c1826a273ad0ef375d6790a070a51ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Thu, 24 Jul 2025 13:49:19 -0700 Subject: [PATCH 29/51] update ESA slider anchors --- EvalView/templates/EvalView/_slider-mqm-esa.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EvalView/templates/EvalView/_slider-mqm-esa.html b/EvalView/templates/EvalView/_slider-mqm-esa.html index 44ecf29b..363e8a57 100644 --- a/EvalView/templates/EvalView/_slider-mqm-esa.html +++ b/EvalView/templates/EvalView/_slider-mqm-esa.html @@ -2,8 +2,8 @@ - - + +
    0%: Broken/poor33%: Flawed (significant issues)66%: Good (insignificant issues)33%: Flawed66%: Good 100%: Perfect
    From 3a90994ed6018ea0d478c1659709ab17bb781464 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Thu, 24 Jul 2025 16:54:27 -0700 Subject: [PATCH 30/51] add note on LLM usage, resolve #201 --- EvalView/templates/EvalView/_instructions-esa.html | 1 + 1 file changed, 1 insertion(+) diff --git a/EvalView/templates/EvalView/_instructions-esa.html b/EvalView/templates/EvalView/_instructions-esa.html index 2cd34472..38abbe41 100644 --- a/EvalView/templates/EvalView/_instructions-esa.html +++ b/EvalView/templates/EvalView/_instructions-esa.html @@ -24,6 +24,7 @@
  • 100%: Perfect: meaning and style aligned completely with the source
  • +
  • Using external tools for annotations (chatbots, LLMs) is not allowed.
  • \ No newline at end of file From e79e4e9448281ad0466a1e49b343c544354d3562 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Fri, 25 Jul 2025 12:16:59 -0700 Subject: [PATCH 31/51] ESA styling, add language tags on the side, resolve #50 --- .../direct-assessment-document-mqm-esa.css | 37 ++++++++++++++++++- .../js/direct-assessment-document-mqm-esa.js | 5 --- .../direct-assessment-document-mqm-esa.html | 20 ++++++---- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css index bbaf26fb..4339ea75 100644 --- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css +++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css @@ -34,8 +34,23 @@ color: black; } -.quotelike { - border-left: 5px solid #ddd; + +.language_tag_holder { + position: relative; +} + + +.language_tag { + /* transform: rotate(-90deg); */ + transform-origin: top left; + width: 200px; + display: inline-block; + position: absolute; + text-align: right; + left: -210px; + top: 15px; + color: #257; + font-size: small; } .quotelike { @@ -52,6 +67,7 @@ .item-box { margin-bottom: 20px; + border-radius: 4px; } .target-text { @@ -127,6 +143,12 @@ color: black; } +#instructions { + background-color: #d9edf7; + padding: 10px; + border-radius: 4px; +} + .alert_message { position: fixed; top: 25px; @@ -191,4 +213,15 @@ .ui-widget-content { border: none !important; +} + + +/* override defaults */ +.alert-info { + border: none; + color: #257; +} +.navbar-fixed-top { + position: absolute; + top: -2px; } \ No newline at end of file diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js index 8cc42c7d..4256ca56 100644 --- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js +++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js @@ -177,11 +177,6 @@ $(document).ready(() => { let instructions_show = localStorage.getItem("appraise-instructions-show") if (instructions_show == null) instructions_show = true; else instructions_show = instructions_show == "true"; - console.log( - localStorage.getItem("appraise-instructions-show"), - localStorage.getItem("appraise-instructions-show") == null, - instructions_show, - ) $("#instructions-show").on("click", () => { instructions_show = !instructions_show; diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html index 8782642b..b37ee003 100644 --- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html +++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html @@ -35,14 +35,13 @@ -
    + {% if guidelines %}

    {{ guidelines }}

    {% endif %} @@ -81,11 +80,18 @@
    +
    +
    {{source_language}}
    +
    +
    {{ item.sourceText|safe }}
    - +
    +
    +
    {{target_language}}
    +
    {{item.targetText}}
    From 35bcf1eb8af199df051c448620caf5abea5fbb65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Fri, 25 Jul 2025 12:45:20 -0700 Subject: [PATCH 32/51] make ESA interface neater by moving icons to the side --- .../direct-assessment-document-mqm-esa.css | 44 ++++++++++++++----- .../js/direct-assessment-document-mqm-esa.js | 15 +++---- .../direct-assessment-document-mqm-esa.html | 23 +++------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css index 4339ea75..384f7075 100644 --- a/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css +++ b/EvalView/static/EvalView/css/direct-assessment-document-mqm-esa.css @@ -35,21 +35,45 @@ } -.language_tag_holder { - position: relative; +.status-indicator { + display: inline-block; + position: absolute; + left: -35px; + top: 10px; + font-size: small; + width: 20px; + margin: 0px; + padding: 0px; +} + +.button-reset { + display: inline-block; + position: absolute; + left: -40px; + top: 40px; + font-size: small; + background-color: transparent !important; + border: none; + width: 20px; + margin: 0px; + padding: 0px; } +.button-submit { + display: block; + margin-left: auto; + margin-right: auto; +} + +.target-box { + position: relative; + margin-bottom: -10px; +} .language_tag { /* transform: rotate(-90deg); */ - transform-origin: top left; - width: 200px; - display: inline-block; - position: absolute; - text-align: right; - left: -210px; - top: 15px; - color: #257; + float: right; + color: #777; font-size: small; } diff --git a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js index 4256ca56..a01cd352 100644 --- a/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js +++ b/EvalView/static/EvalView/js/direct-assessment-document-mqm-esa.js @@ -195,9 +195,8 @@ function _all_sentences_scored() { return items_left == 0; } -function _change_item_status_icon(item_box, icon_name, status_text) { +function _change_item_status_icon(item_box, icon_name) { let icon_box = item_box.find('.status-indicator').removeClass('glyphicon-refresh glyphicon-ok glyphicon-flag'); - item_box.find(".status-text").text(status_text) icon_box.addClass(`glyphicon-${icon_name}`) } @@ -211,21 +210,21 @@ function submit_form_ajax(item_box) { dataType: 'json', beforeSend: function () { console.log('Sending AJAX request, item-id=', item_box.data('item-id')); - _change_item_status_icon(item_box, 'refresh', "Uploading"); + _change_item_status_icon(item_box, 'refresh'); }, success: function (data) { console.log(`Success, saved=${data.saved} next_item=${data.item_id}`); if (data.saved) { - _change_item_status_icon(item_box, 'ok', "Completed"); + _change_item_status_icon(item_box, 'ok'); } else { - _change_item_status_icon(item_box, 'none', "Upload failed"); + _change_item_status_icon(item_box, 'warning-sign'); _show_error_box(data.error_msg, 10_000); } }, error: function (x, s, t) { console.log('Error:', x, s, t); - _change_item_status_icon(item_box, 'none', "Upload failed"); + _change_item_status_icon(item_box, 'warning-sign'); _show_error_box( 'An unrecognized error has occured. ' + 'Please reload the page or try again in a moment. ', @@ -514,10 +513,10 @@ class MQMItemHandler { check_status() { if (this.el.attr("data-item-completed") == "True") { - _change_item_status_icon(this.el, "ok", "Completed") + _change_item_status_icon(this.el, "ok") this.el.find(".button-submit").hide() } else { - _change_item_status_icon(this.el, "flag", "Unfinished") + _change_item_status_icon(this.el, "flag") } } diff --git a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html index b37ee003..ef500e95 100644 --- a/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html +++ b/EvalView/templates/EvalView/direct-assessment-document-mqm-esa.html @@ -101,23 +101,12 @@
    {% include 'EvalView/_slider-mqm-esa.html' %}
    - - - - - - - -
    - - - - Item status - - -
    + + + +
    From 3c75e973f2189b0f0b77ece28afadcbbaf754c0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Fri, 25 Jul 2025 19:03:22 -0700 Subject: [PATCH 33/51] create campaign status page for ESA --- Campaign/views.py | 125 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 92 insertions(+), 33 deletions(-) diff --git a/Campaign/views.py b/Campaign/views.py index ce7cd863..6126ef8b 100644 --- a/Campaign/views.py +++ b/Campaign/views.py @@ -23,6 +23,7 @@ from EvalData.models import PairwiseAssessmentResult from EvalData.models import seconds_to_timedelta from EvalData.models import TASK_DEFINITIONS +from EvalData.models.direct_assessment_document import DirectAssessmentDocumentTask # pylint: disable=import-error @@ -51,19 +52,27 @@ def campaign_status(request, campaign_name, sort_key=2): _msg = 'Failure to identify campaign {0}'.format(campaign_name) return HttpResponse(_msg, content_type='text/plain') + try: + campaign_opts = campaign.campaignOptions.lower().split(";") + # may raise KeyError + result_type = RESULT_TYPE_BY_CLASS_NAME[campaign.get_campaign_type()] + except KeyError as exc: + LOGGER.debug( + f'Invalid campaign type {campaign.get_campaign_type()} for campaign {campaign.campaignName}' + ) + LOGGER.error(exc) + return HttpResponse( + 'Invalid campaign type for campaign {0}'.format(campaign.campaignName), + content_type='text/plain', + ) + + # special handling for ESA + if "esa" in campaign_opts: + return campaign_status_esa(campaign) + _out = [] for team in campaign.teams.all(): for user in team.members.all(): - try: - campaign_opts = campaign.campaignOptions.lower().split(";") - # may raise KeyError - result_type = RESULT_TYPE_BY_CLASS_NAME[campaign.get_campaign_type()] - except KeyError as exc: - LOGGER.debug( - f'Invalid campaign type {campaign.get_campaign_type()} for campaign {campaign.campaignName}' - ) - LOGGER.error(exc) - continue _data = result_type.objects.filter( createdBy=user, completed=True, task__campaign=campaign.id @@ -118,29 +127,6 @@ def campaign_status(request, campaign_name, sort_key=2): (x[0], x[1], -len(json.loads(x[2])), x[3], x[4], x[5], x[6]) for x in _data ] - elif "esa" in campaign_opts: - is_mqm_or_esa = True - _data = _data.values_list( - 'start_time', - 'end_time', - 'score', - 'item__itemID', - 'item__targetID', - 'item__itemType', - 'item__id', - 'item__documentID', - ) - # compute time override based on document times - import collections - - _time_pairs = collections.defaultdict(list) - for x in _data: - _time_pairs[x[7] + " ||| " + x[4]].append((x[0], x[1])) - _time_pairs = [ - (min([x[0] for x in doc_v]), max([x[1] for x in doc_v])) - for doc, doc_v in _time_pairs.items() - ] - _data = [(x[0], x[1], x[2], x[3], x[4], x[5], x[6]) for x in _data] else: _data = _data.values_list( 'start_time', @@ -245,6 +231,79 @@ def campaign_status(request, campaign_name, sort_key=2): return HttpResponse(u'\n'.join(_txt), content_type='text/plain') +def campaign_status_esa(campaign) -> str: + import collections + out_str = """ + + + + """ + out_str += "\n" + out_str += "\n" + + for team in campaign.teams.all(): + for user in team.members.all(): + if user.is_staff: + continue + + out_str += "" + _data = DirectAssessmentDocumentResult.objects.filter( + createdBy=user, completed=True, task__campaign=campaign.id + ) + + total_count = None + if _data: + _data_all = DirectAssessmentDocumentTask.objects.filter(campaign=campaign.id) + # brute-force try to find if any task has at least one item annotated by this user + for task in _data_all: + for item in task.items.all(): + item = DirectAssessmentDocumentResult.objects.filter( + item=item, createdBy=user + ).last() + if item: + total_count = task.items.count() + break + if total_count: + break + if total_count is None: + out_str += f"" + out_str += f"" + out_str += "" + out_str += "" + else: + if total_count == len(_data): + out_str += f"" + else: + out_str += f"" + out_str += f"" + first_modified = min([x.start_time for x in _data]) + last_modified = max([x.end_time for x in _data]) + out_str += f"" + out_str += f"" + + times = collections.defaultdict() + for item in _data: + times[(item.item.documentID, item.item.targetID)] = (item.start_time, item.end_time) + annotation_time = sum([b-a for a, b in times.values()]) + annotation_time = f'{int(floor(annotation_time / 3600)):0>2d}h{int(floor((annotation_time % 3600) / 60)):0>2d}m' + + annotation_time_upper = last_modified - first_modified + annotation_time_upper = f'{int(floor(annotation_time_upper / 3600)):0>2d}h{int(floor((annotation_time_upper % 3600) / 60)):0>2d}m' + + out_str += f"" + out_str += "\n" + + out_str += "
    UsernameProgressFirst ModifiedLast ModifiedAnnotation Time
    {user.username} 💤0%{user.username} ✅{user.username} 🛠️{len(_data)}/{total_count} ({len(_data) / total_count:.0%}){str(datetime(1970, 1, 1) + seconds_to_timedelta(first_modified)).split('.')[0]}{str(datetime(1970, 1, 1) + seconds_to_timedelta(last_modified)).split('.')[0]}{annotation_time} - {annotation_time_upper}
    " + return HttpResponse(out_str, content_type='text/html') + + def stat_reliable_testing(_data, campaign_opts, result_type): _annotations = len(set([x[6] for x in _data])) _user_mean = sum([x[2] for x in _data]) / (_annotations or 1) From 55715c817da422d5bcd7e3716e21d8e9b9f16298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Fri, 25 Jul 2025 19:25:29 -0700 Subject: [PATCH 34/51] update campaign-status styling --- Campaign/views.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/Campaign/views.py b/Campaign/views.py index 6126ef8b..616dce54 100644 --- a/Campaign/views.py +++ b/Campaign/views.py @@ -243,8 +243,12 @@ def campaign_status_esa(campaign) -> str: td, th { padding: 5px; } - + * { + font-family: monospace; + } + \n """ + out_str += f"

    {campaign.campaignName}

    \n" out_str += "\n" out_str += "\n" @@ -252,12 +256,10 @@ def campaign_status_esa(campaign) -> str: for user in team.members.all(): if user.is_staff: continue - out_str += "" _data = DirectAssessmentDocumentResult.objects.filter( createdBy=user, completed=True, task__campaign=campaign.id ) - total_count = None if _data: _data_all = DirectAssessmentDocumentTask.objects.filter(campaign=campaign.id) @@ -274,7 +276,7 @@ def campaign_status_esa(campaign) -> str: break if total_count is None: out_str += f"" - out_str += f"" + out_str += "" out_str += "" out_str += "" else: @@ -285,8 +287,15 @@ def campaign_status_esa(campaign) -> str: out_str += f"" first_modified = min([x.start_time for x in _data]) last_modified = max([x.end_time for x in _data]) - out_str += f"" - out_str += f"" + + first_modified_str = str(datetime(1970, 1, 1) + seconds_to_timedelta(first_modified)).split('.')[0] + last_modified_str = str(datetime(1970, 1, 1) + seconds_to_timedelta(last_modified)).split('.')[0] + # remove seconds + first_modified_str = ":".join(first_modified_str.split(":")[:-1]) + last_modified_str = ":".join(last_modified_str.split(":")[:-1]) + + out_str += f"" + out_str += f"" times = collections.defaultdict() for item in _data: From ed89e7e4d3c1aa351f8ab476bc12cd1936a752e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Fri, 25 Jul 2025 19:43:13 -0700 Subject: [PATCH 35/51] speed-up campaign-status and next document fetch --- Campaign/views.py | 18 +++--------------- EvalData/models/direct_assessment_document.py | 11 +++++++---- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/Campaign/views.py b/Campaign/views.py index 616dce54..a37b8db0 100644 --- a/Campaign/views.py +++ b/Campaign/views.py @@ -260,26 +260,14 @@ def campaign_status_esa(campaign) -> str: _data = DirectAssessmentDocumentResult.objects.filter( createdBy=user, completed=True, task__campaign=campaign.id ) - total_count = None - if _data: - _data_all = DirectAssessmentDocumentTask.objects.filter(campaign=campaign.id) - # brute-force try to find if any task has at least one item annotated by this user - for task in _data_all: - for item in task.items.all(): - item = DirectAssessmentDocumentResult.objects.filter( - item=item, createdBy=user - ).last() - if item: - total_count = task.items.count() - break - if total_count: - break - if total_count is None: + if not _data: out_str += f"" out_str += "" out_str += "" out_str += "" else: + task = DirectAssessmentDocumentTask.objects.filter(id=_data[0].task_id).first() + total_count = task.items.count() if total_count == len(_data): out_str += f"" else: diff --git a/EvalData/models/direct_assessment_document.py b/EvalData/models/direct_assessment_document.py index 4499bf6a..2a3a56dd 100644 --- a/EvalData/models/direct_assessment_document.py +++ b/EvalData/models/direct_assessment_document.py @@ -263,13 +263,16 @@ def next_document_for_user_mqmesa(self, user): doc_items_results, """ - # get all items (100) and try to find resul + # get all items and try to find a matching result + # TODO: probably can be optimized better + + items_user = DirectAssessmentDocumentResult.objects.filter( + activated=False, completed=True, createdBy=user + ) all_items = [ ( item, - DirectAssessmentDocumentResult.objects.filter( - item=item, activated=False, completed=True, createdBy=user - ).last(), + items_user.filter(item=item).last(), ) for item in self.items.all().order_by('id') ] From 7e8b9fa2321d12a3a2bf8b721e0be2af4d77257b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Fri, 25 Jul 2025 20:03:56 -0700 Subject: [PATCH 36/51] edit style of times --- Campaign/views.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/Campaign/views.py b/Campaign/views.py index a37b8db0..e112e0e9 100644 --- a/Campaign/views.py +++ b/Campaign/views.py @@ -250,7 +250,9 @@ def campaign_status_esa(campaign) -> str: """ out_str += f"

    {campaign.campaignName}

    \n" out_str += "
    UsernameProgressFirst ModifiedLast ModifiedAnnotation Time
    {user.username} 💤0%{len(_data)}/{total_count} ({len(_data) / total_count:.0%}){str(datetime(1970, 1, 1) + seconds_to_timedelta(first_modified)).split('.')[0]}{str(datetime(1970, 1, 1) + seconds_to_timedelta(last_modified)).split('.')[0]}{first_modified_str}{last_modified_str}{user.username} 💤{user.username} ✅
    \n" - out_str += "\n" + out_str += "" + "".join( + f"" for x in ["Username", "Progress", "First Modified", "Last Modified", "Time (Last-First)", "Time (Real)"] + ) + "\n" for team in campaign.teams.all(): for user in team.members.all(): @@ -265,6 +267,8 @@ def campaign_status_esa(campaign) -> str: out_str += "" out_str += "" out_str += "" + out_str += "" + out_str += "" else: task = DirectAssessmentDocumentTask.objects.filter(id=_data[0].task_id).first() total_count = task.items.count() @@ -284,17 +288,23 @@ def campaign_status_esa(campaign) -> str: out_str += f"" out_str += f"" + annotation_time_upper = last_modified - first_modified + annotation_time_upper = f'{int(floor(annotation_time_upper / 3600)):0>2d}h {int(floor((annotation_time_upper % 3600) / 60)):0>2d}m' + out_str += f"" - times = collections.defaultdict() + times = collections.defaultdict(list) for item in _data: - times[(item.item.documentID, item.item.targetID)] = (item.start_time, item.end_time) - annotation_time = sum([b-a for a, b in times.values()]) - annotation_time = f'{int(floor(annotation_time / 3600)):0>2d}h{int(floor((annotation_time % 3600) / 60)):0>2d}m' + times[(item.item.documentID, item.item.targetID)].append((item.start_time, item.end_time)) + times = [ + (min([x[0] for x in doc_v]), max([x[1] for x in doc_v])) + for doc, doc_v in times.items() + ] - annotation_time_upper = last_modified - first_modified - annotation_time_upper = f'{int(floor(annotation_time_upper / 3600)):0>2d}h{int(floor((annotation_time_upper % 3600) / 60)):0>2d}m' + annotation_time = sum([b-a for a, b in times]) + annotation_time = f'{int(floor(annotation_time / 3600)):0>2d}h {int(floor((annotation_time % 3600) / 60)):0>2d}m' + + out_str += f"" - out_str += f"" out_str += "\n" out_str += "
    UsernameProgressFirst ModifiedLast ModifiedAnnotation Time
    {x}
    {first_modified_str}{last_modified_str}{annotation_time_upper}{annotation_time}{annotation_time} - {annotation_time_upper}
    " From 11cd25995bccf85bd9e5f01cc205827c968f47e3 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Sat, 26 Jul 2025 09:45:42 -0700 Subject: [PATCH 37/51] use TaskAgenda to get the task for a user; display 0/xxx even if no annotations made --- Campaign/views.py | 58 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/Campaign/views.py b/Campaign/views.py index e112e0e9..36c7f927 100644 --- a/Campaign/views.py +++ b/Campaign/views.py @@ -23,6 +23,7 @@ from EvalData.models import PairwiseAssessmentResult from EvalData.models import seconds_to_timedelta from EvalData.models import TASK_DEFINITIONS +from EvalData.models import TaskAgenda from EvalData.models.direct_assessment_document import DirectAssessmentDocumentTask # pylint: disable=import-error @@ -65,7 +66,7 @@ def campaign_status(request, campaign_name, sort_key=2): 'Invalid campaign type for campaign {0}'.format(campaign.campaignName), content_type='text/plain', ) - + # special handling for ESA if "esa" in campaign_opts: return campaign_status_esa(campaign) @@ -237,7 +238,7 @@ def campaign_status_esa(campaign) -> str: