thammegowda · thammegowda · Mar 31, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/.github/workflows/python-build-test.yml b/.github/workflows/python-build-test.yml
@@ -13,27 +13,36 @@ permissions:
 
 env:
   PYTHONUTF8: "1"
-  
+
 jobs:
   build:
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [ubuntu-latest, macos-latest]       #  windows-latest
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        os: [ubuntu-latest, macos-latest]  # windows-latest
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+        exclude:
+          - os: macos-latest
+            python-version: '3.7'
+          - os: ubuntu-latest
+            python-version: '3.7'
+          # os x py versions here: https://raw.githubusercontent.com/actions/python-versions/main/versions-manifest.json
     steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v3
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
       with:
-        python-version: "3.10"
+        python-version: "${{ matrix.python-version }}"
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        pip install setuptools==61.2 flake8 pytest-cov
+        pip install --upgrade pip
+        pip install setuptools==61.2 flake8
+        python --version
+        pip --version
     - name: Install module
       run: |
-        python3 setup.py install
+        pip install .[hf,test]
     - name: Test with pytest
       run: |
          python3 -m pytest

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 *.egg-info
 *.pytest_cache
+mtdata/resource/huggingface-datasets.all.jsonl
 
 # Byte-compiled / optimized / DLL files
 /tmp

diff --git a/.travis.yml.bak b/.travis.yml.bak
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Change Log
 
+## 0.4.3 - 20250330
+* Add preliminary support for huggingface datasets; currently wmt24++ is the only supported dataset
+* Update setup.py -> pyproject.toml; hf datasets is optional dependency
+* Add mtdata index subcommand. deprecate `mtdata --reindex <cmd>`
+* Add a field named `meta` of type dictionary to the Entry class; stores arbitrary key-vals which maybe useful for downloading and parsing datasets.
+* Support for document id , (currently, one among the many in meta fields)in  `.meta.jsonl.gz`
+* OPUS index updated
+* `mtdata score` sub command added; support QE scoring via pymarian
+
+
+## v0.4.2
+- minor fixes
+
 ## v0.4.1 - 20240425
 * Better parallelization: parallel and mono data are scheduled at once (previously it was one after the other)
 * `mtdata cache` added. Improves concurrency by supporting multiple recipes

diff --git a/README.md b/README.md
@@ -53,12 +53,12 @@ These are the summary of datasets from various sources (Updated: Feb 2022).
 
 |       Source | Dataset Count |
 |-------------:|--------------:|
-| OPUS | 151,753|
+| OPUS | 156,257|
 | Flores | 51,714|
 | Microsoft | 8,128|
 | Leipzig | 5,893|
 | Neulab | 4,455|
-| Statmt | 1,784|
+| Statmt | 1,798|
 | Facebook | 1,617|
 | AllenAi | 1,611|
 | ELRC | 1,575|
@@ -69,18 +69,19 @@ These are the summary of datasets from various sources (Updated: Feb 2022).
 | AI4Bharath | 192|
 | ParaCrawl | 127|
 | Lindat | 56|
+| Google | 55|
 | UN | 30|
 | JoshuaDec | 29|
 | StanfordNLP | 15|
 | ParIce | 8|
 | LangUk | 5|
+| KECL | 4|
 | Phontron | 4|
 | NRC_CA | 4|
-| KECL | 3|
 | IITB | 3|
 | WAT | 3|
 | Masakhane | 2|
-| **Total** | **231,157** |
+| **Total** | **235,731**|
 
 
 ## Usecases
@@ -278,6 +279,25 @@ mtdata list-recipe  # see all recipes
 mtdata get-recipe -ri <recipe_id> -o <out_dir>  # get recipe, recreate dataset
 ```
 
+## QE Scoring
+
+> Since v0.4.3 (WMT25)
+
+We support scoring parallel segments using any (QE) metric using a subprocess invocation.
+The command for subprocess has to satisfy the following three assumptions
+1. STDIN->STDOUT mapping such that reads `source\ttarget` lines from STDIN and prints score to stdout
+2. 1:1 mapping. i.e. number of output lines match the number of input lines
+3. Preserves input order
+
+Here is an example with `pymarian`
+
+```bash
+pip install pymarian
+metric="wmt22-cometkiwi-da"
+cmd="pymarian-eval --stdin --fields src mt --workspace -8000 --model wmt22-cometkiwi-da --mini-batch 64"
+python -m mtdata score -l eng-isl -o wmt25-eng-isl -c "$cmd" -n "$metric"
+```
+
 ## Language Name Standardization
 ### ISO 639 3 
 Internally, all language codes are mapped to ISO-639 3 codes.

diff --git a/mtdata/__init__.py b/mtdata/__init__.py
@@ -4,7 +4,7 @@
 # Created: 4/4/20
 
 
-__version__ = '0.4.2'
+__version__ = '0.4.3'
 __description__ = 'mtdata is a tool to download datasets for machine translation'
 __author__ = 'Thamme Gowda'
 
@@ -27,6 +27,18 @@
 class MTDataException(Exception):
     pass
 
+
+class MTDataUserError(MTDataException):
+    """
+    This exception is for the cases where printing the whole stack trace is bad UI
+    and we want to show a user-friendly message. https://github.com/thammegowda/mtdata/issues/162
+    """
+    def __init__(self, msg, exitcode=1, *args):
+        super().__init__(*args)
+        self.msg = msg
+        self.exitcode = exitcode
+
+
 class Defaults:
     FILE_LOCK_TIMEOUT = 2 * 60 * 60  # 2 hours
     PBAR_REFRESH_INTERVAL = 1    # seconds
diff --git a/mtdata/cache.py b/mtdata/cache.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Author: Thamme Gowda [tg (at) isi (dot) edu] 
+# Author: Thamme Gowda [tg (at) isi (dot) edu]
 # Created: 4/4/20
 import zipfile
 import tarfile
@@ -45,7 +45,7 @@ def get_entry(self, entry: Entry, fix_missing=True) -> Union[Path, List[Path]]:
             else:
                 assert isinstance(entry.url, str)
                 local = self.get_local_path(entry.url, filename=entry.filename, fix_missing=fix_missing, entry=entry)
-                if zipfile.is_zipfile(local) or tarfile.is_tarfile(local):
+                if isinstance(local, Path) and (zipfile.is_zipfile(local) or tarfile.is_tarfile(local)):
                     # look inside the archives and get the desired files
                     local = self.get_local_in_paths(path=local, entry=entry)
             return local
@@ -133,6 +133,11 @@ def get_flag_file(self, file: Path):
 
     def get_local_path(self, url, filename=None, fix_missing=True, entry=None):
         hostname = urlparse(url).hostname or 'nohost'
+        if hostname == "huggingface.co":
+            # HF is special cased by deligating the task to huggingface sdk
+            #  I have considered not adding the dependency but there are a lot of file formats and
+            # some are sharded datasets which require custom logic, and my custom code might not be future proof
+            return self.get_hf_dataset(url, entry=entry)
         filename = filename or url.split('/')[-1]
         assert hostname and filename
         mdf5_sum = md5(url.encode('utf-8')).hexdigest()
@@ -145,6 +150,29 @@ def get_local_path(self, url, filename=None, fix_missing=True, entry=None):
                 raise
         return local
 
+    def get_hf_dataset(self, url: str, entry=None):
+        # dataset lib has a lot of transient dependencies, so lazily load it
+        #  and only when needed
+        try:
+            from datasets import load_dataset
+        except ImportError as e:
+            raise MTDataException(f"huggingface datasets library is required to access {entry.did}, but it is missing. "
+                                  f"Run: 'pip install datasets' and try again") from e
+        hf_id = entry.meta["orig_id"]
+        config = entry.meta.get("config", None)
+        split = entry.meta.get("split", None)
+        cache_dir = self.root / 'huggingface' / 'datasets'
+        args = dict(
+            name=config,
+            split=split,
+            cache_dir=cache_dir,
+            streaming=False,
+            trust_remote_code=False,
+        )
+        log.debug(f"Loading dataset {hf_id} with args: {args}")
+        ds = load_dataset(hf_id, **args)
+        return ds
+
     @classmethod
     def match_globs(cls, names, globs, meta=''):
         result = []