thammegowda · thammegowda · Apr 13, 2026 · Apr 27, 2025 · May 27, 2025 · Sep 17, 2025
diff --git a/.github/workflows/python-build-test.yml b/.github/workflows/python-build-test.yml
@@ -20,33 +20,41 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, macos-latest]  # windows-latest
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-        #exclude:
-        #  - os: macos-latest
-        #    python-version: '3.7'
-        #  - os: ubuntu-latest
-        #    python-version: '3.7'
-        # os x py versions here: https://raw.githubusercontent.com/actions/python-versions/main/versions-manifest.json
+        os: [ubuntu-22.04, ubuntu-latest, macos-latest]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        exclude:  #exclude some to reduce number of jobs
+          - os: ubuntu-latest
+            python-version: "3.11"
+          - os: ubuntu-latest
+            python-version: "3.13"
+          - os: macos-latest
+            python-version: "3.11"
+          - os: macos-latest
+            python-version: "3.13"
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
         python-version: "${{ matrix.python-version }}"
+        cache: pip
+    - name: Cache mtdata datasets
+      uses: actions/cache@v4
+      with:
+        path: ~/.mtdata
+        key: mtdata-cache-${{ matrix.os }}
     - name: Install dependencies
       run: |
-        pip install --upgrade pip
-        pip install setuptools==61.2 flake8
+        pip install --upgrade pip setuptools
         python --version
         pip --version
     - name: Install module
       run: |
         pip install .[hf,test]
     - name: Test with pytest
       run: |
-         python3 -m pytest
+         python -m pytest
     - name: Test CLI from other dir
       run: |
         cd iso-langs
-        python3 -m mtdata -ri list -id -l eng-kan
+        python -m mtdata -ri list -id -l eng-kan
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,35 @@
 # Change Log
 
+## 0.5.0 - 20250413
+
+**Breaking Changes:**
+* Drop Python 3.9 support; minimum is now Python 3.10
+* Replace `enlighten` progress bars with `rich`; logging now uses `RichHandler`
+* Dependency versions updated: `rich~=14.0` replaces `enlighten`, `portalocker~=3.2`, `pybtex~=0.25`, `ruamel.yaml~=0.18`
+* Removed `setuptools` workaround for pybtex on Python 3.12+
+
+**New Features:**
+* `mtdata-map` CLI added for applying subprocess-based transformations to parallel data; registered as console script
+* Subprocess-based decompression for `.xz` (via `xz -T0`) and `.bz2` (via `pbzip2`/`lbzip2`/`bzip2`) for faster I/O
+* Generalized `SubprocessCompressor` base class in `pigz.py`; `pigz`, `xz_subprocess`, and `bzip2_subprocess` are now subclasses
+* HuggingFace index loader generalized to support arbitrary datasets (no longer hard-coded to `google/wmt24pp` only)
+* Write buffering (1 MiB) for subprocess compressors
+
+**Data Updates:**
+* OPUS index updated: ~60k new entries (159k → 219k)
+* Added WMT26 constrained recipes
+* Added new datasets for WMT26
+* Added English-Bhojpuri parallel and monolingual corpora (BHLTR); Fixes #174
+
+**Improvements:**
+* Progress bars rewritten with `rich`: multi-task support, spinner, rate columns, coordinated logging
+* Singleton `_Sentinel` pattern in `SubprocMapper` preserves identity across pickling
+* `SubprocMapper`: improved control message handling and queue draining assertions
+* `mtdata-map`: do not modify input lines (no longer replaces `\t` in data)
+* Improved log message readability across `mtdata-map` and data pipeline
+* Muted third-party loggers (`httpx`, `datasets`, `huggingface_hub`, `fsspec`, `urllib3`) to WARNING level
+* CI: add Python 3.14, test on ubuntu-22.04
+
 ## 0.4.3 - 20250330
 * Add preliminary support for huggingface datasets; currently wmt24++ is the only supported dataset
 * Update setup.py -> pyproject.toml; hf datasets is optional dependency

diff --git a/mtdata/__init__.py b/mtdata/__init__.py
@@ -4,25 +4,36 @@
 # Created: 4/4/20
 
 
-__version__ = '0.4.3'
+__version__ = '0.5.0'
 __description__ = 'mtdata is a tool to download datasets for machine translation'
 __author__ = 'Thamme Gowda'
 
 import logging as log
 from pathlib import Path
 import os
-import enlighten
 from ruamel.yaml import YAML
 
 yaml = YAML()
 debug_mode = False
-_log_format = '%(asctime)s %(module)s.%(funcName)s:%(lineno)s %(levelname)s:: %(message)s'
-log.basicConfig(level=log.INFO, datefmt='%Y-%m-%d %H:%M:%S', format=_log_format)
+#_log_format = '%(module)s.%(funcName)s:%(lineno)s %(message)s'
+from mtdata.pbar import get_log_handler  # noqa: E402
+log.basicConfig(level=log.INFO, format='%(message)s', datefmt='%Y%m%d %H:%M:%S',
+                handlers=[get_log_handler()])
+
+_THIRD_PARTY_LOGGERS = ('httpx', 'datasets', 'huggingface_hub', 'fsspec', 'urllib3')
+
+def set_third_party_log_level(level=log.WARNING):
+    for name in _THIRD_PARTY_LOGGERS:
+        log.getLogger(name).setLevel(level)
+
+set_third_party_log_level(log.WARNING)
 cache_dir = Path(os.environ.get('MTDATA', '~/.mtdata')).expanduser()
 recipes_dir = Path(os.getenv('MTDATA_RECIPES', '.')).resolve()
 cached_index_file = cache_dir / f'mtdata.index.{__version__}.pkl'
 resource_dir:Path = Path(__file__).parent / 'resource'
-pbar_man = enlighten.get_manager()
+
+from mtdata.pbar import pbar_man  # noqa: E402
+
 
 class MTDataException(Exception):
     pass

diff --git a/mtdata/cache.py b/mtdata/cache.py
@@ -146,7 +146,7 @@ def get_local_path(self, url, filename=None, fix_missing=True, entry=None):
             try:
                 self.download(url, local, entry=entry)
             except:
-                log.error(f'Error downloading {entry and entry.did}\nURL: {url}\nPath:{local}')
+                log.error(f'Error downloading {entry and entry.did} | url={url} | path={local}')
                 raise
         return local
 
@@ -162,6 +162,11 @@ def get_hf_dataset(self, url: str, entry=None):
         config = entry.meta.get("config", None)
         split = entry.meta.get("split", None)
         cache_dir = self.root / 'huggingface' / 'datasets'
+
+        if isinstance(config, list):
+            # Cross-config alignment: load two configs, join by a shared field
+            return self._get_hf_cross_config(entry)
+
         args = dict(
             name=config,
             split=split,
@@ -171,8 +176,48 @@ def get_hf_dataset(self, url: str, entry=None):
         )
         log.debug(f"Loading dataset {hf_id} with args: {args}")
         ds = load_dataset(hf_id, **args)
+        if split is None and hasattr(ds, 'keys'):
+            # load_dataset returns DatasetDict when split=None
+            keys = list(ds.keys())
+            assert len(keys) == 1, (f"Multiple splits found in {hf_id}: {keys}."
+                                    f" Specify 'split' in the resource file.")
+            ds = ds[keys[0]]
         return ds
 
+    def _get_hf_cross_config(self, entry):
+        """Load two HF configs and align rows by a join field, yielding combined dicts."""
+        from datasets import load_dataset
+        hf_id = entry.meta["orig_id"]
+        configs = entry.meta["config"]
+        split = entry.meta.get("split", None)
+        assert len(configs) == 2, f"Expected 2 configs for cross-config, got {configs}"
+        join_field = entry.meta.get("join_field", "id")
+        src_config, tgt_config = configs
+
+        cache_dir = self.root / 'huggingface' / 'datasets'
+        common_args = dict(cache_dir=cache_dir, streaming=False, trust_remote_code=False)
+        log.debug(f"Loading cross-config: {hf_id} [{src_config}] + [{tgt_config}]")
+        ds1 = load_dataset(hf_id, name=src_config, split=split, **common_args)
+        ds2 = load_dataset(hf_id, name=tgt_config, split=split, **common_args)
+
+        # Build lookup from second config, keyed by join field
+        tgt_lookup = {}
+        text_field = entry.meta.get("text_field", "text")
+        for row in ds2:
+            key = row[join_field]
+            tgt_lookup[key] = row[text_field]
+
+        # Yield aligned rows as dicts with config names as keys
+        class CrossConfigDataset:
+            """Iterable wrapper that yields aligned rows from two HF configs."""
+            def __iter__(self_inner):
+                for row in ds1:
+                    key = row[join_field]
+                    if key in tgt_lookup:
+                        yield {src_config: row[text_field], tgt_config: tgt_lookup[key]}
+
+        return CrossConfigDataset()
+
     @classmethod
     def match_globs(cls, names, globs, meta=''):
         result = []
@@ -226,8 +271,8 @@ def download(self, url: str, save_at: Path, timeout=(5, 10), entry=None):
                     parts[2][:24], '...', parts[-1][-24:], # host ... filename
                     ]
             desc = ''.join(desc)
-            with pbar_man.counter(color='green', total=tot_bytes//2**10, unit='KiB', leave=False, position=2,
-                                  min_delta=Defaults.PBAR_REFRESH_INTERVAL, desc=f"{desc}"
+            with pbar_man.counter(total=tot_bytes//2**10, unit='KiB',
+                                  desc=f"{desc}"
                                   ) as pbar, open(save_at, 'wb', buffering=2**24) as out:
                 for chunk in resp.iter_content(chunk_size=buf_size):
                     out.write(chunk)