esm-tools · siligam · May 13, 2026 · May 13, 2026
diff --git a/environment.yaml b/environment.yaml
@@ -20,3 +20,4 @@ dependencies:
   - pip:
     - imohash
     - tqdm
+    - xxhash
diff --git a/ptool/analyse.py b/ptool/analyse.py
@@ -1,10 +1,12 @@
-import os
 import itertools
-import pandas as pd
-import numpy as np
-import humanize
+import os
 from collections import defaultdict
 
+import click
+import humanize
+import numpy as np
+import pandas as pd
+
 __all__ = [
     "read_csv",
     "compare",
@@ -27,10 +29,11 @@ def read_csv(filename, ignore=None, drop_duplicates=False):
     df["rparent"] = df.rpath.apply(os.path.dirname)
     for name, dtype in df.dtypes.items():
         if dtype == "object":
-            df[name] = df[name].astype("str[pyarrow]")
+            df[name] = df[name].astype("string[pyarrow]")
     if ignore:
         df = df[~df.rparent.str.contains(ignore)]
         df = df[~df.fname.str.contains(ignore)]
+    df["checksum_type"] = df.checksum.str.split(":").str[0]
     df = df.sort_values(by=["checksum", "mtime"])
     dups = df[
         df.duplicated(subset=["checksum", "fname"]).values
@@ -74,7 +77,18 @@ def directory_map(m):
     return (m[["rparent_left", "rparent_right"]]).drop_duplicates()
 
 
+def _assert_compatible_checksums(left, right):
+    lt = left.checksum_type.iloc[0]
+    rt = right.checksum_type.iloc[0]
+    if lt != rt:
+        raise click.UsageError(
+            f"Checksum type mismatch: left CSV uses '{lt}' but right CSV uses '{rt}'. "
+            "Re-generate both snapshots with the same --checksum-type before comparing."
+        )
+
+
 def compare(left, right, relabel=False, threshold=0.1):
+    _assert_compatible_checksums(left, right)
     by_hash = merge(left, right)
     by_name = merge(left, right, on="fname")
     by_hash["flag"] = ""

diff --git a/ptool/checksums.py b/ptool/checksums.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import fnmatch
+import hashlib
 import os
 import re
 import sys
@@ -12,6 +13,8 @@
 from imohash import hashfile
 from tqdm.contrib.concurrent import process_map
 
+IMOHASH_SAMPLE_SIZE = 64 * 1024  # upgraded from 16 KB to reduce false-negative risk
+
 not_hidden_files_or_dirs = re.compile(r"^[^.]").match
 
 
@@ -117,21 +120,52 @@ def result(self):
         return self.value
 
 
-def hasher(filename):
-    "Calucates imohash for a given file"
-    return f"imohash:{hashfile(filename, hexdigest=True)}"
+def _imohash(filename):
+    return f"imohash-64k:{hashfile(filename, sample_size=IMOHASH_SAMPLE_SIZE, hexdigest=True)}"
+
+
+def _md5(filename):
+    h = hashlib.md5()
+    with open(filename, "rb") as f:
+        for chunk in iter(lambda: f.read(8 * 1024 * 1024), b""):
+            h.update(chunk)
+    return f"md5:{h.hexdigest()}"
+
+
+def _xxhash(filename):
+    import xxhash
+    h = xxhash.xxh3_64()
+    with open(filename, "rb") as f:
+        for chunk in iter(lambda: f.read(8 * 1024 * 1024), b""):
+            h.update(chunk)
+    return f"xxhash:{h.hexdigest()}"
+
+
+HASHERS = {
+    "imohash-64k": _imohash,
+    "md5": _md5,
+    "xxhash": _xxhash,
+}
 
 
-def stats(fpath, stat=os.stat):
-    "Generates record with imohash and file stats information"
-    try:
-        checksum = hasher(fpath)
-        st = stat(fpath)
-        record = f"{checksum},{st.st_size},{st.st_mtime},{fpath}"
-        record = Results(value=record)
-    except Exception as e:
-        record = Results(exc=f"{str(e)}")
-    return record
+def make_stats(checksum_type):
+    "Returns a stats function bound to the given checksum type"
+    hasher = HASHERS[checksum_type]
+
+    def stats(fpath, stat=os.stat):
+        try:
+            checksum = hasher(fpath)
+            st = stat(fpath)
+            record = f"{checksum},{st.st_size},{st.st_mtime},{fpath}"
+            return Results(value=record)
+        except Exception as e:
+            return Results(exc=f"{str(e)}")
+
+    return stats
+
+
+# default stats function for backwards-compatible use
+stats = make_stats("imohash-64k")
 
 
 def scanner(path, ignore=None, ignore_dirs=None, drop_hidden_files=True):
@@ -178,8 +212,8 @@ def get_files(path, ignore=None, ignore_dirs=None, drop_hidden_files=True):
     return list(files_iter)
 
 
-def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True):
-    "Calculates hashs of all the files in parallel"
+def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True, checksum_type="imohash-64k"):
+    "Calculates hashes of all the files in parallel"
     echo("Gathering files...")
     with timethis("getting files"):
         if os.path.isdir(path):
@@ -194,11 +228,12 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True):
     nfiles = len(files)
     echo(f"nfiles: {nfiles}")
     results = ["checksum,fsize,mtime,fpath"]
-    echo("Calculating hashes...")
+    echo(f"Calculating hashes ({checksum_type})...")
     errors = []
+    stats_fn = make_stats(checksum_type)
     with timethis("calculating hashes"):
         futures = process_map(
-            stats, files, chunksize=10, max_workers=os.cpu_count(), unit="files"
+            stats_fn, files, chunksize=10, max_workers=os.cpu_count(), unit="files"
         )
         for item in futures:
             if item.has_error():
@@ -230,11 +265,20 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True):
 @click.option(
     "-o", "--outfile", type=click.File("w"), default="-", help="output filename"
 )
+@click.option(
+    "--checksum-type",
+    type=click.Choice(list(HASHERS)),
+    default="imohash-64k",
+    show_default=True,
+    help="checksum algorithm to use. imohash-64k is fast (samples 3×64KB) but "
+         "may miss changes in unsampled regions of large files. xxhash and md5 "
+         "read the full file and are collision-free but slower on large files.",
+)
 @click.argument("path")
-def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files):
+def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type):
     """path to file or folder.
 
-    Calculates imohash checksum of file(s) at the given path.
+    Calculates checksum of file(s) at the given path.
     Results are presented as csv.
     """
     path = os.path.expanduser(path)
@@ -244,6 +288,7 @@ def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files):
         ignore=ignore,
         ignore_dirs=ignore_dirs,
         drop_hidden_files=drop_hidden_files,
+        checksum_type=checksum_type,
     )
 
 

diff --git a/ptool/cli.py b/ptool/cli.py
@@ -285,19 +285,28 @@ def prepare_rsync(outfile, ignore, Flag, threshold, lefthost, righthost, left, r
 @click.option(
     "-o", "--outfile", type=click.File("w"), default="-", help="output filename"
 )
+@click.option(
+    "--checksum-type",
+    type=click.Choice(["imohash-64k", "md5", "xxhash"]),
+    default="imohash-64k",
+    show_default=True,
+    help="checksum algorithm. imohash-64k is fast but samples only 3×64KB of "
+         "large files. xxhash and md5 read files fully and are collision-free "
+         "but slower. Both CSVs being compared must use the same algorithm.",
+)
 @click.argument("path")
-def checksums(path, outfile, ignore, ignore_dirs, drop_hidden_files):
-    """Calculates imohash checksum of file(s) at the given path.
+def checksums(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type):
+    """Calculates checksum of file(s) at the given path.
     Results are presented as csv.
 
     `--ignore` and `--ignore-dirs` support *wildcards* in filtering down the
     matches.  If no *wildcards* are provided, then it performs a literal
     match. For multiple patterns, use comma separation.
     """
-    from . import checksums
+    from . import checksums as cs
 
     path = os.path.expanduser(path)
-    checksums.main(path, outfile, ignore, ignore_dirs, drop_hidden_files)
+    cs.main(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type)
 
 
 if __name__ == "__main__":

diff --git a/setup.py b/setup.py
@@ -23,7 +23,11 @@
         "pyarrow",
         "imohash",
         "tqdm",
+        "xxhash",
     ],
+    extras_require={
+        "dev": ["pytest"],
+    },
     entry_points="""
         [console_scripts]
         ptool=ptool.cli:cli

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_checksum_types.py b/tests/test_checksum_types.py
@@ -0,0 +1,124 @@
+import pytest
+
+from ptool.checksums import make_stats, HASHERS, IMOHASH_SAMPLE_SIZE
+
+
+def _write(path, content=b"\x00" * 1024):
+    path.write_bytes(content)
+    return path
+
+
+class TestHasherPrefixes:
+    """Each hasher must embed its type name as a prefix in the output."""
+
+    def test_imohash_prefix(self, tmp_path):
+        f = _write(tmp_path / "f.bin")
+        result = make_stats("imohash-64k")(str(f))
+        assert not result.has_error()
+        assert result.value.startswith("imohash-64k:")
+
+    def test_md5_prefix(self, tmp_path):
+        f = _write(tmp_path / "f.bin")
+        result = make_stats("md5")(str(f))
+        assert not result.has_error()
+        assert result.value.startswith("md5:")
+
+    def test_xxhash_prefix(self, tmp_path):
+        f = _write(tmp_path / "f.bin")
+        result = make_stats("xxhash")(str(f))
+        assert not result.has_error()
+        assert result.value.startswith("xxhash:")
+
+
+class TestHasherCorrectness:
+    """Full-file hashers (md5, xxhash) must distinguish files that imohash misses."""
+
+    def _make_false_negative_pair(self, tmp_path):
+        """Two large files identical in imohash sample windows but different in between."""
+        size = 512 * 1024
+        base = bytearray(size)
+        variant = bytearray(base)
+        # Byte at 17 KB — safely between first (0–64 KB) and middle (256–320 KB) windows
+        variant[IMOHASH_SAMPLE_SIZE + 1024] = 0xFF
+        file_a = tmp_path / "a.bin"
+        file_b = tmp_path / "b.bin"
+        file_a.write_bytes(bytes(base))
+        file_b.write_bytes(bytes(variant))
+        return file_a, file_b
+
+    def test_imohash_misses_unsampled_difference(self, tmp_path):
+        file_a, file_b = self._make_false_negative_pair(tmp_path)
+        hash_a = make_stats("imohash-64k")(str(file_a)).value.split(",")[0]
+        hash_b = make_stats("imohash-64k")(str(file_b)).value.split(",")[0]
+        assert hash_a == hash_b
+
+    def test_md5_catches_unsampled_difference(self, tmp_path):
+        file_a, file_b = self._make_false_negative_pair(tmp_path)
+        hash_a = make_stats("md5")(str(file_a)).value.split(",")[0]
+        hash_b = make_stats("md5")(str(file_b)).value.split(",")[0]
+        assert hash_a != hash_b
+
+    def test_xxhash_catches_unsampled_difference(self, tmp_path):
+        file_a, file_b = self._make_false_negative_pair(tmp_path)
+        hash_a = make_stats("xxhash")(str(file_a)).value.split(",")[0]
+        hash_b = make_stats("xxhash")(str(file_b)).value.split(",")[0]
+        assert hash_a != hash_b
+
+    def test_identical_files_agree_across_all_types(self, tmp_path):
+        """All hashers should agree that truly identical files have the same hash."""
+        content = b"\xAB" * (512 * 1024)
+        file_a = tmp_path / "a.bin"
+        file_b = tmp_path / "b.bin"
+        file_a.write_bytes(content)
+        file_b.write_bytes(content)
+        for checksum_type in HASHERS:
+            hash_a = make_stats(checksum_type)(str(file_a)).value.split(",")[0]
+            hash_b = make_stats(checksum_type)(str(file_b)).value.split(",")[0]
+            assert hash_a == hash_b, f"{checksum_type} disagreed on identical files"
+
+
+class TestChecksumTypeMismatchGuard:
+    """compare() must reject CSVs generated with different checksum types."""
+
+    def _make_csv(self, tmp_path, checksum_type, filename="snapshot.csv"):
+        f = tmp_path / "data.bin"
+        f.write_bytes(b"\x00" * 1024)
+        record = make_stats(checksum_type)(str(f)).value
+        csv_path = tmp_path / filename
+        csv_path.write_text(f"checksum,fsize,mtime,fpath\n{record}\n")
+        return str(csv_path)
+
+    def test_mismatched_types_raise_error(self, tmp_path):
+        import click
+        from ptool.analyse import read_csv, compare
+
+        left_dir = tmp_path / "left"
+        right_dir = tmp_path / "right"
+        left_dir.mkdir()
+        right_dir.mkdir()
+
+        left_csv = self._make_csv(left_dir, "imohash-64k")
+        right_csv = self._make_csv(right_dir, "md5")
+
+        left, _ = read_csv(left_csv)
+        right, _ = read_csv(right_csv)
+
+        with pytest.raises(click.UsageError, match="Checksum type mismatch"):
+            compare(left, right)
+
+    def test_matching_types_do_not_raise(self, tmp_path):
+        from ptool.analyse import read_csv, compare
+
+        left_dir = tmp_path / "left"
+        right_dir = tmp_path / "right"
+        left_dir.mkdir()
+        right_dir.mkdir()
+
+        left_csv = self._make_csv(left_dir, "xxhash")
+        right_csv = self._make_csv(right_dir, "xxhash")
+
+        left, _ = read_csv(left_csv)
+        right, _ = read_csv(right_csv)
+
+        # should not raise
+        compare(left, right)
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,3 +20,4 @@ dependencies: @@
       - pip:
         - imohash
         - tqdm
+        - xxhash