Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ dependencies:
- pip:
- imohash
- tqdm
- xxhash
24 changes: 19 additions & 5 deletions ptool/analyse.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import os
import itertools
import pandas as pd
import numpy as np
import humanize
import os
from collections import defaultdict

import click
import humanize
import numpy as np
import pandas as pd

__all__ = [
"read_csv",
"compare",
Expand All @@ -27,10 +29,11 @@ def read_csv(filename, ignore=None, drop_duplicates=False):
df["rparent"] = df.rpath.apply(os.path.dirname)
for name, dtype in df.dtypes.items():
if dtype == "object":
df[name] = df[name].astype("str[pyarrow]")
df[name] = df[name].astype("string[pyarrow]")
if ignore:
df = df[~df.rparent.str.contains(ignore)]
df = df[~df.fname.str.contains(ignore)]
df["checksum_type"] = df.checksum.str.split(":").str[0]
df = df.sort_values(by=["checksum", "mtime"])
dups = df[
df.duplicated(subset=["checksum", "fname"]).values
Expand Down Expand Up @@ -74,7 +77,18 @@ def directory_map(m):
return (m[["rparent_left", "rparent_right"]]).drop_duplicates()


def _assert_compatible_checksums(left, right):
lt = left.checksum_type.iloc[0]
rt = right.checksum_type.iloc[0]
if lt != rt:
raise click.UsageError(
f"Checksum type mismatch: left CSV uses '{lt}' but right CSV uses '{rt}'. "
"Re-generate both snapshots with the same --checksum-type before comparing."
)


def compare(left, right, relabel=False, threshold=0.1):
_assert_compatible_checksums(left, right)
by_hash = merge(left, right)
by_name = merge(left, right, on="fname")
by_hash["flag"] = ""
Expand Down
83 changes: 64 additions & 19 deletions ptool/checksums.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python

import fnmatch
import hashlib
import os
import re
import sys
Expand All @@ -12,6 +13,8 @@
from imohash import hashfile
from tqdm.contrib.concurrent import process_map

IMOHASH_SAMPLE_SIZE = 64 * 1024 # upgraded from 16 KB to reduce false-negative risk

not_hidden_files_or_dirs = re.compile(r"^[^.]").match


Expand Down Expand Up @@ -117,21 +120,52 @@ def result(self):
return self.value


def hasher(filename):
"Calucates imohash for a given file"
return f"imohash:{hashfile(filename, hexdigest=True)}"
def _imohash(filename):
return f"imohash-64k:{hashfile(filename, sample_size=IMOHASH_SAMPLE_SIZE, hexdigest=True)}"


def _md5(filename):
h = hashlib.md5()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(8 * 1024 * 1024), b""):
h.update(chunk)
return f"md5:{h.hexdigest()}"


def _xxhash(filename):
import xxhash
h = xxhash.xxh3_64()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(8 * 1024 * 1024), b""):
h.update(chunk)
return f"xxhash:{h.hexdigest()}"


HASHERS = {
"imohash-64k": _imohash,
"md5": _md5,
"xxhash": _xxhash,
}


def stats(fpath, stat=os.stat):
"Generates record with imohash and file stats information"
try:
checksum = hasher(fpath)
st = stat(fpath)
record = f"{checksum},{st.st_size},{st.st_mtime},{fpath}"
record = Results(value=record)
except Exception as e:
record = Results(exc=f"{str(e)}")
return record
def make_stats(checksum_type):
"Returns a stats function bound to the given checksum type"
hasher = HASHERS[checksum_type]

def stats(fpath, stat=os.stat):
try:
checksum = hasher(fpath)
st = stat(fpath)
record = f"{checksum},{st.st_size},{st.st_mtime},{fpath}"
return Results(value=record)
except Exception as e:
return Results(exc=f"{str(e)}")

return stats


# default stats function for backwards-compatible use
stats = make_stats("imohash-64k")


def scanner(path, ignore=None, ignore_dirs=None, drop_hidden_files=True):
Expand Down Expand Up @@ -178,8 +212,8 @@ def get_files(path, ignore=None, ignore_dirs=None, drop_hidden_files=True):
return list(files_iter)


def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True):
"Calculates hashs of all the files in parallel"
def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True, checksum_type="imohash-64k"):
"Calculates hashes of all the files in parallel"
echo("Gathering files...")
with timethis("getting files"):
if os.path.isdir(path):
Expand All @@ -194,11 +228,12 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True):
nfiles = len(files)
echo(f"nfiles: {nfiles}")
results = ["checksum,fsize,mtime,fpath"]
echo("Calculating hashes...")
echo(f"Calculating hashes ({checksum_type})...")
errors = []
stats_fn = make_stats(checksum_type)
with timethis("calculating hashes"):
futures = process_map(
stats, files, chunksize=10, max_workers=os.cpu_count(), unit="files"
stats_fn, files, chunksize=10, max_workers=os.cpu_count(), unit="files"
)
for item in futures:
if item.has_error():
Expand Down Expand Up @@ -230,11 +265,20 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True):
@click.option(
"-o", "--outfile", type=click.File("w"), default="-", help="output filename"
)
@click.option(
"--checksum-type",
type=click.Choice(list(HASHERS)),
default="imohash-64k",
show_default=True,
help="checksum algorithm to use. imohash-64k is fast (samples 3×64KB) but "
"may miss changes in unsampled regions of large files. xxhash and md5 "
"read the full file and are collision-free but slower on large files.",
)
@click.argument("path")
def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files):
def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type):
"""path to file or folder.

Calculates imohash checksum of file(s) at the given path.
Calculates checksum of file(s) at the given path.
Results are presented as csv.
"""
path = os.path.expanduser(path)
Expand All @@ -244,6 +288,7 @@ def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files):
ignore=ignore,
ignore_dirs=ignore_dirs,
drop_hidden_files=drop_hidden_files,
checksum_type=checksum_type,
)


Expand Down
17 changes: 13 additions & 4 deletions ptool/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,19 +285,28 @@ def prepare_rsync(outfile, ignore, Flag, threshold, lefthost, righthost, left, r
@click.option(
"-o", "--outfile", type=click.File("w"), default="-", help="output filename"
)
@click.option(
"--checksum-type",
type=click.Choice(["imohash-64k", "md5", "xxhash"]),
default="imohash-64k",
show_default=True,
help="checksum algorithm. imohash-64k is fast but samples only 3×64KB of "
"large files. xxhash and md5 read files fully and are collision-free "
"but slower. Both CSVs being compared must use the same algorithm.",
)
@click.argument("path")
def checksums(path, outfile, ignore, ignore_dirs, drop_hidden_files):
"""Calculates imohash checksum of file(s) at the given path.
def checksums(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type):
"""Calculates checksum of file(s) at the given path.
Results are presented as csv.

`--ignore` and `--ignore-dirs` support *wildcards* in filtering down the
matches. If no *wildcards* are provided, then it performs a literal
match. For multiple patterns, use comma separation.
"""
from . import checksums
from . import checksums as cs

path = os.path.expanduser(path)
checksums.main(path, outfile, ignore, ignore_dirs, drop_hidden_files)
cs.main(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type)


if __name__ == "__main__":
Expand Down
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
"pyarrow",
"imohash",
"tqdm",
"xxhash",
],
extras_require={
"dev": ["pytest"],
},
entry_points="""
[console_scripts]
ptool=ptool.cli:cli
Expand Down
Empty file added tests/__init__.py
Empty file.
124 changes: 124 additions & 0 deletions tests/test_checksum_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import pytest

from ptool.checksums import make_stats, HASHERS, IMOHASH_SAMPLE_SIZE


def _write(path, content=b"\x00" * 1024):
path.write_bytes(content)
return path


class TestHasherPrefixes:
"""Each hasher must embed its type name as a prefix in the output."""

def test_imohash_prefix(self, tmp_path):
f = _write(tmp_path / "f.bin")
result = make_stats("imohash-64k")(str(f))
assert not result.has_error()
assert result.value.startswith("imohash-64k:")

def test_md5_prefix(self, tmp_path):
f = _write(tmp_path / "f.bin")
result = make_stats("md5")(str(f))
assert not result.has_error()
assert result.value.startswith("md5:")

def test_xxhash_prefix(self, tmp_path):
f = _write(tmp_path / "f.bin")
result = make_stats("xxhash")(str(f))
assert not result.has_error()
assert result.value.startswith("xxhash:")


class TestHasherCorrectness:
"""Full-file hashers (md5, xxhash) must distinguish files that imohash misses."""

def _make_false_negative_pair(self, tmp_path):
"""Two large files identical in imohash sample windows but different in between."""
size = 512 * 1024
base = bytearray(size)
variant = bytearray(base)
# Byte at 17 KB — safely between first (0–64 KB) and middle (256–320 KB) windows
variant[IMOHASH_SAMPLE_SIZE + 1024] = 0xFF
file_a = tmp_path / "a.bin"
file_b = tmp_path / "b.bin"
file_a.write_bytes(bytes(base))
file_b.write_bytes(bytes(variant))
return file_a, file_b

def test_imohash_misses_unsampled_difference(self, tmp_path):
file_a, file_b = self._make_false_negative_pair(tmp_path)
hash_a = make_stats("imohash-64k")(str(file_a)).value.split(",")[0]
hash_b = make_stats("imohash-64k")(str(file_b)).value.split(",")[0]
assert hash_a == hash_b

def test_md5_catches_unsampled_difference(self, tmp_path):
file_a, file_b = self._make_false_negative_pair(tmp_path)
hash_a = make_stats("md5")(str(file_a)).value.split(",")[0]
hash_b = make_stats("md5")(str(file_b)).value.split(",")[0]
assert hash_a != hash_b

def test_xxhash_catches_unsampled_difference(self, tmp_path):
file_a, file_b = self._make_false_negative_pair(tmp_path)
hash_a = make_stats("xxhash")(str(file_a)).value.split(",")[0]
hash_b = make_stats("xxhash")(str(file_b)).value.split(",")[0]
assert hash_a != hash_b

def test_identical_files_agree_across_all_types(self, tmp_path):
"""All hashers should agree that truly identical files have the same hash."""
content = b"\xAB" * (512 * 1024)
file_a = tmp_path / "a.bin"
file_b = tmp_path / "b.bin"
file_a.write_bytes(content)
file_b.write_bytes(content)
for checksum_type in HASHERS:
hash_a = make_stats(checksum_type)(str(file_a)).value.split(",")[0]
hash_b = make_stats(checksum_type)(str(file_b)).value.split(",")[0]
assert hash_a == hash_b, f"{checksum_type} disagreed on identical files"


class TestChecksumTypeMismatchGuard:
"""compare() must reject CSVs generated with different checksum types."""

def _make_csv(self, tmp_path, checksum_type, filename="snapshot.csv"):
f = tmp_path / "data.bin"
f.write_bytes(b"\x00" * 1024)
record = make_stats(checksum_type)(str(f)).value
csv_path = tmp_path / filename
csv_path.write_text(f"checksum,fsize,mtime,fpath\n{record}\n")
return str(csv_path)

def test_mismatched_types_raise_error(self, tmp_path):
import click
from ptool.analyse import read_csv, compare

left_dir = tmp_path / "left"
right_dir = tmp_path / "right"
left_dir.mkdir()
right_dir.mkdir()

left_csv = self._make_csv(left_dir, "imohash-64k")
right_csv = self._make_csv(right_dir, "md5")

left, _ = read_csv(left_csv)
right, _ = read_csv(right_csv)

with pytest.raises(click.UsageError, match="Checksum type mismatch"):
compare(left, right)

def test_matching_types_do_not_raise(self, tmp_path):
from ptool.analyse import read_csv, compare

left_dir = tmp_path / "left"
right_dir = tmp_path / "right"
left_dir.mkdir()
right_dir.mkdir()

left_csv = self._make_csv(left_dir, "xxhash")
right_csv = self._make_csv(right_dir, "xxhash")

left, _ = read_csv(left_csv)
right, _ = read_csv(right_csv)

# should not raise
compare(left, right)
Loading