Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions hamilton/caching/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,11 +261,15 @@ def hash_pandas_obj(obj, *args, depth: int = 0, **kwargs) -> str:

@hash_value.register(h_databackends.AbstractPolarsDataFrame)
def hash_polars_dataframe(obj, *args, depth: int = 0, **kwargs) -> str:
"""Convert a polars dataframe, series, or index to
a list of hashes then hash it.
"""Convert a polars dataframe to a hash that includes column names
and dtypes (schema) alongside row hashes. This prevents collisions
between DataFrames with identical cell values but different schemas.
"""
hash_per_row = obj.hash_rows()
return hash_sequence(hash_per_row.to_list(), depth=depth + 1)
schema_str = ",".join(f"{name}:{dtype}" for name, dtype in obj.schema.items())
schema_hash = hash_bytes(schema_str.encode())
row_hash = hash_sequence(obj.hash_rows().to_list(), depth=depth + 1)
combined = hashlib.md5(schema_hash.encode() + row_hash.encode())
return _compact_hash(combined.digest())


@hash_value.register(h_databackends.AbstractPolarsColumn)
Expand All @@ -277,11 +281,11 @@ def hash_polars_column(obj, *args, depth: int = 0, **kwargs) -> str:

@hash_value.register(h_databackends.AbstractNumpyArray)
def hash_numpy_array(obj, *args, depth: int = 0, **kwargs) -> str:
"""Get the bytes representation of the array raw data and hash it.
"""Hash a numpy array including shape and dtype metadata.

Might not be ideal because different higher-level numpy objects could have
the same underlying array representation (e.g., masked arrays).
Unsure, but it's an area to investigate.
Without metadata, arrays with the same raw bytes but different shapes
or dtypes (e.g., shape=(6,) vs shape=(2,3), or float32 vs int32 with
identical bit patterns) would produce identical hashes.
"""
# use the same depth because we're simply dispatching to another implementation
return hash_bytes(obj.tobytes(), depth=depth)
metadata = f"{obj.shape}:{obj.dtype}".encode()
return hash_bytes(metadata + obj.tobytes(), depth=depth)
33 changes: 32 additions & 1 deletion tests/caching/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,37 @@ def test_hash_pandas():

def test_hash_numpy():
array = np.array([[0, 1], [2, 3]])
expected_hash = "ZwjDgY0zQOxO9KPHlYecog=="
expected_hash = "tVIm5kJ7G0GZaaifSEtrOQ=="
fingerprint = fingerprinting.hash_value(array)
assert fingerprint == expected_hash


def test_hash_numpy_different_shapes_differ():
"""Arrays with the same raw bytes but different shapes must hash differently."""
a = np.array([1, 2, 3, 4, 5, 6])
b = np.array([[1, 2, 3], [4, 5, 6]])
assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)


def test_hash_numpy_different_dtypes_differ():
"""Arrays with the same bit pattern but different dtypes must hash differently."""
a = np.array([1.0], dtype=np.float32)
b = np.array([1065353216], dtype=np.int32) # same 4 bytes as float32(1.0)
assert a.tobytes() == b.tobytes() # confirm same raw bytes
assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)


def test_hash_polars_different_columns_differ():
"""DataFrames with identical values but different column names must hash differently."""
polars = pytest.importorskip("polars")
a = polars.DataFrame({"region": ["East", "West"], "revenue": [100, 200]})
b = polars.DataFrame({"student": ["East", "West"], "height_cm": [100, 200]})
assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)


def test_hash_polars_same_schema_same_data_matches():
"""Identical DataFrames must produce the same hash."""
polars = pytest.importorskip("polars")
a = polars.DataFrame({"x": [1, 2], "y": [3, 4]})
b = polars.DataFrame({"x": [1, 2], "y": [3, 4]})
assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b)
Loading