Skip to content

Commit d722bdc

Browse files
timsaucerclaude
andcommitted
fix: drop Python callable from UDF Hash impls
`Python::attach(|py| self.func.bind(py).hash().unwrap_or(0))` silently collapsed every unhashable closure (e.g. a lambda with an unhashable captured value) to the same hash bucket — the worst case for hashmap lookups, and arrived at via `unwrap_or` rather than a deliberate design choice. Hash on the identifying header (name + signature + return field / state fields) only. The Rust `Hash` contract requires `a == b ⇒ hash(a) == hash(b)`, not the converse, so a coarser hash is still sound — `PartialEq` (which still calls Python `eq` on the callable) disambiguates two UDFs that share a header but differ in their callable. Applied symmetrically to scalar, aggregate, and window UDFs. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0b8672e commit d722bdc

3 files changed

Lines changed: 15 additions & 14 deletions

File tree

crates/core/src/udaf.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,16 +249,15 @@ impl PartialEq for PythonFunctionAggregateUDF {
249249

250250
impl std::hash::Hash for PythonFunctionAggregateUDF {
251251
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
252+
// See `PythonFunctionScalarUDF`'s `Hash` impl for the
253+
// rationale: hash the identifying header only and let
254+
// `PartialEq` disambiguate callables.
252255
self.name.hash(state);
253256
self.signature.hash(state);
254257
self.return_type.hash(state);
255258
for f in &self.state_fields {
256259
f.hash(state);
257260
}
258-
Python::attach(|py| {
259-
let py_hash = self.accumulator.bind(py).hash().unwrap_or(0);
260-
state.write_isize(py_hash);
261-
});
262261
}
263262
}
264263

crates/core/src/udf.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -106,15 +106,18 @@ impl PartialEq for PythonFunctionScalarUDF {
106106

107107
impl Hash for PythonFunctionScalarUDF {
108108
fn hash<H: Hasher>(&self, state: &mut H) {
109+
// Hash only the identifying header (name + signature + return
110+
// field). Skipping `func` is intentional: the Rust `Hash`
111+
// contract requires `a == b ⇒ hash(a) == hash(b)`, not the
112+
// converse, so a coarser hash is sound — `PartialEq` still
113+
// disambiguates two UDFs with the same header but distinct
114+
// callables. Falling back to a sentinel on `py_hash` failure
115+
// (as a prior revision did) silently mapped every unhashable
116+
// closure to the same bucket; that is the worst case for a
117+
// hashmap and is what this rewrite avoids.
109118
self.name.hash(state);
110119
self.signature.hash(state);
111120
self.return_field.hash(state);
112-
113-
Python::attach(|py| {
114-
let py_hash = self.func.bind(py).hash().unwrap_or(0); // Handle unhashable objects
115-
116-
state.write_isize(py_hash);
117-
});
118121
}
119122
}
120123

crates/core/src/udwf.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -340,13 +340,12 @@ impl PartialEq for PythonFunctionWindowUDF {
340340

341341
impl std::hash::Hash for PythonFunctionWindowUDF {
342342
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
343+
// See `PythonFunctionScalarUDF`'s `Hash` impl for the
344+
// rationale: hash the identifying header only and let
345+
// `PartialEq` disambiguate evaluators.
343346
self.name.hash(state);
344347
self.signature.hash(state);
345348
self.return_type.hash(state);
346-
Python::attach(|py| {
347-
let py_hash = self.evaluator.bind(py).hash().unwrap_or(0);
348-
state.write_isize(py_hash);
349-
});
350349
}
351350
}
352351

0 commit comments

Comments
 (0)