Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 19 additions & 15 deletions datafusion/common/src/utils/proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ pub trait HashTableAllocExt {
///
/// Returns the bucket where the element was inserted.
/// Note that allocation counts capacity, not size.
/// Panics:
/// Assumes the element is not already present, and may panic if it does
///
/// # Example:
/// ```
Expand All @@ -134,7 +136,7 @@ pub trait HashTableAllocExt {
/// assert_eq!(allocated, 64);
///
/// // insert more values
/// for i in 0..100 {
/// for i in 2..100 {
/// table.insert_accounted(i, hash_fn, &mut allocated);
/// }
/// assert_eq!(allocated, 400);
Expand All @@ -161,22 +163,24 @@ where
) {
let hash = hasher(&x);

// NOTE: `find_entry` does NOT grow!
match self.find_entry(hash, |y| y == &x) {
Ok(_occupied) => {}
Err(_absent) => {
if self.len() == self.capacity() {
// need to request more memory
let bump_elements = self.capacity().max(16);
let bump_size = bump_elements * size_of::<T>();
*accounting = (*accounting).checked_add(bump_size).expect("overflow");
if cfg!(debug_assertions) {
// In debug mode, check that the element is not already present
debug_assert!(
self.find_entry(hash, |y| y == &x).is_err(),
"attempted to insert duplicate element into HashTableAllocExt::insert_accounted"
);
}

self.reserve(bump_elements, &hasher);
}
if self.len() == self.capacity() {
// need to request more memory
let bump_elements = self.capacity().max(16);
let bump_size = bump_elements * size_of::<T>();
*accounting = (*accounting).checked_add(bump_size).expect("overflow");

// still need to insert the element since first try failed
self.entry(hash, |y| y == &x, hasher).insert(x);
}
self.reserve(bump_elements, &hasher);
}

// We assume the element is not already present
self.insert_unique(hash, x, hasher);
}
}
4 changes: 2 additions & 2 deletions datafusion/physical-expr-common/src/binary_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ where
// is value is already present in the set?
let entry = self.map.find_mut(hash, |header| {
// compare value if hashes match
if header.len != value_len {
if header.hash != hash || header.len != value_len {
return false;
}
// value is stored inline so no need to consult buffer
Expand Down Expand Up @@ -427,7 +427,7 @@ where
// Check if the value is already present in the set
let entry = self.map.find_mut(hash, |header| {
// compare value if hashes match
if header.len != value_len {
if header.hash != hash {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should save some random access if a lot of values share the same length

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we also need to update this place to check header.len

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Dandandan do we need to update this location too with a check for length?

Suggested change
if header.hash != hash {
if header.hash != hash || header.len != value_len {

return false;
}
// Need to compare the bytes in the buffer
Expand Down
5 changes: 2 additions & 3 deletions datafusion/physical-expr-common/src/binary_view_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,10 @@ where
let value: &[u8] = value.as_ref();

let entry = self.map.find_mut(hash, |header| {
let v = self.builder.get_value(header.view_idx);

if v.len() != value.len() {
if header.hash != hash {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should save some random access if a lot of values share the same length

return false;
}
let v = self.builder.get_value(header.view_idx);
Copy link
Contributor Author

@Dandandan Dandandan Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw in profiles this is pretty expensive (also self.builder.append_value and always comparing by bytes - I think we are not really using the potential of binary views here, I opened #19961


v == value
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@ where
let hash = key.hash(state);
let insert = self.map.entry(
hash,
|&(g, _)| unsafe { self.values.get_unchecked(g).is_eq(key) },
|&(g, h)| unsafe {
hash == h && self.values.get_unchecked(g).is_eq(key)
Copy link
Contributor

@alamb alamb Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I double checked that Hashbrown says

https://docs.rs/hashbrown/latest/hashbrown/struct.HashTable.html#method.entry

This method will call eq for all entries with the given hash, but may also call it for entries with a different hash. eq should only return true for the desired entry, at which point the search is stopped.

So checking the hash first makes senes to me as it will avoid a memory access

},
|&(_, h)| h,
);

Expand Down