github · aneubeck · May 6, 2026 · Apr 27, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -4,6 +4,7 @@ members = [
     "crates/*",
     "crates/bpe/benchmarks",
     "crates/bpe/tests",
+    "crates/hash-sorted-map/benchmarks",
 ]
 resolver = "2"
 

@@ -30,7 +30,7 @@ fn test_compare_dictionary() {
             hugging_tokens.remove(added_token);
         }
         let mut hugging_tokens: Vec<_> = hugging_tokens.into_iter().collect();
-        hugging_tokens.sort_by(|(_, a), (_, b)| a.cmp(b));
+        hugging_tokens.sort_by_key(|(_, a)| *a);
         let hugging_tokens: Vec<_> = hugging_tokens
             .into_iter()
             .map(|(token, _)| token.chars().map(char_to_byte).collect())

@@ -0,0 +1,10 @@
+[package]
+name = "hash-sorted-map"
+authors = ["The blackbird team <support@github.com>"]
+version = "0.1.0"
+edition = "2021"
+description = "A hash map with hash-ordered iteration and linear-time merge, designed for search-index term maps."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["hashmap", "sorted", "merge", "simd"]
+categories = ["algorithms", "data-structures"]
@@ -0,0 +1,171 @@
+# HashSortedMap vs. Rust Swiss Table (hashbrown): Optimization Analysis
+
+## Executive Summary
+
+`HashSortedMap` is a Swiss-table-inspired hash map that uses **overflow
+chaining** (instead of open addressing), **SIMD group scanning** (NEON/SSE2),
+a **slot-hint fast path**, and an **optimized growth strategy**. It is generic
+over key type, value type, and hash builder.
+
+This document analyzes the design trade-offs versus
+[hashbrown](https://github.com/rust-lang/hashbrown) and records the
+experimental results that guided the current design.
+
+---
+
+## Architecture Comparison
+
+```
+┌──────────────────────────────────────────────────────────────────┐
+│                   hashbrown Swiss Table                          │
+│                                                                  │
+│  Single contiguous allocation (SoA):                             │
+│  [Padding] [T_n ... T_1  T_0] [CT_0 CT_1 ... CT_n] [CT_extra]    │
+│                data               control bytes    (mirrored)    │
+│                                                                  │
+│  • Open addressing, triangular probing                           │
+│  • 16-byte groups (SSE2) or 8-byte groups (NEON/generic)         │
+│  • EMPTY / DELETED / FULL tag states                             │
+└──────────────────────────────────────────────────────────────────┘
+
+┌──────────────────────────────────────────────────────────────────┐
+│                      HashSortedMap                               │
+│                                                                  │
+│  Vec<Group<K,V>> where each Group (AoS):                         │
+│  { ctrl: [u8; 8], keys: [MaybeUninit<K>; 8],                     │
+│    values: [MaybeUninit<V>; 8], overflow: u32 }                  │
+│                                                                  │
+│  • Overflow chaining (linked groups)                             │
+│  • 8-byte groups with NEON/SSE2/scalar SIMD scan                 │
+│  • EMPTY / FULL tag states only (insertion-only, no deletion)    │
+│  • Slot-hint fast path                                           │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Optimizations Investigated
+
+### 1. SIMD Group Scanning ✅ Implemented
+
+Platform-specific SIMD for control byte matching:
+- **aarch64**: NEON `vceq_u8` + `vreinterpret_u64_u8` (8-byte groups)
+- **x86_64**: SSE2 `_mm_cmpeq_epi8` + `_mm_movemask_epi8` (16-byte groups)
+- **Fallback**: Scalar u64 zero-byte detection trick
+
+**Benchmark result**: ~5% faster than scalar on Apple M-series. The gain is
+modest because the slot-hint fast path often skips the group scan entirely.
+
+### 2. Open Addressing with Triangular Probing ❌ Rejected
+
+This is not really an option for this hash map, since it would prevent efficient sorting.
+Additionally, we didn't observe any performance improvement in comparison to the linked overflow buffer approach.
+The biggest benefit of triangular probing is that it allows a much higher load factor, i.e. reduces memory consumption which isn't our main concern though.
+
+**Benchmark result**: **40% slower** than overflow chaining. With the AoS
+layout, each group is ~112 bytes, so probing to the next group jumps over
+large memory regions. Overflow chaining with the slot-hint fast path is
+faster because most inserts land in the first group.
+
+### 3. SoA Memory Layout ❌ Rejected
+
+Tested a SoA variant (`SoaHashSortedMap`) with separate control byte and
+key/value arrays, combined with triangular probing.
+
+**Benchmark result**: **Slowest variant** — even slower than AoS open
+addressing. The two-Vec SoA layout doubles TLB/cache pressure versus
+hashbrown's single-allocation layout. Without the single-allocation trick,
+SoA is worse than AoS for this use case.
+
+### 4. Capacity Sizing ✅ Implemented
+
+Without the correct sizing, there was always the penality of a grow operation.
+
+**Fix**: Changed to ~70% max load factor. This was the **single biggest improvement** — HashSortedMap went from 2× slower to matching hashbrown.
+
+### 5. Optimized Growth ✅ Implemented
+
+The original `grow()` called the full `insert()` for each element (including
+duplicate checking and overflow traversal). hashbrown uses:
+- `find_insert_index` (skip duplicate check)
+- `ptr::copy_nonoverlapping` (raw memory copy)
+- Bulk counter updates
+
+**Fix**: Added `insert_for_grow()` that skips duplicate checking, uses raw
+pointer copies, and iterates occupied slots via bitmask.
+
+**Benchmark result**: Growth is now **2× faster** than hashbrown (4.8 µs vs
+9.8 µs for 3 resize rounds).
+
+### 6. Branch Prediction Hints ⚠️ Mixed Results
+
+Added `likely()`/`unlikely()` annotations and `#[cold] #[inline(never)]` on
+the overflow path.
+
+**Benchmark result**: Helped the scalar version (~2–6% faster) but **hurt the
+SIMD version** by pessimizing NEON code generation. Removed from the SIMD
+implementation, kept in the scalar version.
+
+### 7. Slot Hint Fast Path (Unique to HashSortedMap)
+
+HashSortedMap checks a preferred slot before scanning the group:
+```rust
+let hint = slot_hint(hash);  // 3 bits from hash → slot index
+if ctrl[hint] == EMPTY { /* direct insert */ }
+if ctrl[hint] == tag && keys[hint] == key { /* direct hit */ }
+```
+
+hashbrown does **not** have this optimization — it always does a full SIMD
+group scan. The reason why the performance is different is probably due to the different overflow strategies and the different load factors.
+
+### 8. Overflow Reserve Sizing ✅ Validated
+
+Tested overflow reserves from 0% to 100% of primary groups:
+
+| Reserve | Growth scenario (µs) |
+|---------|----------------------|
+| m/8 (12.5%, default) |  8.04   |
+| m/4 (25%)            |  8.33   |
+| m/2 (50%)            |  8.93   |
+| m/1 (100%)           | 10.31   |
+| 0 (grow immediately) |  6.96   |
+
+**Conclusion**: Smaller reserves are faster — growing early is cheaper than
+traversing overflow chains.
+
+### 9. IdentityHasher Fix ✅ Implemented
+
+The original `IdentityHasher` zero-extended u32 to u64, putting zeros in the
+top 32 bits. Since hashbrown derives the 7-bit tag from `hash >> 57`, every
+entry got the same tag — completely defeating control byte filtering.
+
+**Fix**: Use `folded_multiply` to expand u32 keys to u64 with independent
+entropy in both halves. Also changed trigram generation to use
+`folded_multiply` instead of murmur3.
+
+---
+
+## Optimizations Not Implemented (and Why)
+
+| Optimization                    | Reason                                   |
+|---------------------------------|------------------------------------------|
+| **Tombstone / DELETED support** | Insertion-only map — no deletions needed |
+| **In-place rehashing**          | No tombstones to reclaim                 |
+| **Control byte mirroring**      | Not needed with overflow chaining (no wrap-around) |
+| **Custom allocator support**    | Out of scope for benchmarking            |
+| **Over-allocation utilization** | Uses `Vec` (no raw allocator control)    |
+
+---
+
+## Summary of Impact
+
+| Change                     | Effect on insert time        |
+|----------------------------|------------------------------|
+| Capacity sizing fix        | **−50%** (biggest win)       |
+| Optimized growth path      | **−10%** on growth scenarios |
+| SIMD group scanning        | **−5%**                      |
+| Branch hints (scalar only) | **−2–6%**                    |
+| IdentityHasher fix         | Enabled fair comparison      |
+
+The current HashSortedMap **matches hashbrown+FxHash** on pre-sized inserts,
+**beats all hashbrown variants** on overwrites, and has **2× faster growth**.
@@ -0,0 +1,89 @@
+# hash-sorted-map
+
+A hash map whose groups are ordered by hash prefix, enabling efficient
+sorted-order iteration and linear-time merging of two maps.
+
+## Motivation
+
+In a search index, each document produces a **term map** (term → frequency).
+At index time, term maps from many documents must be **merged** into a single
+posting list, and the result is **serialized in hash-key order** so that
+lookups can use a skip-list approach, leveraging the hash ordering to
+efficiently jump to the right region of the serialized data.
+
+A conventional hash map stores entries in arbitrary order, so merging two maps
+requires collecting, sorting, and reshuffling all entries — an expensive step
+that dominates indexing time for large term maps typical of code search, where
+documents contain massive numbers of tokens.
+
+`HashSortedMap` avoids this by organizing its groups by hash prefix.
+Iterating through the groups in order yields entries sorted by their hashed
+keys, which means:
+
+- **Merging** two maps is a single linear scan (like merge-sort's merge step).
+- **Serialization** in hash-key order requires no extra sorting or copying.
+
+## Design
+
+`HashSortedMap<K, V, S>` is a Swiss-table-inspired hash map that uses:
+
+- **Overflow chaining** instead of open addressing — groups that fill up link
+  to overflow groups rather than probing into neighbours.
+- **Slot hint** — a preferred slot index derived from the hash, checked before
+  scanning the group. Gives a direct hit on most inserts at low load.
+- **SIMD group scanning** — uses NEON on aarch64, SSE2 on x86\_64, and a
+  scalar fallback elsewhere to scan 8–16 control bytes in parallel.
+- **AoS group layout** — each group stores its control bytes, keys, and values
+  together, keeping a single insert's data within 1–2 cache lines.
+- **Optimized growth** — during resize, elements are re-inserted without
+  duplicate checking and copied via raw pointers.
+- **Generic key/value/hasher** — supports any `K: Hash + Eq`, any
+  `S: BuildHasher`, and `Borrow<Q>`-based lookups.
+
+## Benchmark results
+
+All benchmarks insert 1000 random trigram hashes (scrambled with
+`folded_multiply`) into maps with various configurations. Measured on Apple
+M-series (aarch64).
+
+### Insert 1000 trigrams — pre-sized, no growth
+
+| Rank | Map | Time (µs) | vs best |
+|------|-----|-----------|---------|
+| 🥇 | FoldHashMap | 2.44 | — |
+| 🥈 | FxHashMap | 2.61 | +7% |
+| 🥉 | hashbrown::HashMap | 2.67 | +9% |
+| 4 | **HashSortedMap** | **2.71** | +11% |
+| 5 | hashbrown+Identity | 2.74 | +12% |
+| 6 | std::HashMap+FNV | 3.27 | +34% |
+| 7 | AHashMap | 3.22 | +32% |
+| 8 | std::HashMap | 8.49 | +248% |
+
+### Re-insert same keys (all overwrites)
+
+| Map | Time (µs) |
+|-----|-----------|
+| **HashSortedMap** | **2.36** ✅ |
+| hashbrown+Identity | 2.58 |
+
+### Growth from small (`with_capacity(128)`, 3 resize rounds)
+
+| Map | Time (µs) | Growth penalty |
+|-----|-----------|----------------|
+| **HashSortedMap** | **4.85** | +2.14 |
+| hashbrown+Identity | 9.77 | +7.03 |
+
+### Key takeaways
+
+- **HashSortedMap matches the fastest hashbrown configurations** on pre-sized
+  first-time inserts and is **the fastest for overwrites**.
+- **Growth is ~2× faster** than hashbrown thanks to the optimized
+  `insert_for_grow` path that skips duplicate checking and uses raw copies.
+- The remaining gap to FoldHashMap (~11%) comes from foldhash's extremely
+  efficient hash function that pipelines well with hashbrown's SIMD scan.
+
+## Running
+
+```sh
+cargo bench --bench hashmap_insert
+```
@@ -0,0 +1,23 @@
+[package]
+name = "hash-sorted-map-benchmarks"
+edition = "2021"
+
+[lib]
+path = "lib.rs"
+test = false
+
+[[bench]]
+name = "performance"
+path = "performance.rs"
+harness = false
+test = false
+
+[dependencies]
+hash-sorted-map = { path = ".." }
+criterion = "0.8"
+rand = "0.10"
+rustc-hash = "2"
+ahash = "0.8"
+hashbrown = "0.15"
+foldhash = "0.1"
+fnv = "1"
@@ -0,0 +1,46 @@
+use std::hash::{BuildHasherDefault, Hasher};
+
+use rand::RngExt;
+
+const ARBITRARY0: u64 = 0x243f6a8885a308d3;
+
+/// Folded multiply: full u64×u64→u128, then XOR the two halves.
+#[inline(always)]
+pub fn folded_multiply(x: u64, y: u64) -> u64 {
+    let full = (x as u128).wrapping_mul(y as u128);
+    (full as u64) ^ ((full >> 64) as u64)
+}
+
+/// A hasher that passes through u32 keys without hashing, suitable for
+/// keys that are already well-distributed.
+#[derive(Default)]
+pub struct IdentityHasher(u64);
+
+impl Hasher for IdentityHasher {
+    fn write(&mut self, _bytes: &[u8]) {
+        unimplemented!("IdentityHasher only supports write_u32");
+    }
+    fn write_u32(&mut self, i: u32) {
+        self.0 = (i as u64) | ((i as u64) << 32);
+    }
+    fn finish(&self) -> u64 {
+        self.0
+    }
+}
+
+pub type IdentityBuildHasher = BuildHasherDefault<IdentityHasher>;
+
+/// Generate `n` random trigrams as well-distributed u32 hashes.
+/// Each trigram is packed into a u32, then scrambled with folded_multiply.
+pub fn random_trigram_hashes(n: usize) -> Vec<u32> {
+    let mut rng = rand::rng();
+    (0..n)
+        .map(|_| {
+            let a = rng.random_range(b'a'..=b'z') as u32;
+            let b = rng.random_range(b'a'..=b'z') as u32;
+            let c = rng.random_range(b'a'..=b'z') as u32;
+            let packed = a | (b << 8) | (c << 16);
+            folded_multiply(packed as u64, ARBITRARY0) as u32
+        })
+        .collect()
+}