fix(embedded): race-free len, full alias docstring, faster brute force

polaz · polaz · commit e0cc001e33a4 · 2026-05-23T01:12:06.000+03:00
- hnsw.rs: `__len__` now reads the count from the underlying HnswIndex
  instead of `next_id`. The previous derivation could observe phantom
  IDs under concurrent fit/__len__ — `fit` bumps next_id under its own
  mutex, releases it, then acquires the index lock for the actual
  inserts (with the GIL released around the build). A concurrent
  `__len__` between those two windows would see the bumped counter
  before the inserts landed. Locking `inner` makes the count reflect
  committed inserts only.

- hnsw.rs: `__repr__` now reports the same len source plus a `&lt;busy&gt;`
  marker when the index lock is contended (via `try_lock`). Stops a
  debug REPL from blocking on a concurrent build.

- hnsw.rs: docstring for the `metric` constructor argument now lists
  every accepted alias (cosine/angular, euclidean/l2, dot/dot_product/
  ip/inner_product, manhattan/l1) instead of a partial list. The error
  message and the parser were already exhaustive; the docstring just
  needed to catch up.

- tests/unit/test_hnsw.py: the brute-force top-k helper uses
  `np.argpartition` (O(N)) instead of `np.argsort` (O(N log N)) — only
  the SET of nearest k matters for the recall metric, not the order
  inside it.

- tests/unit/test_hnsw.py: the recall test allocates the 10K × 16 float
  matrix directly as float32 via `standard_normal(dtype=np.float32)`
  instead of allocating float64 then `.astype(np.float32)`. Halves the
  peak memory for that test.
diff --git a/coordinode-embedded/src/hnsw.rs b/coordinode-embedded/src/hnsw.rs
@@ -64,9 +64,14 @@ impl Hnsw {
     ///
     /// # Arguments
     /// * `dim` — embedding dimension (must match the vectors passed to `fit` / `knn_query`).
-    /// * `metric` — distance metric. One of `cosine` / `angular`, `euclidean` / `l2`,
-    ///   `dot` / `inner_product`, `manhattan` / `l1`. Names mirror ann-benchmarks
-    ///   conventions so existing harnesses pass their `space` argument unchanged.
+    /// * `metric` — distance metric. Accepted aliases (all case-insensitive):
+    ///     - cosine similarity: `cosine`, `angular`
+    ///     - Euclidean (L2):    `euclidean`, `l2`
+    ///     - dot product:       `dot`, `dot_product`, `ip`, `inner_product`
+    ///     - Manhattan (L1):    `manhattan`, `l1`
+    ///
+    ///   Spellings track ann-benchmarks and VectorDBBench so existing
+    ///   harnesses pass their `space` argument unchanged.
     /// * `M` — max connections per element per layer (HNSW spec). Default 16.
     /// * `ef_construction` — candidate list size during build. Default 200.
     /// * `max_elements` — hint to pre-allocate node storage. Default 1_000_000.
@@ -208,23 +213,31 @@ impl Hnsw {
 
     /// Number of vectors indexed.
     fn __len__(&self) -> PyResult<usize> {
-        // `next_id` is monotonically incremented per insert, so it doubles
-        // as the count without us reaching into HnswIndex internals.
-        let next = self
-            .next_id
+        // Read the count from the HnswIndex itself, NOT from `next_id`.
+        // `next_id` is bumped under its own mutex before the inserts happen
+        // under `inner`; with the GIL released around the build, a concurrent
+        // `__len__` call would otherwise observe phantom IDs that haven't
+        // actually landed in the index.  Locking `inner` makes the count
+        // reflect committed inserts only.
+        let index = self
+            .inner
             .lock()
-            .map_err(|e| PyRuntimeError::new_err(format!("next_id lock poisoned: {e}")))?;
-        Ok(*next as usize)
+            .map_err(|e| PyRuntimeError::new_err(format!("index lock poisoned: {e}")))?;
+        Ok(index.len())
     }
 
     fn __repr__(&self) -> String {
-        // `__len__` surfaces a poisoned mutex as RuntimeError; `__repr__` can't
-        // raise (Python expects it to always return a string) so we emit a
-        // visible marker instead of silently reporting len=0.  Hiding a poisoned
-        // lock would mask real concurrency bugs during debugging.
-        let len_repr = match self.next_id.lock() {
-            Ok(g) => g.to_string(),
-            Err(_) => "<poisoned>".to_owned(),
+        // `__len__` surfaces a poisoned mutex as RuntimeError; `__repr__`
+        // can't raise (Python expects it to always return a string), so a
+        // poison is rendered as a visible marker rather than a silent
+        // `len=0` that would mask real concurrency bugs during debugging.
+        // `try_lock` is intentional: even when uncontended `__repr__` runs
+        // in the debugger and must not block a concurrent build that holds
+        // the lock — we'd rather show `<busy>` than deadlock the REPL.
+        let len_repr = match self.inner.try_lock() {
+            Ok(idx) => idx.len().to_string(),
+            Err(std::sync::TryLockError::WouldBlock) => "<busy>".to_owned(),
+            Err(std::sync::TryLockError::Poisoned(_)) => "<poisoned>".to_owned(),
         };
         format!("Hnsw(dim={}, len={len_repr})", self.dim)
     }
diff --git a/tests/unit/test_hnsw.py b/tests/unit/test_hnsw.py
@@ -14,8 +14,11 @@
 
 
 def _brute_force_topk(X, q, k: int):
+    # argpartition gives the top-k indices in O(N), vs argsort's O(N log N).
+    # We only need the SET of nearest k, ordering inside the set doesn't
+    # matter for the recall metric.
     dists = ((X - q) ** 2).sum(axis=1)
-    return set(np.argsort(dists)[:k].tolist())
+    return set(np.argpartition(dists, k)[:k].tolist())
 
 
 def test_metric_parsing_and_dim_validation() -> None:
@@ -59,8 +62,10 @@ def test_recall_at_10_geq_0_95() -> None:
     we hold queries out of the training set).
     """
     rng = np.random.default_rng(42)
-    X = rng.standard_normal((10_000, 16)).astype(np.float32)
-    queries = rng.standard_normal((50, 16)).astype(np.float32)
+    # `dtype=` on standard_normal skips the float64-then-astype round-trip,
+    # halving the allocation for this 10K × 16 matrix.
+    X = rng.standard_normal((10_000, 16), dtype=np.float32)
+    queries = rng.standard_normal((50, 16), dtype=np.float32)
 
     idx = ce.Hnsw(dim=16, metric="euclidean", M=16, ef_construction=200)
     idx.fit(X)