feat(embedded): add native Hnsw class for ann-benchmarks integration

polaz · polaz · commit f6b57c842e83 · 2026-05-22T22:32:51.000+03:00
The existing LocalClient routes through Cypher (parser + planner + executor), which carries overhead that's not comparable with in-process HNSW libraries like hnswlib, FAISS-HNSW, ScaNN, or Annoy. To put CoordiNode on the canonical ann-benchmarks.com leaderboard — and to drive the per-commit vector benchmark on docs.coordinode.com — we need a Cypher-bypass path that calls coordinode_vector::hnsw::HnswIndex directly. This commit: - Adds `coordinode-vector` and `numpy` as deps on coordinode-embedded. - New `Hnsw` PyO3 class with fit / set_ef / knn_query, mirroring the hnswlib BaseANN surface so the ann-benchmarks adapter is a one-line wrapper around it. - Numpy-native I/O: fit takes `np.ndarray[float32, 2D]`, knn_query returns `np.ndarray[int64, 1D]`. Zero per-row Python-side conversion on the query hot path. - GIL released around the build and search calls. - Internal IDs auto-assigned sequentially; fit returns the (start, end) range so callers can map their own labels. - Build uses per-item `insert` rather than `insert_batch`. The batch path's known recall divergence vs serial (engine parity bar is 0.7 top-10 agreement) is unacceptable for fair library-tier comparisons; a `fit_fast` opt-in can land later if a real workload needs the build-throughput trade. Bumps the `coordinode-rs` submodule to current main (engine HEAD has `HnswIndex::insert_batch` and `set_ef_search` which earlier pinned SHAs did not yet have, plus the f763f86 saturation-path back-edge fix). Tests at tests/unit/test_hnsw.py cover metric parsing, dimension validation, ID-range bookkeeping, and a recall@10 bar at 10K vectors (≥ 0.95 at ef=50, ≥ 0.99 at ef=200). Closes #69
diff --git a/coordinode-embedded/Cargo.lock b/coordinode-embedded/Cargo.lock
diff --git a/coordinode-embedded/Cargo.toml b/coordinode-embedded/Cargo.toml
@@ -14,8 +14,10 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.23", features = ["extension-module", "abi3-py311"] }
+numpy = "0.23"
 coordinode-embed = { path = "../coordinode-rs/crates/coordinode-embed" }
 coordinode-core = { path = "../coordinode-rs/crates/coordinode-core" }
+coordinode-vector = { path = "../coordinode-rs/crates/coordinode-vector" }
 rmpv = "1"
 tempfile = "3"
 
diff --git a/coordinode-embedded/python/coordinode_embedded/__init__.py b/coordinode-embedded/python/coordinode_embedded/__init__.py
@@ -21,6 +21,6 @@
     db.close()
 """
 
-from ._coordinode_embedded import LocalClient
+from ._coordinode_embedded import Hnsw, LocalClient
 
-__all__ = ["LocalClient"]
+__all__ = ["Hnsw", "LocalClient"]
diff --git a/coordinode-embedded/python/coordinode_embedded/_coordinode_embedded.pyi b/coordinode-embedded/python/coordinode_embedded/_coordinode_embedded.pyi
@@ -2,6 +2,69 @@
 
 from typing import Any
 
+import numpy as np
+import numpy.typing as npt
+
+class Hnsw:
+    """In-process HNSW index — fast-path bypass around Cypher.
+
+    Use this when you want library-grade vector search throughput without
+    the Cypher parser/planner cost.  Mirrors the hnswlib / FAISS-HNSW
+    surface used by the ann-benchmarks harness.
+
+    Args:
+        dim: Embedding dimension.  Must match the vectors passed to ``fit``
+             and ``knn_query``.
+        metric: Distance metric — one of ``"cosine"`` / ``"angular"``,
+                ``"euclidean"`` / ``"l2"``, ``"dot"`` / ``"inner_product"``,
+                ``"manhattan"`` / ``"l1"``.
+        M: Max connections per element per layer (HNSW spec). Default 16.
+        ef_construction: Candidate list size during build. Default 200.
+        max_elements: Hint to pre-allocate node storage. Default 1_000_000.
+
+    Example::
+
+        import numpy as np
+        from coordinode_embedded import Hnsw
+
+        rng = np.random.default_rng(42)
+        X = rng.standard_normal((10_000, 128), dtype=np.float32)
+        q = rng.standard_normal(128, dtype=np.float32)
+
+        idx = Hnsw(dim=128, metric="euclidean", M=16, ef_construction=200)
+        idx.fit(X)
+        idx.set_ef(80)
+        labels = idx.knn_query(q, k=10)   # int64 ndarray, shape (10,)
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        metric: str,
+        M: int = 16,
+        ef_construction: int = 200,
+        max_elements: int = 1_000_000,
+    ) -> None: ...
+    def fit(self, vectors: npt.NDArray[np.float32]) -> tuple[int, int]:
+        """Bulk-insert vectors.  Returns the contiguous ``(start, end)`` ID range
+        assigned to this batch.  Multiple ``fit`` calls extend the index rather
+        than replacing it.
+        """
+        ...
+
+    def set_ef(self, ef: int) -> None:
+        """Update runtime ``ef_search``. Higher ef = higher recall, lower QPS."""
+        ...
+
+    def knn_query(
+        self, query: npt.NDArray[np.float32], k: int
+    ) -> npt.NDArray[np.int64]:
+        """k-NN query.  Returns nearest neighbour IDs, ordered nearest-first."""
+        ...
+
+    def __len__(self) -> int: ...
+    def __repr__(self) -> str: ...
+
 class LocalClient:
     """In-process CoordiNode database — no server, no Docker required.
 
diff --git a/coordinode-embedded/src/hnsw.rs b/coordinode-embedded/src/hnsw.rs
@@ -0,0 +1,214 @@
+//! Native PyO3 wrapper around [`coordinode_vector::hnsw::HnswIndex`].
+//!
+//! This is the fast-path bypass used by the ann-benchmarks Docker adapter
+//! and any in-process HNSW workload. It avoids Cypher's parser/planner
+//! overhead so the resulting QPS / recall numbers are directly comparable
+//! with library benchmarks like hnswlib, FAISS-HNSW, ScaNN and Annoy.
+//!
+//! For Cypher-flavoured access (`CREATE VECTOR INDEX`, `MATCH … ORDER BY
+//! vector_similarity(...)`) use `LocalClient` instead.
+
+use std::sync::Mutex;
+
+use coordinode_core::graph::types::VectorMetric;
+use coordinode_vector::hnsw::{HnswConfig, HnswIndex};
+use numpy::{PyArray1, PyReadonlyArray1, PyReadonlyArray2};
+use pyo3::exceptions::{PyRuntimeError, PyValueError};
+use pyo3::prelude::*;
+
+fn parse_metric(s: &str) -> PyResult<VectorMetric> {
+    match s.to_ascii_lowercase().as_str() {
+        "cosine" | "angular" => Ok(VectorMetric::Cosine),
+        "euclidean" | "l2" => Ok(VectorMetric::L2),
+        "dot" | "dot_product" | "ip" | "inner_product" => Ok(VectorMetric::DotProduct),
+        "manhattan" | "l1" => Ok(VectorMetric::L1),
+        other => Err(PyValueError::new_err(format!(
+            "unknown metric '{other}' — expected one of: cosine/angular, euclidean/l2, dot, manhattan/l1"
+        ))),
+    }
+}
+
+/// In-process HNSW index — PyO3 binding around the CoordiNode native engine.
+///
+/// # Example
+///
+/// ```python
+/// import numpy as np
+/// from coordinode_embedded import Hnsw
+///
+/// rng = np.random.default_rng(42)
+/// X = rng.standard_normal((10_000, 128), dtype=np.float32)
+/// q = rng.standard_normal(128, dtype=np.float32)
+///
+/// idx = Hnsw(dim=128, metric="euclidean", M=16, ef_construction=200)
+/// idx.fit(X)
+/// idx.set_ef(80)
+/// labels = idx.knn_query(q, k=10)   # numpy int64 array, shape (10,)
+/// ```
+#[pyclass]
+pub struct Hnsw {
+    inner: Mutex<HnswIndex>,
+    next_id: Mutex<u64>,
+    dim: u32,
+}
+
+#[pymethods]
+impl Hnsw {
+    /// Build a new HNSW index.
+    ///
+    /// # Arguments
+    /// * `dim` — embedding dimension (must match the vectors passed to `fit` / `knn_query`).
+    /// * `metric` — distance metric. One of `cosine` / `angular`, `euclidean` / `l2`,
+    ///   `dot` / `inner_product`, `manhattan` / `l1`. Names mirror ann-benchmarks
+    ///   conventions so existing harnesses pass their `space` argument unchanged.
+    /// * `M` — max connections per element per layer (HNSW spec). Default 16.
+    /// * `ef_construction` — candidate list size during build. Default 200.
+    /// * `max_elements` — hint to pre-allocate node storage. Default 1_000_000.
+    #[new]
+    #[pyo3(signature = (dim, metric, M=16, ef_construction=200, max_elements=1_000_000))]
+    #[allow(non_snake_case)]
+    fn new(
+        dim: u32,
+        metric: &str,
+        M: usize,
+        ef_construction: usize,
+        max_elements: u32,
+    ) -> PyResult<Self> {
+        if dim == 0 {
+            return Err(PyValueError::new_err("dim must be > 0"));
+        }
+        if M == 0 {
+            return Err(PyValueError::new_err("M must be > 0"));
+        }
+        let metric = parse_metric(metric)?;
+        let config = HnswConfig {
+            m: M,
+            m_max0: M * 2,
+            ef_construction,
+            ef_search: 50,
+            metric,
+            max_dimensions: dim,
+            max_elements,
+            ..HnswConfig::default()
+        };
+        Ok(Self {
+            inner: Mutex::new(HnswIndex::new(config)),
+            next_id: Mutex::new(0),
+            dim,
+        })
+    }
+
+    /// Bulk-insert vectors. Accepts a 2-D float32 numpy array of shape `(N, dim)`.
+    ///
+    /// Each row gets an auto-assigned sequential ID starting from the next free
+    /// ID (so multiple `fit` calls extend the index instead of replacing it).
+    /// Returns the contiguous range `[first_id, last_id+1)` as a (start, end) tuple
+    /// so callers can map their own labels onto our internal IDs.
+    fn fit(&self, py: Python<'_>, vectors: PyReadonlyArray2<f32>) -> PyResult<(u64, u64)> {
+        let array = vectors.as_array();
+        let (n, d) = (array.shape()[0], array.shape()[1]);
+        if d as u32 != self.dim {
+            return Err(PyValueError::new_err(format!(
+                "vector dimension mismatch: index dim={}, input dim={d}",
+                self.dim
+            )));
+        }
+        if n == 0 {
+            return Ok((0, 0));
+        }
+        // Materialise the (id, vec) batch under the GIL, then release the GIL
+        // for the build.  Use per-item `insert` rather than `insert_batch`:
+        // the batch path trades within-batch plan staleness for ~5-8× build
+        // throughput, but the resulting recall divergence (engine parity bar
+        // is 0.7 top-10 agreement, not 1.0) is unacceptable for ann-benchmarks
+        // comparisons against serial-equivalent libraries like hnswlib.  We
+        // can expose a `fit_fast` opt-in later if a real workload needs the
+        // build-throughput trade.
+        let mut next = self
+            .next_id
+            .lock()
+            .map_err(|e| PyRuntimeError::new_err(format!("next_id lock poisoned: {e}")))?;
+        let start_id = *next;
+        let mut batch: Vec<(u64, Vec<f32>)> = Vec::with_capacity(n);
+        for row in array.outer_iter() {
+            batch.push((*next, row.to_vec()));
+            *next += 1;
+        }
+        let end_id = *next;
+        drop(next);
+
+        py.allow_threads(|| -> PyResult<()> {
+            let mut index = self
+                .inner
+                .lock()
+                .map_err(|e| PyRuntimeError::new_err(format!("index lock poisoned: {e}")))?;
+            for (id, vec) in batch {
+                index.insert(id, vec);
+            }
+            Ok(())
+        })?;
+        Ok((start_id, end_id))
+    }
+
+    /// Update runtime `ef_search`. Larger ef = higher recall, lower QPS.
+    fn set_ef(&self, ef: usize) -> PyResult<()> {
+        let mut index = self
+            .inner
+            .lock()
+            .map_err(|e| PyRuntimeError::new_err(format!("index lock poisoned: {e}")))?;
+        index.set_ef_search(ef);
+        Ok(())
+    }
+
+    /// k-NN query. Returns a 1-D int64 numpy array of length `k` with the IDs
+    /// of the nearest neighbours, ordered nearest-first. If the index has
+    /// fewer than `k` elements, the result is shorter accordingly.
+    fn knn_query<'py>(
+        &self,
+        py: Python<'py>,
+        query: PyReadonlyArray1<f32>,
+        k: usize,
+    ) -> PyResult<Bound<'py, PyArray1<i64>>> {
+        let q_view = query.as_array();
+        if q_view.len() as u32 != self.dim {
+            return Err(PyValueError::new_err(format!(
+                "query dimension mismatch: index dim={}, query dim={}",
+                self.dim,
+                q_view.len()
+            )));
+        }
+        let q: Vec<f32> = q_view.iter().copied().collect();
+        let labels = py.allow_threads(|| -> PyResult<Vec<i64>> {
+            let index = self
+                .inner
+                .lock()
+                .map_err(|e| PyRuntimeError::new_err(format!("index lock poisoned: {e}")))?;
+            Ok(index
+                .search(&q, k)
+                .into_iter()
+                .map(|r| r.id as i64)
+                .collect())
+        })?;
+        Ok(PyArray1::from_vec(py, labels))
+    }
+
+    /// Number of vectors indexed.
+    fn __len__(&self) -> PyResult<usize> {
+        // `next_id` is monotonically incremented per insert, so it doubles
+        // as the count without us reaching into HnswIndex internals.
+        let next = self
+            .next_id
+            .lock()
+            .map_err(|e| PyRuntimeError::new_err(format!("next_id lock poisoned: {e}")))?;
+        Ok(*next as usize)
+    }
+
+    fn __repr__(&self) -> String {
+        let n = self
+            .next_id
+            .lock()
+            .map(|g| *g)
+            .unwrap_or(0);
+        format!("Hnsw(dim={}, len={})", self.dim, n)
+    }
+}
diff --git a/coordinode-embedded/src/lib.rs b/coordinode-embedded/src/lib.rs
@@ -1,3 +1,5 @@
+mod hnsw;
+
 /// CoordiNode embedded Python bindings.
 ///
 /// Exposes `LocalClient` — a `CoordinodeClient`-compatible interface that runs
@@ -313,5 +315,6 @@ impl LocalClient {
 #[pymodule]
 fn _coordinode_embedded(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<LocalClient>()?;
+    m.add_class::<hnsw::Hnsw>()?;
     Ok(())
 }
diff --git a/coordinode-rs b/coordinode-rs
@@ -1 +1 @@
-Subproject commit e0694e583a4ccb7e42fd29bfff89b51ce9964e72
+Subproject commit 8d4037c7564ffe787d763eabfadb1c1ed3da7a6b
diff --git a/tests/unit/test_hnsw.py b/tests/unit/test_hnsw.py