Skip to content

Commit f6b57c8

Browse files
committed
feat(embedded): add native Hnsw class for ann-benchmarks integration
The existing LocalClient routes through Cypher (parser + planner + executor), which carries overhead that's not comparable with in-process HNSW libraries like hnswlib, FAISS-HNSW, ScaNN, or Annoy. To put CoordiNode on the canonical ann-benchmarks.com leaderboard — and to drive the per-commit vector benchmark on docs.coordinode.com — we need a Cypher-bypass path that calls coordinode_vector::hnsw::HnswIndex directly. This commit: - Adds `coordinode-vector` and `numpy` as deps on coordinode-embedded. - New `Hnsw` PyO3 class with fit / set_ef / knn_query, mirroring the hnswlib BaseANN surface so the ann-benchmarks adapter is a one-line wrapper around it. - Numpy-native I/O: fit takes `np.ndarray[float32, 2D]`, knn_query returns `np.ndarray[int64, 1D]`. Zero per-row Python-side conversion on the query hot path. - GIL released around the build and search calls. - Internal IDs auto-assigned sequentially; fit returns the (start, end) range so callers can map their own labels. - Build uses per-item `insert` rather than `insert_batch`. The batch path's known recall divergence vs serial (engine parity bar is 0.7 top-10 agreement) is unacceptable for fair library-tier comparisons; a `fit_fast` opt-in can land later if a real workload needs the build-throughput trade. Bumps the `coordinode-rs` submodule to current main (engine HEAD has `HnswIndex::insert_batch` and `set_ef_search` which earlier pinned SHAs did not yet have, plus the f763f86 saturation-path back-edge fix). Tests at tests/unit/test_hnsw.py cover metric parsing, dimension validation, ID-range bookkeeping, and a recall@10 bar at 10K vectors (≥ 0.95 at ef=50, ≥ 0.99 at ef=200). Closes #69
1 parent b5c1ed2 commit f6b57c8

8 files changed

Lines changed: 707 additions & 53 deletions

File tree

coordinode-embedded/Cargo.lock

Lines changed: 335 additions & 50 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coordinode-embedded/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@ crate-type = ["cdylib"]
1414

1515
[dependencies]
1616
pyo3 = { version = "0.23", features = ["extension-module", "abi3-py311"] }
17+
numpy = "0.23"
1718
coordinode-embed = { path = "../coordinode-rs/crates/coordinode-embed" }
1819
coordinode-core = { path = "../coordinode-rs/crates/coordinode-core" }
20+
coordinode-vector = { path = "../coordinode-rs/crates/coordinode-vector" }
1921
rmpv = "1"
2022
tempfile = "3"
2123

coordinode-embedded/python/coordinode_embedded/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@
2121
db.close()
2222
"""
2323

24-
from ._coordinode_embedded import LocalClient
24+
from ._coordinode_embedded import Hnsw, LocalClient
2525

26-
__all__ = ["LocalClient"]
26+
__all__ = ["Hnsw", "LocalClient"]

coordinode-embedded/python/coordinode_embedded/_coordinode_embedded.pyi

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,69 @@
22

33
from typing import Any
44

5+
import numpy as np
6+
import numpy.typing as npt
7+
8+
class Hnsw:
9+
"""In-process HNSW index — fast-path bypass around Cypher.
10+
11+
Use this when you want library-grade vector search throughput without
12+
the Cypher parser/planner cost. Mirrors the hnswlib / FAISS-HNSW
13+
surface used by the ann-benchmarks harness.
14+
15+
Args:
16+
dim: Embedding dimension. Must match the vectors passed to ``fit``
17+
and ``knn_query``.
18+
metric: Distance metric — one of ``"cosine"`` / ``"angular"``,
19+
``"euclidean"`` / ``"l2"``, ``"dot"`` / ``"inner_product"``,
20+
``"manhattan"`` / ``"l1"``.
21+
M: Max connections per element per layer (HNSW spec). Default 16.
22+
ef_construction: Candidate list size during build. Default 200.
23+
max_elements: Hint to pre-allocate node storage. Default 1_000_000.
24+
25+
Example::
26+
27+
import numpy as np
28+
from coordinode_embedded import Hnsw
29+
30+
rng = np.random.default_rng(42)
31+
X = rng.standard_normal((10_000, 128), dtype=np.float32)
32+
q = rng.standard_normal(128, dtype=np.float32)
33+
34+
idx = Hnsw(dim=128, metric="euclidean", M=16, ef_construction=200)
35+
idx.fit(X)
36+
idx.set_ef(80)
37+
labels = idx.knn_query(q, k=10) # int64 ndarray, shape (10,)
38+
"""
39+
40+
def __init__(
41+
self,
42+
dim: int,
43+
metric: str,
44+
M: int = 16,
45+
ef_construction: int = 200,
46+
max_elements: int = 1_000_000,
47+
) -> None: ...
48+
def fit(self, vectors: npt.NDArray[np.float32]) -> tuple[int, int]:
49+
"""Bulk-insert vectors. Returns the contiguous ``(start, end)`` ID range
50+
assigned to this batch. Multiple ``fit`` calls extend the index rather
51+
than replacing it.
52+
"""
53+
...
54+
55+
def set_ef(self, ef: int) -> None:
56+
"""Update runtime ``ef_search``. Higher ef = higher recall, lower QPS."""
57+
...
58+
59+
def knn_query(
60+
self, query: npt.NDArray[np.float32], k: int
61+
) -> npt.NDArray[np.int64]:
62+
"""k-NN query. Returns nearest neighbour IDs, ordered nearest-first."""
63+
...
64+
65+
def __len__(self) -> int: ...
66+
def __repr__(self) -> str: ...
67+
568
class LocalClient:
669
"""In-process CoordiNode database — no server, no Docker required.
770

coordinode-embedded/src/hnsw.rs

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
//! Native PyO3 wrapper around [`coordinode_vector::hnsw::HnswIndex`].
2+
//!
3+
//! This is the fast-path bypass used by the ann-benchmarks Docker adapter
4+
//! and any in-process HNSW workload. It avoids Cypher's parser/planner
5+
//! overhead so the resulting QPS / recall numbers are directly comparable
6+
//! with library benchmarks like hnswlib, FAISS-HNSW, ScaNN and Annoy.
7+
//!
8+
//! For Cypher-flavoured access (`CREATE VECTOR INDEX`, `MATCH … ORDER BY
9+
//! vector_similarity(...)`) use `LocalClient` instead.
10+
11+
use std::sync::Mutex;
12+
13+
use coordinode_core::graph::types::VectorMetric;
14+
use coordinode_vector::hnsw::{HnswConfig, HnswIndex};
15+
use numpy::{PyArray1, PyReadonlyArray1, PyReadonlyArray2};
16+
use pyo3::exceptions::{PyRuntimeError, PyValueError};
17+
use pyo3::prelude::*;
18+
19+
fn parse_metric(s: &str) -> PyResult<VectorMetric> {
20+
match s.to_ascii_lowercase().as_str() {
21+
"cosine" | "angular" => Ok(VectorMetric::Cosine),
22+
"euclidean" | "l2" => Ok(VectorMetric::L2),
23+
"dot" | "dot_product" | "ip" | "inner_product" => Ok(VectorMetric::DotProduct),
24+
"manhattan" | "l1" => Ok(VectorMetric::L1),
25+
other => Err(PyValueError::new_err(format!(
26+
"unknown metric '{other}' — expected one of: cosine/angular, euclidean/l2, dot, manhattan/l1"
27+
))),
28+
}
29+
}
30+
31+
/// In-process HNSW index — PyO3 binding around the CoordiNode native engine.
32+
///
33+
/// # Example
34+
///
35+
/// ```python
36+
/// import numpy as np
37+
/// from coordinode_embedded import Hnsw
38+
///
39+
/// rng = np.random.default_rng(42)
40+
/// X = rng.standard_normal((10_000, 128), dtype=np.float32)
41+
/// q = rng.standard_normal(128, dtype=np.float32)
42+
///
43+
/// idx = Hnsw(dim=128, metric="euclidean", M=16, ef_construction=200)
44+
/// idx.fit(X)
45+
/// idx.set_ef(80)
46+
/// labels = idx.knn_query(q, k=10) # numpy int64 array, shape (10,)
47+
/// ```
48+
#[pyclass]
49+
pub struct Hnsw {
50+
inner: Mutex<HnswIndex>,
51+
next_id: Mutex<u64>,
52+
dim: u32,
53+
}
54+
55+
#[pymethods]
56+
impl Hnsw {
57+
/// Build a new HNSW index.
58+
///
59+
/// # Arguments
60+
/// * `dim` — embedding dimension (must match the vectors passed to `fit` / `knn_query`).
61+
/// * `metric` — distance metric. One of `cosine` / `angular`, `euclidean` / `l2`,
62+
/// `dot` / `inner_product`, `manhattan` / `l1`. Names mirror ann-benchmarks
63+
/// conventions so existing harnesses pass their `space` argument unchanged.
64+
/// * `M` — max connections per element per layer (HNSW spec). Default 16.
65+
/// * `ef_construction` — candidate list size during build. Default 200.
66+
/// * `max_elements` — hint to pre-allocate node storage. Default 1_000_000.
67+
#[new]
68+
#[pyo3(signature = (dim, metric, M=16, ef_construction=200, max_elements=1_000_000))]
69+
#[allow(non_snake_case)]
70+
fn new(
71+
dim: u32,
72+
metric: &str,
73+
M: usize,
74+
ef_construction: usize,
75+
max_elements: u32,
76+
) -> PyResult<Self> {
77+
if dim == 0 {
78+
return Err(PyValueError::new_err("dim must be > 0"));
79+
}
80+
if M == 0 {
81+
return Err(PyValueError::new_err("M must be > 0"));
82+
}
83+
let metric = parse_metric(metric)?;
84+
let config = HnswConfig {
85+
m: M,
86+
m_max0: M * 2,
87+
ef_construction,
88+
ef_search: 50,
89+
metric,
90+
max_dimensions: dim,
91+
max_elements,
92+
..HnswConfig::default()
93+
};
94+
Ok(Self {
95+
inner: Mutex::new(HnswIndex::new(config)),
96+
next_id: Mutex::new(0),
97+
dim,
98+
})
99+
}
100+
101+
/// Bulk-insert vectors. Accepts a 2-D float32 numpy array of shape `(N, dim)`.
102+
///
103+
/// Each row gets an auto-assigned sequential ID starting from the next free
104+
/// ID (so multiple `fit` calls extend the index instead of replacing it).
105+
/// Returns the contiguous range `[first_id, last_id+1)` as a (start, end) tuple
106+
/// so callers can map their own labels onto our internal IDs.
107+
fn fit(&self, py: Python<'_>, vectors: PyReadonlyArray2<f32>) -> PyResult<(u64, u64)> {
108+
let array = vectors.as_array();
109+
let (n, d) = (array.shape()[0], array.shape()[1]);
110+
if d as u32 != self.dim {
111+
return Err(PyValueError::new_err(format!(
112+
"vector dimension mismatch: index dim={}, input dim={d}",
113+
self.dim
114+
)));
115+
}
116+
if n == 0 {
117+
return Ok((0, 0));
118+
}
119+
// Materialise the (id, vec) batch under the GIL, then release the GIL
120+
// for the build. Use per-item `insert` rather than `insert_batch`:
121+
// the batch path trades within-batch plan staleness for ~5-8× build
122+
// throughput, but the resulting recall divergence (engine parity bar
123+
// is 0.7 top-10 agreement, not 1.0) is unacceptable for ann-benchmarks
124+
// comparisons against serial-equivalent libraries like hnswlib. We
125+
// can expose a `fit_fast` opt-in later if a real workload needs the
126+
// build-throughput trade.
127+
let mut next = self
128+
.next_id
129+
.lock()
130+
.map_err(|e| PyRuntimeError::new_err(format!("next_id lock poisoned: {e}")))?;
131+
let start_id = *next;
132+
let mut batch: Vec<(u64, Vec<f32>)> = Vec::with_capacity(n);
133+
for row in array.outer_iter() {
134+
batch.push((*next, row.to_vec()));
135+
*next += 1;
136+
}
137+
let end_id = *next;
138+
drop(next);
139+
140+
py.allow_threads(|| -> PyResult<()> {
141+
let mut index = self
142+
.inner
143+
.lock()
144+
.map_err(|e| PyRuntimeError::new_err(format!("index lock poisoned: {e}")))?;
145+
for (id, vec) in batch {
146+
index.insert(id, vec);
147+
}
148+
Ok(())
149+
})?;
150+
Ok((start_id, end_id))
151+
}
152+
153+
/// Update runtime `ef_search`. Larger ef = higher recall, lower QPS.
154+
fn set_ef(&self, ef: usize) -> PyResult<()> {
155+
let mut index = self
156+
.inner
157+
.lock()
158+
.map_err(|e| PyRuntimeError::new_err(format!("index lock poisoned: {e}")))?;
159+
index.set_ef_search(ef);
160+
Ok(())
161+
}
162+
163+
/// k-NN query. Returns a 1-D int64 numpy array of length `k` with the IDs
164+
/// of the nearest neighbours, ordered nearest-first. If the index has
165+
/// fewer than `k` elements, the result is shorter accordingly.
166+
fn knn_query<'py>(
167+
&self,
168+
py: Python<'py>,
169+
query: PyReadonlyArray1<f32>,
170+
k: usize,
171+
) -> PyResult<Bound<'py, PyArray1<i64>>> {
172+
let q_view = query.as_array();
173+
if q_view.len() as u32 != self.dim {
174+
return Err(PyValueError::new_err(format!(
175+
"query dimension mismatch: index dim={}, query dim={}",
176+
self.dim,
177+
q_view.len()
178+
)));
179+
}
180+
let q: Vec<f32> = q_view.iter().copied().collect();
181+
let labels = py.allow_threads(|| -> PyResult<Vec<i64>> {
182+
let index = self
183+
.inner
184+
.lock()
185+
.map_err(|e| PyRuntimeError::new_err(format!("index lock poisoned: {e}")))?;
186+
Ok(index
187+
.search(&q, k)
188+
.into_iter()
189+
.map(|r| r.id as i64)
190+
.collect())
191+
})?;
192+
Ok(PyArray1::from_vec(py, labels))
193+
}
194+
195+
/// Number of vectors indexed.
196+
fn __len__(&self) -> PyResult<usize> {
197+
// `next_id` is monotonically incremented per insert, so it doubles
198+
// as the count without us reaching into HnswIndex internals.
199+
let next = self
200+
.next_id
201+
.lock()
202+
.map_err(|e| PyRuntimeError::new_err(format!("next_id lock poisoned: {e}")))?;
203+
Ok(*next as usize)
204+
}
205+
206+
fn __repr__(&self) -> String {
207+
let n = self
208+
.next_id
209+
.lock()
210+
.map(|g| *g)
211+
.unwrap_or(0);
212+
format!("Hnsw(dim={}, len={})", self.dim, n)
213+
}
214+
}

coordinode-embedded/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
mod hnsw;
2+
13
/// CoordiNode embedded Python bindings.
24
///
35
/// Exposes `LocalClient` — a `CoordinodeClient`-compatible interface that runs
@@ -313,5 +315,6 @@ impl LocalClient {
313315
#[pymodule]
314316
fn _coordinode_embedded(m: &Bound<'_, PyModule>) -> PyResult<()> {
315317
m.add_class::<LocalClient>()?;
318+
m.add_class::<hnsw::Hnsw>()?;
316319
Ok(())
317320
}

coordinode-rs

Submodule coordinode-rs updated 242 files

0 commit comments

Comments
 (0)