Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions datasketches/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ impl Error {
"invalid preamble longs: expected {expected}, got {actual}"
))
}

pub(crate) fn invalid_preamble_ints(expected: u8, actual: u8) -> Self {
Self::deserial(format!(
"invalid preamble ints: expected {expected}, got {actual}"
))
}
}

impl fmt::Debug for Error {
Expand Down
126 changes: 126 additions & 0 deletions datasketches/src/kll/helper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::cell::Cell;
use std::time::SystemTime;
use std::time::UNIX_EPOCH;

const POWERS_OF_THREE: [u64; 31] = [
1,
3,
9,
27,
81,
243,
729,
2187,
6561,
19683,
59049,
177147,
531441,
1594323,
4782969,
14348907,
43046721,
129140163,
387420489,
1162261467,
3486784401,
10460353203,
31381059609,
94143178827,
282429536481,
847288609443,
2541865828329,
7625597484987,
22876792454961,
68630377364883,
205891132094649,
];

pub fn compute_total_capacity(k: u16, m: u8, num_levels: usize) -> u32 {
let mut total: u32 = 0;
for level in 0..num_levels {
total += level_capacity(k, num_levels, level, m);
}
total
}

pub fn level_capacity(k: u16, num_levels: usize, height: usize, min_wid: u8) -> u32 {
assert!(height < num_levels, "height must be < num_levels");
let depth = num_levels - height - 1;
let cap = int_cap_aux(k, depth as u8);
std::cmp::max(min_wid as u32, cap as u32)
}

pub fn int_cap_aux(k: u16, depth: u8) -> u16 {
if depth > 60 {
panic!("depth must be <= 60");
}
if depth <= 30 {
return int_cap_aux_aux(k, depth);
}
let half = depth / 2;
let rest = depth - half;
let tmp = int_cap_aux_aux(k, half);
int_cap_aux_aux(tmp, rest)
}

pub fn int_cap_aux_aux(k: u16, depth: u8) -> u16 {
if depth > 30 {
panic!("depth must be <= 30");
}
let twok = (k as u64) << 1;
let tmp = (twok << depth) / POWERS_OF_THREE[depth as usize];
let result = (tmp + 1) >> 1;
assert!(result <= k as u64, "capacity result exceeds k");
result as u16
}

pub fn sum_the_sample_weights(level_sizes: &[usize]) -> u64 {
let mut total = 0u64;
let mut weight = 1u64;
for &size in level_sizes {
total += weight * size as u64;
weight <<= 1;
}
total
}

fn seed() -> u64 {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_nanos();
nanos as u64
}

pub fn random_bit() -> u32 {
thread_local! {
static RNG_STATE: Cell<u64> = Cell::new(seed());
}

RNG_STATE.with(|state| {
let mut x = state.get();
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
state.set(x);
(x & 1) as u32
})
}
62 changes: 62 additions & 0 deletions datasketches/src/kll/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! KLL sketch implementation for estimating quantiles and ranks.
//!
//! KLL is a compact, streaming quantiles sketch with lazy compaction and
//! near-optimal accuracy per retained item. It supports one-pass updates,
//! approximate quantiles, ranks, PMF, and CDF queries.
//!
//! This implementation follows Apache DataSketches semantics (Java KllSketch
//! / KllPreambleUtil, C++ kll_sketch) and uses the same binary serialization
//! format as those implementations.
//!
//! # Usage
//!
//! ```rust
//! # use datasketches::kll::KllSketch;
//! let mut sketch = KllSketch::<f64>::new(200);
//! sketch.update(1.0);
//! sketch.update(2.0);
//! let q = sketch.quantile(0.5, true).unwrap();
//! assert!(q >= 1.0 && q <= 2.0);
//! ```

mod helper;
mod serialization;
mod sketch;
mod sorted_view;

pub use self::sketch::KllSketch;

/// KLL sketch specialized for `f64`.
pub type KllSketchF64 = KllSketch<f64>;
/// KLL sketch specialized for `f32`.
pub type KllSketchF32 = KllSketch<f32>;
/// KLL sketch specialized for `i64`.
pub type KllSketchI64 = KllSketch<i64>;
/// KLL sketch specialized for `String`.
pub type KllSketchString = KllSketch<String>;

/// Default value of parameter k.
pub const DEFAULT_K: u16 = 200;
/// Default value of parameter m.
pub const DEFAULT_M: u8 = 8;
/// Minimum value of parameter k.
pub const MIN_K: u16 = DEFAULT_M as u16;
/// Maximum value of parameter k.
pub const MAX_K: u16 = u16::MAX;
48 changes: 48 additions & 0 deletions datasketches/src/kll/serialization.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Binary serialization format constants for KLL sketches.
//!
//! Naming and layout follow the Apache DataSketches Java implementation
//! (`KllPreambleUtil`) and the C++ `kll_sketch` serialization format.

/// Family ID for KLL sketches in DataSketches format (KllPreambleUtil.KLL_FAMILY).
pub const KLL_FAMILY_ID: u8 = 15;

/// Serialization version for empty or full sketches (KllPreambleUtil.SERIAL_VERSION_EMPTY_FULL).
pub const SERIAL_VERSION_1: u8 = 1;
/// Serialization version for single-item sketches (KllPreambleUtil.SERIAL_VERSION_SINGLE).
pub const SERIAL_VERSION_2: u8 = 2;

/// Preamble ints for empty and single-item sketches (KllPreambleUtil.PREAMBLE_INTS_EMPTY_SINGLE).
pub const PREAMBLE_INTS_SHORT: u8 = 2;
/// Preamble ints for sketches with more than one item (KllPreambleUtil.PREAMBLE_INTS_FULL).
pub const PREAMBLE_INTS_FULL: u8 = 5;

/// Flag indicating the sketch is empty (KllPreambleUtil.EMPTY_BIT_MASK).
pub const FLAG_EMPTY: u8 = 1 << 0;
/// Flag indicating level zero is sorted (KllPreambleUtil.LEVEL_ZERO_SORTED_BIT_MASK).
pub const FLAG_LEVEL_ZERO_SORTED: u8 = 1 << 1;
/// Flag indicating the sketch has a single item (KllPreambleUtil.SINGLE_ITEM_BIT_MASK).
pub const FLAG_SINGLE_ITEM: u8 = 1 << 2;

/// Serialized size for an empty sketch in bytes (KllPreambleUtil.DATA_START_ADR_SINGLE_ITEM).
pub const EMPTY_SIZE_BYTES: usize = 8;
/// Data offset for single-item sketches (KllPreambleUtil.DATA_START_ADR_SINGLE_ITEM).
pub const DATA_START_SINGLE_ITEM: usize = 8;
/// Data offset for sketches with more than one item (KllPreambleUtil.DATA_START_ADR).
pub const DATA_START: usize = 20;
Loading