microsoft · lhecker · Jan 20, 2026 · Jan 20, 2026 · DHowett · Jan 20, 2026
diff --git a/crates/edit/benches/lib.rs b/crates/edit/benches/lib.rs
@@ -3,14 +3,15 @@
 
 use std::hint::black_box;
 use std::io::Cursor;
+use std::ops::Range;
 use std::{mem, vec};
 
 use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
 use edit::helpers::*;
 use edit::simd::MemsetSafe;
 use edit::{buffer, hash, oklab, simd, unicode};
 use serde::Deserialize;
-use stdext::arena;
+use stdext::{arena, varint};
 
 #[derive(Deserialize)]
 pub struct EditingTracePatch(pub usize, pub usize, pub String);
@@ -227,6 +228,73 @@ fn bench_unicode(c: &mut Criterion) {
         });
 }
 
+fn bench_varint(c: &mut Criterion) {
+    const BUFFER_SIZE: usize = MEBI;
+
+    let mut buffer = Vec::with_capacity(BUFFER_SIZE + 16);
+
+    // Knuth's MMIX LCG
+    let mut rng_state = 1442695040888963407u64;
+    let mut rng = || {
+        rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
+        rng_state as u32
+    };
+
+    // Bitmask with Rejection (as used by Apple)
+    let mut rng_state = 1442695040888963407u64;
+    let mut rng_range = |range: Range<u32>| {
+        let range_size = range.len() as u32;
+        let mask = range_size.next_power_of_two() - 1;
+        loop {
+            rng_state =
+                rng_state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
+            let value = rng_state as u32 & mask;
+            if value < range_size {
+                return range.start.wrapping_add(value);
+            }
+        }
+    };
+
+    loop {
+        // Generate values according to a non-uniform distribution.
+        // The distribution roughly corresponds to what LSH encounters.
+        let value = match rng() {
+            // ~35%: <=7 bits
+            0..1503238553 => rng_range(0..0x7F),
+            // ~40%: <=14 bits
+            1503238553..3221225472 => rng_range(0x80..0x3FFF),
+            // ~20%: <=21 bits
+            3221225472..4026531840 => rng_range(0x4000..0x1FFFFF),
+            // ~5%: u32::MAX
+            _ => (1 << 28) - 1,
+        };
+
+        buffer.extend(varint::encode(value));
+
+        if buffer.len() > BUFFER_SIZE {
+            break;
+        }
+    }
+
+    // As per the varint::decode() safety requirements, we need 8 bytes of padding.
+    // We pre-allocated `buffer` with extra capacity, so we technically fulfill that.
+    // _Technically_, however, we also make Rust unhappy, because it's uninitialized memory.
+    // It's just that I really really don't care about any such antics. It's memory.
+
+    c.benchmark_group("varint").bench_function("decode", |b| {
+        let mut off = 0;
+
+        b.iter(|| {
+            let (val, len) = unsafe { varint::decode(buffer.as_ptr().add(off)) };
+            black_box(val);
+            off += len;
+            if off >= buffer.len() {
+                off = 0;
+            }
+        });
+    });
+}
+
 fn bench(c: &mut Criterion) {
     arena::init(128 * MEBI).unwrap();
 
@@ -238,6 +306,7 @@ fn bench(c: &mut Criterion) {
     bench_simd_memset::<u32>(c);
     bench_simd_memset::<u8>(c);
     bench_unicode(c);
+    bench_varint(c);
 }
 
 criterion_group!(benches, bench);

diff --git a/crates/stdext/src/lib.rs b/crates/stdext/src/lib.rs
@@ -7,6 +7,7 @@
 
 pub mod arena;
 pub mod sys;
+pub mod varint;
 
 mod helpers;
 pub use helpers::*;
diff --git a/crates/stdext/src/varint.rs b/crates/stdext/src/varint.rs
@@ -0,0 +1,140 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Variable-length `u32` encoding and decoding, with efficient storage of `u32::MAX`.
+//! `u32::MAX` is a common value in Microsoft Edit's syntax highlighter bytecode.
+//!
+//! # Format
+//!
+//! ```text
+//!       0-127        ( 7 bits): xxxxxxx0
+//!     128-16383      (14 bits): xxxxxx01 yyyyyyyx
+//!   16384-2097151    (21 bits): xxxxx011 yyyyyyxx zzzzzzzy
+//! 2097152-268435455  (28 bits): xxxx0111 yyyyyxxx zzzzzzyy wwwwwwwz
+//!         4294967295 (32 bits): ....1111
+//! ```
+//!
+//! The least significant bits indicate the length, in a format identical to UTF-8. The remaining bits store
+//! the value, in little-endian order. Little endian was chosen, as most architectures today use that.
+//!
+//! On x86, `tzcnt` (= `trailing_ones()` = what we need) has the benefit that its encoding is identical to `rep bsf`.
+//! Older CPUs without BMI1 will ignore the `rep` prefix and use `bsf`, while modern CPUs will use the faster `tzcnt`.
+//! So not just can we drop the need for `bswap` on x86, but we also speed up the bit count calculation.
+//! This makes this encoding faster than LEB128, Google Varint, and others.
+
+pub fn encode(val: u32) -> Vec<u8> {
+    let mut result = Vec::with_capacity(5);
+    let shift = match val {
+        0..0x80 => 0,
+        0x80..0x4000 => 1,
+        0x4000..0x200000 => 2,
+        0x200000..0x10000000 => 3,
+        _ => {
+            result.push(0xff);
+            return result;
+        }
+    };
+    let marker = (1u32 << shift) - 1;
+    let encoded = (val << (shift + 1)) | marker;
+    let bytes = encoded.to_le_bytes();
+    result.extend_from_slice(&bytes[..=shift]);
+    result
+}
+
+/// # Safety
+///
+/// The caller must ensure that `data..data+4` is valid memory.
+/// It doesn't need to be a valid value, but it must be readable.
+pub unsafe fn decode(data: *const u8) -> (u32, usize) {
+    // For inputs such as:
+    //   [0xff, 0xff, 0xff, 0xff]
+    // the shifts below will shift by more than 31 digits, which Rust considers undefined behavior.
+    // *We explicitly want UB here*.
+    //
+    // If we write an if condition here (like this one), LLVM will turn that into a proper branch. Since our inputs
+    // are relatively random, that branch will mispredict, hurting performance. The if condition at the end
+    // gets turned into conditional moves (good!), but that only works because it comes after the shifts.
+    // Unfortunately, there's no way to ask Rust for "platform-defined behavior" (`unchecked_shl/shr` is not it).
+    #[cfg(debug_assertions)]
+    unsafe {
+        if (*data & 0x0f) == 0x0f {
+            return (u32::MAX, 1);
+        }
+    }
+
+    unsafe {
+        let val = u32::from_le((data as *const u32).read_unaligned());
+        let ones = val.trailing_ones();
+
+        let mut len = ones as usize + 1;
+        let mut res = 'bextr: {
+            // Give LLVM a helping hand for x86 CPUs with BMI1. It's not smart enough to figure out that `bextr` can
+            // be used here. To be fair, it's not faster, so maybe that's why. It is _a lot_ more compact, however.
+            #[cfg(target_feature = "bmi1")]
+            break 'bextr std::arch::x86_64::_bextr_u32(val, len as u32, (7 * len) as u32);
+
+            // This is where you'd put more architecture-specific optimizations.
+            // In fact this is where I'd put my ARM optimizations, but it doesn't have anything like `bextr`. :(
+
+            let mut res = val;
+            // Shift out the bytes we read but don't need.
+            res <<= 32 - 8 * len;
+            // Shift back down and remove the trailing 0/10/110/1110/1111 length bits.
+            res >>= 32 - 7 * len;
+            break 'bextr res;
+        };
+
+        // If the lead byte indicates >28 bits, assume `u32::MAX`.
+        // This doubles as a simple form of error correction.
+        if len > 4 {
+            res = u32::MAX;
+            len = 1;
+        }
+
+        (res, len)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_encode_decode_roundtrip() {
+        // Test various boundary values
+        let test_values = [
+            0u32,
+            1,
+            123,
+            127, // Max 1 byte
+            128, // Min 2 bytes
+            1234,
+            16383,     // Max 2 bytes
+            16384,     // Min 3 bytes
+            2097151,   // Max 3 bytes
+            2097152,   // Min 4 bytes
+            268435455, // Max 4 bytes
+            u32::MAX,  // Special case
+        ];
+
+        for &val in &test_values {
+            let encoded = encode(val);
+            println!("Value {} encoded as: {:02X?}", val, encoded);
+            let (decoded, len) = unsafe { decode(encoded.as_ptr()) };
+            println!("  Decoded as: {} with length {}", decoded, len);
+            assert_eq!(decoded, val, "Failed roundtrip for value {}", val);
+            assert_eq!(len, encoded.len(), "Length mismatch for value {}", val);
+        }
+    }
+
+    #[test]
+    fn test_specific_encodings() {
+        // Test specific byte patterns
+        unsafe {
+            assert_eq!((0, 1), decode([0, 0xbb, 0xcc, 0xdd].as_ptr()));
+            assert_eq!((123, 1), decode([0xf6, 0xbb, 0xcc, 0xdd].as_ptr()));
+            assert_eq!((1234, 2), decode([0x49, 0x13, 0xcc, 0xdd].as_ptr()));
+            assert_eq!((u32::MAX, 1), decode([0xff, 0xbb, 0xcc, 0xdd].as_ptr()));
+        }
+    }
+}