linebender · Shnatsel · May 24, 2026
diff --git a/fearless_simd/Cargo.toml b/fearless_simd/Cargo.toml
@@ -26,13 +26,18 @@ rustdoc-args = [
 
 
 [features]
-default = ["std"]
+default = ["std", "sse4_2", "avx2"]
 # Get floating point functions from the standard library (likely using your targets libc).
 # Also allows using `Level::new` on all platforms, to detect which target features are enabled
 std = []
 # Use floating point implementations from libm
 libm = ["dep:libm"]
 
+# Enable the SSE4.2/x86-64-v2 runtime SIMD level on x86 and x86_64.
+sse4_2 = []
+# Enable the AVX2/x86-64-v3 runtime SIMD level on x86 and x86_64.
+avx2 = []
+
 # Force the "fallback" SIMD level to be supported
 # This is primarily used for tests
 force_support_fallback = []

diff --git a/fearless_simd/README.md b/fearless_simd/README.md
@@ -158,6 +158,8 @@ The following crate [feature flags](https://doc.rust-lang.org/cargo/reference/fe
 - `std` (enabled by default): Get floating point functions from the standard library (likely using your target's libc).
   Also allows using [`Level::new`] on all platforms, to detect which target features are enabled.
 - `libm`: Use floating point implementations from [libm].
+- `sse4_2` (enabled by default): Enable the SSE4.2/x86-64-v2 runtime SIMD level on `x86` and `x86_64`.
+- `avx2` (enabled by default): Enable the AVX2/x86-64-v3 runtime SIMD level on `x86` and `x86_64`.
 - `force_support_fallback`: Force scalar fallback, to be supported, even if your compilation target has a better baseline.
 
 At least one of `std` and `libm` is required; `std` overrides `libm`.

diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs
@@ -44,27 +44,27 @@
 //!
 //! All files in this subdirectory are autogenerated by the `fearless_simd_gen` crate.
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
 mod avx2;
 mod fallback;
 #[cfg(target_arch = "aarch64")]
 mod neon;
 mod ops;
 pub(crate) mod simd_trait;
 mod simd_types;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))]
 mod sse4_2;
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
 mod wasm;
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
 pub use avx2::*;
 pub use fallback::*;
 #[cfg(target_arch = "aarch64")]
 pub use neon::*;
 pub use simd_trait::*;
 pub use simd_types::*;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))]
 pub use sse4_2::*;
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
 pub use wasm::*;
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
@@ -8038,6 +8038,210 @@ impl Simd for Avx2 {
         )
     }
 }
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<f32x4<S>> for __m128 {
+    #[inline(always)]
+    fn from(value: f32x4<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<i8x16<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: i8x16<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<u8x16<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: u8x16<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<mask8x16<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: mask8x16<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<i16x8<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: i16x8<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<u16x8<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: u16x8<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<mask16x8<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: mask16x8<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<i32x4<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: i32x4<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<u32x4<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: u32x4<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<mask32x4<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: mask32x4<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128d) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<f64x2<S>> for __m128d {
+    #[inline(always)]
+    fn from(value: f64x2<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<mask64x2<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: mask64x2<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
 impl<S: Simd> SimdFrom<__m256, S> for f32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256) -> Self {

diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
@@ -150,7 +150,7 @@ pub trait Simd:
     fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
@@ -857,7 +857,7 @@ pub trait Simd:
     fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
@@ -984,7 +984,7 @@ pub trait Simd:
     fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
@@ -1713,7 +1713,7 @@ pub trait Simd:
     fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
@@ -1844,7 +1844,7 @@ pub trait Simd:
     fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
@@ -2567,7 +2567,7 @@ pub trait Simd:
     fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
@@ -2813,7 +2813,7 @@ pub trait SimdFloat<S: Simd>:
     fn abs(self) -> Self;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt(self) -> Self;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip(self) -> Self;
     #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;

diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
@@ -86,6 +86,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn level(self) -> Level {
         #[cfg(not(all(
+            feature = "avx2",
             target_feature = "avx2",
             target_feature = "bmi1",
             target_feature = "bmi2",
@@ -99,6 +100,7 @@ impl Simd for Sse4_2 {
         )))]
         return Level::Sse4_2(self);
         #[cfg(all(
+            feature = "avx2",
             target_feature = "avx2",
             target_feature = "bmi1",
             target_feature = "bmi2",