Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion fearless_simd/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,18 @@ rustdoc-args = [


[features]
default = ["std"]
default = ["std", "sse4_2", "avx2"]
# Get floating point functions from the standard library (likely using your targets libc).
# Also allows using `Level::new` on all platforms, to detect which target features are enabled
std = []
# Use floating point implementations from libm
libm = ["dep:libm"]

# Enable the SSE4.2/x86-64-v2 runtime SIMD level on x86 and x86_64.
sse4_2 = []
# Enable the AVX2/x86-64-v3 runtime SIMD level on x86 and x86_64.
avx2 = []

# Force the "fallback" SIMD level to be supported
# This is primarily used for tests
force_support_fallback = []
Expand Down
2 changes: 2 additions & 0 deletions fearless_simd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ The following crate [feature flags](https://doc.rust-lang.org/cargo/reference/fe
- `std` (enabled by default): Get floating point functions from the standard library (likely using your target's libc).
Also allows using [`Level::new`] on all platforms, to detect which target features are enabled.
- `libm`: Use floating point implementations from [libm].
- `sse4_2` (enabled by default): Enable the SSE4.2/x86-64-v2 runtime SIMD level on `x86` and `x86_64`.
- `avx2` (enabled by default): Enable the AVX2/x86-64-v3 runtime SIMD level on `x86` and `x86_64`.
- `force_support_fallback`: Force scalar fallback, to be supported, even if your compilation target has a better baseline.

At least one of `std` and `libm` is required; `std` overrides `libm`.
Expand Down
8 changes: 4 additions & 4 deletions fearless_simd/src/generated.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,27 +44,27 @@
//!
//! All files in this subdirectory are autogenerated by the `fearless_simd_gen` crate.

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
mod avx2;
mod fallback;
#[cfg(target_arch = "aarch64")]
mod neon;
mod ops;
pub(crate) mod simd_trait;
mod simd_types;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))]
mod sse4_2;
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
mod wasm;

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
pub use avx2::*;
pub use fallback::*;
#[cfg(target_arch = "aarch64")]
pub use neon::*;
pub use simd_trait::*;
pub use simd_types::*;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))]
pub use sse4_2::*;
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub use wasm::*;
204 changes: 204 additions & 0 deletions fearless_simd/src/generated/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8038,6 +8038,210 @@ impl Simd for Avx2 {
)
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<f32x4<S>> for __m128 {
#[inline(always)]
fn from(value: f32x4<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<i8x16<S>> for __m128i {
#[inline(always)]
fn from(value: i8x16<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<u8x16<S>> for __m128i {
#[inline(always)]
fn from(value: u8x16<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<mask8x16<S>> for __m128i {
#[inline(always)]
fn from(value: mask8x16<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<i16x8<S>> for __m128i {
#[inline(always)]
fn from(value: i16x8<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<u16x8<S>> for __m128i {
#[inline(always)]
fn from(value: u16x8<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<mask16x8<S>> for __m128i {
#[inline(always)]
fn from(value: mask16x8<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<i32x4<S>> for __m128i {
#[inline(always)]
fn from(value: i32x4<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<u32x4<S>> for __m128i {
#[inline(always)]
fn from(value: u32x4<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<mask32x4<S>> for __m128i {
#[inline(always)]
fn from(value: mask32x4<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128d) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<f64x2<S>> for __m128d {
#[inline(always)]
fn from(value: f64x2<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
#[cfg(not(feature = "sse4_2"))]
impl<S: Simd> From<mask64x2<S>> for __m128i {
#[inline(always)]
fn from(value: mask64x2<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m256, S> for f32x8<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m256) -> Self {
Expand Down
14 changes: 7 additions & 7 deletions fearless_simd/src/generated/simd_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ pub trait Simd:
fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
Expand Down Expand Up @@ -857,7 +857,7 @@ pub trait Simd:
fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
Expand Down Expand Up @@ -984,7 +984,7 @@ pub trait Simd:
fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
Expand Down Expand Up @@ -1713,7 +1713,7 @@ pub trait Simd:
fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
Expand Down Expand Up @@ -1844,7 +1844,7 @@ pub trait Simd:
fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
Expand Down Expand Up @@ -2567,7 +2567,7 @@ pub trait Simd:
fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
Expand Down Expand Up @@ -2813,7 +2813,7 @@ pub trait SimdFloat<S: Simd>:
fn abs(self) -> Self;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt(self) -> Self;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip(self) -> Self;
#[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;
Expand Down
2 changes: 2 additions & 0 deletions fearless_simd/src/generated/sse4_2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ impl Simd for Sse4_2 {
#[inline(always)]
fn level(self) -> Level {
#[cfg(not(all(
feature = "avx2",
target_feature = "avx2",
target_feature = "bmi1",
target_feature = "bmi2",
Expand All @@ -99,6 +100,7 @@ impl Simd for Sse4_2 {
)))]
return Level::Sse4_2(self);
#[cfg(all(
feature = "avx2",
target_feature = "avx2",
target_feature = "bmi1",
target_feature = "bmi2",
Expand Down
Loading
Loading