Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions fearless_simd/src/generated/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,9 @@ impl Simd for Neon {
#[inline(always)]
fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
unsafe {
let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
let shifts = crate::transmute::checked_transmute_copy::<[i16; 8], int16x8_t>(&[
15, 14, 13, 12, 11, 10, 9, 8,
]);
let lo = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
let hi = vshlq_u16(vdupq_n_u16((bits >> 8) as u16), shifts);
let lo = vcltq_s16(vreinterpretq_s16_u16(lo), vdupq_n_s16(0));
Expand All @@ -795,8 +797,9 @@ impl Simd for Neon {
#[inline(always)]
fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
unsafe {
let weights =
vld1q_u8([1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128].as_ptr());
let weights = crate::transmute::checked_transmute_copy::<[u8; 16], uint8x16_t>(&[
1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
]);
let bits = vandq_u8(vreinterpretq_u8_s8(a.into()), weights);
let lo = vaddv_u8(vget_low_u8(bits)) as u64;
let hi = vaddv_u8(vget_high_u8(bits)) as u64;
Expand Down Expand Up @@ -1284,7 +1287,9 @@ impl Simd for Neon {
#[inline(always)]
fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
unsafe {
let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
let shifts = crate::transmute::checked_transmute_copy::<[i16; 8], int16x8_t>(&[
15, 14, 13, 12, 11, 10, 9, 8,
]);
let shifted = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
let mask = vcltq_s16(vreinterpretq_s16_u16(shifted), vdupq_n_s16(0));
vreinterpretq_s16_u16(mask).simd_into(self)
Expand All @@ -1293,7 +1298,9 @@ impl Simd for Neon {
#[inline(always)]
fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
unsafe {
let weights = vld1q_u16([1, 2, 4, 8, 16, 32, 64, 128].as_ptr());
let weights = crate::transmute::checked_transmute_copy::<[u16; 8], uint16x8_t>(&[
1, 2, 4, 8, 16, 32, 64, 128,
]);
let bits = vandq_u16(vreinterpretq_u16_s16(a.into()), weights);
vaddvq_u16(bits) as u64
}
Expand Down Expand Up @@ -1783,7 +1790,8 @@ impl Simd for Neon {
#[inline(always)]
fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
unsafe {
let shifts = vld1q_s32([31, 30, 29, 28].as_ptr());
let shifts =
crate::transmute::checked_transmute_copy::<[i32; 4], int32x4_t>(&[31, 30, 29, 28]);
let shifted = vshlq_u32(vdupq_n_u32(bits as u32), shifts);
let mask = vcltq_s32(vreinterpretq_s32_u32(shifted), vdupq_n_s32(0));
vreinterpretq_s32_u32(mask).simd_into(self)
Expand All @@ -1792,7 +1800,8 @@ impl Simd for Neon {
#[inline(always)]
fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
unsafe {
let weights = vld1q_u32([1, 2, 4, 8].as_ptr());
let weights =
crate::transmute::checked_transmute_copy::<[u32; 4], uint32x4_t>(&[1, 2, 4, 8]);
let bits = vandq_u32(vreinterpretq_u32_s32(a.into()), weights);
vaddvq_u32(bits) as u64
}
Expand Down Expand Up @@ -2103,7 +2112,7 @@ impl Simd for Neon {
#[inline(always)]
fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
unsafe {
let shifts = vld1q_s64([63, 62].as_ptr());
let shifts = crate::transmute::checked_transmute_copy::<[i64; 2], int64x2_t>(&[63, 62]);
let shifted = vshlq_u64(vdupq_n_u64(bits), shifts);
let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0));
vreinterpretq_s64_u64(mask).simd_into(self)
Expand All @@ -2112,7 +2121,7 @@ impl Simd for Neon {
#[inline(always)]
fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
unsafe {
let weights = vld1q_u64([1, 2].as_ptr());
let weights = crate::transmute::checked_transmute_copy::<[u64; 2], uint64x2_t>(&[1, 2]);
let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights);
vaddvq_u64(bits)
}
Expand Down
2 changes: 2 additions & 0 deletions fearless_simd/src/transmute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use core::arch::aarch64::{
int8x16_t, int8x16x2_t, int8x16x4_t, int16x8_t, int16x8x2_t, int16x8x4_t, int32x4_t,
int32x4x2_t, int32x4x4_t, int64x2_t, int64x2x2_t, int64x2x4_t, uint8x16_t, uint8x16x2_t,
uint8x16x4_t, uint16x8_t, uint16x8x2_t, uint16x8x4_t, uint32x4_t, uint32x4x2_t, uint32x4x4_t,
uint64x2_t,
};
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
use core::arch::wasm32::v128;
Expand Down Expand Up @@ -160,6 +161,7 @@ const _: () = {
unsafe impl SimdPod for uint32x4_t {}
unsafe impl SimdPod for uint32x4x2_t {}
unsafe impl SimdPod for uint32x4x4_t {}
unsafe impl SimdPod for uint64x2_t {}
};

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Expand Down
48 changes: 35 additions & 13 deletions fearless_simd_gen/src/mk_neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,10 @@ impl Neon {
8 => quote! {
#method_sig {
unsafe {
let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
let shifts =
crate::transmute::checked_transmute_copy::<[i16; 8], int16x8_t>(
&[15, 14, 13, 12, 11, 10, 9, 8],
);
let lo = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
let hi = vshlq_u16(vdupq_n_u16((bits >> 8) as u16), shifts);
let lo = vcltq_s16(vreinterpretq_s16_u16(lo), vdupq_n_s16(0));
Expand All @@ -595,7 +598,10 @@ impl Neon {
16 => quote! {
#method_sig {
unsafe {
let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
let shifts =
crate::transmute::checked_transmute_copy::<[i16; 8], int16x8_t>(
&[15, 14, 13, 12, 11, 10, 9, 8],
);
let shifted = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
let mask = vcltq_s16(vreinterpretq_s16_u16(shifted), vdupq_n_s16(0));
vreinterpretq_s16_u16(mask).simd_into(self)
Expand All @@ -605,7 +611,10 @@ impl Neon {
32 => quote! {
#method_sig {
unsafe {
let shifts = vld1q_s32([31, 30, 29, 28].as_ptr());
let shifts =
crate::transmute::checked_transmute_copy::<[i32; 4], int32x4_t>(
&[31, 30, 29, 28],
);
let shifted = vshlq_u32(vdupq_n_u32(bits as u32), shifts);
let mask = vcltq_s32(vreinterpretq_s32_u32(shifted), vdupq_n_s32(0));
vreinterpretq_s32_u32(mask).simd_into(self)
Expand All @@ -615,7 +624,10 @@ impl Neon {
64 => quote! {
#method_sig {
unsafe {
let shifts = vld1q_s64([63, 62].as_ptr());
let shifts =
crate::transmute::checked_transmute_copy::<[i64; 2], int64x2_t>(
&[63, 62],
);
let shifted = vshlq_u64(vdupq_n_u64(bits), shifts);
let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0));
vreinterpretq_s64_u64(mask).simd_into(self)
Expand All @@ -642,10 +654,13 @@ impl Neon {
8 => quote! {
#method_sig {
unsafe {
let weights = vld1q_u8([
1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128,
].as_ptr());
let weights =
crate::transmute::checked_transmute_copy::<[u8; 16], uint8x16_t>(
&[
1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128,
],
);
let bits = vandq_u8(vreinterpretq_u8_s8(a.into()), weights);
let lo = vaddv_u8(vget_low_u8(bits)) as u64;
let hi = vaddv_u8(vget_high_u8(bits)) as u64;
Expand All @@ -656,9 +671,10 @@ impl Neon {
16 => quote! {
#method_sig {
unsafe {
let weights = vld1q_u16([
1, 2, 4, 8, 16, 32, 64, 128,
].as_ptr());
let weights =
crate::transmute::checked_transmute_copy::<[u16; 8], uint16x8_t>(
&[1, 2, 4, 8, 16, 32, 64, 128],
);
let bits = vandq_u16(vreinterpretq_u16_s16(a.into()), weights);
vaddvq_u16(bits) as u64
}
Expand All @@ -667,7 +683,10 @@ impl Neon {
32 => quote! {
#method_sig {
unsafe {
let weights = vld1q_u32([1, 2, 4, 8].as_ptr());
let weights =
crate::transmute::checked_transmute_copy::<[u32; 4], uint32x4_t>(
&[1, 2, 4, 8],
);
let bits = vandq_u32(vreinterpretq_u32_s32(a.into()), weights);
vaddvq_u32(bits) as u64
}
Expand All @@ -676,7 +695,10 @@ impl Neon {
64 => quote! {
#method_sig {
unsafe {
let weights = vld1q_u64([1, 2].as_ptr());
let weights =
crate::transmute::checked_transmute_copy::<[u64; 2], uint64x2_t>(
&[1, 2],
);
let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights);
vaddvq_u64(bits)
}
Expand Down
Loading