Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 44 additions & 16 deletions fearless_simd/src/generated/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5540,11 +5540,18 @@ impl Simd for Avx2 {
}
#[inline(always)]
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
let (chunks, []) = src.as_chunks::<4usize>() else {
unreachable!()
};
let v0: __m128 =
crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
let v1: __m128 =
crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
let v2: __m128 =
crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
let v3: __m128 =
crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
unsafe {
let v0 = _mm_loadu_ps(src.as_ptr() as *const _);
let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _);
let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _);
let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _);
let tmp0 = _mm_unpacklo_ps(v0, v1);
let tmp1 = _mm_unpackhi_ps(v0, v1);
let tmp2 = _mm_unpacklo_ps(v2, v3);
Expand Down Expand Up @@ -6170,11 +6177,18 @@ impl Simd for Avx2 {
}
#[inline(always)]
fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
let (chunks, []) = src.as_chunks::<16usize>() else {
unreachable!()
};
let v0: __m128i =
crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
let v1: __m128i =
crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
let v2: __m128i =
crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
let v3: __m128i =
crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
unsafe {
let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _);
let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _);
let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _);
let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
let v0 = _mm_shuffle_epi8(v0, mask);
let v1 = _mm_shuffle_epi8(v1, mask);
Expand Down Expand Up @@ -6931,11 +6945,18 @@ impl Simd for Avx2 {
}
#[inline(always)]
fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
let (chunks, []) = src.as_chunks::<8usize>() else {
unreachable!()
};
let v0: __m128i =
crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
let v1: __m128i =
crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
let v2: __m128i =
crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
let v3: __m128i =
crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
unsafe {
let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _);
let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _);
let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _);
let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
let v0 = _mm_shuffle_epi8(v0, mask);
let v1 = _mm_shuffle_epi8(v1, mask);
Expand Down Expand Up @@ -7680,11 +7701,18 @@ impl Simd for Avx2 {
}
#[inline(always)]
fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
let (chunks, []) = src.as_chunks::<4usize>() else {
unreachable!()
};
let v0: __m128i =
crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
let v1: __m128i =
crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
let v2: __m128i =
crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
let v3: __m128i =
crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
unsafe {
let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _);
let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _);
let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _);
let tmp0 = _mm_unpacklo_epi32(v0, v1);
let tmp1 = _mm_unpackhi_epi32(v0, v1);
let tmp2 = _mm_unpacklo_epi32(v2, v3);
Expand Down
60 changes: 44 additions & 16 deletions fearless_simd/src/generated/sse4_2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5623,11 +5623,18 @@ impl Simd for Sse4_2 {
}
#[inline(always)]
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
let (chunks, []) = src.as_chunks::<4usize>() else {
unreachable!()
};
let v0: __m128 =
crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
let v1: __m128 =
crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
let v2: __m128 =
crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
let v3: __m128 =
crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
unsafe {
let v0 = _mm_loadu_ps(src.as_ptr() as *const _);
let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _);
let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _);
let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _);
let tmp0 = _mm_unpacklo_ps(v0, v1);
let tmp1 = _mm_unpackhi_ps(v0, v1);
let tmp2 = _mm_unpacklo_ps(v2, v3);
Expand Down Expand Up @@ -6253,11 +6260,18 @@ impl Simd for Sse4_2 {
}
#[inline(always)]
fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
let (chunks, []) = src.as_chunks::<16usize>() else {
unreachable!()
};
let v0: __m128i =
crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
let v1: __m128i =
crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
let v2: __m128i =
crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
let v3: __m128i =
crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
unsafe {
let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _);
let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _);
let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _);
let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
let v0 = _mm_shuffle_epi8(v0, mask);
let v1 = _mm_shuffle_epi8(v1, mask);
Expand Down Expand Up @@ -7020,11 +7034,18 @@ impl Simd for Sse4_2 {
}
#[inline(always)]
fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
let (chunks, []) = src.as_chunks::<8usize>() else {
unreachable!()
};
let v0: __m128i =
crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
let v1: __m128i =
crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
let v2: __m128i =
crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
let v3: __m128i =
crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
unsafe {
let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _);
let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _);
let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _);
let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
let v0 = _mm_shuffle_epi8(v0, mask);
let v1 = _mm_shuffle_epi8(v1, mask);
Expand Down Expand Up @@ -7761,11 +7782,18 @@ impl Simd for Sse4_2 {
}
#[inline(always)]
fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
let (chunks, []) = src.as_chunks::<4usize>() else {
unreachable!()
};
let v0: __m128i =
crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
let v1: __m128i =
crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
let v2: __m128i =
crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
let v3: __m128i =
crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
unsafe {
let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _);
let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _);
let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _);
let tmp0 = _mm_unpacklo_epi32(v0, v1);
let tmp1 = _mm_unpackhi_epi32(v0, v1);
let tmp2 = _mm_unpacklo_epi32(v2, v3);
Expand Down
44 changes: 28 additions & 16 deletions fearless_simd/src/generated/wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5518,10 +5518,13 @@ impl Simd for WasmSimd128 {
}
#[inline(always)]
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
let v0: v128 = unsafe { v128_load(src[0 * 4usize..].as_ptr() as *const v128) };
let v1: v128 = unsafe { v128_load(src[1 * 4usize..].as_ptr() as *const v128) };
let v2: v128 = unsafe { v128_load(src[2 * 4usize..].as_ptr() as *const v128) };
let v3: v128 = unsafe { v128_load(src[3 * 4usize..].as_ptr() as *const v128) };
let (chunks, []) = src.as_chunks::<4usize>() else {
unreachable!()
};
let v0: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[0]);
let v1: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[1]);
let v2: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[2]);
let v3: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[3]);
let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1);
let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3);
let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1);
Expand Down Expand Up @@ -6145,10 +6148,13 @@ impl Simd for WasmSimd128 {
}
#[inline(always)]
fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
let v0: v128 = unsafe { v128_load(src[0 * 16usize..].as_ptr() as *const v128) };
let v1: v128 = unsafe { v128_load(src[1 * 16usize..].as_ptr() as *const v128) };
let v2: v128 = unsafe { v128_load(src[2 * 16usize..].as_ptr() as *const v128) };
let v3: v128 = unsafe { v128_load(src[3 * 16usize..].as_ptr() as *const v128) };
let (chunks, []) = src.as_chunks::<16usize>() else {
unreachable!()
};
let v0: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[0]);
let v1: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[1]);
let v2: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[2]);
let v3: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[3]);
let v01_lower =
u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v0, v1);
let v23_lower =
Expand Down Expand Up @@ -6886,10 +6892,13 @@ impl Simd for WasmSimd128 {
}
#[inline(always)]
fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
let v0: v128 = unsafe { v128_load(src[0 * 8usize..].as_ptr() as *const v128) };
let v1: v128 = unsafe { v128_load(src[1 * 8usize..].as_ptr() as *const v128) };
let v2: v128 = unsafe { v128_load(src[2 * 8usize..].as_ptr() as *const v128) };
let v3: v128 = unsafe { v128_load(src[3 * 8usize..].as_ptr() as *const v128) };
let (chunks, []) = src.as_chunks::<8usize>() else {
unreachable!()
};
let v0: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[0]);
let v1: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[1]);
let v2: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[2]);
let v3: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[3]);
let v01_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v0, v1);
let v23_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v2, v3);
let v01_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v0, v1);
Expand Down Expand Up @@ -7609,10 +7618,13 @@ impl Simd for WasmSimd128 {
}
#[inline(always)]
fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
let v0: v128 = unsafe { v128_load(src[0 * 4usize..].as_ptr() as *const v128) };
let v1: v128 = unsafe { v128_load(src[1 * 4usize..].as_ptr() as *const v128) };
let v2: v128 = unsafe { v128_load(src[2 * 4usize..].as_ptr() as *const v128) };
let v3: v128 = unsafe { v128_load(src[3 * 4usize..].as_ptr() as *const v128) };
let (chunks, []) = src.as_chunks::<4usize>() else {
unreachable!()
};
let v0: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[0]);
let v1: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[1]);
let v2: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[2]);
let v3: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[3]);
let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1);
let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3);
let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1);
Expand Down
20 changes: 16 additions & 4 deletions fearless_simd_gen/src/mk_wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,7 @@ impl Level for WasmSimd128 {
} => {
assert_eq!(block_count, 4, "only count of 4 is currently supported");
let elems_per_vec = block_size as usize / vec_ty.scalar_bits;
let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);

// For WASM we need to simulate interleaving with shuffle, and we only have
// access to 2, 4 and 16 lanes. So, for 64 u8's, we need to split and recombine
Expand Down Expand Up @@ -644,10 +645,21 @@ impl Level for WasmSimd128 {

quote! {
#method_sig {
let v0: v128 = unsafe { v128_load(src[0 * #elems_per_vec..].as_ptr() as *const v128) };
let v1: v128 = unsafe { v128_load(src[1 * #elems_per_vec..].as_ptr() as *const v128) };
let v2: v128 = unsafe { v128_load(src[2 * #elems_per_vec..].as_ptr() as *const v128) };
let v3: v128 = unsafe { v128_load(src[3 * #elems_per_vec..].as_ptr() as *const v128) };
let (chunks, []) = src.as_chunks::<#elems_per_vec>() else {
unreachable!()
};
let v0: v128 = crate::transmute::checked_transmute_copy::<[#scalar_ty; #elems_per_vec], v128>(
&chunks[0],
);
let v1: v128 = crate::transmute::checked_transmute_copy::<[#scalar_ty; #elems_per_vec], v128>(
&chunks[1],
);
let v2: v128 = crate::transmute::checked_transmute_copy::<[#scalar_ty; #elems_per_vec], v128>(
&chunks[2],
);
let v3: v128 = crate::transmute::checked_transmute_copy::<[#scalar_ty; #elems_per_vec], v128>(
&chunks[3],
);

// InterleaveLowerLanes(v0, v2) and InterleaveLowerLanes(v1, v3)
let v01_lower = #shuffle_fn::<#i1>(v0, v1);
Expand Down
25 changes: 18 additions & 7 deletions fearless_simd_gen/src/mk_x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1933,8 +1933,8 @@ impl X86 {
32 | 16 | 8 => {
let block_ty =
VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
let load_unaligned =
intrinsic_ident("loadu", coarse_type(&block_ty), block_ty.n_bits());
let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
let native_ty = self.arch_ty(&block_ty);
let vec_32 = block_ty.reinterpret(block_ty.scalar, 32);
let unpacklo_32 = simple_sign_unaware_intrinsic("unpacklo", &vec_32);
let unpackhi_32 = simple_sign_unaware_intrinsic("unpackhi", &vec_32);
Expand Down Expand Up @@ -2015,12 +2015,23 @@ impl X86 {
};

quote! {
unsafe {
let v0 = #load_unaligned(src.as_ptr() as *const _);
let v1 = #load_unaligned(src.as_ptr().add(#block_len) as *const _);
let v2 = #load_unaligned(src.as_ptr().add(2 * #block_len) as *const _);
let v3 = #load_unaligned(src.as_ptr().add(3 * #block_len) as *const _);
let (chunks, []) = src.as_chunks::<#block_len>() else {
unreachable!()
};
let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[0],
);
let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[1],
);
let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[2],
);
let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[3],
);

unsafe {
#init_shuffle

let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
Expand Down
Loading