Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions .github/workflows/cross-rvv-arch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# RISC-V RVV cross-compilation build using qemu 11 + gcc 15 (Arch Linux).
#
# Why this workflow exists alongside cross-rvv.yml:
#
# QEMU's RISC-V Vector emulation is dramatically slower than scalar in
# qemu < 11 (see QEMU issue #2137 for documented 100x+ slowdowns of
# auto-vectorised RVV loops under TCG). At vlen=128 the slowdown is large
# enough that gcc's RVV codegen for our test suite causes the qemu-user
# emulator to make no observable progress within the 6h GHA timeout —
# i.e. the apt-shipped qemu-user-static (8.2.x in noble, 9.x in plucky)
# can't run xsimd's full test_xsimd at vlen=128.
#
# Empirically:
# qemu 8.2.2 (Ubuntu 24.04 apt) : test_xsimd at vlen=128 times out
# qemu 9.2.1 (Ubuntu 25.04 plucky) : ditto
# qemu 10.0.8 (Debian trixie) : ditto
# qemu 11.0.0 (Arch) + gcc 15.1 : 367 cases / 5664 asserts in <10 min
#
# So vlen=128 RVV coverage lives in this workflow, which runs the build
# and test inside an `archlinux:latest` container (qemu 11 + gcc 15.1).
# The matching ubuntu-runner workflow `cross-rvv.yml` keeps multi-compiler
# matrix coverage (gcc-14, clang-17/18) for vlens >= 256, where the apt
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess there's no way to have all compilers running on archlinux?

# qemu is fast enough.
#
# References:
# QEMU 11.0.0 release notes: https://www.qemu.org/2026/04/22/qemu-11-0-0/
# QEMU RVV slowdowns issue: https://gitlab.com/qemu-project/qemu/-/issues/2137
# Ubuntu RVV vstart bug: https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2095169
name: RISC-V RVV cross-compilation build (qemu 11)
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
container: archlinux:latest
name: 'RISC-V RVV${{ matrix.vector_bits }} (qemu 11)'
strategy:
fail-fast: false
matrix:
vector_bits:
- 128
- 256
- 512
steps:
- name: Setup toolchain and qemu
run: |
pacman -Sy --noconfirm
pacman -S --noconfirm --needed \
qemu-user-static riscv64-linux-gnu-gcc riscv64-linux-gnu-glibc \
cmake ninja git ca-certificates
qemu-riscv64-static --version
riscv64-linux-gnu-gcc --version | head -1
- name: Checkout xsimd
uses: actions/checkout@v6
- name: Setup
run: >
cmake -S . -B _build
-GNinja
-DBUILD_TESTS=ON
-DDOWNLOAD_DOCTEST=ON
-DCMAKE_BUILD_TYPE=Release
-DTARGET_ARCH=generic
-DCMAKE_C_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl"
-DCMAKE_CXX_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl"
-DCMAKE_TOOLCHAIN_FILE=.github/toolchains/gcc-riscv64-linux-gnu.cmake
- name: Build
run: cmake --build _build
- name: Set CPU feature test expectations
run: |
echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV"
echo "XSIMD_TEST_CPU_ASSUME_SVE=0" >> "$GITHUB_ENV"
echo "XSIMD_TEST_CPU_ASSUME_RVV=1" >> "$GITHUB_ENV"
- name: Testing xsimd
timeout-minutes: 15
# Invoke qemu-riscv64-static explicitly. Inside the archlinux:latest
# container we don't have permission to register binfmt_misc with the
# host kernel, so exec'ing the riscv64 ELF directly fails with
# "Exec format error".
run: >
QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0"
QEMU_LD_PREFIX="/usr/riscv64-linux-gnu"
qemu-riscv64-static ./test/test_xsimd
working-directory: _build
28 changes: 24 additions & 4 deletions .github/workflows/cross-rvv.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# RISC-V RVV cross-compilation build (Ubuntu apt qemu, multi-compiler matrix).
#
# vlen=128 is intentionally NOT covered here. Ubuntu's qemu-user-static
# (8.2.x in noble, 9.x in plucky) hangs on the xsimd test_xsimd binary at
# vlen=128 — see QEMU issue #2137 (RVV TCG slowdowns) for the underlying
# emulator behaviour. Until ubuntu-latest ships qemu 11+, vlen=128 coverage
# lives in cross-rvv-arch.yml, which runs inside an archlinux:latest
# container with qemu 11. Vlens >= 256 run fast enough under the apt qemu
# to stay within the test step's timeout.
#
# References:
# QEMU 11.0.0 release notes: https://www.qemu.org/2026/04/22/qemu-11-0-0/
# QEMU RVV slowdowns issue: https://gitlab.com/qemu-project/qemu/-/issues/2137
# Ubuntu RVV vstart bug: https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2095169
name: RISC-V RVV cross-compilation build
on: [push, pull_request]
concurrency:
Expand All @@ -8,13 +22,13 @@ jobs:
runs-on: ubuntu-latest
name: 'RISC-V RVV${{ matrix.vector_bits }}'
strategy:
fail-fast: false
matrix:
sys:
- { compiler: 'gcc', gcc_runtime: '14'}
- { compiler: 'clang', version: '17', gcc_runtime: '14'}
- { compiler: 'clang', version: '18', gcc_runtime: '14'}
vector_bits:
- 128
- 256
- 512
steps:
Expand All @@ -35,9 +49,14 @@ jobs:
sudo ln -srf $(which clang++-${{ matrix.sys.version }}) /usr/bin/clang++
rm llvm.sh
- name: Setup QEMU
uses: docker/setup-qemu-action@v3.0.0
with:
platforms: riscv64
# Use the qemu-user-static package shipped by the runner image rather
# than docker/setup-qemu-action: tonistiigi/binfmt pins an even older
# qemu (~6.x/7.x) whose RVV implementation miscompiles vmulh* and is
# known to hang test_xsimd until the 6h GHA timeout.
run: |
sudo apt-get -y -qq update
sudo apt-get -y -qq --no-install-suggests --no-install-recommends install qemu-user-static
qemu-riscv64-static --version
- name: Setup Ninja
run: |
sudo apt-get -y -qq install ninja-build
Expand All @@ -62,6 +81,7 @@ jobs:
echo "XSIMD_TEST_CPU_ASSUME_SVE=0" >> "$GITHUB_ENV"
echo "XSIMD_TEST_CPU_ASSUME_RVV=1" >> "$GITHUB_ENV"
- name: Testing xsimd
timeout-minutes: 15
run: >
QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0"
QEMU_LD_PREFIX="/usr/riscv64-linux-gnu"
Expand Down
6 changes: 6 additions & 0 deletions docs/source/api/arithmetic_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ Binary operations:
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mul` | per slot multiply |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mullo` | low N bits of the 2N-bit integer product |
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indentation seems odd.

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mulhi` | high N bits of the 2N-bit integer product |
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mulhilo` | pair {hi, lo} of the 2N-bit integer product |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`div` | per slot division |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mod` | per slot modulo |
Expand Down
112 changes: 112 additions & 0 deletions include/xsimd/arch/common/xsimd_common_arithmetic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,118 @@ namespace xsimd
self, other);
}

// mulhi
namespace detail
{
template <class T>
struct mulhi_helper
{
// default: use a wider native integer type
using wider = typename std::conditional<
std::is_signed<T>::value,
typename std::conditional<sizeof(T) == 1, int16_t,
typename std::conditional<sizeof(T) == 2, int32_t, int64_t>::type>::type,
typename std::conditional<sizeof(T) == 1, uint16_t,
typename std::conditional<sizeof(T) == 2, uint32_t, uint64_t>::type>::type>::type;

static XSIMD_INLINE T compute(T x, T y) noexcept
{
constexpr int shift = 8 * sizeof(T);
return static_cast<T>((static_cast<wider>(x) * static_cast<wider>(y)) >> shift);
}
};

// 64-bit unsigned software mulhi via 32-bit splits
XSIMD_INLINE uint64_t mulhi_u64(uint64_t x, uint64_t y) noexcept
{
#if defined(__SIZEOF_INT128__)
return static_cast<uint64_t>((static_cast<unsigned __int128>(x) * static_cast<unsigned __int128>(y)) >> 64);
#else
uint64_t xl = x & 0xffffffffULL;
uint64_t xh = x >> 32;
uint64_t yl = y & 0xffffffffULL;
uint64_t yh = y >> 32;
uint64_t ll = xl * yl;
uint64_t lh = xl * yh;
uint64_t hl = xh * yl;
uint64_t hh = xh * yh;
uint64_t mid = (ll >> 32) + (lh & 0xffffffffULL) + (hl & 0xffffffffULL);
return hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
#endif
}

XSIMD_INLINE int64_t mulhi_i64(int64_t x, int64_t y) noexcept
{
#if defined(__SIZEOF_INT128__)
return static_cast<int64_t>((static_cast<__int128>(x) * static_cast<__int128>(y)) >> 64);
#else
uint64_t uhi = mulhi_u64(static_cast<uint64_t>(x), static_cast<uint64_t>(y));
if (x < 0)
uhi -= static_cast<uint64_t>(y);
if (y < 0)
uhi -= static_cast<uint64_t>(x);
return static_cast<int64_t>(uhi);
#endif
}

template <>
struct mulhi_helper<uint64_t>
{
static XSIMD_INLINE uint64_t compute(uint64_t x, uint64_t y) noexcept { return mulhi_u64(x, y); }
};

template <>
struct mulhi_helper<int64_t>
{
static XSIMD_INLINE int64_t compute(int64_t x, int64_t y) noexcept { return mulhi_i64(x, y); }
};

// Compute the high 64 bits of each lane-wise 64x64 unsigned product,
// given a "widening mul" functor WMul that takes two batch<uint64_t,A>
// and returns batch<uint64_t,A> containing the 64-bit product of the
// low 32 bits of each 64-bit lane (i.e. _mm*_mul_epu32 wrapped).
template <class A, class WMul>
XSIMD_INLINE batch<uint64_t, A> mulhi_u64_core(batch<uint64_t, A> const& x,
batch<uint64_t, A> const& y,
WMul mul_epu32) noexcept
{
using B = batch<uint64_t, A>;
const B mask(uint64_t(0xffffffffULL));
B xl = x & mask;
B xh = x >> 32;
B yl = y & mask;
B yh = y >> 32;
B ll = mul_epu32(xl, yl);
B lh = mul_epu32(xl, yh);
B hl = mul_epu32(xh, yl);
B hh = mul_epu32(xh, yh);
B mid = (ll >> 32) + (lh & mask) + (hl & mask);
return hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
}

// Signed variant: unsigned core + sign fixup via arithmetic shift-by-63.
template <class A, class WMul>
XSIMD_INLINE batch<int64_t, A> mulhi_i64_core(batch<int64_t, A> const& x,
batch<int64_t, A> const& y,
WMul mul_epu32) noexcept
{
auto ux = ::xsimd::bitwise_cast<uint64_t>(x);
auto uy = ::xsimd::bitwise_cast<uint64_t>(y);
auto uhi = mulhi_u64_core<A>(ux, uy, mul_epu32);
auto sa = ::xsimd::bitwise_cast<uint64_t>(x >> 63);
auto sb = ::xsimd::bitwise_cast<uint64_t>(y >> 63);
return ::xsimd::bitwise_cast<int64_t>(uhi - (uy & sa) - (ux & sb));
}
}

template <class A, class T, class /*=std::enable_if_t<std::is_integral<T>::value>*/>
XSIMD_INLINE batch<T, A> mulhi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept
{
return detail::apply([](T x, T y) noexcept -> T
{ return detail::mulhi_helper<T>::compute(x, y); },
self, other);
}

// rotl
template <class A, class T, class STy>
XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, STy other, requires_arch<common>) noexcept
Expand Down
44 changes: 44 additions & 0 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,50 @@ namespace xsimd
}
}

// mulhi
template <class A>
XSIMD_INLINE batch<int16_t, A> mulhi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_mulhi_epi16(self, other);
}
template <class A>
XSIMD_INLINE batch<uint16_t, A> mulhi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_mulhi_epu16(self, other);
}
template <class A>
XSIMD_INLINE batch<int32_t, A> mulhi(batch<int32_t, A> const& self, batch<int32_t, A> const& other, requires_arch<avx2>) noexcept
{
__m256i even = _mm256_mul_epi32(self, other);
__m256i odd = _mm256_mul_epi32(_mm256_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)),
_mm256_shuffle_epi32(other, _MM_SHUFFLE(3, 3, 1, 1)));
__m256i even_hi = _mm256_srli_epi64(even, 32);
return _mm256_blend_epi16(even_hi, odd, 0xCC);
}
template <class A>
XSIMD_INLINE batch<uint32_t, A> mulhi(batch<uint32_t, A> const& self, batch<uint32_t, A> const& other, requires_arch<avx2>) noexcept
{
__m256i even = _mm256_mul_epu32(self, other);
__m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(self, 32), _mm256_srli_epi64(other, 32));
__m256i even_hi = _mm256_srli_epi64(even, 32);
return _mm256_blend_epi16(even_hi, odd, 0xCC);
}

template <class A>
XSIMD_INLINE batch<uint64_t, A> mulhi(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx2>) noexcept
{
return detail::mulhi_u64_core<A>(self, other,
[](batch<uint64_t, A> a, batch<uint64_t, A> b)
{ return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
}
template <class A>
XSIMD_INLINE batch<int64_t, A> mulhi(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx2>) noexcept
{
return detail::mulhi_i64_core<A>(self, other,
[](batch<uint64_t, A> a, batch<uint64_t, A> b)
{ return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
}

// reduce_add
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
Expand Down
12 changes: 12 additions & 0 deletions include/xsimd/arch/xsimd_avx512bw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,18 @@ namespace xsimd
}
}

// mulhi
template <class A>
XSIMD_INLINE batch<int16_t, A> mulhi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<avx512bw>) noexcept
{
return _mm512_mulhi_epi16(self, other);
}
template <class A>
XSIMD_INLINE batch<uint16_t, A> mulhi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<avx512bw>) noexcept
{
return _mm512_mulhi_epu16(self, other);
}

// neq
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
Expand Down
35 changes: 35 additions & 0 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1772,6 +1772,41 @@ namespace xsimd
}
}

// mulhi
template <class A>
XSIMD_INLINE batch<int32_t, A> mulhi(batch<int32_t, A> const& self, batch<int32_t, A> const& other, requires_arch<avx512f>) noexcept
{
__m512i even = _mm512_mul_epi32(self, other);
__m512i odd = _mm512_mul_epi32(_mm512_shuffle_epi32(self, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))),
_mm512_shuffle_epi32(other, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))));
__m512i even_hi = _mm512_srli_epi64(even, 32);
// merge: even_hi has hi in low-32 of each 64, odd has hi in high-32 of each 64
return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd);
}
template <class A>
XSIMD_INLINE batch<uint32_t, A> mulhi(batch<uint32_t, A> const& self, batch<uint32_t, A> const& other, requires_arch<avx512f>) noexcept
{
__m512i even = _mm512_mul_epu32(self, other);
__m512i odd = _mm512_mul_epu32(_mm512_srli_epi64(self, 32), _mm512_srli_epi64(other, 32));
__m512i even_hi = _mm512_srli_epi64(even, 32);
return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd);
}

template <class A>
XSIMD_INLINE batch<uint64_t, A> mulhi(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512f>) noexcept
{
return detail::mulhi_u64_core<A>(self, other,
[](batch<uint64_t, A> a, batch<uint64_t, A> b)
{ return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
}
template <class A>
XSIMD_INLINE batch<int64_t, A> mulhi(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512f>) noexcept
{
return detail::mulhi_i64_core<A>(self, other,
[](batch<uint64_t, A> a, batch<uint64_t, A> b)
{ return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
}

// nearbyint
template <class A>
XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
Expand Down
Loading
Loading