xtensor-stack · DiamonDinoia · Apr 17, 2026 · Apr 17, 2026 · May 1, 2026 · serge-sans-paille
diff --git a/.github/workflows/cross-rvv-arch.yml b/.github/workflows/cross-rvv-arch.yml
@@ -0,0 +1,85 @@
+# RISC-V RVV cross-compilation build using qemu 11 + gcc 15 (Arch Linux).
+#
+# Why this workflow exists alongside cross-rvv.yml:
+#
+# QEMU's RISC-V Vector emulation is dramatically slower than scalar in
+# qemu < 11 (see QEMU issue #2137 for documented 100x+ slowdowns of
+# auto-vectorised RVV loops under TCG). At vlen=128 the slowdown is large
+# enough that gcc's RVV codegen for our test suite causes the qemu-user
+# emulator to make no observable progress within the 6h GHA timeout —
+# i.e. the apt-shipped qemu-user-static (8.2.x in noble, 9.x in plucky)
+# can't run xsimd's full test_xsimd at vlen=128.
+#
+# Empirically:
+#   qemu 8.2.2 (Ubuntu 24.04 apt)    : test_xsimd at vlen=128 times out
+#   qemu 9.2.1 (Ubuntu 25.04 plucky) : ditto
+#   qemu 10.0.8 (Debian trixie)      : ditto
+#   qemu 11.0.0 (Arch) + gcc 15.1    : 367 cases / 5664 asserts in <10 min
+#
+# So vlen=128 RVV coverage lives in this workflow, which runs the build
+# and test inside an `archlinux:latest` container (qemu 11 + gcc 15.1).
+# The matching ubuntu-runner workflow `cross-rvv.yml` keeps multi-compiler
+# matrix coverage (gcc-14, clang-17/18) for vlens >= 256, where the apt
+# qemu is fast enough.
+#
+# References:
+#   QEMU 11.0.0 release notes:  https://www.qemu.org/2026/04/22/qemu-11-0-0/
+#   QEMU RVV slowdowns issue:   https://gitlab.com/qemu-project/qemu/-/issues/2137
+#   Ubuntu RVV vstart bug:      https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2095169
+name: RISC-V RVV cross-compilation build (qemu 11)
+on: [push, pull_request]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    container: archlinux:latest
+    name: 'RISC-V RVV${{ matrix.vector_bits }} (qemu 11)'
+    strategy:
+      fail-fast: false
+      matrix:
+        vector_bits:
+          - 128
+          - 256
+          - 512
+    steps:
+    - name: Setup toolchain and qemu
+      run: |
+        pacman -Sy --noconfirm
+        pacman -S --noconfirm --needed \
+          qemu-user-static riscv64-linux-gnu-gcc riscv64-linux-gnu-glibc \
+          cmake ninja git ca-certificates
+        qemu-riscv64-static --version
+        riscv64-linux-gnu-gcc --version | head -1
+    - name: Checkout xsimd
+      uses: actions/checkout@v6
+    - name: Setup
+      run: >
+        cmake -S . -B _build
+        -GNinja
+        -DBUILD_TESTS=ON
+        -DDOWNLOAD_DOCTEST=ON
+        -DCMAKE_BUILD_TYPE=Release
+        -DTARGET_ARCH=generic
+        -DCMAKE_C_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl"
+        -DCMAKE_CXX_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl"
+        -DCMAKE_TOOLCHAIN_FILE=.github/toolchains/gcc-riscv64-linux-gnu.cmake
+    - name: Build
+      run: cmake --build _build
+    - name: Set CPU feature test expectations
+      run: |
+        echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_SVE=0"    >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_RVV=1" >> "$GITHUB_ENV"
+    - name: Testing xsimd
+      timeout-minutes: 15
+      # Invoke qemu-riscv64-static explicitly. Inside the archlinux:latest
+      # container we don't have permission to register binfmt_misc with the
+      # host kernel, so exec'ing the riscv64 ELF directly fails with
+      # "Exec format error".
+      run: >
+        QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0"
+        QEMU_LD_PREFIX="/usr/riscv64-linux-gnu"
+        qemu-riscv64-static ./test/test_xsimd
+      working-directory: _build
diff --git a/.github/workflows/cross-rvv.yml b/.github/workflows/cross-rvv.yml
@@ -1,3 +1,17 @@
+# RISC-V RVV cross-compilation build (Ubuntu apt qemu, multi-compiler matrix).
+#
+# vlen=128 is intentionally NOT covered here. Ubuntu's qemu-user-static
+# (8.2.x in noble, 9.x in plucky) hangs on the xsimd test_xsimd binary at
+# vlen=128 — see QEMU issue #2137 (RVV TCG slowdowns) for the underlying
+# emulator behaviour. Until ubuntu-latest ships qemu 11+, vlen=128 coverage
+# lives in cross-rvv-arch.yml, which runs inside an archlinux:latest
+# container with qemu 11. Vlens >= 256 run fast enough under the apt qemu
+# to stay within the test step's timeout.
+#
+# References:
+#   QEMU 11.0.0 release notes:  https://www.qemu.org/2026/04/22/qemu-11-0-0/
+#   QEMU RVV slowdowns issue:   https://gitlab.com/qemu-project/qemu/-/issues/2137
+#   Ubuntu RVV vstart bug:      https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2095169
 name: RISC-V RVV cross-compilation build
 on: [push, pull_request]
 concurrency:
@@ -8,13 +22,13 @@ jobs:
     runs-on: ubuntu-latest
     name: 'RISC-V RVV${{ matrix.vector_bits }}'
     strategy:
+      fail-fast: false
       matrix:
         sys:
           - { compiler: 'gcc', gcc_runtime: '14'}
           - { compiler: 'clang', version: '17', gcc_runtime: '14'}
           - { compiler: 'clang', version: '18', gcc_runtime: '14'}
         vector_bits:
-          - 128
           - 256
           - 512
     steps:
@@ -35,9 +49,14 @@ jobs:
         sudo ln -srf $(which clang++-${{ matrix.sys.version }}) /usr/bin/clang++
         rm llvm.sh
     - name: Setup QEMU
-      uses: docker/setup-qemu-action@v3.0.0
-      with:
-        platforms: riscv64
+      # Use the qemu-user-static package shipped by the runner image rather
+      # than docker/setup-qemu-action: tonistiigi/binfmt pins an even older
+      # qemu (~6.x/7.x) whose RVV implementation miscompiles vmulh* and is
+      # known to hang test_xsimd until the 6h GHA timeout.
+      run: |
+        sudo apt-get -y -qq update
+        sudo apt-get -y -qq --no-install-suggests --no-install-recommends install qemu-user-static
+        qemu-riscv64-static --version
     - name: Setup Ninja
       run: |
         sudo apt-get -y -qq install ninja-build
@@ -62,6 +81,7 @@ jobs:
         echo "XSIMD_TEST_CPU_ASSUME_SVE=0"    >> "$GITHUB_ENV"
         echo "XSIMD_TEST_CPU_ASSUME_RVV=1" >> "$GITHUB_ENV"
     - name: Testing xsimd
+      timeout-minutes: 15
       run: >
         QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0"
         QEMU_LD_PREFIX="/usr/riscv64-linux-gnu"

diff --git a/docs/source/api/arithmetic_index.rst b/docs/source/api/arithmetic_index.rst
@@ -40,6 +40,12 @@ Binary operations:
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`mul`                       | per slot multiply                                  |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`mullo`                    | low N bits of the 2N-bit integer product           |
++---------------------------------------+----------------------------------------------------+
+| :cpp:func:`mulhi`                    | high N bits of the 2N-bit integer product          |
++---------------------------------------+----------------------------------------------------+
+| :cpp:func:`mulhilo`                   | pair {hi, lo} of the 2N-bit integer product        |
++---------------------------------------+----------------------------------------------------+
 | :cpp:func:`div`                       | per slot division                                  |
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`mod`                       | per slot modulo                                    |

diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
@@ -177,6 +177,118 @@ namespace xsimd
                                  self, other);
         }
 
+        // mulhi
+        namespace detail
+        {
+            template <class T>
+            struct mulhi_helper
+            {
+                // default: use a wider native integer type
+                using wider = typename std::conditional<
+                    std::is_signed<T>::value,
+                    typename std::conditional<sizeof(T) == 1, int16_t,
+                                              typename std::conditional<sizeof(T) == 2, int32_t, int64_t>::type>::type,
+                    typename std::conditional<sizeof(T) == 1, uint16_t,
+                                              typename std::conditional<sizeof(T) == 2, uint32_t, uint64_t>::type>::type>::type;
+
+                static XSIMD_INLINE T compute(T x, T y) noexcept
+                {
+                    constexpr int shift = 8 * sizeof(T);
+                    return static_cast<T>((static_cast<wider>(x) * static_cast<wider>(y)) >> shift);
+                }
+            };
+
+            // 64-bit unsigned software mulhi via 32-bit splits
+            XSIMD_INLINE uint64_t mulhi_u64(uint64_t x, uint64_t y) noexcept
+            {
+#if defined(__SIZEOF_INT128__)
+                return static_cast<uint64_t>((static_cast<unsigned __int128>(x) * static_cast<unsigned __int128>(y)) >> 64);
+#else
+                uint64_t xl = x & 0xffffffffULL;
+                uint64_t xh = x >> 32;
+                uint64_t yl = y & 0xffffffffULL;
+                uint64_t yh = y >> 32;
+                uint64_t ll = xl * yl;
+                uint64_t lh = xl * yh;
+                uint64_t hl = xh * yl;
+                uint64_t hh = xh * yh;
+                uint64_t mid = (ll >> 32) + (lh & 0xffffffffULL) + (hl & 0xffffffffULL);
+                return hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
+#endif
+            }
+
+            XSIMD_INLINE int64_t mulhi_i64(int64_t x, int64_t y) noexcept
+            {
+#if defined(__SIZEOF_INT128__)
+                return static_cast<int64_t>((static_cast<__int128>(x) * static_cast<__int128>(y)) >> 64);
+#else
+                uint64_t uhi = mulhi_u64(static_cast<uint64_t>(x), static_cast<uint64_t>(y));
+                if (x < 0)
+                    uhi -= static_cast<uint64_t>(y);
+                if (y < 0)
+                    uhi -= static_cast<uint64_t>(x);
+                return static_cast<int64_t>(uhi);
+#endif
+            }
+
+            template <>
+            struct mulhi_helper<uint64_t>
+            {
+                static XSIMD_INLINE uint64_t compute(uint64_t x, uint64_t y) noexcept { return mulhi_u64(x, y); }
+            };
+
+            template <>
+            struct mulhi_helper<int64_t>
+            {
+                static XSIMD_INLINE int64_t compute(int64_t x, int64_t y) noexcept { return mulhi_i64(x, y); }
+            };
+
+            // Compute the high 64 bits of each lane-wise 64x64 unsigned product,
+            // given a "widening mul" functor WMul that takes two batch<uint64_t,A>
+            // and returns batch<uint64_t,A> containing the 64-bit product of the
+            // low 32 bits of each 64-bit lane (i.e. _mm*_mul_epu32 wrapped).
+            template <class A, class WMul>
+            XSIMD_INLINE batch<uint64_t, A> mulhi_u64_core(batch<uint64_t, A> const& x,
+                                                           batch<uint64_t, A> const& y,
+                                                           WMul mul_epu32) noexcept
+            {
+                using B = batch<uint64_t, A>;
+                const B mask(uint64_t(0xffffffffULL));
+                B xl = x & mask;
+                B xh = x >> 32;
+                B yl = y & mask;
+                B yh = y >> 32;
+                B ll = mul_epu32(xl, yl);
+                B lh = mul_epu32(xl, yh);
+                B hl = mul_epu32(xh, yl);
+                B hh = mul_epu32(xh, yh);
+                B mid = (ll >> 32) + (lh & mask) + (hl & mask);
+                return hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
+            }
+
+            // Signed variant: unsigned core + sign fixup via arithmetic shift-by-63.
+            template <class A, class WMul>
+            XSIMD_INLINE batch<int64_t, A> mulhi_i64_core(batch<int64_t, A> const& x,
+                                                          batch<int64_t, A> const& y,
+                                                          WMul mul_epu32) noexcept
+            {
+                auto ux = ::xsimd::bitwise_cast<uint64_t>(x);
+                auto uy = ::xsimd::bitwise_cast<uint64_t>(y);
+                auto uhi = mulhi_u64_core<A>(ux, uy, mul_epu32);
+                auto sa = ::xsimd::bitwise_cast<uint64_t>(x >> 63);
+                auto sb = ::xsimd::bitwise_cast<uint64_t>(y >> 63);
+                return ::xsimd::bitwise_cast<int64_t>(uhi - (uy & sa) - (ux & sb));
+            }
+        }
+
+        template <class A, class T, class /*=std::enable_if_t<std::is_integral<T>::value>*/>
+        XSIMD_INLINE batch<T, A> mulhi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return detail::mulhi_helper<T>::compute(x, y); },
+                                 self, other);
+        }
+
         // rotl
         template <class A, class T, class STy>
         XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, STy other, requires_arch<common>) noexcept

diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -928,6 +928,50 @@ namespace xsimd
             }
         }
 
+        // mulhi
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> mulhi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_mulhi_epi16(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A> mulhi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_mulhi_epu16(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> mulhi(batch<int32_t, A> const& self, batch<int32_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            __m256i even = _mm256_mul_epi32(self, other);
+            __m256i odd = _mm256_mul_epi32(_mm256_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)),
+                                           _mm256_shuffle_epi32(other, _MM_SHUFFLE(3, 3, 1, 1)));
+            __m256i even_hi = _mm256_srli_epi64(even, 32);
+            return _mm256_blend_epi16(even_hi, odd, 0xCC);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> mulhi(batch<uint32_t, A> const& self, batch<uint32_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            __m256i even = _mm256_mul_epu32(self, other);
+            __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(self, 32), _mm256_srli_epi64(other, 32));
+            __m256i even_hi = _mm256_srli_epi64(even, 32);
+            return _mm256_blend_epi16(even_hi, odd, 0xCC);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> mulhi(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return detail::mulhi_u64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
+        }
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> mulhi(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return detail::mulhi_i64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
+        }
+
         // reduce_add
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept

diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -470,6 +470,18 @@ namespace xsimd
             }
         }
 
+        // mulhi
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> mulhi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_mulhi_epi16(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A> mulhi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_mulhi_epu16(self, other);
+        }
+
         // neq
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept

diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -1772,6 +1772,41 @@ namespace xsimd
             }
         }
 
+        // mulhi
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> mulhi(batch<int32_t, A> const& self, batch<int32_t, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i even = _mm512_mul_epi32(self, other);
+            __m512i odd = _mm512_mul_epi32(_mm512_shuffle_epi32(self, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))),
+                                           _mm512_shuffle_epi32(other, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))));
+            __m512i even_hi = _mm512_srli_epi64(even, 32);
+            // merge: even_hi has hi in low-32 of each 64, odd has hi in high-32 of each 64
+            return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> mulhi(batch<uint32_t, A> const& self, batch<uint32_t, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i even = _mm512_mul_epu32(self, other);
+            __m512i odd = _mm512_mul_epu32(_mm512_srli_epi64(self, 32), _mm512_srli_epi64(other, 32));
+            __m512i even_hi = _mm512_srli_epi64(even, 32);
+            return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> mulhi(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::mulhi_u64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
+        }
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> mulhi(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::mulhi_i64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
+        }
+
         // nearbyint
         template <class A>
         XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept