simd/02_sum.cpp at main · marcelcases/simd · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#include "common.h"

using namespace simd_examples;

// ============================================================================
// Example 2: Sum Reduction
// ============================================================================
// This demonstrates horizontal reduction - summing all elements in a SIMD register.
// This is harder for compilers to auto-vectorize, so explicit SIMD shows clear gains.
//
// The key operation is stdx::reduce() which performs:
//   result = v[0] + v[1] + ... + v[W-1]

// Scalar sum: simple loop
#if defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
__attribute__((noinline, optnone))
float sum_scalar(const float* a, std::size_t n) {
    float s = 0.f;
    for (std::size_t i = 0; i < n; ++i) s += a[i];
    return s;
}
#else
#pragma GCC push_options
#pragma GCC optimize("no-tree-vectorize", "no-tree-loop-distribute-patterns")
float sum_scalar(const float* a, std::size_t n) {
    float s = 0.f;
    for (std::size_t i = 0; i < n; ++i) s += a[i];
    return s;
}
#pragma GCC pop_options
#endif

// SIMD sum: accumulate in SIMD registers, then reduce
float sum_simd(const float* a, std::size_t n) {
    using V = native_simd<float>;
    constexpr std::size_t W = V::size();

    // Start with zero vector
    V acc = 0.f;
    std::size_t i = 0;

    // Main loop: accumulate W elements at a time
    for (; i + W <= n; i += W) {
        V v;
        v.copy_from(a + i, stdx::element_aligned);
        acc += v;  // Element-wise addition
    }

    // Horizontal reduction: sum all elements in the SIMD register
    float s = stdx::reduce(acc);

    // Tail: handle remaining elements
    for (; i < n; ++i) s += a[i];

    return s;
}

int main() {
    std::cout << "=== Example 2: Sum Reduction ===\n\n";

    const std::size_t N = 1ULL << 24;  // 16 million
    std::vector<float> a(N);

    std::mt19937 rng(42);
    std::uniform_real_distribution<float> dist(0.f, 1.f);
    for (auto& x : a) x = dist(rng);

    // Benchmark scalar
    double t_scalar = bench_ms([&]() -> float {
        return sum_scalar(a.data(), N);
    });

    // Benchmark SIMD
    double t_simd = bench_ms([&]() -> float {
        return sum_simd(a.data(), N);
    });

    float chk_scalar = sum_scalar(a.data(), N);
    float chk_simd = sum_simd(a.data(), N);

    std::cout << "Array size: " << N << " elements\n";
    std::cout << "SIMD width: " << native_simd<float>::size() << " floats\n\n";
    std::cout << "Scalar time: " << t_scalar << " ms\n";
    std::cout << "SIMD time:   " << t_simd << " ms\n";
    std::cout << "Speedup:     " << (t_scalar / t_simd) << "x\n\n";

    std::cout << "IMPORTANT: Note that sums match = NO\n";
    std::cout << "This is due to floating-point non-associativity:\n";
    std::cout << "  (a + b) + c  !=  a + (b + c) in floating point\n";
    std::cout << "The SIMD version adds elements in a different order,\n";
    std::cout << "causing different rounding errors.\n";
    std::cout << "For exact reproducibility, use Kahan summation or\n";
    std::cout << "enforce a specific order.\n\n";

    std::cout << "Scalar sum: " << chk_scalar << "\n";
    std::cout << "SIMD sum:   " << chk_simd << "\n";
    std::cout << "Difference: " << std::abs(chk_scalar - chk_simd) << "\n";

    return 0;
}