-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path06_fma.cpp
More file actions
164 lines (140 loc) · 5.58 KB
/
06_fma.cpp
File metadata and controls
164 lines (140 loc) · 5.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#include "common.h"
using namespace simd_examples;
// ============================================================================
// Example 6: FMA (Fused Multiply-Add)
// ============================================================================
// FMA computes a*b + c in a single instruction with:
// - Single rounding (better precision than separate multiply + add)
// - Higher throughput (one instruction instead of two)
//
// Key insight: FMA shows speedup when the operation is COMPUTE-BOUND.
// If memory bandwidth is the bottleneck, SIMD won't help much.
//
// This example demonstrates BOTH cases:
// 1. Memory-bound: y[i] = a[i] * b[i] + c[i] (3 loads, 1 store per FMA)
// 2. Compute-bound: accumulate dot product (1 load, no store per iteration)
// ============ Memory-bound example: y = a*b + c ============
// Prevent auto-vectorization for fair comparison
#if defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
__attribute__((noinline, optnone))
void fma_membound_scalar(const float* a, const float* b, const float* c,
float* y, std::size_t n) {
for (std::size_t i = 0; i < n; ++i) {
y[i] = a[i] * b[i] + c[i];
}
}
#else
#pragma GCC push_options
#pragma GCC optimize("no-tree-vectorize", "no-tree-loop-distribute-patterns")
void fma_membound_scalar(const float* a, const float* b, const float* c,
float* y, std::size_t n) {
for (std::size_t i = 0; i < n; ++i) {
y[i] = a[i] * b[i] + c[i];
}
}
#pragma GCC pop_options
#endif
void fma_membound_simd(const float* a, const float* b, const float* c,
float* y, std::size_t n) {
using V = native_simd<float>;
constexpr std::size_t W = V::size();
std::size_t i = 0;
for (; i + W <= n; i += W) {
V va, vb, vc;
va.copy_from(&a[i], stdx::element_aligned);
vb.copy_from(&b[i], stdx::element_aligned);
vc.copy_from(&c[i], stdx::element_aligned);
V r = va * vb + vc; // Compiler generates FMA
r.copy_to(&y[i], stdx::element_aligned);
}
for (; i < n; ++i) y[i] = a[i] * b[i] + c[i];
}
// ============ Compute-bound example: dot product ============
#if defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
// Intel/Clang: use optnone to truly disable all optimizations for fair comparison
__attribute__((noinline, optnone))
float dot_scalar(const float* a, const float* b, std::size_t n) {
float sum = 0.f;
for (std::size_t i = 0; i < n; ++i) {
sum += a[i] * b[i];
}
return sum;
}
#else
#pragma GCC push_options
#pragma GCC optimize("no-tree-vectorize", "no-tree-loop-distribute-patterns")
float dot_scalar(const float* a, const float* b, std::size_t n) {
float sum = 0.f;
for (std::size_t i = 0; i < n; ++i) {
sum += a[i] * b[i]; // FMA: sum = sum + a[i]*b[i]
}
return sum;
}
#pragma GCC pop_options
#endif
float dot_simd(const float* a, const float* b, std::size_t n) {
using V = native_simd<float>;
constexpr std::size_t W = V::size();
V acc = 0.f;
std::size_t i = 0;
for (; i + W <= n; i += W) {
V va, vb;
va.copy_from(&a[i], stdx::element_aligned);
vb.copy_from(&b[i], stdx::element_aligned);
// Note: Intel compiler generates better code with stdx::fma(),
// while GCC generates better code with acc + va * vb.
// Here we use the form that works well with both compilers.
#if defined(__INTEL_LLVM_COMPILER)
acc = stdx::fma(va, vb, acc);
#else
acc = acc + va * vb;
#endif
}
float sum = stdx::reduce(acc);
for (; i < n; ++i) sum += a[i] * b[i];
return sum;
}
int main() {
std::cout << "=== Example 6: FMA (Fused Multiply-Add) ===\n\n";
const std::size_t N = 1ULL << 24;
std::vector<float> a(N), b(N), c(N), y(N);
std::mt19937 rng(42);
std::uniform_real_distribution<float> dist(-1.f, 1.f);
for (std::size_t i = 0; i < N; ++i) {
a[i] = dist(rng);
b[i] = dist(rng);
c[i] = dist(rng);
}
std::cout << "Array size: " << N << " elements\n";
std::cout << "SIMD width: " << native_simd<float>::size() << " floats\n\n";
// Test 1: Memory-bound (y = a*b + c)
std::cout << "TEST 1: Memory-bound (y = a*b + c)\n";
std::cout << " 3 loads + 1 store per FMA = memory bottleneck\n";
double t1_scalar = bench_ms([&]() {
fma_membound_scalar(a.data(), b.data(), c.data(), y.data(), N);
});
double t1_simd = bench_ms([&]() {
fma_membound_simd(a.data(), b.data(), c.data(), y.data(), N);
});
std::cout << " Scalar: " << t1_scalar << " ms\n";
std::cout << " SIMD: " << t1_simd << " ms\n";
std::cout << " Speedup: " << (t1_scalar / t1_simd) << "x\n\n";
// Test 2: Compute-bound (dot product)
std::cout << "TEST 2: Compute-bound (dot product)\n";
std::cout << " 2 loads per FMA, accumulating in registers\n";
double t2_scalar = bench_ms([&]() -> float {
return dot_scalar(a.data(), b.data(), N);
});
double t2_simd = bench_ms([&]() -> float {
return dot_simd(a.data(), b.data(), N);
});
std::cout << " Scalar: " << t2_scalar << " ms\n";
std::cout << " SIMD: " << t2_simd << " ms\n";
std::cout << " Speedup: " << (t2_scalar / t2_simd) << "x\n\n";
std::cout << "KEY LESSON:\n";
std::cout << "-----------\n";
std::cout << "SIMD shines when the operation is COMPUTE-BOUND.\n";
std::cout << "If you're waiting on memory, wider registers don't help.\n";
std::cout << "\nRule of thumb: more computation per byte loaded = better speedup.\n";
return 0;
}