-
Notifications
You must be signed in to change notification settings - Fork 32
Expand file tree
/
Copy pathegg_adaptive_normalize.h
More file actions
105 lines (83 loc) · 2.84 KB
/
egg_adaptive_normalize.h
File metadata and controls
105 lines (83 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#ifndef EGG_ADAPTIVE_NORMALIZE_H
#define EGG_ADAPTIVE_NORMALIZE_H
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
// Helper for Adaptive QKV Normalization
// Normalizes values layer-wise (across all warps/heads)
template <int N_WARPS>
__device__ __forceinline__ int8_t adaptive_qkv_normalize(
int32_t val,
int tid,
int32_t *warp_maxs // Shared memory scratchpad (needs N_WARPS integers)
) {
// 1. Calculate absolute value
int32_t abs_v = abs(val);
// 2. Warp-level reduction to find max(abs_v) in this warp
unsigned int mask = 0xFFFFFFFF;
for (int offset = 16; offset > 0; offset /= 2) {
int32_t other = __shfl_down_sync(mask, abs_v, offset);
if (other > abs_v) abs_v = other;
}
int warp_id = tid / 32;
int lane_id = tid % 32;
// Write warp max to shared memory
if (lane_id == 0) {
warp_maxs[warp_id] = abs_v;
}
// 3. Synchronize to ensure all warps have written their maxes
__syncthreads();
// 4. Layer-level max reduction (Per-Layer Scaling)
int32_t layer_max = 0;
#pragma unroll
for (int i = 0; i < N_WARPS; i++) {
if (warp_maxs[i] > layer_max) layer_max = warp_maxs[i];
}
// 5. Scaling
float scale = 127.0f / (float)(layer_max + 1e-9f);
float scaled_f = (float)val * scale;
int32_t scaled = (int32_t)roundf(scaled_f);
// 6. Clip to int8 range
if (scaled > 127) scaled = 127;
if (scaled < -127) scaled = -127;
return (int8_t)scaled;
}
// Helper for Adaptive Layer Normalization
// Normalizes values across the entire layer dimension
template <int N_WARPS>
__device__ __forceinline__ int8_t adaptive_layer_normalize(
int32_t val,
int tid,
int32_t *warp_maxs
) {
// 1. Calculate absolute value
int32_t abs_v = abs(val);
// 2. Warp-level reduction to find max(abs_v) in this warp
unsigned int mask = 0xFFFFFFFF;
for (int offset = 16; offset > 0; offset /= 2) {
int32_t other = __shfl_down_sync(mask, abs_v, offset);
if (other > abs_v) abs_v = other;
}
int warp_id = tid / 32;
int lane_id = tid % 32;
// Write warp max to shared memory
if (lane_id == 0) {
warp_maxs[warp_id] = abs_v;
}
// 3. Synchronize to ensure all warps have written their maxes
__syncthreads();
// 4. Layer-level max reduction
int32_t layer_max = 0;
#pragma unroll
for (int i = 0; i < N_WARPS; i++) {
if (warp_maxs[i] > layer_max) layer_max = warp_maxs[i];
}
// 5. Scaling
float scale = 127.0f / (float)(layer_max + 1e-9f);
float scaled_f = (float)val * scale;
int32_t scaled = (int32_t)roundf(scaled_f);
// 6. Clip to int8 range
if (scaled > 127) scaled = 127;
if (scaled < -127) scaled = -127;
return (int8_t)scaled;
}
#endif // EGG_ADAPTIVE_NORMALIZE_H