egg.c/egg_adaptive_normalize.h at master · d0rc/egg.c · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#ifndef EGG_ADAPTIVE_NORMALIZE_H
#define EGG_ADAPTIVE_NORMALIZE_H

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

// Helper for Adaptive QKV Normalization
// Normalizes values layer-wise (across all warps/heads)
template <int N_WARPS>
__device__ __forceinline__ int8_t adaptive_qkv_normalize(
    int32_t val,
    int tid,
    int32_t *warp_maxs // Shared memory scratchpad (needs N_WARPS integers)
) {
    // 1. Calculate absolute value
    int32_t abs_v = abs(val);

    // 2. Warp-level reduction to find max(abs_v) in this warp
    unsigned int mask = 0xFFFFFFFF;
    for (int offset = 16; offset > 0; offset /= 2) {
        int32_t other = __shfl_down_sync(mask, abs_v, offset);
        if (other > abs_v) abs_v = other;
    }

    int warp_id = tid / 32;
    int lane_id = tid % 32;

    // Write warp max to shared memory
    if (lane_id == 0) {
        warp_maxs[warp_id] = abs_v;
    }

    // 3. Synchronize to ensure all warps have written their maxes
    __syncthreads();

    // 4. Layer-level max reduction (Per-Layer Scaling)
    int32_t layer_max = 0;
    #pragma unroll
    for (int i = 0; i < N_WARPS; i++) {
        if (warp_maxs[i] > layer_max) layer_max = warp_maxs[i];
    }

    // 5. Scaling
    float scale = 127.0f / (float)(layer_max + 1e-9f);

    float scaled_f = (float)val * scale;
    int32_t scaled = (int32_t)roundf(scaled_f);

    // 6. Clip to int8 range
    if (scaled > 127) scaled = 127;
    if (scaled < -127) scaled = -127;

    return (int8_t)scaled;
}

// Helper for Adaptive Layer Normalization
// Normalizes values across the entire layer dimension
template <int N_WARPS>
__device__ __forceinline__ int8_t adaptive_layer_normalize(
    int32_t val,
    int tid,
    int32_t *warp_maxs
) {
    // 1. Calculate absolute value
    int32_t abs_v = abs(val);

    // 2. Warp-level reduction to find max(abs_v) in this warp
    unsigned int mask = 0xFFFFFFFF;
    for (int offset = 16; offset > 0; offset /= 2) {
        int32_t other = __shfl_down_sync(mask, abs_v, offset);
        if (other > abs_v) abs_v = other;
    }

    int warp_id = tid / 32;
    int lane_id = tid % 32;

    // Write warp max to shared memory
    if (lane_id == 0) {
        warp_maxs[warp_id] = abs_v;
    }

    // 3. Synchronize to ensure all warps have written their maxes
    __syncthreads();

    // 4. Layer-level max reduction
    int32_t layer_max = 0;
    #pragma unroll
    for (int i = 0; i < N_WARPS; i++) {
        if (warp_maxs[i] > layer_max) layer_max = warp_maxs[i];
    }

    // 5. Scaling
    float scale = 127.0f / (float)(layer_max + 1e-9f);

    float scaled_f = (float)val * scale;
    int32_t scaled = (int32_t)roundf(scaled_f);

    // 6. Clip to int8 range
    if (scaled > 127) scaled = 127;
    if (scaled < -127) scaled = -127;

    return (int8_t)scaled;
}

#endif // EGG_ADAPTIVE_NORMALIZE_H