Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions 03_nf4_dequant/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
data/qlora_test.bin
data/cpp_output.bin
data/py_output.bin
__pycache__/
Binary file added 03_nf4_dequant/NF4反量化算子优化.pdf
Binary file not shown.
78 changes: 69 additions & 9 deletions 03_nf4_dequant/README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,76 @@
## 简介
## 快速开始性能测试
`conda activate nf4`激活conda环境

凭借你在 InfiniTensor 训练营的经历和人工智能与并行计算方面的出色成绩,你受雇于一家领先的 AI 大模型公司,负责为下一代对话系统开发高效的推理引擎。用户抱怨现有的聊天机器人响应速度太慢,无法支持流畅的实时交互——尤其是在消费级显卡上,大模型的显存占用导致批处理规模受限,吞吐量始终提不上去。
`CUDA_VISIBLE_DEVICES=2 python main.py`运行测试程序,终端输出误差以及性能分析报告

你发现问题的症结在于模型参数量过大。公司采用了 QLoRA 技术,将基座模型量化为 4-bit NF4 格式,显存占用降至原来的 1/4,理论上可以在单卡上运行更大的模型或更大的批次。然而,现有实现的解量化过程成了新的瓶颈:每个矩阵乘法前都需要将 4-bit 权重实时解压回 16-bit,这个操作拖慢了整体计算速度。
`CUDA_VISIBLE_DEVICES=2`指定运行的GPU

## 任务内容
开发一个 CUDA 程序,实现单核的 NF4 解量化算子,将压缩后的 4-bit 权重实时解压为 16-bit 浮点(BF16 或 FP16)输出。
结果示例:
```
--- 正在执行: 编译 C++ CUDA 内核 ---
--- 正在执行: 生成随机量化权重 ---
维度: 32768x32768 | Block: 64 | Group: 256
数据已写入: ./data/qlora_test.bin
--- 运行反量化 ---
耗时: 24.4132 ms | 吞吐: 1106.48 GB/s
基准输出已保存: ./data/py_output.bin
--- 正在执行: 运行自定义 CUDA 内核 ---
原始元素: 1073741824
对齐元素: 1073741824
Launch Kernel...
Kernel 耗时: 21.6289 ms
有效带宽: 1248.91 GB/s
--- 正在执行: 对比精度与性能表现 ---
测试维度: 32768 x 32768
Python: ./data/py_output.bin
C++: ./data/cpp_output.bin

## 📬 有疑问?
==================================================
误差分析报告
==================================================
平均绝对误差 (MAE):6.5369757067e-05
均方误差 (MSE):2.4582243441e-07
最大误差 (Max Diff):1.5625000000e-02
--------------------------------------------------
完全一致元素数:1043509311 / 1073741824
一致率:97.1844%
==================================================

更多详细信息和要求请参考本季度项目文档。
随机数据采样对比
---------------------------------------------------------------------------
Index | Python (BF16) | C++ (BF16) | Diff
---------------------------------------------------------------------------
0 | 0.394531 | 0.394531 | 0.0000e+00
1 | 1.171875 | 1.171875 | 0.0000e+00
2 | -0.215820 | -0.215820 | 0.0000e+00
446162790 | -0.277344 | -0.277344 | 0.0000e+00
1066319814 | 0.000000 | 0.000000 | 0.0000e+00
230426597 | 0.205078 | 0.205078 | 0.0000e+00
951028163 | 0.322266 | 0.322266 | 0.0000e+00
857049811 | -0.566406 | -0.566406 | 0.0000e+00
527929503 | 0.718750 | 0.718750 | 0.0000e+00
23305680 | 0.000000 | 0.000000 | 0.0000e+00
168636035 | 0.134766 | 0.134766 | 0.0000e+00
660748068 | -0.625000 | -0.625000 | 0.0000e+00
801264382 | 0.098633 | 0.098633 | 0.0000e+00
---------------------------------------------------------------------------
bnb耗时:24.41320ms, 带宽:1106.47730GB/s
nf4 kernel耗时:21.62890ms, 带宽:1248.91000GB/s
```

可以在项目群里直接询问导师和助教!
## data目录
data目录存放测试文件以及反量化的输出结果。
在param.txt中可以修改测试用矩阵大小等参数。
data/log目录存放输出的性能日志。

## kernel_test目录
kernel_test目录中是kernel逐步优化的过程。

## src目录

generate_data.py 用于生成随机矩阵,并使用bnb库量化后输出文件qlora_test.bin供nf4_kernel.cu读取进行反量化。同时generate_data.py将量化后的矩阵进行反量化并计时,性能日志输出到data/log/log_py.txt,运算结果输出到data/py_output.bin

nf4_kernel.cu是CUDA实现的nf4反量化算子,从qlora_test.bin中读取量化后的矩阵进行反量化,并将结果输出到data/cpp_output.bin,性能日志输出到data/log/log_cpp.txt

benchmark.py 是测试程序。读取cpp_output.bin与py_output.bin,对二者反量化结果进行精度对比。

Good luck and happy coding! 🚀
1 change: 1 addition & 0 deletions 03_nf4_dequant/data/log/log_cpp.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
21.6248,1249.15
1 change: 1 addition & 0 deletions 03_nf4_dequant/data/log/log_py.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
21.2040,1273.9424
6 changes: 6 additions & 0 deletions 03_nf4_dequant/data/param.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
blocksize = 64
groupsize = 256
compute_type = "bf16"
target_gpu = "T4"
rows = 32768
cols = 32768
243 changes: 243 additions & 0 deletions 03_nf4_dequant/kernel_test/ALU_optim.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdint>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) \
{ \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA Error: " << cudaGetErrorString(err) \
<< " at line " << __LINE__ << std::endl; \
exit(EXIT_FAILURE); \
} \
}

void read_file(
const std::string file_path, std::vector<uint8_t>& packed_weights,
std::vector<uint8_t>& absmax_q,
std::vector<uint16_t>& absmax2,
std::vector<uint16_t>& code2, float& offset,
int64_t& rows, int64_t& cols, int32_t& blocksize
)
{
std::ifstream file(file_path, std::ios::binary);
if (!file) { std::cerr << "无法打开文件" << std::endl; exit(1); }

file.read(reinterpret_cast<char*>(&rows), 8);
file.read(reinterpret_cast<char*>(&cols), 8);
file.read(reinterpret_cast<char*>(&blocksize), 4);

int64_t total_original = rows * cols;
int32_t groupsize = 256;
int32_t alignment = blocksize * groupsize;
int64_t total_aligned = ((total_original + alignment - 1) / alignment) * alignment;

size_t num_blocks = total_aligned / blocksize;
size_t num_groups = total_aligned / (blocksize * groupsize);

std::cout << "原始元素: " << total_original << std::endl;
std::cout << "对齐元素: " << total_aligned << std::endl;

packed_weights.resize(total_aligned / 2);
absmax_q.resize(num_blocks);
absmax2.resize(num_groups);
code2.resize(256);

file.read(reinterpret_cast<char*>(packed_weights.data()), packed_weights.size());
file.read(reinterpret_cast<char*>(absmax_q.data()), absmax_q.size());
file.read(reinterpret_cast<char*>(absmax2.data()), absmax2.size() * 2);
file.read(reinterpret_cast<char*>(code2.data()), 256 * 2);
file.read(reinterpret_cast<char*>(&offset), 4);
}


__constant__ float c_code2[256];
// NF4 table
__constant__ float c_nf4[16] =
{
-1.00000000f, -0.69619280f, -0.52507305f, -0.39491749f,
-0.28444138f, -0.18477343f, -0.09105004f, 0.00000000f,
0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f,
0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f
};

__global__ void dequantize_nf4_kernel_ALU_optim
(
const uint8_t* __restrict__ packed_weights,
const uint8_t* __restrict__ absmax_q,
const half* __restrict__ absmax2,
uint32_t* __restrict__ output_packed,
int num_bytes,
int block_shift,
int group_shift,
float offset
)
{
__shared__ float s_nf4[16];

int tx = threadIdx.x;
if (tx < 16)
{
s_nf4[tx] = c_nf4[tx];
}
__syncthreads();

// 每个线程处理 4 个输入字节(即 8 个 NF4 权重)
// 所以总线程数只需要是 num_bytes / 4
int tid = blockIdx.x * blockDim.x + threadIdx.x;

if (tid * 4 >= num_bytes) return;

// 向量化读取:一次吞下 4 个字节 (32-bit)
uint32_t packed_4bytes = reinterpret_cast<const uint32_t*>(packed_weights)[tid];

// 准备 128-bit 的输出容器 (4 个 uint32_t,每个 uint32_t 包含 2 个 bf16)
uint32_t out_vec[4];

#pragma unroll
for (int i = 0; i < 4; ++i)
{
uint8_t byte_val = (packed_4bytes >> (i * 8)) & 0xFF;

// 计算当前权重的全局索引
int element_idx = (tid * 4 + i) * 2;
int block_idx = element_idx >> block_shift;
int group_idx = block_idx >> group_shift;

float scale_1 = c_code2[absmax_q[block_idx]];

float scale_2 = __half2float(absmax2[group_idx]);
float final_scale = scale_1 * scale_2;

// 解码两个 NF4
uint8_t idx_0 = byte_val >> 4;
uint8_t idx_1 = byte_val & 0x0F;

float v0 = s_nf4[idx_0] * final_scale;
float v1 = s_nf4[idx_1] * final_scale;

__nv_bfloat16 b0 = __float2bfloat16(v0);
__nv_bfloat16 b1 = __float2bfloat16(v1);

uint16_t bits_0 = *reinterpret_cast<unsigned short*>(&b0);
uint16_t bits_1 = *reinterpret_cast<unsigned short*>(&b1);

// 打包结果存入临时数组
out_vec[i] = ((uint32_t)bits_1 << 16) | (uint32_t)bits_0;
}

// 向量化写入
reinterpret_cast<int4*>(output_packed)[tid] = *reinterpret_cast<int4*>(out_vec);
}

void nf4_dequantize_cuda
(
std::vector<uint8_t>& h_packed_weights,
std::vector<uint8_t>& h_absmax_q,
std::vector<uint16_t>& h_absmax2,
std::vector<uint16_t>& h_code2,
int64_t rows, int64_t cols, int32_t blocksize, int32_t groupsize,float offset
)
{
size_t num_bytes = h_packed_weights.size();
size_t out_size = num_bytes * sizeof(uint32_t);

float h_code2_f32[256];
for(int i = 0; i < 256; ++i)
{
__half h_val = *reinterpret_cast<__half*>(&h_code2[i]);
h_code2_f32[i] = (float)h_val;
}
CHECK_CUDA(cudaMemcpyToSymbol(c_code2, h_code2_f32, sizeof(h_code2_f32)));

uint8_t *d_packed, *d_absmax_q;
half *d_absmax2;
uint32_t *d_output;

CHECK_CUDA(cudaMalloc(&d_packed, h_packed_weights.size()));
CHECK_CUDA(cudaMalloc(&d_absmax_q, h_absmax_q.size()));
CHECK_CUDA(cudaMalloc(&d_absmax2, h_absmax2.size() * 2));
CHECK_CUDA(cudaMalloc(&d_output, out_size));

CHECK_CUDA(cudaMemcpy(d_packed, h_packed_weights.data(), h_packed_weights.size(), cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), h_absmax_q.size(), cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), h_absmax2.size() * 2, cudaMemcpyHostToDevice));

int threadsPerBlock = 256;
int num_elements_vec = (num_bytes + 3) / 4;
int blocksPerGrid = (num_elements_vec + threadsPerBlock - 1) / threadsPerBlock;

int block_shift = log2(blocksize);
int group_shift = log2(groupsize);

// warm up
for (int i = 0; i < 5; ++i)
{
dequantize_nf4_kernel_ALU_optim<<<blocksPerGrid, threadsPerBlock>>>
(d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, block_shift, group_shift, offset);
}

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start); // 开始记录
std::cout << "Launch Kernel...\n";
for (int i = 0; i < 10; ++i)
{
dequantize_nf4_kernel_ALU_optim<<<blocksPerGrid, threadsPerBlock>>>
(d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, block_shift, group_shift, offset);
}
cudaEventRecord(stop); // 结束记录

CHECK_CUDA(cudaEventSynchronize(stop)); // 等待 Event 完成

float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);

// 计算带宽
// 读取: Packed(1) + Indices(1) + Scales(2) + Code2(忽略不计)
size_t total_read = h_packed_weights.size() + h_absmax_q.size() + h_absmax2.size() * 2;
// 写入: Output(4) (因为每个packed byte生成一个uint32)
size_t total_write = num_bytes * 4;

double total_bytes = (double)(total_read + total_write);
double gb_per_sec = (total_bytes * 10 / 1e9) / (milliseconds / 1000.0);

std::cout << "Kernel 耗时: " << milliseconds << " ms" << std::endl;
std::cout << "有效带宽: " << gb_per_sec << " GB/s" << std::endl;

// 保存时间与带宽
std::ofstream timefile("./data/log/log_cpp.txt");
timefile << milliseconds << "," << gb_per_sec;
timefile.close();

std::vector<uint32_t> h_output(num_bytes);
CHECK_CUDA(cudaMemcpy(h_output.data(), d_output, out_size, cudaMemcpyDeviceToHost));

const std::string output_path = "./data/cpp_output.bin";
std::ofstream outfile(output_path, std::ios::binary);
outfile.write(reinterpret_cast<char*>(h_output.data()), out_size);
outfile.close();

cudaFree(d_packed); cudaFree(d_absmax_q); cudaFree(d_absmax2); cudaFree(d_output);
cudaEventDestroy(start); cudaEventDestroy(stop);
}

int main()
{
const std::string file_path = "../data/qlora_test.bin";
std::vector<uint8_t> packed_weights, absmax_q;
std::vector<uint16_t> absmax2, code2;
float offset;
int64_t rows, cols;
int32_t blocksize;
int32_t groupsize = 256;
read_file(file_path, packed_weights, absmax_q, absmax2, code2, offset, rows, cols, blocksize);
nf4_dequantize_cuda(packed_weights, absmax_q, absmax2, code2, rows, cols, blocksize, groupsize, offset);

return 0;
}
Loading