InfiniTensor · Roundaboutt · Mar 1, 2026 · Mar 1, 2026 · Mar 7, 2026 · Mar 11, 2026
diff --git a/03_nf4_dequant/.gitignore b/03_nf4_dequant/.gitignore
@@ -0,0 +1,4 @@
+data/qlora_test.bin
+data/cpp_output.bin
+data/py_output.bin
+__pycache__/
diff --git a/03_nf4_dequant/NF4反量化算子优化.pdf b/03_nf4_dequant/NF4反量化算子优化.pdf
diff --git a/03_nf4_dequant/README.md b/03_nf4_dequant/README.md
@@ -1,16 +1,76 @@
-## 简介
+## 快速开始性能测试
+`conda activate nf4`激活conda环境
 
-凭借你在 InfiniTensor 训练营的经历和人工智能与并行计算方面的出色成绩，你受雇于一家领先的 AI 大模型公司，负责为下一代对话系统开发高效的推理引擎。用户抱怨现有的聊天机器人响应速度太慢，无法支持流畅的实时交互——尤其是在消费级显卡上，大模型的显存占用导致批处理规模受限，吞吐量始终提不上去。
+`CUDA_VISIBLE_DEVICES=2 python main.py`运行测试程序，终端输出误差以及性能分析报告
 
-你发现问题的症结在于模型参数量过大。公司采用了 QLoRA 技术，将基座模型量化为 4-bit NF4 格式，显存占用降至原来的 1/4，理论上可以在单卡上运行更大的模型或更大的批次。然而，现有实现的解量化过程成了新的瓶颈：每个矩阵乘法前都需要将 4-bit 权重实时解压回 16-bit，这个操作拖慢了整体计算速度。
+`CUDA_VISIBLE_DEVICES=2`指定运行的GPU
 
-## 任务内容
-开发一个 CUDA 程序，实现单核的 NF4 解量化算子，将压缩后的 4-bit 权重实时解压为 16-bit 浮点（BF16 或 FP16）输出。
+结果示例：
+```
+--- 正在执行: 编译 C++ CUDA 内核 ---
+--- 正在执行: 生成随机量化权重 ---
+维度: 32768x32768 | Block: 64 | Group: 256
+数据已写入: ./data/qlora_test.bin
+--- 运行反量化 ---
+耗时: 24.4132 ms | 吞吐: 1106.48 GB/s
+基准输出已保存: ./data/py_output.bin
+--- 正在执行: 运行自定义 CUDA 内核 ---
+原始元素: 1073741824
+对齐元素: 1073741824
+Launch Kernel...
+Kernel 耗时: 21.6289 ms
+有效带宽: 1248.91 GB/s
+--- 正在执行: 对比精度与性能表现 ---
+测试维度: 32768 x 32768
+Python: ./data/py_output.bin
+C++: ./data/cpp_output.bin
 
-## 📬 有疑问?
+==================================================
+误差分析报告
+==================================================
+平均绝对误差 (MAE):6.5369757067e-05
+均方误差 (MSE):2.4582243441e-07
+最大误差 (Max Diff):1.5625000000e-02
+--------------------------------------------------
+完全一致元素数:1043509311 / 1073741824
+一致率:97.1844%
+==================================================
 
-更多详细信息和要求请参考本季度项目文档。
+ 随机数据采样对比
+---------------------------------------------------------------------------
+Index      | Python (BF16)      | C++ (BF16)         | Diff           
+---------------------------------------------------------------------------
+0          | 0.394531           | 0.394531           | 0.0000e+00     
+1          | 1.171875           | 1.171875           | 0.0000e+00     
+2          | -0.215820          | -0.215820          | 0.0000e+00     
+446162790  | -0.277344          | -0.277344          | 0.0000e+00     
+1066319814 | 0.000000           | 0.000000           | 0.0000e+00     
+230426597  | 0.205078           | 0.205078           | 0.0000e+00     
+951028163  | 0.322266           | 0.322266           | 0.0000e+00     
+857049811  | -0.566406          | -0.566406          | 0.0000e+00     
+527929503  | 0.718750           | 0.718750           | 0.0000e+00     
+23305680   | 0.000000           | 0.000000           | 0.0000e+00     
+168636035  | 0.134766           | 0.134766           | 0.0000e+00     
+660748068  | -0.625000          | -0.625000          | 0.0000e+00     
+801264382  | 0.098633           | 0.098633           | 0.0000e+00     
+---------------------------------------------------------------------------
+bnb耗时:24.41320ms, 带宽:1106.47730GB/s
+nf4 kernel耗时:21.62890ms, 带宽:1248.91000GB/s
+```
 
-可以在项目群里直接询问导师和助教！
+## data目录
+data目录存放测试文件以及反量化的输出结果。
+在param.txt中可以修改测试用矩阵大小等参数。
+data/log目录存放输出的性能日志。
+
+## kernel_test目录
+kernel_test目录中是kernel逐步优化的过程。
+
+## src目录
+
+generate_data.py 用于生成随机矩阵，并使用bnb库量化后输出文件qlora_test.bin供nf4_kernel.cu读取进行反量化。同时generate_data.py将量化后的矩阵进行反量化并计时，性能日志输出到data/log/log_py.txt，运算结果输出到data/py_output.bin
+
+nf4_kernel.cu是CUDA实现的nf4反量化算子，从qlora_test.bin中读取量化后的矩阵进行反量化，并将结果输出到data/cpp_output.bin，性能日志输出到data/log/log_cpp.txt
+
+benchmark.py 是测试程序。读取cpp_output.bin与py_output.bin，对二者反量化结果进行精度对比。
 
-Good luck and happy coding! 🚀
diff --git a/03_nf4_dequant/data/log/log_cpp.txt b/03_nf4_dequant/data/log/log_cpp.txt
@@ -0,0 +1 @@
+21.6248,1249.15
diff --git a/03_nf4_dequant/data/log/log_py.txt b/03_nf4_dequant/data/log/log_py.txt
@@ -0,0 +1 @@
+21.2040,1273.9424
diff --git a/03_nf4_dequant/data/param.txt b/03_nf4_dequant/data/param.txt
@@ -0,0 +1,6 @@
+blocksize = 64
+groupsize = 256
+compute_type = "bf16"
+target_gpu = "T4"
+rows = 32768
+cols = 32768
diff --git a/03_nf4_dequant/kernel_test/ALU_optim.cu b/03_nf4_dequant/kernel_test/ALU_optim.cu
@@ -0,0 +1,243 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstdint>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#define CHECK_CUDA(call)                                                 \
+    {                                                                    \
+        cudaError_t err = call;                                          \
+        if (err != cudaSuccess) {                                        \
+            std::cerr << "CUDA Error: " << cudaGetErrorString(err)       \
+                      << " at line " << __LINE__ << std::endl;           \
+            exit(EXIT_FAILURE);                                          \
+        }                                                                \
+    }
+
+void read_file(
+    const std::string file_path, std::vector<uint8_t>& packed_weights, 
+    std::vector<uint8_t>& absmax_q, 
+    std::vector<uint16_t>& absmax2, 
+    std::vector<uint16_t>& code2, float& offset, 
+    int64_t& rows, int64_t& cols, int32_t& blocksize
+) 
+{
+    std::ifstream file(file_path, std::ios::binary);
+    if (!file) { std::cerr << "无法打开文件" << std::endl; exit(1); }
+
+    file.read(reinterpret_cast<char*>(&rows), 8);
+    file.read(reinterpret_cast<char*>(&cols), 8);
+    file.read(reinterpret_cast<char*>(&blocksize), 4);
+
+    int64_t total_original = rows * cols;
+    int32_t groupsize = 256; 
+    int32_t alignment = blocksize * groupsize;
+    int64_t total_aligned = ((total_original + alignment - 1) / alignment) * alignment;
+
+    size_t num_blocks = total_aligned / blocksize;
+    size_t num_groups = total_aligned / (blocksize * groupsize);
+
+    std::cout << "原始元素: " << total_original << std::endl;
+    std::cout << "对齐元素: " << total_aligned << std::endl;
+
+    packed_weights.resize(total_aligned / 2);
+    absmax_q.resize(num_blocks);
+    absmax2.resize(num_groups);
+    code2.resize(256);
+
+    file.read(reinterpret_cast<char*>(packed_weights.data()), packed_weights.size());
+    file.read(reinterpret_cast<char*>(absmax_q.data()), absmax_q.size());
+    file.read(reinterpret_cast<char*>(absmax2.data()), absmax2.size() * 2);
+    file.read(reinterpret_cast<char*>(code2.data()), 256 * 2);
+    file.read(reinterpret_cast<char*>(&offset), 4);
+}
+
+
+__constant__ float c_code2[256];
+// NF4 table
+__constant__ float c_nf4[16] = 
+{
+    -1.00000000f, -0.69619280f, -0.52507305f, -0.39491749f,
+    -0.28444138f, -0.18477343f, -0.09105004f,  0.00000000f,
+    0.07958030f,  0.16093020f,  0.24611230f,  0.33791524f,
+    0.44070983f,  0.56261700f,  0.72295684f,  1.00000000f
+};
+
+__global__ void dequantize_nf4_kernel_ALU_optim
+(
+    const uint8_t*  __restrict__ packed_weights, 
+    const uint8_t*  __restrict__ absmax_q,       
+    const half*     __restrict__ absmax2,
+    uint32_t*       __restrict__ output_packed, 
+    int num_bytes, 
+    int block_shift,
+    int group_shift,
+    float offset
+) 
+{
+    __shared__ float s_nf4[16];
+
+    int tx = threadIdx.x;
+    if (tx < 16) 
+    {
+        s_nf4[tx] = c_nf4[tx];
+    }
+    __syncthreads();
+
+    // 每个线程处理 4 个输入字节（即 8 个 NF4 权重）
+    // 所以总线程数只需要是 num_bytes / 4
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (tid * 4 >= num_bytes) return;
+
+    // 向量化读取：一次吞下 4 个字节 (32-bit)
+    uint32_t packed_4bytes = reinterpret_cast<const uint32_t*>(packed_weights)[tid];
+
+    // 准备 128-bit 的输出容器 (4 个 uint32_t，每个 uint32_t 包含 2 个 bf16)
+    uint32_t out_vec[4];
+
+#pragma unroll
+    for (int i = 0; i < 4; ++i) 
+    {
+        uint8_t byte_val = (packed_4bytes >> (i * 8)) & 0xFF;
+
+        // 计算当前权重的全局索引
+        int element_idx = (tid * 4 + i) * 2;
+        int block_idx = element_idx >> block_shift;
+        int group_idx = block_idx >> group_shift;
+
+        float scale_1 = c_code2[absmax_q[block_idx]]; 
+
+        float scale_2 = __half2float(absmax2[group_idx]);
+        float final_scale = scale_1 * scale_2;
+
+        // 解码两个 NF4
+        uint8_t idx_0 = byte_val >> 4;
+        uint8_t idx_1 = byte_val & 0x0F;
+
+        float v0 = s_nf4[idx_0] * final_scale;
+        float v1 = s_nf4[idx_1] * final_scale;
+
+        __nv_bfloat16 b0 = __float2bfloat16(v0);
+        __nv_bfloat16 b1 = __float2bfloat16(v1);
+
+        uint16_t bits_0 = *reinterpret_cast<unsigned short*>(&b0);
+        uint16_t bits_1 = *reinterpret_cast<unsigned short*>(&b1);
+
+        // 打包结果存入临时数组
+        out_vec[i] = ((uint32_t)bits_1 << 16) | (uint32_t)bits_0;
+    }
+
+    // 向量化写入
+    reinterpret_cast<int4*>(output_packed)[tid] = *reinterpret_cast<int4*>(out_vec);
+}
+
+void nf4_dequantize_cuda
+(
+    std::vector<uint8_t>& h_packed_weights, 
+    std::vector<uint8_t>& h_absmax_q,
+    std::vector<uint16_t>& h_absmax2, 
+    std::vector<uint16_t>& h_code2,
+    int64_t rows, int64_t cols, int32_t blocksize, int32_t groupsize,float offset
+)
+{
+    size_t num_bytes = h_packed_weights.size();
+    size_t out_size = num_bytes * sizeof(uint32_t);
+
+    float h_code2_f32[256];
+    for(int i = 0; i < 256; ++i)
+    {
+        __half h_val = *reinterpret_cast<__half*>(&h_code2[i]);
+        h_code2_f32[i] = (float)h_val;        
+    } 
+    CHECK_CUDA(cudaMemcpyToSymbol(c_code2, h_code2_f32, sizeof(h_code2_f32)));
+
+    uint8_t *d_packed, *d_absmax_q;
+    half *d_absmax2;
+    uint32_t *d_output;
+
+    CHECK_CUDA(cudaMalloc(&d_packed, h_packed_weights.size()));
+    CHECK_CUDA(cudaMalloc(&d_absmax_q, h_absmax_q.size()));
+    CHECK_CUDA(cudaMalloc(&d_absmax2, h_absmax2.size() * 2));
+    CHECK_CUDA(cudaMalloc(&d_output, out_size));
+
+    CHECK_CUDA(cudaMemcpy(d_packed, h_packed_weights.data(), h_packed_weights.size(), cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), h_absmax_q.size(), cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), h_absmax2.size() * 2, cudaMemcpyHostToDevice));
+
+    int threadsPerBlock = 256;
+    int num_elements_vec = (num_bytes + 3) / 4;
+    int blocksPerGrid = (num_elements_vec + threadsPerBlock - 1) / threadsPerBlock;
+
+    int block_shift = log2(blocksize);
+    int group_shift = log2(groupsize);
+
+    // warm up
+    for (int i = 0; i < 5; ++i)
+    {
+        dequantize_nf4_kernel_ALU_optim<<<blocksPerGrid, threadsPerBlock>>>
+        (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, block_shift, group_shift, offset);        
+    }
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start); // 开始记录
+    std::cout << "Launch Kernel...\n";
+    for (int i = 0; i < 10; ++i)
+    {
+        dequantize_nf4_kernel_ALU_optim<<<blocksPerGrid, threadsPerBlock>>>
+        (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, block_shift, group_shift, offset);        
+    }
+    cudaEventRecord(stop);  // 结束记录
+
+    CHECK_CUDA(cudaEventSynchronize(stop)); // 等待 Event 完成
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    // 计算带宽
+    // 读取: Packed(1) + Indices(1) + Scales(2) + Code2(忽略不计)
+    size_t total_read = h_packed_weights.size() + h_absmax_q.size() + h_absmax2.size() * 2;
+    // 写入: Output(4) (因为每个packed byte生成一个uint32)
+    size_t total_write = num_bytes * 4;
+
+    double total_bytes = (double)(total_read + total_write);
+    double gb_per_sec = (total_bytes * 10 / 1e9) / (milliseconds / 1000.0);
+
+    std::cout << "Kernel 耗时: " << milliseconds << " ms" << std::endl;
+    std::cout << "有效带宽: " << gb_per_sec << " GB/s" << std::endl;
+
+    // 保存时间与带宽
+    std::ofstream timefile("./data/log/log_cpp.txt");
+    timefile << milliseconds << "," << gb_per_sec;
+    timefile.close();
+
+    std::vector<uint32_t> h_output(num_bytes);
+    CHECK_CUDA(cudaMemcpy(h_output.data(), d_output, out_size, cudaMemcpyDeviceToHost));
+
+    const std::string output_path = "./data/cpp_output.bin";
+    std::ofstream outfile(output_path, std::ios::binary);
+    outfile.write(reinterpret_cast<char*>(h_output.data()), out_size);
+    outfile.close();
+
+    cudaFree(d_packed); cudaFree(d_absmax_q); cudaFree(d_absmax2); cudaFree(d_output);
+    cudaEventDestroy(start); cudaEventDestroy(stop);
+}
+
+int main() 
+{
+    const std::string file_path = "../data/qlora_test.bin";
+    std::vector<uint8_t> packed_weights, absmax_q;
+    std::vector<uint16_t> absmax2, code2;
+    float offset;
+    int64_t rows, cols;
+    int32_t blocksize;
+    int32_t groupsize = 256;
+    read_file(file_path, packed_weights, absmax_q, absmax2, code2, offset, rows, cols, blocksize);
+    nf4_dequantize_cuda(packed_weights, absmax_q, absmax2, code2, rows, cols, blocksize, groupsize, offset);
+
+    return 0;
+}