Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions 03_nf4_dequant/ayepei/bnb_benchmark_results.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Shape,Blocksize,BnB Time (ms),Total Elements,Weight File,BnB Output File
256x256,64,0.1144,65536,weight_data/weight_256x256_bs64.bin,bnb_results/bnb_256x256_bs64.fp16
256x256,128,0.1064,65536,weight_data/weight_256x256_bs128.bin,bnb_results/bnb_256x256_bs128.fp16
512x512,64,0.1010,262144,weight_data/weight_512x512_bs64.bin,bnb_results/bnb_512x512_bs64.fp16
512x512,128,0.0977,262144,weight_data/weight_512x512_bs128.bin,bnb_results/bnb_512x512_bs128.fp16
1024x1024,64,0.0960,1048576,weight_data/weight_1024x1024_bs64.bin,bnb_results/bnb_1024x1024_bs64.fp16
1024x1024,128,0.0912,1048576,weight_data/weight_1024x1024_bs128.bin,bnb_results/bnb_1024x1024_bs128.fp16
2048x2048,64,0.0910,4194304,weight_data/weight_2048x2048_bs64.bin,bnb_results/bnb_2048x2048_bs64.fp16
2048x2048,128,0.0925,4194304,weight_data/weight_2048x2048_bs128.bin,bnb_results/bnb_2048x2048_bs128.fp16
4096x4096,64,0.0891,16777216,weight_data/weight_4096x4096_bs64.bin,bnb_results/bnb_4096x4096_bs64.fp16
4096x4096,128,0.0905,16777216,weight_data/weight_4096x4096_bs128.bin,bnb_results/bnb_4096x4096_bs128.fp16
8192x8192,64,0.0918,67108864,weight_data/weight_8192x8192_bs64.bin,bnb_results/bnb_8192x8192_bs64.fp16
8192x8192,128,0.0912,67108864,weight_data/weight_8192x8192_bs128.bin,bnb_results/bnb_8192x8192_bs128.fp16
16384x16384,64,0.3074,268435456,weight_data/weight_16384x16384_bs64.bin,bnb_results/bnb_16384x16384_bs64.fp16
16384x16384,128,0.2973,268435456,weight_data/weight_16384x16384_bs128.bin,bnb_results/bnb_16384x16384_bs128.fp16
3421x3146,64,0.0925,10762466,weight_data/weight_3421x3146_bs64.bin,bnb_results/bnb_3421x3146_bs64.fp16
3421x3146,128,0.0904,10762466,weight_data/weight_3421x3146_bs128.bin,bnb_results/bnb_3421x3146_bs128.fp16
6578x1236,64,0.0895,8130408,weight_data/weight_6578x1236_bs64.bin,bnb_results/bnb_6578x1236_bs64.fp16
6578x1236,128,0.0876,8130408,weight_data/weight_6578x1236_bs128.bin,bnb_results/bnb_6578x1236_bs128.fp16
7000x7000,64,0.0909,49000000,weight_data/weight_7000x7000_bs64.bin,bnb_results/bnb_7000x7000_bs64.fp16
7000x7000,128,0.0939,49000000,weight_data/weight_7000x7000_bs128.bin,bnb_results/bnb_7000x7000_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_1024x1024_bs128.bin
rows=1024
cols=1024
blocksize=128
total_elements=1048576
kernel_time_ms=0.0030
bandwidth_gbps=816.30
output_file=cuda_results/dequant_1024x1024_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_1024x1024_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_1024x1024_bs64.bin
rows=1024
cols=1024
blocksize=64
total_elements=1048576
kernel_time_ms=0.0029
bandwidth_gbps=845.08
output_file=cuda_results/dequant_1024x1024_bs64.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_16384x16384_bs128.bin
rows=16384
cols=16384
blocksize=128
total_elements=268435456
kernel_time_ms=0.2102
bandwidth_gbps=2991.95
output_file=cuda_results/dequant_16384x16384_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_16384x16384_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_16384x16384_bs64.bin
rows=16384
cols=16384
blocksize=64
total_elements=268435456
kernel_time_ms=0.2105
bandwidth_gbps=2987.26
output_file=cuda_results/dequant_16384x16384_bs64.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_2048x2048_bs128.bin
rows=2048
cols=2048
blocksize=128
total_elements=4194304
kernel_time_ms=0.0049
bandwidth_gbps=2017.74
output_file=cuda_results/dequant_2048x2048_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_2048x2048_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_2048x2048_bs64.bin
rows=2048
cols=2048
blocksize=64
total_elements=4194304
kernel_time_ms=0.0047
bandwidth_gbps=2082.46
output_file=cuda_results/dequant_2048x2048_bs64.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_256x256_bs128.bin
rows=256
cols=256
blocksize=128
total_elements=65536
kernel_time_ms=0.0026
bandwidth_gbps=58.80
output_file=cuda_results/dequant_256x256_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_256x256_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_256x256_bs64.bin
rows=256
cols=256
blocksize=64
total_elements=65536
kernel_time_ms=0.0026
bandwidth_gbps=58.80
output_file=cuda_results/dequant_256x256_bs64.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_3421x3146_bs128.bin
rows=3421
cols=3146
blocksize=128
total_elements=10762466
kernel_time_ms=0.0086
bandwidth_gbps=2939.31
output_file=cuda_results/dequant_3421x3146_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_3421x3146_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_3421x3146_bs64.bin
rows=3421
cols=3146
blocksize=64
total_elements=10762466
kernel_time_ms=0.0086
bandwidth_gbps=2935.13
output_file=cuda_results/dequant_3421x3146_bs64.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_4096x4096_bs128.bin
rows=4096
cols=4096
blocksize=128
total_elements=16777216
kernel_time_ms=0.0139
bandwidth_gbps=2832.19
output_file=cuda_results/dequant_4096x4096_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_4096x4096_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_4096x4096_bs64.bin
rows=4096
cols=4096
blocksize=64
total_elements=16777216
kernel_time_ms=0.0138
bandwidth_gbps=2840.68
output_file=cuda_results/dequant_4096x4096_bs64.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_512x512_bs128.bin
rows=512
cols=512
blocksize=128
total_elements=262144
kernel_time_ms=0.0028
bandwidth_gbps=221.24
output_file=cuda_results/dequant_512x512_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_512x512_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_512x512_bs64.bin
rows=512
cols=512
blocksize=64
total_elements=262144
kernel_time_ms=0.0028
bandwidth_gbps=221.82
output_file=cuda_results/dequant_512x512_bs64.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_6578x1236_bs128.bin
rows=6578
cols=1236
blocksize=128
total_elements=8130408
kernel_time_ms=0.0070
bandwidth_gbps=2717.80
output_file=cuda_results/dequant_6578x1236_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_6578x1236_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_6578x1236_bs64.bin
rows=6578
cols=1236
blocksize=64
total_elements=8130408
kernel_time_ms=0.0072
bandwidth_gbps=2638.75
output_file=cuda_results/dequant_6578x1236_bs64.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_7000x7000_bs128.bin
rows=7000
cols=7000
blocksize=128
total_elements=49000000
kernel_time_ms=0.0412
bandwidth_gbps=2787.30
output_file=cuda_results/dequant_7000x7000_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_7000x7000_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_7000x7000_bs64.bin
rows=7000
cols=7000
blocksize=64
total_elements=49000000
kernel_time_ms=0.0414
bandwidth_gbps=2771.04
output_file=cuda_results/dequant_7000x7000_bs64.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs128.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_8192x8192_bs128.bin
rows=8192
cols=8192
blocksize=128
total_elements=67108864
kernel_time_ms=0.0552
bandwidth_gbps=2850.08
output_file=cuda_results/dequant_8192x8192_bs128.fp16
8 changes: 8 additions & 0 deletions 03_nf4_dequant/ayepei/cuda_results/perf_8192x8192_bs64.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
input_file=../weight_data/weight_8192x8192_bs64.bin
rows=8192
cols=8192
blocksize=64
total_elements=67108864
kernel_time_ms=42.5944
bandwidth_gbps=3.69
output_file=cuda_results/dequant_8192x8192_bs64.fp16
Loading