Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,32 @@ void quantize(uint dst_idx, uint src_idx)
}
#endif

#if defined(DATA_A_Q2_0)
void quantize(uint dst_idx, uint src_idx)
{
float amax = 0.0;

[[unroll]] for (int j = 0; j < QUANT_K_Q2_0; ++j) {
amax = max(amax, abs(data_s[src_idx + j]));
}

const float d = amax;
const float id = (d > 0.0) ? 1.0/d : 0.0;

data_q[dst_idx].d = float16_t(d);

[[unroll]] for (int j = 0; j < QUANT_K_Q2_0 / 4; ++j) {
data_q[dst_idx].qs[j] = uint8_t(0);
}

[[unroll]] for (int j = 0; j < QUANT_K_Q2_0; ++j) {
int q = int(round(data_s[src_idx + j] * id)) + 1;
q = clamp(q, 0, 3);
data_q[dst_idx].qs[j / 4] |= uint8_t(q << ((j % 4) * 2));
}
}
#endif

#if defined(DATA_A_IQ4_NL)
uint best_index(float x) {
if (x <= kvalues_iq4nl[0]) return 0;
Expand Down
25 changes: 25 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,25 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
}
#endif

#if defined(DATA_A_Q2_0)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint byte_val = uint(data_a[a_offset + ib].qs[iqs / 4u]);
const uint shift = (iqs % 4u) * 2u;
return vec2(
float(int((byte_val >> shift) & 3u) - 1),
float(int((byte_val >> (shift + 2u)) & 3u) - 1));
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const uint byte_val = uint(data_a[a_offset + ib].qs[iqs / 4u]);
const uint shift = (iqs % 4u) * 2u;
return vec4(
float(int((byte_val >> shift) & 3u) - 1),
float(int((byte_val >> (shift + 2u)) & 3u) - 1),
float(int((byte_val >> (shift + 4u)) & 3u) - 1),
float(int((byte_val >> (shift + 6u)) & 3u) - 1));
}
#endif

#if defined(DATA_A_IQ1_S)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint ib32 = iqs / 32;
Expand Down Expand Up @@ -497,6 +516,12 @@ vec2 get_dm(uint ib, uint a_offset) {
}
#endif

#if defined(DATA_A_Q2_0)
vec2 get_dm(uint ib, uint a_offset) {
return vec2(float(data_a[a_offset + ib].d), 0);
}
#endif

#if defined(DATA_A_MXFP4)
vec2 get_dm(uint ib, uint a_offset) {
return vec2(e8m0_to_fp32(data_a[a_offset + ib].e), 0);
Expand Down
15 changes: 15 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ float16_t dequantFuncQ1_0(const in decodeBufQ1_0 bl, const in uint blockCoords[2
return bit != 0u ? d : -d;
}

layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ2_0 {
block_q2_0 block;
};

float16_t dequantFuncQ2_0(const in decodeBufQ2_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
const uint byte_val = uint(bl.block.qs[idx >> 2]);
const uint shift = (idx & 3u) * 2u;
return float16_t(int((byte_val >> shift) & 3u) - 1) * d;
}

layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
block_q4_0_packed16 block;
};
Expand Down Expand Up @@ -717,6 +730,8 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords

#if defined(DATA_A_Q1_0)
#define dequantFuncA dequantFuncQ1_0
#elif defined(DATA_A_Q2_0)
#define dequantFuncA dequantFuncQ2_0
#elif defined(DATA_A_Q4_0)
#define dequantFuncA dequantFuncQ4_0
#elif defined(DATA_A_Q4_1)
Expand Down
30 changes: 30 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_0.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#version 450

#include "dequant_head.glsl"

layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

layout (binding = 0) readonly buffer A {block_q2_0 data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};

void main() {
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
const uint tid = gl_LocalInvocationID.x % 64;
const uint il = tid / 4; // 0..15: byte-pair index in block
const uint ir = tid % 4; // 0..3: which block within group of 4
const uint ib = 4*i + ir;
if (ib >= p.nel / 128) {
return;
}

const uint b_idx = 512*i + 128*ir + 8*il;

const float d = float(data_a[ib].d);
const uint b0 = uint(data_a[ib].qs[il*2 ]);
const uint b1 = uint(data_a[ib].qs[il*2 + 1]);

[[unroll]] for (uint l = 0; l < 4; ++l) {
data_b[b_idx + l ] = D_TYPE(float(int((b0 >> (l*2u)) & 3u) - 1) * d);
data_b[b_idx + l + 4] = D_TYPE(float(int((b1 >> (l*2u)) & 3u) - 1) * d);
}
}
16 changes: 16 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,22 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
buf_a[buf_idx + 1] = FLOAT_TYPEV2((bits & 0x04u) != 0u ? d : -d, (bits & 0x08u) != 0u ? d : -d);
buf_a[buf_idx + 2] = FLOAT_TYPEV2((bits & 0x10u) != 0u ? d : -d, (bits & 0x20u) != 0u ? d : -d);
buf_a[buf_idx + 3] = FLOAT_TYPEV2((bits & 0x40u) != 0u ? d : -d, (bits & 0x80u) != 0u ? d : -d);
#elif defined(DATA_A_Q2_0)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

const uint ib = idx / 32;
const uint iqs = idx & 0x1fu;

const float d = float(data_a[ib].d);
const uint byte_val = uint(data_a[ib].qs[iqs]);

buf_a[buf_idx ] = FLOAT_TYPEV2(
float(int( byte_val & 3u) - 1) * d,
float(int((byte_val >> 2u) & 3u) - 1) * d);
buf_a[buf_idx + 1] = FLOAT_TYPEV2(
float(int((byte_val >> 4u) & 3u) - 1) * d,
float(int((byte_val >> 6u) & 3u) - 1) * d);
#elif defined(DATA_A_Q2_K)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
Expand Down
17 changes: 17 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,23 @@ struct block_q1_0
#define A_TYPE block_q1_0
#endif

#define QUANT_K_Q2_0 128
#define QUANT_R_Q2_0 1

struct block_q2_0
{
float16_t d;
uint8_t qs[QUANT_K_Q2_0 / 4];
};

#if defined(DATA_A_Q2_0)
#define QUANT_K QUANT_K_Q2_0
#define QUANT_R QUANT_R_Q2_0
#define QUANT_AUXF 1
#define A_TYPE block_q2_0
#endif


#define QUANT_K_Q8_1 32
#define QUANT_R_Q8_1 1

Expand Down
7 changes: 4 additions & 3 deletions ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const std::vector<std::string> type_names = {
"f32",
"f16",
"q1_0",
"q2_0",
"q4_0",
"q4_1",
"q5_0",
Expand Down Expand Up @@ -557,7 +558,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
std::string load_vec_quant = "2";
if ((tname == "q1_0") || (tname == "q4_0") || (tname == "q4_1") || (tname == "q5_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s"))
load_vec_quant = "8";
else if ((tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_xs") || (tname == "iq4_nl") || (tname == "mxfp4") || (tname == "nvfp4"))
else if ((tname == "q2_0") || (tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_xs") || (tname == "iq4_nl") || (tname == "mxfp4") || (tname == "nvfp4"))
load_vec_quant = "4";

if (tname == "bf16") {
Expand Down Expand Up @@ -767,12 +768,12 @@ void process_shaders() {
string_to_spv("cpy_transpose_16", "copy_transpose.comp", {{"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
string_to_spv("cpy_transpose_32", "copy_transpose.comp", {{"A_TYPE", "uint"}, {"D_TYPE", "uint"}});

for (std::string t : {"q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
for (std::string t : {"q1_0", "q2_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}

for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
for (std::string t : {"f32", "f16", "bf16", "q1_0", "q2_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
string_to_spv("set_rows_" + t + "_i32", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uint"}, {"B_SIZE", "32"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("set_rows_" + t + "_i64", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"B_SIZE", "64"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ llama_build_and_test(
peg-parser/tests.h
)
llama_build_and_test(test-regex-partial.cpp)
llama_build_and_test(test-vulkan-q2_0-shader-sim.cpp)

if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
set(MODEL_NAME "tinyllamas/stories15M-q4_0.gguf")
Expand Down
3 changes: 3 additions & 0 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7252,6 +7252,7 @@ static const ggml_type all_types[] = {
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
GGML_TYPE_Q8_0,
GGML_TYPE_Q1_0,
GGML_TYPE_Q2_0,
GGML_TYPE_MXFP4, GGML_TYPE_NVFP4,
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
Expand All @@ -7266,6 +7267,7 @@ static const ggml_type base_types[] = {
GGML_TYPE_F32, GGML_TYPE_F16,
GGML_TYPE_Q8_0, // for I8MM tests
GGML_TYPE_Q1_0,
GGML_TYPE_Q2_0,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1, // for I8MM tests
GGML_TYPE_Q4_K,
Expand All @@ -7278,6 +7280,7 @@ static const ggml_type other_types[] = {
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
GGML_TYPE_Q8_0,
GGML_TYPE_Q1_0,
GGML_TYPE_Q2_0,
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
GGML_TYPE_Q5_K,
GGML_TYPE_Q6_K,
Expand Down
Loading