Sync ggml: add Q1_0 1-bit quantization support (CPU) (#21273)

JamePeng · JamePeng · commit 9241b0fafad5 · 2026-04-09T03:09:49.000+08:00
diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py
@@ -121,7 +121,8 @@ class GGMLStatus(enum.IntEnum):
 #     // GGML_TYPE_IQ4_NL_8_8 = 38,
 #     GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
 #     GGML_TYPE_NVFP4   = 40, // NVFP4 (4 blocks, E4M3 scale)
-#     GGML_TYPE_COUNT   = 41,
+#     GGML_TYPE_Q1_0    = 41,
+#     GGML_TYPE_COUNT   = 42,
 # };
 class GGMLType(enum.IntEnum):
     GGML_TYPE_F32  = 0
@@ -157,7 +158,8 @@ class GGMLType(enum.IntEnum):
     GGML_TYPE_TQ2_0 = 35
     GGML_TYPE_MXFP4 = 39
     GGML_TYPE_NVFP4 = 40
-    GGML_TYPE_COUNT = 41
+    GGML_TYPE_Q1_0 = 41
+    GGML_TYPE_COUNT = 42
 
 
 # // precision
@@ -198,6 +200,7 @@ class GGMLPrec(enum.IntEnum):
 #     GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
 #     GGML_FTYPE_MOSTLY_MXFP4   = 25, // except 1d tensors
 #     GGML_FTYPE_MOSTLY_NVFP4   = 26, // except 1d tensors
+#     GGML_FTYPE_MOSTLY_Q1_0    = 27, // except 1d tensors
 # };
 class GGMLFType(enum.IntEnum):
     GGML_FTYPE_UNKNOWN        = -1
@@ -226,6 +229,7 @@ class GGMLFType(enum.IntEnum):
     GGML_FTYPE_MOSTLY_BF16    = 24
     GGML_FTYPE_MOSTLY_MXFP4   = 25
     GGML_FTYPE_MOSTLY_NVFP4   = 26
+    GGML_FTYPE_MOSTLY_Q1_0    = 27
 
 
 # // available tensor operations:
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -351,6 +351,7 @@
 #     LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_NVFP4         = 39, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q1_0          = 40, // except 1d tensors
 #
 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@@ -391,6 +392,7 @@
 LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
 LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
 LLAMA_FTYPE_MOSTLY_NVFP4 = 39
+LLAMA_FTYPE_MOSTLY_Q1_0 = 40
 LLAMA_FTYPE_GUESSED = 1024
 
 # enum llama_rope_scaling_type {