ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics * iq2_xs: this should have been in the basics * iq2_xs: CUDA and scalar CPU works * iq2_xs: WIP Metal * iq2_xs: Metal now works * iq2_xs: working, but dog slow, ARM_NEON dot product * iq2_xs: better ARM_NEON dot product We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when running on the CPU. * iq2_xs: AVX2 dot product - 19.5 t/s * iq2_xs: faster AVX2 dit product 21.4 t/s for TG-128, 59.2 t/s for PP-512. The latter is 2x compared to the previous version. * iq2_xs: had forgotten to delete iq2-data.h * Add llama enum for IQ2_XS --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <48489457+ikawrakow@users.noreply.github.com> 2024-01-11 20:39:39 +0100
committer: GitHub <noreply@github.com> 2024-01-11 21:39:39 +0200
commit: 49662cbed3e95f5976c070b85b9fd53fd577038d (patch)
tree: b70cd0956715bc11696f6e47d26788e24c5112c4 /ggml-quants.h
parent: 3ba5b8ca8e6181a5c712c5b77595a29f1d3e2b97 (diff)
1 files changed, 12 insertions, 0 deletions
diff --git a/ggml-quants.h b/ggml-quants.h
index 8dd911d4..df5e7ae8 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -174,6 +174,14 @@ typedef struct {
 } block_iq2_xxs;
 static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
 
+// 2.3125 bpw quants
+typedef struct {
+    ggml_fp16_t d;
+    uint16_t qs[QK_K/8];
+    uint8_t  scales[QK_K/32];
+} block_iq2_xs;
+static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
+
 // Quantization
 void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
@@ -189,6 +197,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
 void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
 void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
+void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs  * restrict y, int k);
 
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
 void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@@ -204,6 +213,7 @@ void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
+void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
 
 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
@@ -220,6 +230,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
 void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
 void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
 void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
+void dequantize_row_iq2_xs (const block_iq2_xs  * restrict x, float * restrict y, int k);
 
 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -234,3 +245,4 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx,
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
author	Kawrakow <48489457+ikawrakow@users.noreply.github.com>	2024-01-11 20:39:39 +0100
committer	GitHub <noreply@github.com>	2024-01-11 21:39:39 +0200
commit	49662cbed3e95f5976c070b85b9fd53fd577038d (patch)
tree	b70cd0956715bc11696f6e47d26788e24c5112c4 /ggml-quants.h
parent	3ba5b8ca8e6181a5c712c5b77595a29f1d3e2b97 (diff)