diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-02-18 18:16:55 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-18 18:16:55 +0200 |
commit | bd2d4e393b2b7d2a1b2e201058e26017c9728ead (patch) | |
tree | 5c51109459cf1a25fc92fdb11d420895e16785ac /ggml-quants.h | |
parent | c8e0d7efeb7634ecc2e9832e879ab9fca4510e71 (diff) |
1.5 bit quantization (#5453)
* iq1_s: WIP basics
* iq1_s: CUDA is working
* iq1_s: scalar CPU dot product
* iq1_s: WIP AVX2 dot product - something is not right
* Fix tests
* Fix shadow warnings
* Fix after merge with latest master
* iq1_s: AVX2 finally works
* iq1_s: ARM_NEON dot product. Works, but not very fast
* iq1_s: better grid
* iq1_s: use IQ2_XXS for attn_output
At a cost of 0.04 extra bpw this gives a big improvement in PPL.
* iq1_s: Metal basics
Dequantize works, but not dot product
* iq1_s: Metal works, but quite slow
As usual, Apple Silicon does not like the code I write.
* iq1_s: Tests
* iq1_s: slightly faster dot product
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml-quants.h')
-rw-r--r-- | ggml-quants.h | 14 |
1 files changed, 12 insertions, 2 deletions
diff --git a/ggml-quants.h b/ggml-quants.h index 68f09b1e..ad381cfa 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -191,6 +191,13 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +typedef struct { + ggml_fp16_t d; + uint8_t qs[QK_K/8]; + uint8_t scales[QK_K/16]; +} block_iq1_s; +static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); + #ifdef __cplusplus extern "C" { #endif @@ -243,6 +250,7 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -259,6 +267,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") @@ -266,6 +275,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); @@ -276,8 +286,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -void iq2xs_init_impl(int grid_size); -void iq2xs_free_impl(int grid_size); +void iq2xs_init_impl(enum ggml_type type); +void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); |