diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-02-18 18:16:55 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-18 18:16:55 +0200 |
commit | bd2d4e393b2b7d2a1b2e201058e26017c9728ead (patch) | |
tree | 5c51109459cf1a25fc92fdb11d420895e16785ac /ggml.c | |
parent | c8e0d7efeb7634ecc2e9832e879ab9fca4510e71 (diff) |
1.5 bit quantization (#5453)
* iq1_s: WIP basics
* iq1_s: CUDA is working
* iq1_s: scalar CPU dot product
* iq1_s: WIP AVX2 dot product - something is not right
* Fix tests
* Fix shadow warnings
* Fix after merge with latest master
* iq1_s: AVX2 finally works
* iq1_s: ARM_NEON dot product. Works, but not very fast
* iq1_s: better grid
* iq1_s: use IQ2_XXS for attn_output
At a cost of 0.04 extra bpw this gives a big improvement in PPL.
* iq1_s: Metal basics
Dequantize works, but not dot product
* iq1_s: Metal works, but quite slow
As usual, Apple Silicon does not like the code I write.
* iq1_s: Tests
* iq1_s: slightly faster dot product
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml.c')
-rw-r--r-- | ggml.c | 44 |
1 files changed, 39 insertions, 5 deletions
@@ -673,6 +673,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ1_S] = { + .type_name = "iq1_s", + .blck_size = QK_K, + .type_size = sizeof(block_iq1_s), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq1_s, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = ggml_vec_dot_iq1_s_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, @@ -2267,6 +2279,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break; case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break; + case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7677,6 +7690,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: { ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; @@ -7944,6 +7958,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: { ggml_compute_forward_add1_q_f32(params, src0, src1, dst); } break; @@ -8064,6 +8079,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: default: { GGML_ASSERT(false); @@ -10830,6 +10846,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: { ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; @@ -11010,6 +11027,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: default: { GGML_ASSERT(false); @@ -11207,6 +11225,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -11880,6 +11899,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -11957,6 +11977,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -19136,8 +19157,9 @@ void ggml_quantize_init(enum ggml_type type) { ggml_critical_section_start(); switch (type) { - case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break; - case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break; + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; default: // nothing break; @@ -19149,8 +19171,10 @@ void ggml_quantize_init(enum ggml_type type) { void ggml_quantize_free(void) { ggml_critical_section_start(); - iq2xs_free_impl(256); - iq2xs_free_impl(512); + iq2xs_free_impl(GGML_TYPE_IQ2_XXS); + iq2xs_free_impl(GGML_TYPE_IQ2_XS); + iq2xs_free_impl(GGML_TYPE_IQ1_S); + iq3xs_free_impl(256); ggml_critical_section_end(); } @@ -19285,7 +19309,8 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * bool ggml_quantize_requires_imatrix(enum ggml_type type) { return type == GGML_TYPE_IQ2_XXS || - type == GGML_TYPE_IQ2_XS; + type == GGML_TYPE_IQ2_XS || + type == GGML_TYPE_IQ1_S; } size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, @@ -19410,6 +19435,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; + case GGML_TYPE_IQ1_S: + { + GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); + } break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); |