diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-01-14 09:45:56 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-14 09:45:56 +0200 |
commit | 147b17ac94a24d524e367cda26a9ff6245689f34 (patch) | |
tree | 6bae34826f82aa28a60ccb26de8eda0464774110 /ggml.c | |
parent | 807179ec583dcb882f97d9704577c06beb2c5ec9 (diff) |
2-bit quantizations (#4897)
* imatrix: load
* imatrix: WIP
* imatrix: Add Q2_K quantization
* imatrix: also guard against Q2_K_S quantization without importance matrix
* imatrix: guard even more against low-bit quantization misuse
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml.c')
-rw-r--r-- | ggml.c | 36 |
1 files changed, 25 insertions, 11 deletions
@@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_iq2_xxs), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs, - .from_float = quantize_row_iq2_xxs, - .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference, + .from_float = NULL, + .from_float_reference = NULL, .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, @@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_iq2_xs), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, - .from_float = quantize_row_iq2_xs, - .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference, + .from_float = NULL, + .from_float_reference = NULL, .vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, @@ -18665,8 +18665,11 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK8_0*sizeof(block_q8_0)); } -size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, + int nrows, int n_per_row, int64_t * hist, const float * imatrix) { + (void)imatrix; size_t result = 0; + int n = nrows * n_per_row; switch (type) { case GGML_TYPE_Q4_0: { @@ -18701,8 +18704,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i case GGML_TYPE_Q2_K: { GGML_ASSERT(start % QK_K == 0); - block_q2_K * block = (block_q2_K*)dst + start / QK_K; - result = ggml_quantize_q2_K(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_Q3_K: { @@ -18731,14 +18737,22 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i case GGML_TYPE_IQ2_XXS: { GGML_ASSERT(start % QK_K == 0); - block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K; - result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + GGML_ASSERT(imatrix); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_IQ2_XS: { GGML_ASSERT(start % QK_K == 0); - block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K; - result = ggml_quantize_iq2_xs(src + start, block, n, n, hist); + GGML_ASSERT(start % n_per_row == 0); + GGML_ASSERT(imatrix); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_F16: { |