summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-01-14 09:45:56 +0200
committerGitHub <noreply@github.com>2024-01-14 09:45:56 +0200
commit147b17ac94a24d524e367cda26a9ff6245689f34 (patch)
tree6bae34826f82aa28a60ccb26de8eda0464774110 /ggml.c
parent807179ec583dcb882f97d9704577c06beb2c5ec9 (diff)
2-bit quantizations (#4897)
* imatrix: load * imatrix: WIP * imatrix: Add Q2_K quantization * imatrix: also guard against Q2_K_S quantization without importance matrix * imatrix: guard even more against low-bit quantization misuse --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c36
1 files changed, 25 insertions, 11 deletions
diff --git a/ggml.c b/ggml.c
index bcfb6652..52467475 100644
--- a/ggml.c
+++ b/ggml.c
@@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_iq2_xxs),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
- .from_float = quantize_row_iq2_xxs,
- .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
+ .from_float = NULL,
+ .from_float_reference = NULL,
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
@@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_iq2_xs),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
- .from_float = quantize_row_iq2_xs,
- .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
+ .from_float = NULL,
+ .from_float_reference = NULL,
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
@@ -18665,8 +18665,11 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
return (n/QK8_0*sizeof(block_q8_0));
}
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
+size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
+ int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
+ (void)imatrix;
size_t result = 0;
+ int n = nrows * n_per_row;
switch (type) {
case GGML_TYPE_Q4_0:
{
@@ -18701,8 +18704,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
case GGML_TYPE_Q2_K:
{
GGML_ASSERT(start % QK_K == 0);
- block_q2_K * block = (block_q2_K*)dst + start / QK_K;
- result = ggml_quantize_q2_K(src + start, block, n, n, hist);
+ GGML_ASSERT(start % n_per_row == 0);
+ size_t start_row = start / n_per_row;
+ size_t row_size = ggml_row_size(type, n_per_row);
+ result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+ GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_Q3_K:
{
@@ -18731,14 +18737,22 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
case GGML_TYPE_IQ2_XXS:
{
GGML_ASSERT(start % QK_K == 0);
- block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
- result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
+ GGML_ASSERT(start % n_per_row == 0);
+ GGML_ASSERT(imatrix);
+ size_t start_row = start / n_per_row;
+ size_t row_size = ggml_row_size(type, n_per_row);
+ result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+ GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_IQ2_XS:
{
GGML_ASSERT(start % QK_K == 0);
- block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
- result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
+ GGML_ASSERT(start % n_per_row == 0);
+ GGML_ASSERT(imatrix);
+ size_t start_row = start / n_per_row;
+ size_t row_size = ggml_row_size(type, n_per_row);
+ result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+ GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_F16:
{