summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c339
1 files changed, 45 insertions, 294 deletions
diff --git a/ggml.c b/ggml.c
index 6eff98ab..80efa6f2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20159,133 +20159,6 @@ void ggml_quantize_free(void) {
ggml_critical_section_end();
}
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK4_0 == 0);
- const int nb = k / QK4_0;
-
- for (int b = 0; b < n; b += k) {
- block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
-
- quantize_row_q4_0_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- for (int j = 0; j < QK4_0; j += 2) {
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
- hist[vi0]++;
- hist[vi1]++;
- }
- }
- }
-
- return (n/QK4_0*sizeof(block_q4_0));
-}
-
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK4_1 == 0);
- const int nb = k / QK4_1;
-
- for (int b = 0; b < n; b += k) {
- block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
-
- quantize_row_q4_1_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- for (int j = 0; j < QK4_1; j += 2) {
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
- hist[vi0]++;
- hist[vi1]++;
- }
- }
- }
-
- return (n/QK4_1*sizeof(block_q4_1));
-}
-
-size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK5_0 == 0);
- const int nb = k / QK5_0;
-
- for (int b = 0; b < n; b += k) {
- block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
-
- quantize_row_q5_0_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- uint32_t qh;
- memcpy(&qh, &y[i].qh, sizeof(qh));
-
- for (int j = 0; j < QK5_0; j += 2) {
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
-
- // cast to 16 bins
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
-
- hist[vi0]++;
- hist[vi1]++;
- }
- }
- }
-
- return (n/QK5_0*sizeof(block_q5_0));
-}
-
-size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK5_1 == 0);
- const int nb = k / QK5_1;
-
- for (int b = 0; b < n; b += k) {
- block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
-
- quantize_row_q5_1_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- uint32_t qh;
- memcpy(&qh, &y[i].qh, sizeof(qh));
-
- for (int j = 0; j < QK5_1; j += 2) {
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
-
- // cast to 16 bins
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
-
- hist[vi0]++;
- hist[vi1]++;
- }
- }
- }
-
- return (n/QK5_1*sizeof(block_q5_1));
-}
-
-size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
- assert(k % QK8_0 == 0);
- const int nb = k / QK8_0;
-
- for (int b = 0; b < n; b += k) {
- block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
-
- quantize_row_q8_0_reference(src + b, y, k);
-
- for (int i = 0; i < nb; i++) {
- for (int j = 0; j < QK8_0; ++j) {
- const int8_t vi = y[i].qs[j];
-
- hist[vi/16 + 8]++;
- }
- }
- }
-
- return (n/QK8_0*sizeof(block_q8_0));
-}
-
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
return
type == GGML_TYPE_IQ2_XXS ||
@@ -20293,177 +20166,52 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
type == GGML_TYPE_IQ1_S;
}
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
- int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
+size_t ggml_quantize_chunk(
+ enum ggml_type type,
+ const float * src,
+ void * dst,
+ int start,
+ int nrows,
+ int n_per_row,
+ const float * imatrix) {
+ const int n = nrows * n_per_row;
+
+ if (ggml_quantize_requires_imatrix(type)) {
+ GGML_ASSERT(imatrix != NULL);
+ }
+
+ GGML_ASSERT(start % type_traits[type].blck_size == 0);
+ GGML_ASSERT(start % n_per_row == 0);
+
ggml_quantize_init(type); // this is noop if already initialized
+
+ const size_t start_row = start / n_per_row;
+ const size_t row_size = ggml_row_size(type, n_per_row);
+
size_t result = 0;
- int n = nrows * n_per_row;
+
switch (type) {
- case GGML_TYPE_Q4_0:
- {
- GGML_ASSERT(start % QK4_0 == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q4_1:
- {
- GGML_ASSERT(start % QK4_1 == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q5_0:
- {
- GGML_ASSERT(start % QK5_0 == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q5_1:
- {
- GGML_ASSERT(start % QK5_1 == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q8_0:
- {
- GGML_ASSERT(start % QK8_0 == 0);
- block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
- result = ggml_quantize_q8_0(src + start, block, n, n, hist);
- } break;
- case GGML_TYPE_Q2_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q3_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q4_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q5_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_Q6_K:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ2_XXS:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- GGML_ASSERT(imatrix);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ2_XS:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- GGML_ASSERT(imatrix);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ3_XXS:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ3_S:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ2_S:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ1_S:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
- case GGML_TYPE_IQ4_NL:
+ case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
#if QK_K == 64
- case GGML_TYPE_IQ4_XS:
-#endif
- {
- GGML_ASSERT(start % QK4_NL == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
-#if QK_K != 64
- case GGML_TYPE_IQ4_XS:
- {
- GGML_ASSERT(start % QK_K == 0);
- GGML_ASSERT(start % n_per_row == 0);
- size_t start_row = start / n_per_row;
- size_t row_size = ggml_row_size(type, n_per_row);
- result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
- GGML_ASSERT(result == row_size * nrows);
- } break;
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+#else
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
#endif
case GGML_TYPE_F16:
{
@@ -20480,6 +20228,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
default:
assert(false);
}
+
+ GGML_ASSERT(result == nrows * row_size);
+
return result;
}