diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2024-10-02 15:22:13 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-02 15:22:13 +0300 |
commit | cce49832c1b81b4e535e78ff308417ef3a386b18 (patch) | |
tree | 33b10f9344f4656d58cd3ea068233ba75888498d /ggml/src/ggml-quants.c | |
parent | d6909ed6f00f91f20c9ef628085a1a1a6a55c453 (diff) |
Adding Q6_0 (#77)
* Adding q6_0 - basics + AVX2/Zen4 working
* Adding q6_0: CUDA dequantize works, but not mmvq
* Adding q6_0: CUDA mmvq works
* Adding q6_0: CUDA cpy, so Q6_0 can be used for KV-cache
* Add q6_0 to CPU flash attention
Disappointing result: for LlaMA-3.2-1B, q6_0 K- and V-cache
gives about the same PPL as q8_0 K-cache and q4_0 V-cache,
while needing the exact same RAM.
I.e., what was the point?
* q6_0: slightly better kv-cache result
Better than q8_0+q4_0, but not as good as q8_0+iq4_nl
* q6_0: works on ARM_NEON
* q6_0: dequantize works on Metal, but not vector dot product
* q6_0: it now works on Metal
Outperforms q5_0 by a significant margin. E.g.
| model | size | params | backend | ngl | threads | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | ------------: | ---------------: |
| llama 8B Q6_0 | 6.08 GiB | 8.03 B | Metal | 100 | 4 | tg128 | 44.02 ± 0.08 |
| llama 8B Q5_0 | 5.21 GiB | 8.03 B | Metal | 100 | 4 | tg128 | 40.13 ± 0.12 |
| llama 8B Q6_0 | 6.08 GiB | 8.03 B | Metal | 100 | 4 | pp512 | 500.55 ± 0.32 |
| llama 8B Q5_0 | 5.21 GiB | 8.03 B | Metal | 100 | 4 | pp512 | 448.02 ± 0.27 |
* q6_0: can now be used for kv-cache on Metal
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-quants.c')
-rw-r--r-- | ggml/src/ggml-quants.c | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index bef2f73e..f5fff22e 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -848,6 +848,59 @@ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) { quantize_row_q5_1_ref(x, y, k); } +void quantize_row_q6_0_ref(const float * restrict x, block_q6_0 * restrict y, int64_t k) { + static const int qk = QK6_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -32; + const float id = d ? 1.0f/d : 0.0f; + + //y[i].d = GGML_FP32_TO_FP16(d); + memset(y[i].qh, 0, qk/4); + + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + const float w0 = x0*x0; + const float w1 = x1*x1; + + const uint8_t xi0 = MIN(63, (int8_t)(x0 + 32.5f)); + const uint8_t xi1 = MIN(63, (int8_t)(x1 + 32.5f)); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2); + y[i].qh[j%(qk/4)] |= (h << 4*(j/(qk/4))); + + const float q0 = (float)xi0 - 32.f; + const float q1 = (float)xi1 - 32.f; + sumqx += w0*x[i*qk + j]*q0 + w1*x[i*qk + qk/2 + j]*q1; + sumq2 += w0*q0*q0 + w1*q1*q1; + } + y[i].d = sumq2 > 0 ? GGML_FP32_TO_FP16(sumqx/sumq2) : GGML_FP32_TO_FP16(d); + } +} + +void quantize_row_q6_0(const float * restrict x, void * restrict y, int64_t k) { + quantize_row_q6_0_ref(x, y, k); +} + // reference implementation for deterministic creation of model files void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) { assert(k % QK8_0 == 0); @@ -1691,6 +1744,28 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6 } } +void dequantize_row_q6_0(const block_q6_0 * restrict x, float * restrict y, int64_t k) { + static const int qk = QK6_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t h = x[i].qh[j%(qk/4)] >> 4*(j/(qk/4)); + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | ((h << 4) & 0x30)) - 32; + const int32_t x1 = ((x[i].qs[j] >> 4) | ((h << 2) & 0x30)) - 32; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) { static const int qk = QK8_0; @@ -3429,6 +3504,54 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } +static void quantize_row_q6_0_impl(const float * restrict x, block_q6_0 * restrict y, int64_t n_per_row, const float * quant_weights) { + static_assert(QK6_0 == 32, "QK6_0 must be 32"); + + float weight[QK6_0]; + int8_t L[QK6_0]; + + float sigma2 = 0; + if (quant_weights) { + float sum_x2 = 0; + for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; + sigma2 = sum_x2/n_per_row; + } + + const int64_t nb = n_per_row/QK6_0; + for (int ib = 0; ib < nb; ++ib) { + const float * xb = x + QK6_0 * ib; + if (quant_weights) { + const float * qw = quant_weights + QK6_0 * ib; + for (int j = 0; j < QK6_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < QK6_0; ++j) weight[j] = xb[j]*xb[j]; + } + float d = make_qx_quants(QK6_0, 32, xb, L, 1, weight); + y[ib].d = GGML_FP32_TO_FP16(d); + + memset(y[ib].qh, 0, QK6_0/4); + + for (int j = 0; j < 16; ++j) { + const uint8_t xi0 = L[j]; + const uint8_t xi1 = L[j+16]; + y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2); + y[ib].qh[j%8] |= (h << 4*(j/8)); + } + } +} + +size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + size_t row_size = ggml_row_size(GGML_TYPE_Q6_0, n_per_row); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q6_0_impl(src, (block_q6_0*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} + size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { (void)quant_weights; // not used const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row); @@ -5383,6 +5506,21 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r *s = sumf; } +void ggml_vec_dot_q6_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +#if GGML_USE_IQK_MULMAT +#ifdef __AVX2__ + const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1; +#else + const enum ggml_type vec_dot_type = GGML_TYPE_Q8_0; +#endif + if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q6_0, vx, bx, vec_dot_type, vy, by, s, bs, 0, 1)) { + return; + } +#endif + // TODO + *s = 0; +} + void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { #if GGML_USE_IQK_MULMAT if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q8_0, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) { @@ -15020,6 +15158,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; + case GGML_TYPE_Q6_0: break; case GGML_TYPE_IQ2_K: break; case GGML_TYPE_IQ3_K: break; case GGML_TYPE_IQ4_K: break; |