summaryrefslogtreecommitdiff
path: root/ggml/src/ggml-quants.c
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2024-10-13 13:34:30 +0300
committerGitHub <noreply@github.com>2024-10-13 13:34:30 +0300
commit910a13409463f7aedb0a92be013a1b9bb50f4859 (patch)
tree16e13e1fd3010549877408a0a62706b2bc5d5f0c /ggml/src/ggml-quants.c
parentc15de3654e0002537c8052fd6d52d879e778e88c (diff)
IQ2_KS: 2.1875 bpw non-linear quantization (#85)
* Experimenting * iq2k: Try make_qx_quants for the scale Slightly better for LLaMA-3.1, Gemma-2, slightly worse for Qwen2.5 * iq2k with make_qx_quants: adjust scale * iq2ks: basics * iq2_ks: CUDA works * iq2_ks: WIP * iq2_ks: WIP * iq2_ks: Zen4 * iq2_ks: AVX2 * iq2_ks: scalar dot product * iq2_ks: ARM_NEON * iq2_ks: Metal * iq2_ks: faster Metal LLaMA-3.1-8B: PP-512 = 475.22 ± 0.37 t/s TG-128 = 45.32 ± 0.03 t/s --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-quants.c')
-rw-r--r--ggml/src/ggml-quants.c50
1 files changed, 40 insertions, 10 deletions
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 40978ac0..a845eaf5 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -12873,7 +12873,6 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
const int * kmap_q2xs = iq2_data[gindex].map;
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
- GGML_ASSERT(quant_weights && "missing quantization weights");
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
@@ -12908,8 +12907,12 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
for (int ib = 0; ib < QK_K/32; ++ib) {
const float * xb = xbl + 32*ib;
- const float * qw = quant_weights + QK_K*ibl + 32*ib;
- for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+ if (quant_weights) {
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+ } else {
+ for (int i = 0; i < 32; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
+ }
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
for (int k = 0; k < 4; ++k) {
int nflip = 0;
@@ -13046,7 +13049,6 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
const int * kmap_q2xs = iq2_data[gindex].map;
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
- GGML_ASSERT(quant_weights && "missing quantization weights");
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
@@ -13084,8 +13086,12 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
for (int ib = 0; ib < QK_K/16; ++ib) {
const float * xb = xbl + 16*ib;
- const float * qw = quant_weights + QK_K*ibl + 16*ib;
- for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+ if (quant_weights) {
+ const float * qw = quant_weights + QK_K*ibl + 16*ib;
+ for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+ } else {
+ for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
+ }
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
for (int k = 0; k < 2; ++k) {
int nflip = 0;
@@ -13230,6 +13236,17 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t
return nrow * nblock * sizeof(block_iq2_xxs);
}
+void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int64_t k) {
+ assert(k % QK_K == 0);
+ block_iq2_xxs * restrict y = vy;
+ quantize_row_iq2_xxs_ref(x, y, k);
+}
+
+void quantize_row_iq2_xxs_ref(const float * restrict x, block_iq2_xxs * restrict y, int64_t k) {
+ assert(k % QK_K == 0);
+ quantize_iq2_xxs(x, y, 1, k, NULL);
+}
+
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row%QK_K == 0);
int64_t nblock = n_per_row/QK_K;
@@ -13242,6 +13259,17 @@ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t
return nrow * nblock * sizeof(block_iq2_xs);
}
+void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int64_t k) {
+ assert(k % QK_K == 0);
+ block_iq2_xs * restrict y = vy;
+ quantize_row_iq2_xs_ref(x, y, k);
+}
+
+void quantize_row_iq2_xs_ref(const float * restrict x, block_iq2_xs * restrict y, int64_t k) {
+ assert(k % QK_K == 0);
+ quantize_iq2_xs(x, y, 1, k, NULL);
+}
+
//
// ============================================= 3-bit using D4 lattice
//
@@ -14947,10 +14975,11 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
return false;
}
- if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && nbytes % ggml_type_size(type) != 0) {
- fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
- return false;
- }
+ // Who needs this?
+ //if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && nbytes % ggml_type_size(type) != 0) {
+ // fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
+ // return false;
+ //}
const size_t nb = nbytes/ggml_type_size(type);
@@ -15160,6 +15189,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
} break;
case GGML_TYPE_Q6_0: break;
case GGML_TYPE_IQ2_K: break;
+ case GGML_TYPE_IQ2_KS: break;
case GGML_TYPE_IQ3_K: break;
case GGML_TYPE_IQ4_K: break;
case GGML_TYPE_IQ5_K: break;