diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2024-09-27 08:16:06 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-09-27 08:16:06 +0300 |
commit | 6dec4af4b6e65eb72e646a6f8b10d77c9d306281 (patch) | |
tree | b69a6dfdd024ccf6a4d7490666664cbac4bc65ce /ggml/src/ggml.c | |
parent | 546f3ef349a7082fbc349897c3c7246baed2a6c6 (diff) |
Adding ability to have meta data per tensor row (#61)
* POC: per row scale
This is a POC how to work around opinionated ggml to
have scales per row rather than per block.
Only implemened for Zen4 and only for iq2_tn.
* POC per row scale: iq2_tn on NEON
* POC per row scale: iq2_tn on Metal
* Per row scale Metal templates
* iq1_tn: shrink to 1.625 bpw (NEON and Metal)
* POC per row scale: CUDA
* POC per row scale: add CUDA TODOs
There are two places in ggml-cuda.cu left where it is assumed
that type_size * n_per_row / block_size is the way to compute
and handle row sizes. This does not affect simple usage,
but will lead to issues when tensors are split between GPUs.
* Per row scales - CUDA
The only place left where there are unnecessary assumptions being made
is in the Flash Attention code. As we are not using any quants that
use per row scales for quantized KV cache, it should be OK for now.
* Update IQ1_TN and IQ2_TN bpw shown to user
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml.c')
-rw-r--r-- | ggml/src/ggml.c | 84 |
1 files changed, 64 insertions, 20 deletions
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 08b292b7..2804accd 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -651,24 +651,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(int8_t), .is_quantized = false, + .row_meta_size = 0, }, [GGML_TYPE_I16] = { .type_name = "i16", .blck_size = 1, .type_size = sizeof(int16_t), .is_quantized = false, + .row_meta_size = 0, }, [GGML_TYPE_I32] = { .type_name = "i32", .blck_size = 1, .type_size = sizeof(int32_t), .is_quantized = false, + .row_meta_size = 0, }, [GGML_TYPE_I64] = { .type_name = "i64", .blck_size = 1, .type_size = sizeof(int64_t), .is_quantized = false, + .row_meta_size = 0, }, [GGML_TYPE_F64] = { .type_name = "f64", @@ -676,6 +680,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(double), .is_quantized = false, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_F32] = { .type_name = "f32", @@ -685,6 +690,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot_type = GGML_TYPE_F32, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_F16] = { .type_name = "f16", @@ -697,6 +703,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q4_0] = { .type_name = "q4_0", @@ -717,6 +724,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { #else .nrows = 1, #endif + .row_meta_size = 0, }, [GGML_TYPE_Q4_1] = { .type_name = "q4_1", @@ -733,6 +741,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { #else .nrows = 1, #endif + .row_meta_size = 0, }, [4] = { // GGML_TYPE_Q4_2 .type_name = "DEPRECATED", @@ -745,6 +754,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, .nrows = 1, + .row_meta_size = 0, }, [5] = { // GGML_TYPE_Q4_3 .type_name = "DEPRECATED", @@ -757,6 +767,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q5_0] = { .type_name = "q5_0", @@ -773,6 +784,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, #endif .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q5_1] = { .type_name = "q5_1", @@ -785,6 +797,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q8_0] = { .type_name = "q8_0", @@ -806,6 +819,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { #else .nrows = 1, #endif + .row_meta_size = 0, }, [GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -816,6 +830,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref, .vec_dot_type = GGML_TYPE_Q8_1, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q2_K] = { .type_name = "q2_K", @@ -828,6 +843,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q3_K] = { .type_name = "q3_K", @@ -840,6 +856,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q4_K] = { .type_name = "q4_K", @@ -852,6 +869,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q5_K] = { .type_name = "q5_K", @@ -864,6 +882,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q6_K] = { .type_name = "q6_K", @@ -876,6 +895,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ2_XXS] = { .type_name = "iq2_xxs", @@ -888,6 +908,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ2_XS] = { .type_name = "iq2_xs", @@ -900,6 +921,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ3_XXS] = { .type_name = "iq3_xxs", @@ -912,6 +934,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq3_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ3_S] = { .type_name = "iq3_s", @@ -924,6 +947,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq3_s_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ2_S] = { .type_name = "iq2_s", @@ -936,6 +960,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq2_s_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ1_S] = { .type_name = "iq1_s", @@ -948,6 +973,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq1_s_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ1_M] = { .type_name = "iq1_m", @@ -960,6 +986,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq1_m_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ1_BN] = { .type_name = "iq1_bn", @@ -972,6 +999,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq1_bn_q8_K64, .vec_dot_type = GGML_TYPE_Q8_K64, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ2_BN] = { .type_name = "iq2_bn", @@ -984,6 +1012,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq2_bn_q8_K64, .vec_dot_type = GGML_TYPE_Q8_K64, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ2_TN] = { .type_name = "iq2_tn", @@ -996,6 +1025,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_iq2_tn_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 4, }, [GGML_TYPE_IQ1_TN] = { .type_name = "iq1_tn", @@ -1008,6 +1038,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_iq1_tn_q8_k, .vec_dot_type = GGML_TYPE_Q8_K64, .nrows = 1, + .row_meta_size = 2, }, [GGML_TYPE_IQ4_NL] = { .type_name = "iq4_nl", @@ -1020,6 +1051,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq4_nl_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ4_XS] = { .type_name = "iq4_xs", @@ -1032,6 +1064,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq4_xs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", @@ -1039,6 +1072,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_q8_K), .is_quantized = true, .from_float = quantize_row_q8_K, + .row_meta_size = 0, }, [GGML_TYPE_Q8_K64] = { .type_name = "q8_K64", @@ -1046,6 +1080,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_q8_K64), .is_quantized = true, .from_float = quantize_row_q8_K64, + .row_meta_size = 0, }, [GGML_TYPE_BF16] = { .type_name = "bf16", @@ -1058,6 +1093,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_Q4_0_4_4] = { .type_name = "q4_0_4x4", @@ -1074,6 +1110,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .ncols = 4, .gemv = ggml_gemv_q4_0_4x4_q8_0, .gemm = ggml_gemm_q4_0_4x4_q8_0, + .row_meta_size = 0, }, [GGML_TYPE_Q4_0_4_8] = { .type_name = "q4_0_4x8", @@ -1090,6 +1127,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .ncols = 4, .gemv = ggml_gemv_q4_0_4x8_q8_0, .gemm = ggml_gemm_q4_0_4x8_q8_0, + .row_meta_size = 0, }, [GGML_TYPE_Q4_0_8_8] = { .type_name = "q4_0_8x8", @@ -1106,6 +1144,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .ncols = 8, .gemv = ggml_gemv_q4_0_8x8_q8_0, .gemm = ggml_gemm_q4_0_8x8_q8_0, + .row_meta_size = 0, }, [GGML_TYPE_IQ2_K] = { .type_name = "iq2_k", @@ -1118,6 +1157,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_iq2_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ3_K] = { .type_name = "iq3_k", @@ -1130,6 +1170,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_iq3_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ4_K] = { .type_name = "iq4_k", @@ -1142,6 +1183,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_iq4_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ5_K] = { .type_name = "iq5_k", @@ -1154,6 +1196,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_iq5_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, [GGML_TYPE_IQ6_K] = { .type_name = "iq6_k", @@ -1166,6 +1209,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_iq6_k_q8_k, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .row_meta_size = 0, }, }; @@ -3585,6 +3629,10 @@ GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) { return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } +GGML_CALL int64_t ggml_blck_size(enum ggml_type type) { + return type_traits[type].blck_size; +} + GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) { size_t nbytes; size_t blck_size = ggml_blck_size(tensor->type); @@ -3595,7 +3643,7 @@ GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) { } } else { - nbytes = tensor->ne[0]*tensor->nb[0]/blck_size; + nbytes = tensor->nb[1]; //tensor->ne[0]*tensor->nb[0]/blck_size; for (int i = 1; i < GGML_MAX_DIMS; ++i) { nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; } @@ -3608,17 +3656,13 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } -GGML_CALL int64_t ggml_blck_size(enum ggml_type type) { - return type_traits[type].blck_size; -} - GGML_CALL size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) { assert(ne % ggml_blck_size(type) == 0); - return ggml_type_size(type)*ne/ggml_blck_size(type); + return type_traits[type].row_meta_size + ggml_type_size(type)*ne/ggml_blck_size(type); } double ggml_type_sizef(enum ggml_type type) { @@ -3764,7 +3808,7 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) { if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) { return false; } - next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type); + next_nb = ggml_row_size(tensor->type, tensor->ne[0]); //next_nb*tensor->ne[0]/ggml_blck_size(tensor->type) + type_traits[tensor->type].row_meta_size; for (int i = 1; i < GGML_MAX_DIMS; i++) { if (tensor->ne[i] != 1) { if (i > n) { @@ -4227,7 +4271,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( } result->nb[0] = ggml_type_size(type); - result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type)); + result->nb[1] = ggml_row_size(type, ne[0]); for (int i = 2; i < GGML_MAX_DIMS; i++) { result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; } @@ -13023,8 +13067,8 @@ static void ggml_compute_forward_mul_mat( for (int64_t i12 = 0; i12 < ne12; i12++) { if (counter++ % nth == ith) { if (!iqk_mul_mat(ne01, ne11, ne00, - src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type), - src1->type, (const char *)src1->data + i12*nb12 + i13*nb13, nb11/ggml_type_size(src1->type), + src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01, ///ggml_type_size(src0->type), + src1->type, (const char *)src1->data + i12*nb12 + i13*nb13, nb11, ///ggml_type_size(src1->type), (float *)((char *)dst->data + i12*nb2 + i13*nb3), nb1/ggml_type_size(dst->type), 0, 1)) goto IQK_MulMat_Not_Available1; } @@ -13036,8 +13080,8 @@ static void ggml_compute_forward_mul_mat( for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) if (!iqk_mul_mat(ne01, ne11, ne00, - src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type), - src1->type, (const char *)src1->data + i12*nb12 + i13*nb13, nb11/ggml_type_size(src1->type), + src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01, ///ggml_type_size(src0->type), + src1->type, (const char *)src1->data + i12*nb12 + i13*nb13, nb11, ///ggml_type_size(src1->type), (float *)((char *)dst->data + i12*nb2 + i13*nb3), nb1/ggml_type_size(dst->type), ith, nth)) goto IQK_MulMat_Not_Available1; return; @@ -13123,8 +13167,8 @@ UseGgmlGemm1:; for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) if (!iqk_mul_mat(ne01, ne11, ne00, - src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type), - vec_dot_type, (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, row_size/ggml_type_size(vec_dot_type), + src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01, ///ggml_type_size(src0->type), + vec_dot_type, (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, row_size, ///ggml_type_size(vec_dot_type), (float *)((char *)dst->data + i12*nb2 + i13*nb3), nb1/ggml_type_size(dst->type), ith, nth)) goto IQK_MulMat_Not_Available2; return; @@ -13353,8 +13397,8 @@ static void ggml_compute_forward_mul_mat_id( #if GGML_USE_IQK_MULMAT if (ne13 == 1 && dst->type == GGML_TYPE_F32) { if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, - src0->type, (const char *)src0_cur, nb01/ggml_type_size(src0->type), - vec_dot_type, (const char *)wdata, row_size/ggml_type_size(vec_dot_type), + src0->type, (const char *)src0_cur, nb01, ///ggml_type_size(src0->type), + vec_dot_type, (const char *)wdata, row_size, ///ggml_type_size(vec_dot_type), (float *)dst->data, nb1, nb2, matrix_rows + cur_a*ne12, ith, nth)) goto IQK_MulMat_Not_Available; continue; @@ -13870,7 +13914,7 @@ static void ggml_compute_forward_softcap( default: { GGML_ASSERT(false); - } break; + } } } @@ -13986,7 +14030,7 @@ static void ggml_compute_forward_softcap_max( default: { GGML_ASSERT(false); - } break; + } } } @@ -18652,11 +18696,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_SOFTCAP: { GGML_ASSERT(false); // TODO: not implemented - } break; + } case GGML_OP_SOFT_CAP_MAX: { GGML_ASSERT(false); // TODO: not implemented - } break; + } case GGML_OP_SET: { const size_t nb1 = ((int32_t *) tensor->op_params)[0]; |