diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-03-18 15:41:05 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-03-18 15:41:05 +0100 |
commit | 8e549b42346f039be01a06495c6469d66d1c8926 (patch) | |
tree | 091738b442808081a6a5c9d565ea8339303dfb9b /ggml/src/ggml.c | |
parent | 68a5b60408b1085d2b2ed5de75e004ee23f8ddb9 (diff) |
Allow q8_0 cache on the CPU for FlashMLA-2 (#265)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml.c')
-rw-r--r-- | ggml/src/ggml.c | 64 |
1 files changed, 56 insertions, 8 deletions
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a904464e..1552d91b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -10569,6 +10569,58 @@ static void ggml_compute_forward_dup_q( const struct ggml_compute_params * params, struct ggml_tensor * dst) { + int64_t nrows = ggml_nrows(dst); + int ith = params->ith; + int nth = params->nth; + + if (dst->type == GGML_TYPE_Q8_0 && dst->src[0]->type == GGML_TYPE_Q8_0 && + ggml_are_same_shape(dst, dst->src[0])) { + + // we assume src is transposed and that's why we are here + + GGML_ASSERT(dst->ne[0] % QK8_0 == 0); + + struct ggml_tensor * const src = dst->src[0]; + GGML_ASSERT(src->nb[1] == sizeof(block_q8_0)); + + float aux[QK8_0]; + + int64_t n_per_thread = (nrows + nth - 1)/nth; + int64_t first_row = ith*n_per_thread; + if (first_row >= nrows) return; + int64_t last_row = MIN(first_row + n_per_thread, nrows); + + int64_t nblock = dst->ne[0] / QK8_0; + for (int64_t ir = first_row; ir < last_row; ++ir) { + int64_t i3 = ir/(dst->ne[1]*dst->ne[2]); + int64_t i2 = (ir - i3*dst->ne[1]*dst->ne[2])/dst->ne[1]; + int64_t i1 = ir - i3*dst->ne[1]*dst->ne[2] - i2*dst->ne[1]; + int ib0 = i1/QK8_0; + int iq0 = i1%QK8_0; + for (int ib = 0; ib < nblock; ++ib) { + block_q8_0 * dst_q8 = (block_q8_0 *)((char *)dst->data + i1*dst->nb[1] + i2*dst->nb[2] + i3*dst->nb[3]); + float amax = 0; + for (int j = 0; j < QK8_0; ++j) { + int64_t i0 = ib*QK8_0 + j; + const block_q8_0 * src_q8 = (const block_q8_0 *)((const char *)src->data + i0*src->nb[0] + i2*src->nb[2] + i3*src->nb[3]); + float xi = GGML_FP16_TO_FP32(src_q8[ib0].d) * src_q8[ib0].qs[iq0]; + aux[j] = xi; + xi = fabsf(xi); + amax = MAX(amax, xi); + } + float d = amax/127; + dst_q8[ib].d = GGML_FP32_TO_FP16(d); + if (d > 0) { + float id = 1/d; + for (int j = 0; j < QK8_0; ++j) dst_q8[ib].qs[j] = roundf(id*aux[j]); + } else { + memset(dst_q8[ib].qs, 0, QK8_0); + } + } + } + return; + } + GGML_ASSERT(dst->type == GGML_TYPE_F32); struct ggml_tensor * src0 = dst->src[0]; GGML_ASSERT(src0->ne[0] == dst->ne[0] && src0->nb[0] == ggml_type_size(src0->type)); @@ -10576,10 +10628,6 @@ static void ggml_compute_forward_dup_q( ggml_to_float_t to_float = type_traits[src0->type].to_float; GGML_ASSERT(to_float != NULL); - int64_t nrows = ggml_nrows(dst); - int ith = params->ith; - int nth = params->nth; - int64_t n_per_thread = (nrows + nth - 1)/nth; int64_t first_row = ith*n_per_thread; if (first_row >= nrows) return; @@ -10607,13 +10655,13 @@ static void ggml_compute_forward_dup( const struct ggml_tensor * src0 = dst->src[0]; - if (src0->type == dst->type) { - ggml_compute_forward_dup_bytes(params, dst); + if (ggml_is_quantized(src0->type)) { + ggml_compute_forward_dup_q(params, dst); return; } - if (ggml_is_quantized(src0->type)) { - ggml_compute_forward_dup_q(params, dst); + if (src0->type == dst->type) { + ggml_compute_forward_dup_bytes(params, dst); return; } |