iqk_mul_mat: attentions matrix multiplications

K*Q and KQ*V are n_kv_embed x n_token x n_head matrix multiplications. Before this PR, this meant n_head calls to iqk_mul_mat to perform n_kv_embed x n_token 2D multiplications, each using nth threads. Instead, in this PR, if n_head is a multiple of nth, each thread does n_head/nth multiplications of the n_kv_embed x n_token 2D matrices. This improves PP-512(32 threads) for Bitnet-3B to 433 t/s up from 409 t/s. It is beneficial in other cases too. E.g., for LLaMA-7B, we go to 201 t/s up from 193 t/s for q4_K_S, and to 144 t/s up from 139 t/s for fp16. All these numbers are for the Ryzen-7950X CPU.
author: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-07-18 14:00:56 +0300
committer: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-07-18 14:00:56 +0300
commit: 8db01c0804b603cb76bbee82ebb1a144c8d3592e (patch)
tree: c668a7fbf539881c1f2508829973f914f1f8f5a1
parent: 744eb9ffa955fa3557cc835995e45448c3c06bcb (diff)
1 files changed, 16 insertions, 1 deletions
diff --git a/ggml.c b/ggml.c
index a15d41ad..9a83059d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -12368,7 +12368,22 @@ static void ggml_compute_forward_mul_mat(
     //   compute by src0 rows
 
 #if GGML_USE_IQK_MULMAT
-    if (ggml_is_contiguous(src1) && dst->type == GGML_TYPE_F32) {
+    if (dst->type == GGML_TYPE_F32 && params->type == GGML_TASK_TYPE_COMPUTE && (ne12*ne13)%nth == 0) {
+        int counter = 0;
+        for (int64_t i13 = 0; i13 < ne13; i13++) {
+            for (int64_t i12 = 0; i12 < ne12; i12++) {
+                if (counter++ % nth == ith) {
+                    if (!iqk_mul_mat(params->type, ne01, ne11, ne00,
+                                src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type),
+                                src1->type, (const char *)src1->data + i12*nb12 + i13*nb13, nb11/ggml_type_size(src1->type),
+                                (float *)((char *)dst->data + i12*nb2 + i13*nb3), nb1/ggml_type_size(dst->type),
+                                0, 1)) goto IQK_MulMat_Not_Available1;
+                }
+            }
+        }
+        return;
+    }
+    if (dst->type == GGML_TYPE_F32) {
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
                 if (!iqk_mul_mat(params->type, ne01, ne11, ne00,
author	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-07-18 14:00:56 +0300
committer	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-07-18 14:00:56 +0300
commit	8db01c0804b603cb76bbee82ebb1a144c8d3592e (patch)
tree	c668a7fbf539881c1f2508829973f914f1f8f5a1
parent	744eb9ffa955fa3557cc835995e45448c3c06bcb (diff)