Fix Zen4 Flash Attention (#35)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <48489457+ikawrakow@users.noreply.github.com> 2024-09-02 15:54:24 +0300
committer: GitHub <noreply@github.com> 2024-09-02 15:54:24 +0300
commit: 9b53c2533fb8c236f319b874c5ff592de8fcd3b4 (patch)
tree: 0a66009e83a426cca3b9bcad0053ea46010f77a9
parent: 5518e24be82b278506154a1a1bf871d10e47821e (diff)
2 files changed, 12 insertions, 5 deletions
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 4546eac3..771bc8ca 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -16154,11 +16154,12 @@ static void ggml_compute_forward_flash_attn_ext_f16(
         mask && mask->type == GGML_TYPE_F16) {
         int64_t work_per_slice = D*nek1*neq1;
         int ntg = 1;
-        if      (nth%8 == 0 && work_per_slice >= (1 << 23)) ntg = 8;
-        else if (nth%4 == 0 && work_per_slice >= (1 << 21)) ntg = 4;
-        else if (nth%2 == 0 && work_per_slice >= (1 << 19)) ntg = 2;
+        if      (nth%8 == 0 && neq1%8 == 0 && work_per_slice >= (1 << 23)) ntg = 8;
+        else if (nth%4 == 0 && neq1%4 == 0 && work_per_slice >= (1 << 21)) ntg = 4;
+        else if (nth%2 == 0 && neq1%2 == 0 && work_per_slice >= (1 << 19)) ntg = 2;
         if ((neq2*neq3)%(nth/ntg) == 0) {
-            //if (ith == 0) printf("%s: D = %d, neq2 = %d, neq1 = %d, nek1 = %d\n", __func__, (int)D, (int)neq2, (int)neq1, (int)nek1);
+            //if (ith == 0) printf("%s: D = %d, neq2 = %d, neq1 = %d, nek1 = %d, ntg = %d, neq1/ntg = %d\n", __func__,
+            //        (int)D, (int)neq2, (int)neq1, (int)nek1, ntg, (int)(neq1/ntg));
             int counter = 0;
             for (int64_t iq3 = 0; iq3 < neq3; iq3++) {
                 for (int64_t iq2 = 0; iq2 < neq2; iq2++) {
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index 55dd016c..7dd817b9 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -6120,7 +6120,7 @@ struct FlashAttn {
         }
         auto qr = q + m1*stride_q;
         auto vsum = _mm512_mul_ps(vk[0], _mm512_loadu_ps(qr));
-        for (int i = 0; i < D/16; ++i) {
+        for (int i = 1; i < D/16; ++i) {
             vsum = _mm512_fmadd_ps(vk[i], _mm512_loadu_ps(qr + 16*i), vsum);
         }
         cache[k_step*m1 + l1] = _mm512_reduce_add_ps(vsum);
@@ -6138,6 +6138,11 @@ struct FlashAttn {
         }
 
         float smax = reduce_T<_mm512_reduce_max_ps, _mm512_max_ps>(vk);
+        if (smax == -INFINITY) {
+            std::memset(cache + k_step*j, 0, k_step*sizeof(float));
+            need_scaling[j] = M[j] == -INFINITY ? 2 : 0;
+            return;
+        }
         need_scaling[j] = 0;
         if (smax > M[j]) {
             if (M[j] > -INFINITY) {
@@ -6404,6 +6409,7 @@ template <int D, int q_step, int k_step>
 inline void iqk_flash_helper_T(int nq1, int nk1, int stride_q, int stride_k, int stride_v, int stride_m, int stride_qkv,
                         const float * q, const char * k, const char * v, const char * mask,
                         float scale, float softcap, float * qkv) {
+
     if (nq1 >= q_step) {
         FlashAttn<D, q_step, k_step> fa(scale, softcap);
         fa.compute(nq1, nk1, stride_k, stride_q, stride_m, stride_v, stride_qkv,
author	Kawrakow <48489457+ikawrakow@users.noreply.github.com>	2024-09-02 15:54:24 +0300
committer	GitHub <noreply@github.com>	2024-09-02 15:54:24 +0300
commit	9b53c2533fb8c236f319b874c5ff592de8fcd3b4 (patch)
tree	0a66009e83a426cca3b9bcad0053ea46010f77a9
parent	5518e24be82b278506154a1a1bf871d10e47821e (diff)