ggml : full ALiBi support (#7192)

* ggml : full ALiBi support * ggml : update ggml_soft_max_ext() CUDA, SYCL * ggml : ggml_flash_attn_ext() support ALiBi (CPU) * ggml : ggml_flash_attn_ext() support ALiBi (Metal) * ggml : fix warning * ggml : ggml_flash_attn_ext() support ALiBi (CUDA) ggml-ci * ggml : fix assert message * vulkan : add dev notes * ggml : require mask when using ALiBi ggml-ci * convert : fix convert for refact models
author: Georgi Gerganov <ggerganov@gmail.com> 2024-05-11 10:32:41 +0300
committer: GitHub <noreply@github.com> 2024-05-11 10:32:41 +0300
commit: 9cb317f77e53067f7a138cc89ef7657148eae8e6 (patch)
tree: 3ba1d2d80d1d7c8b4ab01f6396a3febaae26e91b /ggml.h
parent: e849648888a11de13aaaa4cb2eda3f5a9c7b444d (diff)
1 files changed, 3 insertions, 15 deletions
diff --git a/ggml.h b/ggml.h
index fe605382..76c33283 100644
--- a/ggml.h
+++ b/ggml.h
@@ -468,7 +468,6 @@ extern "C" {
         GGML_OP_SOFT_MAX_BACK,
         GGML_OP_ROPE,
         GGML_OP_ROPE_BACK,
-        GGML_OP_ALIBI,
         GGML_OP_CLAMP,
         GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_IM2COL,
@@ -1428,15 +1427,13 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
-    // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
+    // fused soft_max(a*scale + mask*(ALiBi slope))
     // mask is optional
-    // pos is required when max_bias > 0.0f
     // max_bias = 0.0f for no ALiBi
     GGML_API struct ggml_tensor * ggml_soft_max_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * mask,
-            struct ggml_tensor  * pos,
             float                 scale,
             float                 max_bias);
 
@@ -1538,16 +1535,6 @@ extern "C" {
             float                 xpos_base,
             bool                  xpos_down);
 
-    // alibi position embedding
-    // in-place, returns view(a)
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past,
-            int                   n_head,
-            float                 bias_max),
-        "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
-
     // clamp
     // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_clamp(
@@ -1744,7 +1731,8 @@ extern "C" {
             struct ggml_tensor  * k,
             struct ggml_tensor  * v,
             struct ggml_tensor  * mask,
-            float                 scale);
+            float                 scale,
+            float                 max_bias);
 
     GGML_API void ggml_flash_attn_ext_set_prec(
             struct ggml_tensor * a,
author	Georgi Gerganov <ggerganov@gmail.com>	2024-05-11 10:32:41 +0300
committer	GitHub <noreply@github.com>	2024-05-11 10:32:41 +0300
commit	9cb317f77e53067f7a138cc89ef7657148eae8e6 (patch)
tree	3ba1d2d80d1d7c8b4ab01f6396a3febaae26e91b /ggml.h
parent	e849648888a11de13aaaa4cb2eda3f5a9c7b444d (diff)