falcon : fix CUDA inference by making K and Q contiguous (#2830)

* falcon : fix CUDA inference by making K and Q contiguous ggml-ci * cuda : add assert to guard from non-cont ropes
author: Georgi Gerganov <ggerganov@gmail.com> 2023-08-27 16:40:48 +0300
committer: GitHub <noreply@github.com> 2023-08-27 16:40:48 +0300
commit: eaa13a48ff4136f01c1cdb79cacd61b67ec53095 (patch)
tree: 1e22d465164eb73b72dd6dab345987ea5691e6f2 /llama.cpp
parent: da7455d0467b5f5cc2e45d0dcffaf098df13db63 (diff)
1 files changed, 6 insertions, 4 deletions
diff --git a/llama.cpp b/llama.cpp
index e9868f5d..0d12d9cc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2642,18 +2642,20 @@ static struct ggml_cgraph * llm_build_falcon(
 
             const size_t wsize = ggml_type_size(cur->type);
 
-            struct ggml_tensor * tmpq = ggml_view_3d(
+            // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
+            //       non-contiguous views is added for the rope operator
+            struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
                 ctx0, cur, n_embd_head, n_head, N,
                 wsize * n_embd_head,
                 wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                0);
+                0));
             offload_func_kq(tmpq);
 
-            struct ggml_tensor * tmpk = ggml_view_3d(
+            struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
                 ctx0, cur, n_embd_head, n_head_kv, N,
                 wsize * n_embd_head,
                 wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head *  n_head);
+                wsize * n_embd_head *  n_head));
             offload_func_kq(tmpk);
 
             struct ggml_tensor * tmpv = ggml_view_3d(
author	Georgi Gerganov <ggerganov@gmail.com>	2023-08-27 16:40:48 +0300
committer	GitHub <noreply@github.com>	2023-08-27 16:40:48 +0300
commit	eaa13a48ff4136f01c1cdb79cacd61b67ec53095 (patch)
tree	1e22d465164eb73b72dd6dab345987ea5691e6f2 /llama.cpp
parent	da7455d0467b5f5cc2e45d0dcffaf098df13db63 (diff)