diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-08-27 16:40:48 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-27 16:40:48 +0300 |
commit | eaa13a48ff4136f01c1cdb79cacd61b67ec53095 (patch) | |
tree | 1e22d465164eb73b72dd6dab345987ea5691e6f2 /llama.cpp | |
parent | da7455d0467b5f5cc2e45d0dcffaf098df13db63 (diff) |
falcon : fix CUDA inference by making K and Q contiguous (#2830)
* falcon : fix CUDA inference by making K and Q contiguous
ggml-ci
* cuda : add assert to guard from non-cont ropes
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 10 |
1 files changed, 6 insertions, 4 deletions
@@ -2642,18 +2642,20 @@ static struct ggml_cgraph * llm_build_falcon( const size_t wsize = ggml_type_size(cur->type); - struct ggml_tensor * tmpq = ggml_view_3d( + // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for + // non-contiguous views is added for the rope operator + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head, N, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0); + 0)); offload_func_kq(tmpq); - struct ggml_tensor * tmpk = ggml_view_3d( + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, N, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head); + wsize * n_embd_head * n_head)); offload_func_kq(tmpk); struct ggml_tensor * tmpv = ggml_view_3d( |