summaryrefslogtreecommitdiff
path: root/ggml-cuda.cu
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-08-27 16:40:48 +0300
committerGitHub <noreply@github.com>2023-08-27 16:40:48 +0300
commiteaa13a48ff4136f01c1cdb79cacd61b67ec53095 (patch)
tree1e22d465164eb73b72dd6dab345987ea5691e6f2 /ggml-cuda.cu
parentda7455d0467b5f5cc2e45d0dcffaf098df13db63 (diff)
falcon : fix CUDA inference by making K and Q contiguous (#2830)
* falcon : fix CUDA inference by making K and Q contiguous ggml-ci * cuda : add assert to guard from non-cont ropes
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r--ggml-cuda.cu2
1 files changed, 2 insertions, 0 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index d83aefc9..d76a25dc 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6337,9 +6337,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
const int mode = ((int32_t *) dst->op_params)[2];
const bool is_glm = mode & 4;
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
}