summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorKerfuffle <44031344+KerfuffleV2@users.noreply.github.com>2023-11-13 01:58:15 -0700
committerGitHub <noreply@github.com>2023-11-13 01:58:15 -0700
commitbb50a792ec2a49944470c82694fa364345e95170 (patch)
tree1ad53a7f00d4cc76a91943a51729806db16988db /llama.cpp
parent21fd874c8d2a14dea2d56724e4357c0824aee6a8 (diff)
Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)
* Add ReLU and SQR CUDA ops to fix Persimmon offloading * Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp7
1 files changed, 7 insertions, 0 deletions
diff --git a/llama.cpp b/llama.cpp
index d682d286..a5f3876c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2877,6 +2877,13 @@ static void llm_load_tensors(
ggml_backend_type backend_output;
if (n_gpu_layers > int(n_layer)) {
+#ifdef GGML_USE_CUBLAS
+ if (n_gpu_layers > int(n_layer + 1)) {
+ LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
+ __func__, n_layer + 1);
+ throw std::runtime_error("Persimmon CUDA offload failed");
+ }
+#endif
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32