From bb50a792ec2a49944470c82694fa364345e95170 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Mon, 13 Nov 2023 01:58:15 -0700
Subject: Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading
 (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
---
 llama.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'llama.cpp')

diff --git a/llama.cpp b/llama.cpp
index d682d286..a5f3876c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2877,6 +2877,13 @@ static void llm_load_tensors(
                         ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
+#ifdef GGML_USE_CUBLAS
+                            if (n_gpu_layers > int(n_layer + 1)) {
+                                LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
+                                    __func__, n_layer + 1);
+                                throw std::runtime_error("Persimmon CUDA offload failed");
+                            }
+#endif
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                             // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-- 
cgit v1.2.3