diff options
author | Meng Zhang <meng@tabbyml.com> | 2023-11-05 04:40:08 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-05 14:40:08 +0200 |
commit | 3d48f42efcd05381221654376e9f6f69d76af739 (patch) | |
tree | 56ea096ffb23eb86e66a97ac04faccefb4e63b3b | |
parent | c41ea36eaa3548776de4cb3d5d49b925cd3fc0f2 (diff) |
llama : mark LLM_ARCH_STARCODER as full offload supported (#3945)
as done in https://github.com/ggerganov/llama.cpp/pull/3827
-rw-r--r-- | llama.cpp | 11 |
1 files changed, 6 insertions, 5 deletions
@@ -5164,11 +5164,12 @@ static int llama_decode_internal( // If all tensors can be run on the GPU then using more than 1 thread is detrimental. const bool full_offload_supported = - model.arch == LLM_ARCH_LLAMA || - model.arch == LLM_ARCH_BAICHUAN || - model.arch == LLM_ARCH_FALCON || - model.arch == LLM_ARCH_REFACT || - model.arch == LLM_ARCH_MPT; + model.arch == LLM_ARCH_LLAMA || + model.arch == LLM_ARCH_BAICHUAN || + model.arch == LLM_ARCH_FALCON || + model.arch == LLM_ARCH_REFACT || + model.arch == LLM_ARCH_MPT || + model.arch == LLM_ARCH_STARCODER; const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { |