summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorJohannes Gäßler <johannesg@5d6.de>2023-05-13 15:38:36 +0200
committerGitHub <noreply@github.com>2023-05-13 16:38:36 +0300
commit905d87b70aa189623d500a28602d7a3a755a4769 (patch)
tree11f0d435ecb7555734b14b7a8994e88772bf8190 /llama.cpp
parentf954edda935a70a14cf0cc45ecc7fe7d60cf3e4b (diff)
ggml : GPU-accelerated token generation (#1412)
* CUDA kernel for q4_0 dequant. + mat. vec. mult. * Added q4_1 via template * Added missing __syncthreads(); * --gpu_layers -> --gpu-layers * Shorter dequantize_mul_mat_vec line * q5_0 dequantize_mul_mat kernel * More readable dequantize_mul_mat_vec logic * dequantize_mul_mat_vec kernels for q5_1, q8_0, f16 * llama : offload "output" tensor to GPU too + coding style fixes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp37
1 files changed, 35 insertions, 2 deletions
diff --git a/llama.cpp b/llama.cpp
index 08c73523..73b932a7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9,6 +9,9 @@
#include "llama.h"
#include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
#include <array>
#include <ctime>
@@ -810,6 +813,7 @@ struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.n_ctx =*/ 512,
/*.n_parts =*/ -1,
+ /*.gpu_layers =*/ 0,
/*.seed =*/ -1,
/*.f16_kv =*/ false,
/*.logits_all =*/ false,
@@ -876,6 +880,7 @@ static void llama_model_load_internal(
const std::string & fname,
llama_context & lctx,
int n_ctx,
+ int n_gpu_layers,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
@@ -1022,6 +1027,33 @@ static void llama_model_load_internal(
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
model.mapping = std::move(ml->mapping);
+#ifdef GGML_USE_CUBLAS
+ {
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+
+ size_t vram_total = 0;
+
+ for (int i = 0; i < n_gpu; ++i) {
+ const auto & layer = model.layers[i];
+
+ ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+ ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+ ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+ ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+ ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+ ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+ ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+ }
+ if (n_gpu_layers > (int) hparams.n_layer) {
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+ ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+ }
+
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+ }
+#endif
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
@@ -1032,6 +1064,7 @@ static bool llama_model_load(
const std::string & fname,
llama_context & lctx,
int n_ctx,
+ int n_gpu_layers,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
@@ -1039,7 +1072,7 @@ static bool llama_model_load(
llama_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
vocab_only, progress_callback, progress_callback_user_data);
return true;
} catch (const std::string & err) {
@@ -2111,7 +2144,7 @@ struct llama_context * llama_init_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
params.use_mmap, params.use_mlock, params.vocab_only,
params.progress_callback, params.progress_callback_user_data)) {
fprintf(stderr, "%s: failed to load model\n", __func__);