summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp37
1 files changed, 35 insertions, 2 deletions
diff --git a/llama.cpp b/llama.cpp
index 08c73523..73b932a7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9,6 +9,9 @@
#include "llama.h"
#include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
#include <array>
#include <ctime>
@@ -810,6 +813,7 @@ struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.n_ctx =*/ 512,
/*.n_parts =*/ -1,
+ /*.gpu_layers =*/ 0,
/*.seed =*/ -1,
/*.f16_kv =*/ false,
/*.logits_all =*/ false,
@@ -876,6 +880,7 @@ static void llama_model_load_internal(
const std::string & fname,
llama_context & lctx,
int n_ctx,
+ int n_gpu_layers,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
@@ -1022,6 +1027,33 @@ static void llama_model_load_internal(
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
model.mapping = std::move(ml->mapping);
+#ifdef GGML_USE_CUBLAS
+ {
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+
+ size_t vram_total = 0;
+
+ for (int i = 0; i < n_gpu; ++i) {
+ const auto & layer = model.layers[i];
+
+ ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+ ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+ ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+ ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+ ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+ ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+ ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+ }
+ if (n_gpu_layers > (int) hparams.n_layer) {
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+ ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+ }
+
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+ }
+#endif
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
@@ -1032,6 +1064,7 @@ static bool llama_model_load(
const std::string & fname,
llama_context & lctx,
int n_ctx,
+ int n_gpu_layers,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
@@ -1039,7 +1072,7 @@ static bool llama_model_load(
llama_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
vocab_only, progress_callback, progress_callback_user_data);
return true;
} catch (const std::string & err) {
@@ -2111,7 +2144,7 @@ struct llama_context * llama_init_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
params.use_mmap, params.use_mlock, params.vocab_only,
params.progress_callback, params.progress_callback_user_data)) {
fprintf(stderr, "%s: failed to load model\n", __func__);