diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 37 |
1 files changed, 35 insertions, 2 deletions
@@ -9,6 +9,9 @@ #include "llama.h" #include "ggml.h" +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif #include <array> #include <ctime> @@ -810,6 +813,7 @@ struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.n_ctx =*/ 512, /*.n_parts =*/ -1, + /*.gpu_layers =*/ 0, /*.seed =*/ -1, /*.f16_kv =*/ false, /*.logits_all =*/ false, @@ -876,6 +880,7 @@ static void llama_model_load_internal( const std::string & fname, llama_context & lctx, int n_ctx, + int n_gpu_layers, ggml_type memory_type, bool use_mmap, bool use_mlock, @@ -1022,6 +1027,33 @@ static void llama_model_load_internal( ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); model.mapping = std::move(ml->mapping); +#ifdef GGML_USE_CUBLAS + { + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu); + + size_t vram_total = 0; + + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + + ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq); + ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk); + ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv); + ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo); + ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1); + ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2); + ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); + } + if (n_gpu_layers > (int) hparams.n_layer) { + fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); + ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); + } + + fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } +#endif // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration @@ -1032,6 +1064,7 @@ static bool llama_model_load( const std::string & fname, llama_context & lctx, int n_ctx, + int n_gpu_layers, ggml_type memory_type, bool use_mmap, bool use_mlock, @@ -1039,7 +1072,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock, + llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::string & err) { @@ -2111,7 +2144,7 @@ struct llama_context * llama_init_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type, + if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { fprintf(stderr, "%s: failed to load model\n", __func__); |