diff options
author | Johannes Gäßler <johannesg@5d6.de> | 2023-05-20 14:19:28 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-20 15:19:28 +0300 |
commit | affc76edfdefa7b326f526e463cc65ff13fcfb92 (patch) | |
tree | 6f197652f2d8cba9d585fc0d1baab3733421c623 /llama.cpp | |
parent | ea600071cb005267e9e8f2629c1e406dd5fde083 (diff) |
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 197 |
1 files changed, 118 insertions, 79 deletions
@@ -1,6 +1,7 @@ // Defines fileno on msys: #ifndef _GNU_SOURCE #define _GNU_SOURCE +#include <cstddef> #include <cstdint> #include <cstdio> #endif @@ -645,7 +646,7 @@ struct llama_model_loader { } } - struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) { + struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) { auto it = tensors_map.name_to_idx.find(name); if (it == tensors_map.name_to_idx.end()) { throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); @@ -656,10 +657,10 @@ struct llama_model_loader { name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); } - return get_tensor_for(lt); + return get_tensor_for(lt, backend); } - struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { + struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) { struct ggml_tensor * tensor; if (lt.ne.size() == 2) { tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); @@ -669,6 +670,7 @@ struct llama_model_loader { } ggml_set_name(tensor, lt.name.c_str()); LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + tensor->backend = backend; lt.ggml_tensor = tensor; num_ggml_tensors_created++; return tensor; @@ -682,12 +684,16 @@ struct llama_model_loader { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; + size_t prefetch_size = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { data_size += lt.size; + if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { + prefetch_size += lt.size; + } } if (use_mmap) { - mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); + mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size)); if (!lmlock) { // Don't call the callback since the actual loading will be lazy // and we can't measure it. @@ -700,6 +706,9 @@ struct llama_model_loader { size_t done_size = 0; for (llama_load_tensor & lt : tensors_map.tensors) { + if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) { + continue; + } if (progress_callback) { progress_callback((float) done_size / data_size, progress_callback_user_data); } @@ -712,9 +721,6 @@ struct llama_model_loader { lmlock->grow_to(done_size); } } - if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); - } } void load_data_for(llama_load_tensor & lt) { @@ -969,27 +975,7 @@ static void llama_model_load_internal( size_t ctx_size; size_t mmapped_size; ml->calc_sizes(&ctx_size, &mmapped_size); - fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0); - - // print memory requirements - { - const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; - - // this is the total memory required to run the inference - const size_t mem_required = - ctx_size + - mmapped_size + - MEM_REQ_SCRATCH0().at(model.type) + - MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at(model.type); - - // this is the memory required by one llama_state - const size_t mem_required_state = - scale*MEM_REQ_KV_SELF().at(model.type); - - fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, - mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - } + fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); // create the ggml context { @@ -1011,7 +997,14 @@ static void llama_model_load_internal( } } +#ifdef GGML_USE_CUBLAS +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA +#else +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU +#endif + // prepare memory for the weights + size_t vram_total = 0; { const uint32_t n_embd = hparams.n_embd; const uint32_t n_layer = hparams.n_layer; @@ -1019,70 +1012,122 @@ static void llama_model_load_internal( ml->ggml_ctx = ctx; - model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); - model.norm = ml->get_tensor("norm.weight", {n_embd}); - model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); + model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU); + + // "output" tensor + { + ggml_backend backend_output; + if (n_gpu_layers > int(n_layer)) { // NOLINT + backend_output = LLAMA_BACKEND_OFFLOAD; + } else { + backend_output = GGML_BACKEND_CPU; + } + + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); + } + + const int i_gpu_start = n_layer - n_gpu_layers; model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + auto & layer = model.layers[i]; std::string layers_i = "layers." + std::to_string(i); - layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); + layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); - layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); - layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); - layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); - layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend); - layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); + layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); - layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); - layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); - layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend); + + if (backend == GGML_BACKEND_CUDA) { + vram_total += + ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) + + ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } } } ml->done_getting_tensors(); - // populate `tensors_by_name` - for (llama_load_tensor & lt : ml->tensors_map.tensors) { - model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); - } + // print memory requirements + { + const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; - ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); + // this is the total memory required to run the inference + const size_t mem_required = + ctx_size + + mmapped_size - vram_total + // weights in VRAM not in memory + MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH1().at(model.type) + + MEM_REQ_EVAL().at(model.type); + + // this is the memory required by one llama_state + const size_t mem_required_state = + scale*MEM_REQ_KV_SELF().at(model.type); + + fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, + mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - model.mapping = std::move(ml->mapping); #ifdef GGML_USE_CUBLAS - { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu); + if (n_gpu_layers > (int) hparams.n_layer) { + fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); + } + fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); +#else + (void) n_gpu_layers; +#endif + } - size_t vram_total = 0; + // populate `tensors_by_name` + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); + } - for (int i = 0; i < n_gpu; ++i) { - const auto & layer = model.layers[i]; + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); - ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq); - ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk); - ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv); - ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo); - ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1); - ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2); - ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); +#ifdef GGML_USE_CUBLAS + { + size_t done_size = 0; + size_t data_size = 0; + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + data_size += lt.size; + if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { + done_size += lt.size; + } } - if (n_gpu_layers > (int) hparams.n_layer) { - fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); - ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) { + continue; + } + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off); + done_size += lt.size; } + } +#endif // GGML_USE_CUBLAS - fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); } -#else - (void) n_gpu_layers; -#endif + + model.mapping = std::move(ml->mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration @@ -1181,10 +1226,8 @@ static bool llama_eval_internal( { cur = ggml_rms_norm(ctx0, inpL); - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].attention_norm, cur), - cur); + // cur = cur*attention_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm); } // self-attention @@ -1291,10 +1334,8 @@ static bool llama_eval_internal( { cur = ggml_rms_norm(ctx0, inpFF); - // cur = ffn_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ffn_norm, cur), - cur); + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, @@ -1331,10 +1372,8 @@ static bool llama_eval_internal( inpL = ggml_rms_norm(ctx0, inpL); - // inpL = norm*inpL - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model.norm, inpL), - inpL); + // inpL = inpL*norm(broadcasted) + inpL = ggml_mul(ctx0, inpL, model.norm); embeddings = inpL; } @@ -2158,7 +2197,7 @@ struct llama_context * llama_init_from_file( unsigned * cur_percentage_p = (unsigned *) ctx; unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { - ++*cur_percentage_p; + *cur_percentage_p = percentage; fprintf(stderr, "."); fflush(stderr); if (percentage >= 100) { @@ -2315,7 +2354,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // maybe this should in llama_model_loader if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0)); } } @@ -2408,7 +2447,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * } size_t idx = model_loader->tensors_map.name_to_idx[base_name]; llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; - base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); + base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); lt.data = (uint8_t *) lt.ggml_tensor->data; model_loader->load_data_for(lt); lt.ggml_tensor->data = lt.data; |