summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-08-25 11:55:59 +0300
committerGitHub <noreply@github.com>2023-08-25 11:55:59 +0300
commit3f460a2b723c8b936ac29ecfd02f244b3adeba55 (patch)
tree3159656f14a6646d745d2900452f83f9bc9ebed0 /llama.cpp
parent87e3733f24a85d894cc16e1cbdfa1ea1e81a76f3 (diff)
cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)
* cuda : add RoPE kernel for mode == 2 (NeoX) * falcon : do not offload the embeddings layer
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp22
1 files changed, 21 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index 67319396..52ba31d7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1958,6 +1958,14 @@ static void llm_load_tensors(
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+
+ if (backend_norm == GGML_BACKEND_GPU) {
+ vram_weights += ggml_nbytes(model.output_norm);
+ vram_weights += ggml_nbytes(model.output_norm_b);
+ }
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+ vram_weights += ggml_nbytes(model.output);
+ }
}
const uint32_t n_ff = hparams.n_ff;
@@ -1967,7 +1975,7 @@ static void llm_load_tensors(
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
auto & layer = model.layers[i];
@@ -1978,6 +1986,11 @@ static void llm_load_tensors(
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
+
+ if (backend == GGML_BACKEND_GPU) {
+ vram_weights += ggml_nbytes(layer.attn_norm_2);
+ vram_weights += ggml_nbytes(layer.attn_norm_2_b);
+ }
}
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
@@ -1985,6 +1998,13 @@ static void llm_load_tensors(
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+
+ if (backend == GGML_BACKEND_GPU) {
+ vram_weights +=
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+ }
}
} break;
default: