gemma : use more bits for the token_embd.weight tensor (#5650)

* gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type
author: Georgi Gerganov <ggerganov@gmail.com> 2024-02-22 23:23:46 +0200
committer: GitHub <noreply@github.com> 2024-02-22 23:23:46 +0200
commit: 96633eeca1265ed03e57230de54032041c58f9cd (patch)
tree: f3e0370d7f304666030968a4f0fb8a36f693b605 /llama.cpp
parent: 847eedbdb2d1ebf14ef56eb507d4b4b975510908 (diff)
1 files changed, 4 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index 7770fa0e..2ebd40df 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         return std::make_pair(i_layer, n_layer);
     };
 
-    if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+    // with the quantization of the output tensor
+    if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
+        (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
         int nx = tensor->ne[0];
         if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
             new_type = GGML_TYPE_Q8_0;
author	Georgi Gerganov <ggerganov@gmail.com>	2024-02-22 23:23:46 +0200
committer	GitHub <noreply@github.com>	2024-02-22 23:23:46 +0200
commit	96633eeca1265ed03e57230de54032041c58f9cd (patch)
tree	f3e0370d7f304666030968a4f0fb8a36f693b605 /llama.cpp
parent	847eedbdb2d1ebf14ef56eb507d4b4b975510908 (diff)