summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp5
1 files changed, 4 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index 7770fa0e..2ebd40df 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
return std::make_pair(i_layer, n_layer);
};
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+ // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+ // with the quantization of the output tensor
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
+ (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
int nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;