diff options
author | Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> | 2024-10-19 17:24:43 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-19 17:24:43 +0200 |
commit | a077f09bcb33a07c33408e7eb078529aa4fa6b4a (patch) | |
tree | 00ee6794d80a098b294ee09fcb285498398138bd /src/llama.cpp | |
parent | 7b886ae3d876dfb569cdd02cca688066315a0667 (diff) |
Quant strategies: attn_q Q4 & attn_v Q6 for Llama 3.1 Q5_K_S (#96)
* attn_q Q4 & attn_v Q6 for Llama 3.1 Q5_K_S
Pattern worth to be tested on more quants and on L3 8B.
PPL 512 = -0.024 for 70b ; - 0.005 for 8b
Size = - 640MiB for 70b ; - 64MiB for 8b
70b Q5_K_S now beats Q5_K_M by -0.012 ppl
I suspect that it goes for L3 as well, which was quite insensitive to attn_q quantization.
* indent
Diffstat (limited to 'src/llama.cpp')
-rw-r--r-- | src/llama.cpp | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/src/llama.cpp b/src/llama.cpp index cae91619..0aaedb96 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15756,6 +15756,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { + if (qs.model.hparams.n_vocab >= 127999 && (qs.model.type == MODEL_8B || qs.model.type == MODEL_70B)) + new_type = GGML_TYPE_Q6_K; + } if (qs.model.type == MODEL_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with @@ -15796,6 +15800,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { + if (qs.model.hparams.n_vocab >= 127999 && (qs.model.type == MODEL_8B || qs.model.type == MODEL_70B)) + new_type = GGML_TYPE_Q4_K; + } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; |