diff options
author | Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> | 2024-10-18 09:48:15 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-18 09:48:15 +0200 |
commit | 03cabe15401adba617c672ce8708a3501be8f15c (patch) | |
tree | eda129c338de73c9c8e21a9b70eb9dc02480b3ad /src/llama.cpp | |
parent | 76b97c80645362ac65a2e33043fd8d46bdaf8c56 (diff) |
CLI - Specify GGML_TYPE to quantize for the main tensors. (#91)
To complement the token_embd.weight and output.weight :
attn_v.weight
attn_k.weight.
attn_q_weight
attn_output.weight
attn_qkv.weight
ffn_gate
ffn_down
ffn_up
Diffstat (limited to 'src/llama.cpp')
-rw-r--r-- | src/llama.cpp | 56 |
1 files changed, 48 insertions, 8 deletions
diff --git a/src/llama.cpp b/src/llama.cpp index d9eec461..cae91619 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15716,7 +15716,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } } } else if (name.find("attn_v.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (qs.params->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params->attn_v_type; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_K) { @@ -15775,7 +15776,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs.params->attn_k_type < GGML_TYPE_COUNT) new_type = qs.params->attn_k_type; + else if (qs.model.hparams.n_expert == 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; @@ -15787,7 +15789,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ2_S; } } else if (name.find("attn_q.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.params->attn_q_type < GGML_TYPE_COUNT) new_type = qs.params->attn_q_type; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { @@ -15796,7 +15799,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } @@ -15843,7 +15847,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { - if (arch != LLM_ARCH_FALCON) { + if (qs.params->attn_output_type < GGML_TYPE_COUNT) new_type = qs.params->attn_output_type; + else if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert >= 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || @@ -15866,7 +15871,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.params->attn_qkv_type < GGML_TYPE_COUNT) new_type = qs.params->attn_qkv_type; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_K; @@ -15876,7 +15882,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { + if (qs.params->ffn_gate_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_gate_type; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_KL && use_more_bits(i_layer, n_layer)) { @@ -15887,7 +15894,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { + if (qs.params->ffn_up_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_up_type; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_KL && use_more_bits(i_layer, n_layer)) { @@ -16323,6 +16331,30 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { new_type = params->output_tensor_type; } + if (params->attn_q_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_q.weight") == 0) { + new_type = params->attn_q_type; + } + if (params->attn_k_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_k.weight") == 0) { + new_type = params->attn_k_type; + } + if (params->attn_v_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_v.weight") == 0) { + new_type = params->attn_v_type; + } + if (params->attn_qkv_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_qkv.weight") == 0) { + new_type = params->attn_qkv_type; + } + if (params->attn_output_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_output.weight") == 0) { + new_type = params->attn_output_type; + } + if (params->ffn_gate_type < GGML_TYPE_COUNT && strcmp(tensor->name, "ffn_gate") == 0) { + new_type = params->ffn_gate_type; + } + if (params->ffn_down_type < GGML_TYPE_COUNT && strcmp(tensor->name, "ffn_down") == 0) { + new_type = params->ffn_down_type; + } + if (params->ffn_up_type < GGML_TYPE_COUNT && strcmp(tensor->name, "ffn_up") == 0) { + new_type = params->ffn_up_type; + } // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. @@ -16726,6 +16758,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, /*.output_tensor_type =*/ GGML_TYPE_COUNT, /*.token_embedding_type =*/ GGML_TYPE_COUNT, + /*.attn_q_type =*/ GGML_TYPE_COUNT, + /*.attn_k_type =*/ GGML_TYPE_COUNT, + /*.attn_v_type =*/ GGML_TYPE_COUNT, + /*.attn_qkv_type =*/ GGML_TYPE_COUNT, + /*.attn_output_type =*/ GGML_TYPE_COUNT, + /*.ffn_gate_type =*/ GGML_TYPE_COUNT, + /*.ffn_down_type =*/ GGML_TYPE_COUNT, + /*.ffn_up_type =*/ GGML_TYPE_COUNT, /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false, |