diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2024-12-04 15:20:07 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-12-04 15:20:07 +0100 |
commit | f64de08203aaee95ca755336de3e1db85d990198 (patch) | |
tree | 9af01056e0b304ee5df5792f25d82066931eb4d6 /src/llama.cpp | |
parent | f1f4eb988fe5ee969100cd0d3782fd7460d13949 (diff) |
IQ4_XS_R4 (#123)
* Adding iq4_xs_r4
This is a 1st working version on Zen4.
We get PP-512(LLaMA-3.1-8B) = 226 t/s, so 16% slower
than iq4_nl_x4.
* iq4_xs_r4: WIP
* iq4_xs_r4: Use AVX2 version for matrix x vector on Zen4
* iq4_xs_r4: NEON
We get PP-512(LLaMA-3.1-8B) = 115.6 t/s on M2-Max,
up from 68.2 t/s for iq4_xs!
* DRY
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'src/llama.cpp')
-rw-r--r-- | src/llama.cpp | 20 |
1 files changed, 17 insertions, 3 deletions
diff --git a/src/llama.cpp b/src/llama.cpp index f307fd89..e2abc235 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3850,6 +3850,7 @@ struct llama_model_loader { case GGML_TYPE_IQ2_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_NL_X4:ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL_X4;break; + case GGML_TYPE_IQ4_XS_R4:ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS_R4;break; case GGML_TYPE_Q4_0_R4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_R4; break; case GGML_TYPE_Q5_0_R4: ftype = LLAMA_FTYPE_MOSTLY_Q5_0_R4; break; case GGML_TYPE_Q6_0_R4: ftype = LLAMA_FTYPE_MOSTLY_Q6_0_R4; break; @@ -4559,6 +4560,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL_X4:return "IQ4_NL_X4 - 4.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_XS_R4:return "IQ4_XS_R4 - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_R4: return "Q4_0_R4 - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_Q5_0_R4: return "Q5_0_R4 - 5.5 bpw"; case LLAMA_FTYPE_MOSTLY_Q6_0_R4: return "Q6_0_R4 - 6.5 bpw"; @@ -15779,6 +15781,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (new_type == GGML_TYPE_IQ4_NL_X4) { new_type = GGML_TYPE_IQ4_NL; } + else if (new_type == GGML_TYPE_IQ4_XS_R4) { + new_type = GGML_TYPE_IQ4_XS; + } else if (new_type == GGML_TYPE_Q4_0_R4) { new_type = GGML_TYPE_Q4_0; } @@ -15852,7 +15857,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_X4 || + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_X4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS) && qs.model.hparams.n_gqa() >= 2) { new_type = GGML_TYPE_IQ5_K; } @@ -15883,6 +15889,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K; else if (new_type == GGML_TYPE_IQ4_NL) new_type = GGML_TYPE_Q5_K; else if (new_type == GGML_TYPE_IQ4_NL_X4) new_type = GGML_TYPE_Q5_K; + else if (new_type == GGML_TYPE_IQ4_XS_R4) new_type = GGML_TYPE_Q5_K; else if (new_type == GGML_TYPE_Q5_K) new_type = GGML_TYPE_Q6_K; } ++qs.i_attention_wv; @@ -15947,7 +15954,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (i_layer < n_layer/8 && !qs.has_imatrix && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || - ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_X4)) { + ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS || + ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_X4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS_R4)) { new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; @@ -15973,7 +15981,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_K || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_X4) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || + ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_X4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS_R4) { new_type = GGML_TYPE_Q5_K; } } else { @@ -16183,6 +16192,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ2_BN: default_type = GGML_TYPE_IQ2_BN; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL_X4:default_type = GGML_TYPE_IQ4_NL_X4;break; + case LLAMA_FTYPE_MOSTLY_IQ4_XS_R4:default_type = GGML_TYPE_IQ4_XS_R4;break; case LLAMA_FTYPE_MOSTLY_Q4_0_R4: default_type = GGML_TYPE_Q4_0_R4; break; case LLAMA_FTYPE_MOSTLY_Q5_0_R4: default_type = GGML_TYPE_Q5_0_R4; break; case LLAMA_FTYPE_MOSTLY_Q6_0_R4: default_type = GGML_TYPE_Q6_0_R4; break; @@ -16548,6 +16558,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_NL; else chunk_size_multiplier = 4; } + else if (new_type == GGML_TYPE_IQ4_XS_R4) { + if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_XS; + else chunk_size_multiplier = 4; + } else if (new_type == GGML_TYPE_Q4_0_R4) { if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; else chunk_size_multiplier = 4; |