summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2024-12-08 09:13:10 +0100
committerGitHub <noreply@github.com>2024-12-08 09:13:10 +0100
commitef95b81733599429fdd63e4c2fb32c74645046be (patch)
tree7b01c0969ccb342edb155bce41a47edb343d8ea2 /src
parent3682e4700db6b8cb2ca8e3da365578078f21ab0c (diff)
R4 improvements on ARM_NEON (#125)
* q4_0_r4: 6% faster PP on NEON * qx_0_r4_q8_0 template Applied to q4_0_r4 and q5_0_r4. It makes q5_0_r4 PP ~7% faster. * Apply qx_0_r4_q8_0 template also to q6_0_r4 and iq4_nl_x4 * Simplify * Minor iq4_xs_r4 improvement on NEON --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'src')
-rw-r--r--src/llama.cpp4
1 files changed, 4 insertions, 0 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index ad76a7b8..0e1aadbd 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16569,6 +16569,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
else chunk_size_multiplier = 4;
}
+ else if (new_type == GGML_TYPE_Q5_0_R4) {
+ if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_0;
+ else chunk_size_multiplier = 4;
+ }
else if (new_type == GGML_TYPE_Q6_0_R4) {
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_0;
else chunk_size_multiplier = 4;