From ef95b81733599429fdd63e4c2fb32c74645046be Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sun, 8 Dec 2024 09:13:10 +0100 Subject: R4 improvements on ARM_NEON (#125) * q4_0_r4: 6% faster PP on NEON * qx_0_r4_q8_0 template Applied to q4_0_r4 and q5_0_r4. It makes q5_0_r4 PP ~7% faster. * Apply qx_0_r4_q8_0 template also to q6_0_r4 and iq4_nl_x4 * Simplify * Minor iq4_xs_r4 improvement on NEON --------- Co-authored-by: Iwan Kawrakow --- src/llama.cpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/llama.cpp') diff --git a/src/llama.cpp b/src/llama.cpp index ad76a7b8..0e1aadbd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16569,6 +16569,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; else chunk_size_multiplier = 4; } + else if (new_type == GGML_TYPE_Q5_0_R4) { + if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_0; + else chunk_size_multiplier = 4; + } else if (new_type == GGML_TYPE_Q6_0_R4) { if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_0; else chunk_size_multiplier = 4; -- cgit v1.2.3