From 2e6b523853a8659c63283a6deca805051ecd713a Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Thu, 30 Jan 2025 09:28:53 +0200 Subject: Faster Q4_K_R4 and Q5_K_R4 on AVX2/Zen4 (#182) * Slightly faster AVX2 implementation for q4_k_r4 * Even better AVX2 implementation for q4_k_r4 We now arrive at PP-512 = 328 t/s for LLaMA-3.1-8B on a Ryzen-5975WX CPU, up from 291 t/s when I last measured on 3c5f8722. With FA and Q8_0 K-cache we get to 339.5 t/s. * Fix llama-bench labels that I broke with #181 * Faster AVX2 implementation for q5_k_q4 We arrive at 302 t/s for LLaMA-3.1-8B on a Ryzen-5975WX CPU, up from 273 t/s. * Use AVX2 implementation of q4_k_r4 and q5_k_r4 also on Zen4 After the changes I made to AVX2, it ends up being slightly faster compared to what I had for Zen4. * Minor tweak * Cleanup --------- Co-authored-by: Iwan Kawrakow --- examples/llama-bench/llama-bench.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'examples') diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index b46bd855..42320da8 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -756,7 +756,7 @@ static std::vector get_cmd_params_instances(const cmd_param continue; } cmd_params_instance instance = { - /* .test_kind = */ TEST_KIND_PP, + /* .test_kind = */ TEST_KIND_TG, /* .model = */ m, /* .n_prompt = */ 0, /* .n_gen = */ n_gen, @@ -784,7 +784,7 @@ static std::vector get_cmd_params_instances(const cmd_param continue; } cmd_params_instance instance = { - /* .test_kind = */ TEST_KIND_PP, + /* .test_kind = */ TEST_KIND_PG, /* .model = */ m, /* .n_prompt = */ n_pg.first, /* .n_gen = */ n_pg.second, -- cgit v1.2.3