From 2e6b523853a8659c63283a6deca805051ecd713a Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Thu, 30 Jan 2025 09:28:53 +0200
Subject: Faster Q4_K_R4 and Q5_K_R4 on AVX2/Zen4 (#182)

* Slightly faster AVX2 implementation for q4_k_r4

* Even better AVX2 implementation for q4_k_r4

We now arrive at PP-512 = 328 t/s for LLaMA-3.1-8B on a
Ryzen-5975WX CPU, up from 291 t/s when I last measured
on 3c5f8722.
With FA and Q8_0 K-cache we get to 339.5 t/s.

* Fix llama-bench labels that I broke with #181

* Faster AVX2 implementation for q5_k_q4

We arrive at 302 t/s for LLaMA-3.1-8B on a Ryzen-5975WX CPU,
up from 273 t/s.

* Use AVX2 implementation of q4_k_r4 and q5_k_r4 also on Zen4

After the changes I made to AVX2, it ends up being slightly faster
compared to what I had for Zen4.

* Minor tweak

* Cleanup

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/llama-bench/llama-bench.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'examples')

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index b46bd855..42320da8 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -756,7 +756,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 continue;
             }
             cmd_params_instance instance = {
-                /* .test_kind    = */ TEST_KIND_PP,
+                /* .test_kind    = */ TEST_KIND_TG,
                 /* .model        = */ m,
                 /* .n_prompt     = */ 0,
                 /* .n_gen        = */ n_gen,
@@ -784,7 +784,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 continue;
             }
             cmd_params_instance instance = {
-                /* .test_kind    = */ TEST_KIND_PP,
+                /* .test_kind    = */ TEST_KIND_PG,
                 /* .model        = */ m,
                 /* .n_prompt     = */ n_pg.first,
                 /* .n_gen        = */ n_pg.second,
-- 
cgit v1.2.3