Option to enable disable the IQK CPU FA kernels (#429)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2025-05-17 11:21:58 +0300
committer: GitHub <noreply@github.com> 2025-05-17 11:21:58 +0300
commit: b3036a872f474beadf2df72d452ca7016db72aac (patch)
tree: a5d99baa610d8efa9489ed47ca263cb4802d6143
parent: c35a383bcd8e4bd334ba2b8d2eb96103e69f75d4 (diff)
4 files changed, 13 insertions, 4 deletions
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 70e3bbf3..314a38fb 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -131,6 +131,7 @@ option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_USE_GRAPHS                 "ggml: use CUDA graphs (llama.cpp only)"          OFF)
 
+option(GGML_IQK_FLASH_ATTENTION             "ggml: enable the IQK FlashAttention CPU kernels" ON)
 option(GGML_IQK_FA_ALL_QUANTS               "ggml: compile all quants for IQK FlashAttention" OFF)
 
 option(GGML_CURL                            "ggml: use libcurl to download model from an URL" OFF)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 4f4337c2..14650d03 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -260,9 +260,15 @@ if (GGML_IQK_MUL_MAT)
     add_compile_definitions(GGML_USE_IQK_MULMAT)
     set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp iqk/iqk_flash_attn.cpp)
     set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h iqk/iqk_flash_impl.h)
-    if (GGML_IQK_FA_ALL_QUANTS)
-        message(STATUS "Including all IQK FA kernels")
-        add_compile_definitions(GGML_IQK_FA_ALL_QUANTS)
+    if (GGML_IQK_FLASH_ATTENTION)
+        message(STATUS "Enabling IQK Flash Attention kernels")
+        add_compile_definitions(GGML_IQK_FLASH_ATTENTION)
+        if (GGML_IQK_FA_ALL_QUANTS)
+            message(STATUS "Including all IQK FA kernels")
+            add_compile_definitions(GGML_IQK_FA_ALL_QUANTS)
+        endif()
+    else()
+        message(STATUS "Disabling IQK Flash Attention kernels")
     endif()
 endif()
 
diff --git a/ggml/src/iqk/iqk_flash_attn.cpp b/ggml/src/iqk/iqk_flash_attn.cpp
index 610f18b7..9a974ae7 100644
--- a/ggml/src/iqk/iqk_flash_attn.cpp
+++ b/ggml/src/iqk/iqk_flash_attn.cpp
@@ -8,7 +8,7 @@
 #include "iqk_mul_mat.h"
 #include "iqk_flash_impl.h"
 
-#ifdef IQK_IMPLEMENT
+#if defined IQK_IMPLEMENT && defined GGML_IQK_FLASH_ATTENTION
 
 #include <algorithm>
 #include <cstdio>
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index 654cc706..311554f4 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -15875,6 +15875,7 @@ void MulMat::relu(int n, const float * x, float * y) {
 #endif
 } // namespace
 
+#ifdef GGML_IQK_FLASH_ATTENTION
 namespace {
 
 template <int k_step>
@@ -18663,6 +18664,7 @@ bool iqk_flash_attn_impl(int int_type_k,         // type of k
 
     return true;
 }
+#endif
 
 #else  // IQK_IMPLEMENT
author	Kawrakow <iwankawrakow@gmail.com>	2025-05-17 11:21:58 +0300
committer	GitHub <noreply@github.com>	2025-05-17 11:21:58 +0300
commit	b3036a872f474beadf2df72d452ca7016db72aac (patch)
tree	a5d99baa610d8efa9489ed47ca263cb4802d6143
parent	c35a383bcd8e4bd334ba2b8d2eb96103e69f75d4 (diff)