iqk_mul_mat: add ability to disable it

author: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-06-10 18:30:33 +0300
committer: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-06-22 12:02:50 +0300
commit: 9593e163db41f60b8d6598a443fec1740e97eb67 (patch)
tree: dd55925e2b5fc0960d77c5f8cc2c4d402118d9d0
parent: 81cf6990f512e82c2c89ba7f89a15c3d98172f84 (diff)
5 files changed, 34 insertions, 23 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f2298a62..fdb4e216 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,8 @@ option(LLAMA_BLAS                            "llama: use BLAS"
 set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
                                              "llama: BLAS library vendor")
 option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
+option(LLAMA_IQK_MULMAT                      "llama: use optimized iqk matrix multiplications"  ON)
+set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
 option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
@@ -399,8 +401,11 @@ if (LLAMA_BLAS)
     endif()
 endif()
 
-set(GGML_SOURCES_IQK_MM iqk_mul_mat.cpp)
-set(GGML_HEADERS_IQK_MM iqk_mul_mat.h)
+if (LLAMA_IQK_MULMAT)
+    add_compile_definitions(GGML_USE_IQK_MULMAT)
+    set(GGML_SOURCES_IQK_MM iqk_mul_mat.cpp)
+    set(GGML_HEADERS_IQK_MM iqk_mul_mat.h)
+endif()
 
 if (LLAMA_LLAMAFILE)
     add_compile_definitions(GGML_USE_LLAMAFILE)
diff --git a/Makefile b/Makefile
index af9f72cb..720f8661 100644
--- a/Makefile
+++ b/Makefile
@@ -475,7 +475,10 @@ ifdef LLAMA_BLIS
 	OBJS        += ggml-blas.o
 endif # LLAMA_BLIS
 
-OBJS += iqk_mul_mat.o
+ifndef LLAMA_NO_IQK_MULMAT
+	MK_CPPFLAGS += -DGGML_USE_IQK_MULMAT
+	OBJS += iqk_mul_mat.o
+endif
 
 ifndef LLAMA_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
@@ -704,8 +707,10 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
 COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
 COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
 
+ifndef LLAMA_NO_IQK_MULMAT
 iqk_mul_mat.o: iqk_mul_mat.cpp ggml-impl.h ggml.h ggml-quants.h ggml-common.h iqk_mul_mat.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif
 
 ifndef LLAMA_NO_LLAMAFILE
 sgemm.o: sgemm.cpp sgemm.h ggml.h
diff --git a/ggml-quants.c b/ggml-quants.c
index 061edddc..684fdf7d 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -872,7 +872,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
 
 #if defined(__ARM_NEON)
     block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
-    int nb4 = 4*(nb/4);
+#if GGML_USE_IQK_MULMAT
+    const int nb4 = 4*(nb/4);
+#else
+    const int nb4 = -1;
+#endif
     for (int i = 0; i < nb; i++) {
         int i4 = i/4, ir = i%4;
         float32x4_t srcv [8];
@@ -1220,9 +1224,13 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
 
     block_q8_1 * restrict y = vy;
 
+#if GGML_USE_IQK_MULMAT
+    const int nb4 = 4*(nb/4);
+#else
+    const int nb4 = -1;
+#endif
 #if defined(__ARM_NEON)
     block_q8_1_x4 * restrict y4 = vy;
-    int nb4 = 4*(nb/4);
     for (int i = 0; i < nb; i++) {
         int i4 = i/4, ir = i%4;
         float32x4_t srcv [8];
@@ -1319,7 +1327,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
     }
 #elif defined(__AVX2__) || defined(__AVX__)
     block_q8_1_x4 * restrict y4 = vy;
-    int nb4 = 4*(nb/4);
 #ifdef __AVX2__
     const bool pack = true;
 #else
diff --git a/ggml.c b/ggml.c
index ccab8a57..bcf16222 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4,7 +4,9 @@
 #include "ggml-impl.h"
 #include "ggml-quants.h"
 #include "ggml.h"
+#if GGML_USE_IQK_MULMAT
 #include "iqk_mul_mat.h"
+#endif
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -12371,6 +12373,7 @@ UseGgmlGemm1:;
 
     const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
 
+#if GGML_USE_IQK_MULMAT
     if ((vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 ||
          vec_dot_type == GGML_TYPE_Q8_1) && dst->type == GGML_TYPE_F32) {
         for (int64_t i13 = 0; i13 < ne13; i13++)
@@ -12384,6 +12387,7 @@ UseGgmlGemm1:;
         return;
     }
 IQK_MulMat_Not_Available:;
+#endif
 
 
 #if GGML_USE_LLAMAFILE
@@ -12607,6 +12611,7 @@ static void ggml_compute_forward_mul_mat_id(
         const int64_t nr0 = ne01; // src0 rows
         const int64_t nr1 = cne1; // src1 rows
                                   //
+#if GGML_USE_IQK_MULMAT
         if (ne13 == 1 && dst->type == GGML_TYPE_F32 &&
            (vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 || vec_dot_type == GGML_TYPE_Q8_1)) {
            if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, src0->type,
@@ -12618,6 +12623,8 @@ static void ggml_compute_forward_mul_mat_id(
                 continue;
         }
 IQK_MulMat_Not_Available:;
+#endif
+
         // distribute the thread work across the inner or outer loop based on which one is larger
 
         const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
diff --git a/sgemm.cpp b/sgemm.cpp
index b6c00c4e..409a9a67 100644
--- a/sgemm.cpp
+++ b/sgemm.cpp
@@ -51,7 +51,9 @@
 #include "sgemm.h"
 #include "ggml-impl.h"
 #include "ggml-quants.h"
+#if GGML_USE_IQK_MULMAT
 #include "iqk_mul_mat.h"
+#endif
 
 #ifdef _MSC_VER
 #define NOINLINE __declspec(noinline)
@@ -866,8 +868,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
     if (Ctype != GGML_TYPE_F32)
         return false;
 
+#if GGML_USE_IQK_MULMAT
 #if defined __AVX2__ && defined __FMA__
-    //bool is_accepted_float_type = k >= 32 && Atype == GGML_TYPE_F16 && Btype == GGML_TYPE_F32;
     bool is_accepted_float_type = k >= 32 &&
         ((Atype == GGML_TYPE_F16 && Btype == GGML_TYPE_F32) || (Atype == GGML_TYPE_F32 && Btype == GGML_TYPE_F16));
 #elif defined __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && defined __ARM_FEATURE_FMA
@@ -884,22 +886,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
             return true;
         }
     }
-
-//    if (task == GGML_TASK_TYPE_COMPUTE && k >= 32 && Atype == GGML_TYPE_F16) {
-//#if defined __AVX2__ && defined __FMA__
-//        if (Btype == GGML_TYPE_F32) {
-//            if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) {
-//                return true;
-//            }
-//        }
-//#elif defined __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && defined __ARM_FEATURE_FMA
-//        if (Btype == GGML_TYPE_F16) {
-//            if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) {
-//                return true;
-//            }
-//        }
-//#endif
-//    }
+#endif
 
     switch (Atype) {
author	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-06-10 18:30:33 +0300
committer	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-06-22 12:02:50 +0300
commit	9593e163db41f60b8d6598a443fec1740e97eb67 (patch)
tree	dd55925e2b5fc0960d77c5f8cc2c4d402118d9d0
parent	81cf6990f512e82c2c89ba7f89a15c3d98172f84 (diff)