summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIwan Kawrakow <iwan.kawrakow@gmail.com>2024-06-10 18:30:33 +0300
committerIwan Kawrakow <iwan.kawrakow@gmail.com>2024-06-22 12:02:50 +0300
commit9593e163db41f60b8d6598a443fec1740e97eb67 (patch)
treedd55925e2b5fc0960d77c5f8cc2c4d402118d9d0
parent81cf6990f512e82c2c89ba7f89a15c3d98172f84 (diff)
iqk_mul_mat: add ability to disable it
-rw-r--r--CMakeLists.txt9
-rw-r--r--Makefile7
-rw-r--r--ggml-quants.c13
-rw-r--r--ggml.c7
-rw-r--r--sgemm.cpp21
5 files changed, 34 insertions, 23 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f2298a62..fdb4e216 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,8 @@ option(LLAMA_BLAS "llama: use BLAS"
set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
"llama: BLAS library vendor")
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
+option(LLAMA_IQK_MULMAT "llama: use optimized iqk matrix multiplications" ON)
+set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
@@ -399,8 +401,11 @@ if (LLAMA_BLAS)
endif()
endif()
-set(GGML_SOURCES_IQK_MM iqk_mul_mat.cpp)
-set(GGML_HEADERS_IQK_MM iqk_mul_mat.h)
+if (LLAMA_IQK_MULMAT)
+ add_compile_definitions(GGML_USE_IQK_MULMAT)
+ set(GGML_SOURCES_IQK_MM iqk_mul_mat.cpp)
+ set(GGML_HEADERS_IQK_MM iqk_mul_mat.h)
+endif()
if (LLAMA_LLAMAFILE)
add_compile_definitions(GGML_USE_LLAMAFILE)
diff --git a/Makefile b/Makefile
index af9f72cb..720f8661 100644
--- a/Makefile
+++ b/Makefile
@@ -475,7 +475,10 @@ ifdef LLAMA_BLIS
OBJS += ggml-blas.o
endif # LLAMA_BLIS
-OBJS += iqk_mul_mat.o
+ifndef LLAMA_NO_IQK_MULMAT
+ MK_CPPFLAGS += -DGGML_USE_IQK_MULMAT
+ OBJS += iqk_mul_mat.o
+endif
ifndef LLAMA_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
@@ -704,8 +707,10 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
+ifndef LLAMA_NO_IQK_MULMAT
iqk_mul_mat.o: iqk_mul_mat.cpp ggml-impl.h ggml.h ggml-quants.h ggml-common.h iqk_mul_mat.h
$(CXX) $(CXXFLAGS) -c $< -o $@
+endif
ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h
diff --git a/ggml-quants.c b/ggml-quants.c
index 061edddc..684fdf7d 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -872,7 +872,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
#if defined(__ARM_NEON)
block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
- int nb4 = 4*(nb/4);
+#if GGML_USE_IQK_MULMAT
+ const int nb4 = 4*(nb/4);
+#else
+ const int nb4 = -1;
+#endif
for (int i = 0; i < nb; i++) {
int i4 = i/4, ir = i%4;
float32x4_t srcv [8];
@@ -1220,9 +1224,13 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
block_q8_1 * restrict y = vy;
+#if GGML_USE_IQK_MULMAT
+ const int nb4 = 4*(nb/4);
+#else
+ const int nb4 = -1;
+#endif
#if defined(__ARM_NEON)
block_q8_1_x4 * restrict y4 = vy;
- int nb4 = 4*(nb/4);
for (int i = 0; i < nb; i++) {
int i4 = i/4, ir = i%4;
float32x4_t srcv [8];
@@ -1319,7 +1327,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
}
#elif defined(__AVX2__) || defined(__AVX__)
block_q8_1_x4 * restrict y4 = vy;
- int nb4 = 4*(nb/4);
#ifdef __AVX2__
const bool pack = true;
#else
diff --git a/ggml.c b/ggml.c
index ccab8a57..bcf16222 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4,7 +4,9 @@
#include "ggml-impl.h"
#include "ggml-quants.h"
#include "ggml.h"
+#if GGML_USE_IQK_MULMAT
#include "iqk_mul_mat.h"
+#endif
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
@@ -12371,6 +12373,7 @@ UseGgmlGemm1:;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+#if GGML_USE_IQK_MULMAT
if ((vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 ||
vec_dot_type == GGML_TYPE_Q8_1) && dst->type == GGML_TYPE_F32) {
for (int64_t i13 = 0; i13 < ne13; i13++)
@@ -12384,6 +12387,7 @@ UseGgmlGemm1:;
return;
}
IQK_MulMat_Not_Available:;
+#endif
#if GGML_USE_LLAMAFILE
@@ -12607,6 +12611,7 @@ static void ggml_compute_forward_mul_mat_id(
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1; // src1 rows
//
+#if GGML_USE_IQK_MULMAT
if (ne13 == 1 && dst->type == GGML_TYPE_F32 &&
(vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 || vec_dot_type == GGML_TYPE_Q8_1)) {
if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, src0->type,
@@ -12618,6 +12623,8 @@ static void ggml_compute_forward_mul_mat_id(
continue;
}
IQK_MulMat_Not_Available:;
+#endif
+
// distribute the thread work across the inner or outer loop based on which one is larger
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
diff --git a/sgemm.cpp b/sgemm.cpp
index b6c00c4e..409a9a67 100644
--- a/sgemm.cpp
+++ b/sgemm.cpp
@@ -51,7 +51,9 @@
#include "sgemm.h"
#include "ggml-impl.h"
#include "ggml-quants.h"
+#if GGML_USE_IQK_MULMAT
#include "iqk_mul_mat.h"
+#endif
#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
@@ -866,8 +868,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
if (Ctype != GGML_TYPE_F32)
return false;
+#if GGML_USE_IQK_MULMAT
#if defined __AVX2__ && defined __FMA__
- //bool is_accepted_float_type = k >= 32 && Atype == GGML_TYPE_F16 && Btype == GGML_TYPE_F32;
bool is_accepted_float_type = k >= 32 &&
((Atype == GGML_TYPE_F16 && Btype == GGML_TYPE_F32) || (Atype == GGML_TYPE_F32 && Btype == GGML_TYPE_F16));
#elif defined __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && defined __ARM_FEATURE_FMA
@@ -884,22 +886,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
return true;
}
}
-
-// if (task == GGML_TASK_TYPE_COMPUTE && k >= 32 && Atype == GGML_TYPE_F16) {
-//#if defined __AVX2__ && defined __FMA__
-// if (Btype == GGML_TYPE_F32) {
-// if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) {
-// return true;
-// }
-// }
-//#elif defined __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && defined __ARM_FEATURE_FMA
-// if (Btype == GGML_TYPE_F16) {
-// if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) {
-// return true;
-// }
-// }
-//#endif
-// }
+#endif
switch (Atype) {