diff options
-rw-r--r-- | CMakeLists.txt | 9 | ||||
-rw-r--r-- | Makefile | 7 | ||||
-rw-r--r-- | ggml-quants.c | 13 | ||||
-rw-r--r-- | ggml.c | 7 | ||||
-rw-r--r-- | sgemm.cpp | 21 |
5 files changed, 34 insertions, 23 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index f2298a62..fdb4e216 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,8 @@ option(LLAMA_BLAS "llama: use BLAS" set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING "llama: BLAS library vendor") option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) +option(LLAMA_IQK_MULMAT "llama: use optimized iqk matrix multiplications" ON) +set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUDA "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) @@ -399,8 +401,11 @@ if (LLAMA_BLAS) endif() endif() -set(GGML_SOURCES_IQK_MM iqk_mul_mat.cpp) -set(GGML_HEADERS_IQK_MM iqk_mul_mat.h) +if (LLAMA_IQK_MULMAT) + add_compile_definitions(GGML_USE_IQK_MULMAT) + set(GGML_SOURCES_IQK_MM iqk_mul_mat.cpp) + set(GGML_HEADERS_IQK_MM iqk_mul_mat.h) +endif() if (LLAMA_LLAMAFILE) add_compile_definitions(GGML_USE_LLAMAFILE) @@ -475,7 +475,10 @@ ifdef LLAMA_BLIS OBJS += ggml-blas.o endif # LLAMA_BLIS -OBJS += iqk_mul_mat.o +ifndef LLAMA_NO_IQK_MULMAT + MK_CPPFLAGS += -DGGML_USE_IQK_MULMAT + OBJS += iqk_mul_mat.o +endif ifndef LLAMA_NO_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE @@ -704,8 +707,10 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o +ifndef LLAMA_NO_IQK_MULMAT iqk_mul_mat.o: iqk_mul_mat.cpp ggml-impl.h ggml.h ggml-quants.h ggml-common.h iqk_mul_mat.h $(CXX) $(CXXFLAGS) -c $< -o $@ +endif ifndef LLAMA_NO_LLAMAFILE sgemm.o: sgemm.cpp sgemm.h ggml.h diff --git a/ggml-quants.c b/ggml-quants.c index 061edddc..684fdf7d 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -872,7 +872,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) #if defined(__ARM_NEON) block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy; - int nb4 = 4*(nb/4); +#if GGML_USE_IQK_MULMAT + const int nb4 = 4*(nb/4); +#else + const int nb4 = -1; +#endif for (int i = 0; i < nb; i++) { int i4 = i/4, ir = i%4; float32x4_t srcv [8]; @@ -1220,9 +1224,13 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) block_q8_1 * restrict y = vy; +#if GGML_USE_IQK_MULMAT + const int nb4 = 4*(nb/4); +#else + const int nb4 = -1; +#endif #if defined(__ARM_NEON) block_q8_1_x4 * restrict y4 = vy; - int nb4 = 4*(nb/4); for (int i = 0; i < nb; i++) { int i4 = i/4, ir = i%4; float32x4_t srcv [8]; @@ -1319,7 +1327,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) } #elif defined(__AVX2__) || defined(__AVX__) block_q8_1_x4 * restrict y4 = vy; - int nb4 = 4*(nb/4); #ifdef __AVX2__ const bool pack = true; #else @@ -4,7 +4,9 @@ #include "ggml-impl.h" #include "ggml-quants.h" #include "ggml.h" +#if GGML_USE_IQK_MULMAT #include "iqk_mul_mat.h" +#endif #if defined(_MSC_VER) || defined(__MINGW32__) #include <malloc.h> // using malloc.h with MSC/MINGW @@ -12371,6 +12373,7 @@ UseGgmlGemm1:; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; +#if GGML_USE_IQK_MULMAT if ((vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 || vec_dot_type == GGML_TYPE_Q8_1) && dst->type == GGML_TYPE_F32) { for (int64_t i13 = 0; i13 < ne13; i13++) @@ -12384,6 +12387,7 @@ UseGgmlGemm1:; return; } IQK_MulMat_Not_Available:; +#endif #if GGML_USE_LLAMAFILE @@ -12607,6 +12611,7 @@ static void ggml_compute_forward_mul_mat_id( const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = cne1; // src1 rows // +#if GGML_USE_IQK_MULMAT if (ne13 == 1 && dst->type == GGML_TYPE_F32 && (vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 || vec_dot_type == GGML_TYPE_Q8_1)) { if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, src0->type, @@ -12618,6 +12623,8 @@ static void ggml_compute_forward_mul_mat_id( continue; } IQK_MulMat_Not_Available:; +#endif + // distribute the thread work across the inner or outer loop based on which one is larger const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows @@ -51,7 +51,9 @@ #include "sgemm.h" #include "ggml-impl.h" #include "ggml-quants.h" +#if GGML_USE_IQK_MULMAT #include "iqk_mul_mat.h" +#endif #ifdef _MSC_VER #define NOINLINE __declspec(noinline) @@ -866,8 +868,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda if (Ctype != GGML_TYPE_F32) return false; +#if GGML_USE_IQK_MULMAT #if defined __AVX2__ && defined __FMA__ - //bool is_accepted_float_type = k >= 32 && Atype == GGML_TYPE_F16 && Btype == GGML_TYPE_F32; bool is_accepted_float_type = k >= 32 && ((Atype == GGML_TYPE_F16 && Btype == GGML_TYPE_F32) || (Atype == GGML_TYPE_F32 && Btype == GGML_TYPE_F16)); #elif defined __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && defined __ARM_FEATURE_FMA @@ -884,22 +886,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda return true; } } - -// if (task == GGML_TASK_TYPE_COMPUTE && k >= 32 && Atype == GGML_TYPE_F16) { -//#if defined __AVX2__ && defined __FMA__ -// if (Btype == GGML_TYPE_F32) { -// if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) { -// return true; -// } -// } -//#elif defined __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && defined __ARM_FEATURE_FMA -// if (Btype == GGML_TYPE_F16) { -// if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) { -// return true; -// } -// } -//#endif -// } +#endif switch (Atype) { |