diff options
author | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-07 09:43:33 +0300 |
---|---|---|
committer | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-22 12:02:50 +0300 |
commit | 667bd4759c8680f528bf31dcc41bdf22a6533f22 (patch) | |
tree | c1d21870068c6120cf68d0f644a1065f2d4ca89b | |
parent | 2ee56b4f0d079b4a1bd58347b13cb85ac5bd1445 (diff) |
iqk_mul_mat: make it independent of sgemm
-rw-r--r-- | ggml.c | 51 | ||||
-rw-r--r-- | iqk_mul_mat.cpp | 2 | ||||
-rw-r--r-- | iqk_mul_mat.h | 17 | ||||
-rw-r--r-- | sgemm.cpp | 16 |
4 files changed, 48 insertions, 38 deletions
@@ -4,7 +4,7 @@ #include "ggml-impl.h" #include "ggml-quants.h" #include "ggml.h" - +#include "iqk_mul_mat.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include <malloc.h> // using malloc.h with MSC/MINGW @@ -12362,25 +12362,6 @@ UseGgmlGemm1:; atomic_store(&state->shared->current_chunk, nth); } - //// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - //atomic_store(&state->shared->current_chunk, nth); - //if (src1->type != vec_dot_type) { - // char * wdata = params->wdata; - // const size_t row_size = ggml_row_size(vec_dot_type, ne10); - - // assert(params->wsize >= ne11*ne12*ne13*row_size); - // GGML_ASSERT(src1->type == GGML_TYPE_F32); - - // for (int64_t i13 = 0; i13 < ne13; ++i13) { - // for (int64_t i12 = 0; i12 < ne12; ++i12) { - // for (int64_t i11 = 0; i11 < ne11; ++i11) { - // from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); - // wdata += row_size; - // } - // } - // } - //} - return; } @@ -12388,6 +12369,23 @@ UseGgmlGemm1:; return; } + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + + if ((vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 || + vec_dot_type == GGML_TYPE_Q8_1) && dst->type == GGML_TYPE_F32) { + for (int64_t i13 = 0; i13 < ne13; i13++) + for (int64_t i12 = 0; i12 < ne12; i12++) + if (!iqk_mul_mat(ne01, ne11, ne00, src0->type, + (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, + (const char *)wdata + ggml_row_size(vec_dot_type, ne10)*(i13*ne12 + i12), + (float *)((char *)dst->data + i12*nb2 + i13*nb3), + nb1/ggml_type_size(dst->type), + ith, nth)) goto IQK_MulMat_Not_Available; + return; + } +IQK_MulMat_Not_Available:; + + #if GGML_USE_LLAMAFILE if (src1->type != vec_dot_type) { const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; @@ -12605,7 +12603,18 @@ static void ggml_compute_forward_mul_mat_id( const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = cne1; // src1 rows - + // + if (ne13 == 1 && dst->type == GGML_TYPE_F32 && + (vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 || vec_dot_type == GGML_TYPE_Q8_1)) { + if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, src0->type, + (const char *)src0_cur, + (const char *)wdata, + (float *)dst->data, nb1, nb2, + matrix_rows + cur_a*ne12, + ith, nth)) goto IQK_MulMat_Not_Available; + continue; + } +IQK_MulMat_Not_Available:; // distribute the thread work across the inner or outer loop based on which one is larger const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp index 4e7d27b9..73ac2f42 100644 --- a/iqk_mul_mat.cpp +++ b/iqk_mul_mat.cpp @@ -21,7 +21,7 @@ #include "ggml-impl.h" #include "ggml-quants.h" -#include "sgemm.h" +#include "iqk_mul_mat.h" #define GGML_COMMON_IMPL_C #include "ggml-common.h" diff --git a/iqk_mul_mat.h b/iqk_mul_mat.h new file mode 100644 index 00000000..4706714b --- /dev/null +++ b/iqk_mul_mat.h @@ -0,0 +1,17 @@ +#pragma once +#include <stdint.h> +#include <stdbool.h> +#ifdef __cplusplus +extern "C" { +#endif + +bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const void * B, + float * C, long stride_C, int ith, int nth); + +bool iqk_mul_mat_moe(long, long, long, int, int, const void *, const void *, + float *, long, long, const void *, int, int); + + +#ifdef __cplusplus +} +#endif @@ -850,10 +850,6 @@ class tinyBLAS_Q0_AVX { * @return true if this function was able to service the matmul request */ -bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const void * B, - float * C, long stride_C, int ith, int nth); - - bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) { @@ -866,18 +862,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda assert(nth > 0); assert(ith < nth); - if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) { - if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float *)C, ldc, ith, nth)) { - return true; - } - } - if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) { - assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32); - if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float *)C, ldc, ith, nth)) { - return true; - } - } - if (Ctype != GGML_TYPE_F32) return false; |