summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
authorIwan Kawrakow <iwan.kawrakow@gmail.com>2024-06-07 09:43:33 +0300
committerIwan Kawrakow <iwan.kawrakow@gmail.com>2024-06-22 12:02:50 +0300
commit667bd4759c8680f528bf31dcc41bdf22a6533f22 (patch)
treec1d21870068c6120cf68d0f644a1065f2d4ca89b /ggml.c
parent2ee56b4f0d079b4a1bd58347b13cb85ac5bd1445 (diff)
iqk_mul_mat: make it independent of sgemm
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c51
1 files changed, 30 insertions, 21 deletions
diff --git a/ggml.c b/ggml.c
index 55daa330..6781a5f3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4,7 +4,7 @@
#include "ggml-impl.h"
#include "ggml-quants.h"
#include "ggml.h"
-
+#include "iqk_mul_mat.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
@@ -12362,25 +12362,6 @@ UseGgmlGemm1:;
atomic_store(&state->shared->current_chunk, nth);
}
- //// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
- //atomic_store(&state->shared->current_chunk, nth);
- //if (src1->type != vec_dot_type) {
- // char * wdata = params->wdata;
- // const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
- // assert(params->wsize >= ne11*ne12*ne13*row_size);
- // GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
- // for (int64_t i13 = 0; i13 < ne13; ++i13) {
- // for (int64_t i12 = 0; i12 < ne12; ++i12) {
- // for (int64_t i11 = 0; i11 < ne11; ++i11) {
- // from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
- // wdata += row_size;
- // }
- // }
- // }
- //}
-
return;
}
@@ -12388,6 +12369,23 @@ UseGgmlGemm1:;
return;
}
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+
+ if ((vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 ||
+ vec_dot_type == GGML_TYPE_Q8_1) && dst->type == GGML_TYPE_F32) {
+ for (int64_t i13 = 0; i13 < ne13; i13++)
+ for (int64_t i12 = 0; i12 < ne12; i12++)
+ if (!iqk_mul_mat(ne01, ne11, ne00, src0->type,
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
+ (const char *)wdata + ggml_row_size(vec_dot_type, ne10)*(i13*ne12 + i12),
+ (float *)((char *)dst->data + i12*nb2 + i13*nb3),
+ nb1/ggml_type_size(dst->type),
+ ith, nth)) goto IQK_MulMat_Not_Available;
+ return;
+ }
+IQK_MulMat_Not_Available:;
+
+
#if GGML_USE_LLAMAFILE
if (src1->type != vec_dot_type) {
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
@@ -12605,7 +12603,18 @@ static void ggml_compute_forward_mul_mat_id(
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1; // src1 rows
-
+ //
+ if (ne13 == 1 && dst->type == GGML_TYPE_F32 &&
+ (vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 || vec_dot_type == GGML_TYPE_Q8_1)) {
+ if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, src0->type,
+ (const char *)src0_cur,
+ (const char *)wdata,
+ (float *)dst->data, nb1, nb2,
+ matrix_rows + cur_a*ne12,
+ ith, nth)) goto IQK_MulMat_Not_Available;
+ continue;
+ }
+IQK_MulMat_Not_Available:;
// distribute the thread work across the inner or outer loop based on which one is larger
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows