summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
authorIwan Kawrakow <iwan.kawrakow@gmail.com>2024-05-27 09:51:08 +0200
committerIwan Kawrakow <iwan.kawrakow@gmail.com>2024-06-22 12:02:49 +0300
commit19c578b413fb53f432319a99fbd658ed30faf966 (patch)
tree68aa0ec1ecaf75ebd1340b930f2e5e2ec1951a31 /ggml.c
parentc5a8d4b749352645afd4c024f85d6eca2ca72c6d (diff)
iqk_mul_mat for llama.cpp
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c49
1 files changed, 37 insertions, 12 deletions
diff --git a/ggml.c b/ggml.c
index 778ca3fd..55daa330 100644
--- a/ggml.c
+++ b/ggml.c
@@ -12334,11 +12334,7 @@ UseGgmlGemm1:;
#endif
if (params->type == GGML_TASK_TYPE_INIT) {
- if (ith != 0) {
- return;
- }
- // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
- atomic_store(&state->shared->current_chunk, nth);
+
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -12346,16 +12342,45 @@ UseGgmlGemm1:;
assert(params->wsize >= ne11*ne12*ne13*row_size);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
- for (int64_t i13 = 0; i13 < ne13; ++i13) {
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
- from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
- wdata += row_size;
- }
- }
+ int64_t work_size = ne13*ne12*ne11;
+ int64_t work_per_thread = (work_size + nth - 1)/nth;
+ int64_t work_start = work_per_thread * ith;
+ if (work_start >= work_size) {
+ return;
+ }
+ int64_t work_end = MIN(work_size, work_start + work_per_thread);
+ for (int64_t i_work = work_start; i_work < work_end; ++i_work) {
+ int64_t i13 = i_work / (ne11*ne12);
+ int64_t i12 = (i_work - i13*ne11*ne12)/ne11;
+ int64_t i11 = i_work - i13*ne11*ne12 - i12*ne11;
+ from_float_to_vec_dot((const float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+ (void *)(wdata + i_work*row_size), ne10);
}
}
+ if (ith == 0) {
+ atomic_store(&state->shared->current_chunk, nth);
+ }
+
+ //// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
+ //atomic_store(&state->shared->current_chunk, nth);
+ //if (src1->type != vec_dot_type) {
+ // char * wdata = params->wdata;
+ // const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+ // assert(params->wsize >= ne11*ne12*ne13*row_size);
+ // GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+ // for (int64_t i13 = 0; i13 < ne13; ++i13) {
+ // for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ // for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ // from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+ // wdata += row_size;
+ // }
+ // }
+ // }
+ //}
+
return;
}