diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-02-08 09:48:59 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-08 09:48:59 +0200 |
commit | 6d7b58eade37e45e3d8286a2353658047539d2b2 (patch) | |
tree | 8de22e931d64ce7567a9e5c7c7eb28fd2e6ec04e /ggml/src/ggml.c | |
parent | 4601a8c3735d8e47c46e0927712d77c4f422be6c (diff) |
Revert #79 (#192)
* Revert "Do not quantize activations if not necessary (#79)"
This reverts commit 0bf4d99774aa3b6d00ef564acbc4dc211e45db33.
* Fixed compilation after revert
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml.c')
-rw-r--r-- | ggml/src/ggml.c | 63 |
1 files changed, 16 insertions, 47 deletions
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 68525906..b19fb006 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2586,7 +2586,6 @@ struct ggml_compute_params { // work buffer for all threads size_t wsize; - size_t qsize; void * wdata; struct ggml_compute_state_shared * shared; @@ -13940,7 +13939,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( return; } - const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); assert(ne12 % ne02 == 0); @@ -14110,12 +14109,7 @@ UseGgmlGemm1:; #endif if (src1->type != vec_dot_type) { - char * wdata = (char *)params->wdata + params->wsize - params->qsize; - - if (strncmp(src1->name, wdata - GGML_MAX_NAME, GGML_MAX_NAME) == 0) { - goto AlreadyQuantized; - } - wdata += GGML_MAX_NAME; + char * wdata = params->wdata; #if IK_PRINT_TIMING int64_t t1 = ggml_time_us(); @@ -14125,7 +14119,7 @@ UseGgmlGemm1:; const size_t nbw2 = nbw1*ne11; const size_t nbw3 = nbw2*ne12; - assert(params->qsize >= ne13*nbw3); + assert(params->wsize >= ne13*nbw3); GGML_ASSERT(src1->type == GGML_TYPE_F32); for (int64_t i13 = 0; i13 < ne13; ++i13) { @@ -14157,17 +14151,14 @@ UseGgmlGemm1:; #endif if (ith == 0) { - wdata -= GGML_MAX_NAME; - memcpy(wdata, src1->name, GGML_MAX_NAME); // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. //atomic_store(¶ms->shared->current_chunk, nth); } -AlreadyQuantized:; + ggml_barrier(params->shared); } - const void * wdata = (src1->type == vec_dot_type) ? src1->data - : (const void *)((const char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME); + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; #if GGML_USE_IQK_MULMAT if (src1->type != vec_dot_type && dst->type == GGML_TYPE_F32) { @@ -14354,10 +14345,9 @@ static void ggml_compute_forward_mul_mat_id( const int n_ids = ids->ne[0]; // n_expert_used const int n_as = ne02; // n_expert - char * qdata = (char *)params->wdata + params->wsize - params->qsize; - - char * wdata_src1_end = (src1->type == vec_dot_type) ? qdata : - qdata + GGML_PAD(GGML_MAX_NAME + ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); + char * wdata_src1_end = (src1->type == vec_dot_type) ? + (char *) params->wdata : + (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); struct mmid_row_mapping { int32_t i1; @@ -14367,19 +14357,14 @@ static void ggml_compute_forward_mul_mat_id( int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11] - bool store_name = false; if (src1->type != vec_dot_type) { - if (strncmp(src1->name, qdata, GGML_MAX_NAME) == 0) { - goto QuantizationAlreadyDone; - } - store_name = true; - char * wdata = qdata + GGML_MAX_NAME; + char * wdata = params->wdata; const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); const size_t nbw2 = nbw1*ne11; const size_t nbw3 = nbw2*ne12; - assert(params->qsize >= ne13*nbw3); + assert(params->wsize >= ne13*nbw3); GGML_ASSERT(src1->type == GGML_TYPE_F32); for (int64_t i13 = 0; i13 < ne13; ++i13) { @@ -14395,12 +14380,7 @@ static void ggml_compute_forward_mul_mat_id( #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)] -QuantizationAlreadyDone:; if (ith == 0) { - if (store_name) { - memcpy(qdata, src1->name, GGML_MAX_NAME); - } - // initialize matrix_row_counts memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); @@ -14429,7 +14409,7 @@ QuantizationAlreadyDone:; const char * src0_cur = (const char *) src0->data + cur_a*nb02; - const void * wdata = (src1->type == vec_dot_type) ? src1->data : qdata + GGML_MAX_NAME; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); const int64_t nr0 = ne01; // src0 rows @@ -21017,7 +20997,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } size_t work_size = 0; - size_t q_size = 0; struct ggml_cplan cplan; memset(&cplan, 0, sizeof(struct ggml_cplan)); @@ -21033,7 +21012,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa max_tasks = MAX(max_tasks, n_tasks); size_t cur = 0; - size_t cur_q = 0; switch (node->op) { case GGML_OP_CPY: @@ -21064,8 +21042,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; if (node->src[1]->type != vec_dot_type) { - cur_q = ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]); - //cur_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); + cur = ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]); } } break; case GGML_OP_MUL_MAT_ID: @@ -21075,13 +21052,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa const struct ggml_tensor * src1 = node->src[1]; const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; if (src1->type != vec_dot_type) { - cur_q += ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]); - //cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1)); + cur += ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]); } const int n_as = src0->ne[2]; - cur_q += GGML_PAD(cur, sizeof(int64_t)); // align - cur_q += n_as * sizeof(int64_t); // matrix_row_counts - cur_q += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows + cur += GGML_PAD(cur, sizeof(int64_t)); // align + cur += n_as * sizeof(int64_t); // matrix_row_counts + cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows } break; case GGML_OP_OUT_PROD: { @@ -21170,20 +21146,14 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } work_size = MAX(work_size, cur); - q_size = MAX(q_size, cur_q); } if (work_size > 0) { work_size += CACHE_LINE_SIZE*(n_threads - 1); } - if (q_size > 0) { - q_size += GGML_MAX_NAME; - } - work_size += q_size; cplan.n_threads = MIN(max_tasks, n_threads); cplan.work_size = work_size; - cplan.q_size = q_size; cplan.work_data = NULL; return cplan; @@ -21201,7 +21171,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.ith =*/ state->ith, /*.nth =*/ state->shared->n_threads, /*.wsize =*/ cplan->work_size, - /*.qsize =*/ cplan->q_size, /*.wdata =*/ cplan->work_data, /*.shared=*/ state->shared, }; |