diff options
-rw-r--r-- | examples/imatrix/imatrix.cpp | 2 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/cpy.cu | 2 | ||||
-rw-r--r-- | ggml/src/ggml.c | 2 | ||||
-rw-r--r-- | src/llama.cpp | 18 |
4 files changed, 21 insertions, 3 deletions
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 8006988c..d8a43049 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -195,7 +195,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * if (m_params.verbosity > 1) { printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); } - for (int row = 0; row < (int)src1->ne[1]; ++row) { + for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) { const float * x = data + row * src1->ne[0]; for (int j = 0; j < (int)src1->ne[0]; ++j) { e.values[j] += x[j]*x[j]; diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 0b269a86..fabe8843 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -556,7 +556,7 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) { return (void*) cpy_f32_q<cpy_blck_f32_q6_0, QK6_0>; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - return (void*) cpy_f32_f16<cpy_1_f32_f16>; + return (void*) cpy_f32_f16<cpy_1_f16_f16>; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { return (void*) cpy_f32_f16<cpy_1_f16_f32>; } else { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4089e9b7..88820438 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -10468,7 +10468,7 @@ static void ggml_compute_forward_dup_bytes( if (ggml_is_contiguous(dst)) { size_t id = 0; char * dst_ptr = (char *) dst->data; - const size_t rs = ne00 * type_size; + const size_t rs = ggml_row_size(src0->type, ne00); //ne00 * type_size; if (nb00 == type_size) { // src0 is contigous on first dimension, copy by rows diff --git a/src/llama.cpp b/src/llama.cpp index bad8d33d..ba5c5052 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13787,6 +13787,7 @@ struct llm_build_context { ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank)*n_embd_head_v, 0); cb(wv_b, "wv_b", il); + std::memcpy(wv_b->name, model.layers[il].wv_b->name, GGML_MAX_NAME); kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed); cb(kqv, "kqv", il); @@ -17348,6 +17349,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (imatrix_data) { auto it = imatrix_data->find(tensor->name); if (it == imatrix_data->end()) { + // MLA hack: most imatrix files floating around the Internet have been computed with standard attention. + // This means that the imatrix file does not contain data for the *.attn_k_b.weight and *.attn_v_b.weight + // required by MLA. But the *.attn_v_b.weight tensors "see" the exact same activations as the + // *.attn_kv_b.weight tensors used in standard attention. Hence, if we find imatrix data for + // *.attn_kv_b.weight we can use it for *.attn_v_b.weight and vice versa. + std::string name{tensor->name}; + static std::array<std::string, 2> alternatives{".attn_v_b.weight", ".attn_kv_b.weight"}; + for (int j = 0; j < int(alternatives.size()); ++j) { + if (auto pos = name.find(alternatives[j]); pos != std::string::npos) { + int j1 = (j + 1) % alternatives.size(); + auto alternative_name = name.substr(0, pos) + alternatives[j1]; + it = imatrix_data->find(alternative_name); + break; + } + } + } + if (it == imatrix_data->end()) { LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); } else { if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { |