diff options
author | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-08-05 11:59:36 +0300 |
---|---|---|
committer | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-08-05 11:39:10 +0200 |
commit | b409c153636d27473970abd3a9c9400b6287d400 (patch) | |
tree | 7bdba4859b8a66fa39ec237b87db56399edacebb | |
parent | c11c7c8cae5ab1abf41c16b7bb27439bb0983c54 (diff) |
q2_K: allow it to detect ternary nets and quantize accordingly
-rw-r--r-- | examples/quantize/quantize.cpp | 7 | ||||
-rw-r--r-- | ggml/src/ggml-quants.c | 45 | ||||
-rw-r--r-- | include/llama.h | 1 | ||||
-rw-r--r-- | src/llama.cpp | 6 |
4 files changed, 55 insertions, 4 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 0b4c3444..bae071ce 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -255,6 +255,8 @@ int main(int argc, char ** argv) { for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { params.quantize_output_tensor = false; + } else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) { + params.ignore_imatrix_rules = true; } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) { if (arg_idx < argc-1) { params.output_tensor_type = parse_ggml_type(argv[++arg_idx]); @@ -409,11 +411,12 @@ int main(int argc, char ** argv) { } } - if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + if (!params.ignore_imatrix_rules && imatrix_data.empty() && + (params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) { + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) { fprintf(stderr, "\n==========================================================================================================\n"); fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); fprintf(stderr, "==========================================================================================================\n\n\n"); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index c2c66f38..415249fb 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -1995,7 +1995,52 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in const float q4scale = 15.f; + // Detect TriNet + { + int n = k; + float max = 0; + for (int j = 0; j < n; ++j) { + float ax = fabsf(x[j]); + max = MAX(max, ax); + } + float mse0 = 0, mse = 0; + for (int j = 0; j < n; ++j) { + int l = x[j] < -0.5f*max ? -1 : x[j] < 0.5f*max ? 0 : 1; + mse0 += x[j]*x[j]; + float diff = x[j] - max*l; + mse += diff*diff; + } + if (mse < 0.1f*mse0) { + // yes, most likely trinet + for (int ibl = 0; ibl < nb; ++ibl) { + y[ibl].d = GGML_FP32_TO_FP16(max); + y[ibl].dmin = GGML_FP32_TO_FP16(max); + for (int ib = 0; ib < QK_K/16; ++ib) y[ibl].scales[ib] = 1 | (1 << 4); + const float * xb = x + QK_K * ibl; + for (int j = 0; j < QK_K; ++j) { + L[j] = xb[j] < -0.5f*max ? 0 : xb[j] < 0.5f*max ? 1 : 2; + } + uint8_t * qs = y[ibl].qs; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + qs[l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + qs += 32; + } + } + return; + } + } + for (int i = 0; i < nb; i++) { + //{ + // float max = x[0], min = x[0]; + // for (int j = 1; j < 256; ++j) { + // max = MAX(x[j], max); + // min = MIN(x[j], min); + // } + // printf("%s: max = %g, min = %g\n", __func__, (double)max, (double)min); + //} float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/16; ++j) { diff --git a/include/llama.h b/include/llama.h index 88d82958..15ff915b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -359,6 +359,7 @@ extern "C" { bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool pure; // quantize all tensors to the default type bool keep_split; // quantize to the same number of shards + bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides } llama_model_quantize_params; diff --git a/src/llama.cpp b/src/llama.cpp index 2caaf7d0..e530f528 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16071,12 +16071,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } } - if ((new_type == GGML_TYPE_IQ2_XXS || + if (!params->ignore_imatrix_rules && !imatrix && + (new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_S || new_type == GGML_TYPE_IQ1_S || (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || - (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { + (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); @@ -16441,6 +16442,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.only_copy =*/ false, /*.pure =*/ false, /*.keep_split =*/ false, + /*.ignore_imatrix_rules =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, }; |