diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-02-26 18:28:38 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-26 18:28:38 +0200 |
commit | a33e6a0d2a66104ea9a906bdbf8a94d050189d91 (patch) | |
tree | 30478b4a0b1792d1af66c5d64e2c3c4fa1af74ab /tests/test-quantize-fns.cpp | |
parent | 47bb7b48c7cec9d8f57d56812ce811ec130b89a3 (diff) |
Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)
* Adding IQ2_S and IQ2_M as a single cumulative commit
* Update examples/quantize/quantize.cpp
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'tests/test-quantize-fns.cpp')
-rw-r--r-- | tests/test-quantize-fns.cpp | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 04656bb9..f615b612 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -150,6 +150,7 @@ int main(int argc, char * argv[]) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : + type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR; @@ -168,7 +169,8 @@ int main(int argc, char * argv[]) { const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data()); const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || - type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT + type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S + ? MAX_DOT_PRODUCT_ERROR_LOWBIT : MAX_DOT_PRODUCT_ERROR; failed = !(vec_dot_error < max_allowed_error); num_failed += failed; |