Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit * Update examples/quantize/quantize.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: Kawrakow <48489457+ikawrakow@users.noreply.github.com> 2024-02-26 18:28:38 +0200
committer: GitHub <noreply@github.com> 2024-02-26 18:28:38 +0200
commit: a33e6a0d2a66104ea9a906bdbf8a94d050189d91 (patch)
tree: 30478b4a0b1792d1af66c5d64e2c3c4fa1af74ab /tests/test-quantize-fns.cpp
parent: 47bb7b48c7cec9d8f57d56812ce811ec130b89a3 (diff)
1 files changed, 3 insertions, 1 deletions
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 04656bb9..f615b612 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -150,6 +150,7 @@ int main(int argc, char * argv[]) {
             const float total_error = total_quantization_error(qfns, test_size, test_data.data());
             const float max_quantization_error =
                 type == GGML_TYPE_Q2_K    ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
+                type == GGML_TYPE_IQ2_S   ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
                 type == GGML_TYPE_Q3_K    ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
                 type == GGML_TYPE_IQ3_S   ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
                 type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
@@ -168,7 +169,8 @@ int main(int argc, char * argv[]) {
 
             const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
             const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
-                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT
+                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
+                                          ? MAX_DOT_PRODUCT_ERROR_LOWBIT
                                           : MAX_DOT_PRODUCT_ERROR;
             failed = !(vec_dot_error < max_allowed_error);
             num_failed += failed;
author	Kawrakow <48489457+ikawrakow@users.noreply.github.com>	2024-02-26 18:28:38 +0200
committer	GitHub <noreply@github.com>	2024-02-26 18:28:38 +0200
commit	a33e6a0d2a66104ea9a906bdbf8a94d050189d91 (patch)
tree	30478b4a0b1792d1af66c5d64e2c3c4fa1af74ab /tests/test-quantize-fns.cpp
parent	47bb7b48c7cec9d8f57d56812ce811ec130b89a3 (diff)