summaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-01-30 15:14:12 +0200
committerGitHub <noreply@github.com>2024-01-30 15:14:12 +0200
commitf4d7e5497485ce6ce0e322533930b7da4657dd2d (patch)
tree78b30048cb4a9c78d5cf3e231a1ac3e9ed190577 /tests
parent2256f36b79a932a478d4dcdf02c1e5a60056e5f3 (diff)
SOTA 3-bit quants (#5196)
* iq3_xxs: quantize/dequantize RMSE seems a bit high-ish at about half-way between q2_K and q3_K, so need to check more. * iq3_xxs: CUDA dequantize works * iq2_xxs: tuning quantization * iq3_xxs: starting to look better PPL on wiki.test.raw LLaMA-v1-7B: 6.4218 LLaMA-v2-7B: 6.3560 Mistral-7B : 6.0717 This is better than Q3_K_XS, with a 5% reduction in quantized model size. * iq3_xxs: CUDA dot product We have PP-512: 5891 t/s TG-128: 143.9 t/s * iq3_xxs: scalar and AVX2 dot products * iq3_xxs: ARM_NEON and Metal Metal performance is decent, ARM_NEON is pathetic * iq3_xxs: slightly better grid points * Faster iq3_xxs and iq2_xs dot products on CUDA * iq3_xxs: add some quant mix * iq3_xxs: fix failing quantization test Dot product still fails. Is this real? * iq3_xxs: hopefully fix ROCm * iq3_xxs: failing tests This time the dot product accuracy did find an actual bug in the AVX2 implementation. * Add IQ3_XXS to test-backend-ops --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'tests')
-rw-r--r--tests/test-backend-ops.cpp1
-rw-r--r--tests/test-quantize-fns.cpp13
-rw-r--r--tests/test-quantize-perf.cpp2
3 files changed, 13 insertions, 3 deletions
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 775147d4..b328b19b 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1890,6 +1890,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
GGML_TYPE_Q6_K,
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
+ GGML_TYPE_IQ3_XXS,
};
// unary ops
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 31a78c63..43df8022 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -17,7 +17,9 @@ constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
+constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
static const char* RESULT_STR[] = {"ok", "FAILED"};
@@ -135,18 +137,21 @@ int main(int argc, char * argv[]) {
}
const ggml_type ei = (ggml_type)i;
+
if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) {
printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei));
continue;
}
printf("Testing %s\n", ggml_type_name((ggml_type) i));
+ ggml_quantize_init(ei);
if (qfns.from_float && qfns.to_float) {
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
const float max_quantization_error =
- type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
- type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : MAX_QUANTIZATION_TOTAL_ERROR;
+ type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
+ type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
+ type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
failed = !(total_error < max_quantization_error);
num_failed += failed;
if (failed || verbose) {
@@ -161,7 +166,9 @@ int main(int argc, char * argv[]) {
}
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
- failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
+ const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
+ type == GGML_TYPE_IQ3_XXS ? MAX_DOT_PRODUCT_ERROR_LOWBIT : MAX_DOT_PRODUCT_ERROR;
+ failed = !(vec_dot_error < max_allowed_error);
num_failed += failed;
if (failed || verbose) {
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 09d410b7..8ec81734 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -278,6 +278,8 @@ int main(int argc, char * argv[]) {
if (qfns.from_float && qfns.to_float) {
printf("%s\n", ggml_type_name(type));
+ ggml_quantize_init(type);
+
if (params.op_quantize_row_q_reference) {
printf(" quantize_row_q_reference\n");
for (size_t size : params.test_sizes) {