diff options
Diffstat (limited to 'ggml.c')
-rw-r--r-- | ggml.c | 107 |
1 files changed, 59 insertions, 48 deletions
@@ -2,7 +2,10 @@ #define _GNU_SOURCE #include "ggml.h" -#include "ggml-quants-k.h" + +#ifdef GGML_USE_K_QUANTS +#include "k_quants.h" +#endif #if defined(_MSC_VER) || defined(__MINGW32__) #include <malloc.h> // using malloc.h with MSC/MINGW @@ -1580,46 +1583,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_q = NULL, // TODO .vec_dot_type = GGML_TYPE_Q8_1, }, +#ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_k, - .quantize_row_q = quantize_row_q2_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q2_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K, + .quantize_row_q = quantize_row_q2_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q3_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_k, - .quantize_row_q = quantize_row_q3_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q3_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K, + .quantize_row_q = quantize_row_q3_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q4_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_k, - .quantize_row_q = quantize_row_q4_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q4_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K, + .quantize_row_q = quantize_row_q4_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q5_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_k, - .quantize_row_q = quantize_row_q5_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q5_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K, + .quantize_row_q = quantize_row_q5_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q6_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_k, - .quantize_row_q = quantize_row_q6_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q6_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K, + .quantize_row_q = quantize_row_q6_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, +#endif }; // For internal test use @@ -3499,12 +3504,14 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q8_0] = QK8_0, [GGML_TYPE_Q8_1] = QK8_1, +#ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = QK_K, [GGML_TYPE_Q3_K] = QK_K, [GGML_TYPE_Q4_K] = QK_K, [GGML_TYPE_Q5_K] = QK_K, [GGML_TYPE_Q6_K] = QK_K, [GGML_TYPE_Q8_K] = QK_K, +#endif [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, @@ -3520,12 +3527,14 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_Q8_1] = sizeof(block_q8_1), - [GGML_TYPE_Q2_K] = sizeof(block_q2_k), - [GGML_TYPE_Q3_K] = sizeof(block_q3_k), - [GGML_TYPE_Q4_K] = sizeof(block_q4_k), - [GGML_TYPE_Q5_K] = sizeof(block_q5_k), - [GGML_TYPE_Q6_K] = sizeof(block_q6_k), - [GGML_TYPE_Q8_K] = sizeof(block_q8_k), +#ifdef GGML_USE_K_QUANTS + [GGML_TYPE_Q2_K] = sizeof(block_q2_K), + [GGML_TYPE_Q3_K] = sizeof(block_q3_K), + [GGML_TYPE_Q4_K] = sizeof(block_q4_K), + [GGML_TYPE_Q5_K] = sizeof(block_q5_K), + [GGML_TYPE_Q6_K] = sizeof(block_q6_K), + [GGML_TYPE_Q8_K] = sizeof(block_q8_K), +#endif [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), @@ -3542,12 +3551,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_Q8_1] = "q8_1", - [GGML_TYPE_Q2_K] = "q2_k", - [GGML_TYPE_Q3_K] = "q3_k", - [GGML_TYPE_Q4_K] = "q4_k", - [GGML_TYPE_Q5_K] = "q5_k", - [GGML_TYPE_Q6_K] = "q6_k", - [GGML_TYPE_Q8_K] = "q8_k", + [GGML_TYPE_Q2_K] = "q2_K", + [GGML_TYPE_Q3_K] = "q3_K", + [GGML_TYPE_Q4_K] = "q4_K", + [GGML_TYPE_Q5_K] = "q5_K", + [GGML_TYPE_Q6_K] = "q6_K", + [GGML_TYPE_Q8_K] = "q8_K", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", @@ -16249,36 +16258,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; result = ggml_quantize_q8_0(src + start, block, n, n, hist); } break; +#ifdef GGML_USE_K_QUANTS case GGML_TYPE_Q2_K: { GGML_ASSERT(start % QK_K == 0); - block_q2_k * block = (block_q2_k*)dst + start / QK_K; - result = ggml_quantize_q2_k(src + start, block, n, n, hist); + block_q2_K * block = (block_q2_K*)dst + start / QK_K; + result = ggml_quantize_q2_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q3_K: { GGML_ASSERT(start % QK_K == 0); - block_q3_k * block = (block_q3_k*)dst + start / QK_K; - result = ggml_quantize_q3_k(src + start, block, n, n, hist); + block_q3_K * block = (block_q3_K*)dst + start / QK_K; + result = ggml_quantize_q3_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q4_K: { GGML_ASSERT(start % QK_K == 0); - block_q4_k * block = (block_q4_k*)dst + start / QK_K; - result = ggml_quantize_q4_k(src + start, block, n, n, hist); + block_q4_K * block = (block_q4_K*)dst + start / QK_K; + result = ggml_quantize_q4_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q5_K: { GGML_ASSERT(start % QK_K == 0); - block_q5_k * block = (block_q5_k*)dst + start / QK_K; - result = ggml_quantize_q5_k(src + start, block, n, n, hist); + block_q5_K * block = (block_q5_K*)dst + start / QK_K; + result = ggml_quantize_q5_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q6_K: { GGML_ASSERT(start % QK_K == 0); - block_q6_k * block = (block_q6_k*)dst + start / QK_K; - result = ggml_quantize_q6_k(src + start, block, n, n, hist); + block_q6_K * block = (block_q6_K*)dst + start / QK_K; + result = ggml_quantize_q6_K(src + start, block, n, n, hist); } break; +#endif default: assert(false); } |