diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2024-10-04 16:16:01 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-04 16:16:01 +0300 |
commit | fe36930c8b7fdf7a6710f7363a9a9f94c2fef9c0 (patch) | |
tree | 3d98d1f34823bd843c0340aa8e3932a120a24ad2 /ggml/src | |
parent | bc79091b0e8602a8d292c22fba0d4072456d52d0 (diff) |
Move scale fudge factors to quantization (#81)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src')
-rw-r--r-- | ggml/src/ggml-cuda/convert.cu | 4 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 6 |
2 files changed, 4 insertions, 6 deletions
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 7089a6df..28b2415b 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -705,7 +705,7 @@ static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_ int ib128 = tid/16; // 0 or 1 int il = tid%16; // 0...15 dst_t * y = yy + i*QK_K + 128*ib128 + 2*il; - const float d = (float)x[i].d * 1.025f; //1.0325f; + const float d = (float)x[i].d; const float dl1 = d * (2*((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 15); const float dl2 = d * (2*((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 15); const float dl3 = d * (2*((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 15); @@ -730,7 +730,7 @@ static __global__ void dequantize_block_iq3_k(const void * __restrict__ vx, dst_ int ib128 = tid/16; // 0 or 1 int il = tid%16; // 0...15 dst_t * y = yy + i*QK_K + 128*ib128 + 2*il; - const float d = (float)x[i].d * 1.01f; //1.0125f; + const float d = (float)x[i].d; const uint16_t sh = x[i].scales_h >> (8*ib128 + (il/8)); const float dl1 = d * ((2*((x[i].scales_l[4*ib128+0] >> 4*(il/8)) & 0xf) + 1) * ((sh & 0x01) ? -1 : 1)); const float dl2 = d * ((2*((x[i].scales_l[4*ib128+1] >> 4*(il/8)) & 0xf) + 1) * ((sh & 0x04) ? -1 : 1)); diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 28bad18e..3ff6b4da 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -589,7 +589,6 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl if (!max_abs_scale) continue; float d = max_abs_scale/15; - y[ibl].d = GGML_FP32_TO_FP16(d); y[ibl].extra = extra; float id = 1/d; @@ -624,7 +623,7 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl } } } - if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(sumqx/sumq2); + y[ibl].d = GGML_FP32_TO_FP16(1.025f*(sumq2 > 0 ? sumqx/sumq2 : d)); } } @@ -854,7 +853,6 @@ static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, c if (!max_abs_scale) continue; float d = max_abs_scale/31; - y[ibl].d = GGML_FP32_TO_FP16(d); y[ibl].extra = extra; float id = 1/d; @@ -892,7 +890,7 @@ static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, c } } } - if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(sumqx/sumq2); + y[ibl].d = GGML_FP32_TO_FP16(1.01f*(sumq2 > 0 ? sumqx/sumq2 : d)); } } |