summaryrefslogtreecommitdiff
path: root/ggml-quants.c
diff options
context:
space:
mode:
Diffstat (limited to 'ggml-quants.c')
-rw-r--r--ggml-quants.c234
1 files changed, 233 insertions, 1 deletions
diff --git a/ggml-quants.c b/ggml-quants.c
index 3319d2cc..6336538f 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -3754,6 +3754,26 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
}
}
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
+ assert(k % QK4_NL == 0);
+ const int nb = k / QK4_NL;
+
+ for (int i = 0; i < nb; i++) {
+
+ const uint8_t * qs = x[i].qs;
+
+ const float d = GGML_FP16_TO_FP32(x[i].d);
+ for (int j = 0; j < QK4_NL/2; ++j) {
+ y[j+ 0] = d * kvalues_iq4nl[qs[j] & 0xf];
+ y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >> 4];
+ }
+ y += QK4_NL;
+ qs += QK4_NL/2;
+ }
+}
+
//===================================== Q8_K ==============================================
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -9148,7 +9168,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
#endif
}
-// TODO
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
@@ -9452,7 +9471,100 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
*s = sumf;
#endif
+}
+
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+ assert(n % QK4_NL == 0);
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+ const block_iq4_nl * restrict x = vx;
+ const block_q8_0 * restrict y = vy;
+
+ const int nb = n / QK4_NL;
+
+#if defined __ARM_NEON
+ const int8x16_t values = vld1q_s8(kvalues_iq4nl);
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
+ uint8x16x2_t q4bits;
+ int8x16x4_t q4b;
+ int8x16x4_t q8b;
+ int32x4_t prod_1, prod_2;
+ float sumf = 0;
+
+ for (int ib = 0; ib < nb; ib += 2) {
+
+ q4bits.val[0] = vld1q_u8(x[ib+0].qs);
+ q4bits.val[1] = vld1q_u8(x[ib+1].qs);
+ q8b.val[0] = vld1q_s8(y[ib+0].qs);
+ q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
+ q8b.val[2] = vld1q_s8(y[ib+1].qs);
+ q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
+
+ q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b));
+ q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
+ q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b));
+ q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
+
+ prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
+ prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
+
+ sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);
+
+ }
+
+ *s = sumf;
+
+#elif defined __AVX2__
+
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+ const __m128i m4b = _mm_set1_epi8(0x0f);
+ const __m256i mone = _mm256_set1_epi16(1);
+
+ __m256 accum1 = _mm256_setzero_ps();
+ __m256 accum2 = _mm256_setzero_ps();
+ for (int ib = 0; ib < nb; ib += 2) {
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
+ const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+ const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
+ accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
+ _mm256_cvtepi32_ps(p_1), accum1);
+ accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
+ _mm256_cvtepi32_ps(p_2), accum2);
+
+ y += 2;
+ x += 2;
+ }
+
+ *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
+
+#else
+ float sumf = 0;
+ for (int ib = 0; ib < nb; ++ib) {
+ const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
+ int sumi1 = 0, sumi2 = 0;
+ for (int j = 0; j < QK4_NL/2; ++j) {
+ sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+ sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
+ }
+ sumf += d * (sumi1 + sumi2);
+ }
+ *s = sumf;
+#endif
}
// ================================ IQ2 quantization =============================================
@@ -10729,3 +10841,123 @@ size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, in
}
return nrow * nblock * sizeof(block_iq1_s);
}
+
+// ============================ 4-bit non-linear quants
+
+static inline int best_index_int8(int n, const int8_t * val, float x) {
+ if (x <= val[0]) return 0;
+ if (x >= val[n-1]) return n-1;
+ int ml = 0, mu = n-1;
+ while (mu-ml > 1) {
+ int mav = (ml+mu)/2;
+ if (x < val[mav]) mu = mav; else ml = mav;
+ }
+ return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
+}
+
+static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x,
+ ggml_fp16_t * dh, uint8_t * q4,
+ float * weight, uint8_t * L,
+ const int8_t * values,
+ const float * quant_weights) {
+
+ const int ntry = 7;
+
+ float sigma2 = 0;
+ for (int j = 0; j < QK4_NL; ++j) sigma2 += x[j]*x[j];
+ sigma2 *= 2.f/QK4_NL;
+
+ const int nb = QK4_NL/block_size;
+
+ memset(q4, 0, QK4_NL/2);
+ for (int ib = 0; ib < nb; ++ib) {
+ dh[ib] = GGML_FP32_TO_FP16(0.f);
+ const float * xb = x + ib*block_size;
+ if (quant_weights) {
+ const float * qw = quant_weights + ib*block_size;
+ for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+ } else {
+ for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
+ }
+ float amax = 0, max = 0;
+ for (int j = 0; j < block_size; ++j) {
+ float ax = fabsf(xb[j]);
+ if (ax > amax) {
+ amax = ax; max = xb[j];
+ }
+ }
+ if (!amax) {
+ continue;
+ }
+ float d = -max/values[0];
+ float id = 1/d;
+ float sumqx = 0, sumq2 = 0;
+ for (int j = 0; j < block_size; ++j) {
+ float al = id*xb[j];
+ int l = best_index_int8(16, values, al);
+ float q = values[l];
+ float w = weight[j];
+ sumqx += w*q*xb[j];
+ sumq2 += w*q*q;
+ }
+ float best_id = id;
+ d = sumqx/sumq2;
+ float best = d*sumqx;
+ for (int itry = -ntry; itry <= ntry; ++itry) {
+ id = (itry + values[0])/max;
+ sumqx = sumq2 = 0;
+ for (int j = 0; j < block_size; ++j) {
+ float al = id*xb[j];
+ int l = best_index_int8(16, values, al);
+ float q = values[l];
+ float w = weight[j];
+ sumqx += w*q*xb[j];
+ sumq2 += w*q*q;
+ }
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+ d = sumqx/sumq2; best = d * sumqx;
+ best_id = id;
+ }
+ }
+ dh[ib] = GGML_FP32_TO_FP16(d);
+ for (int j = 0; j < block_size; ++j) {
+ L[ib*block_size + j] = best_index_int8(16, values, best_id*xb[j]);
+ }
+ }
+ for (int i = 0; i < QK4_NL/32; ++i) {
+ for (int j = 0; j < 16; ++j) {
+ q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
+ }
+ }
+}
+
+size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+ (void)hist;
+ GGML_ASSERT(n_per_row%QK4_NL == 0);
+ int nblock = n_per_row/QK4_NL;
+ char * qrow = (char *)dst;
+ uint8_t L[QK4_NL];
+ float weight[32];
+ for (int row = 0; row < nrow; ++row) {
+ block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
+ quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, weight, L, kvalues_iq4nl, qw);
+ }
+ src += n_per_row;
+ qrow += nblock*sizeof(block_iq4_nl);
+ }
+ return nrow * nblock * sizeof(block_iq4_nl);
+}
+
+void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
+ assert(k % QK4_NL == 0);
+ block_iq4_nl * restrict y = vy;
+ quantize_row_iq4_nl_reference(x, y, k);
+}
+
+void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
+ assert(k % QK4_NL == 0);
+ quantize_iq4_nl(x, y, 1, k, NULL, NULL);
+}
+