summaryrefslogtreecommitdiff
path: root/ggml/src/iqk/iqk_quantize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp74
1 files changed, 71 insertions, 3 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 0968becf..9b39a490 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -119,6 +119,54 @@ void quantize_row_iq1_bn(const float * x, void * y, int64_t k) {
quantize_iq1_bn(x, y, 1, k, nullptr);
}
+void quantize_row_iq1_tn_ref(const float * x, block_iq1_tn * y, int64_t k) {
+ quantize_iq1_tn(x, (void *)y, 1, k, nullptr);
+}
+
+void quantize_row_iq1_tn(const float * x, void * y, int64_t k) {
+ quantize_iq1_tn(x, y, 1, k, nullptr);
+}
+
+size_t quantize_iq1_tn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+ GGML_ASSERT(n_per_row >= 2*QK_K); // so we have space for the scale
+ int nblock = n_per_row/QK_IQ1BN;
+ float tmp[QK_IQ1BN];
+ char * qrow = (char *)dst;
+ auto row_size = ggml_row_size(GGML_TYPE_IQ1_TN, n_per_row);
+ IQ1BNQuantizer iq1bn;
+ for (int row = 0; row < nrows; ++row) {
+ float max = fabsf(src[0]);
+ for (int j = 1; j < n_per_row; ++j) max = std::max(max, fabsf(src[j]));
+ if (!(max > 0)) printf("%s: found max = %g?\n", __func__, max);
+ //GGML_ASSERT(max > 0);
+ *(ggml_half *)qrow = GGML_FP32_TO_FP16(max);
+ block_iq1_bn * y = (block_iq1_bn *)(qrow + sizeof(ggml_half));
+ const float * xb = src;
+ for (int ib = 0; ib < nblock; ++ib) {
+ for (int j = 0; j < QK_IQ1BN; ++j) tmp[j] = xb[j] < -0.5f*max ? -1 : xb[j] <= 0.5f*max ? 0 : 1;
+ iq1bn.quantize_one_row_1bn(tmp, y, QK_IQ1BN, imatrix);
+ ++y;
+ xb += QK_IQ1BN;
+ }
+ src += n_per_row;
+ qrow += row_size;
+ }
+ return nrows*row_size;
+}
+
+void dequantize_row_iq1_tn(const block_iq1_tn * x, float * y, int64_t k) {
+ float scale = GGML_FP16_TO_FP32(*(const ggml_half *)x);
+ const block_iq1_bn * iq1bn = (const block_iq1_bn *)((const char *)x + sizeof(ggml_half));
+ dequantize_row_iq1_bn(iq1bn, y, k);
+ for (int j = 0; j < int(k); ++j) y[j] *= scale;
+}
+
+void vec_dot_iq1_tn_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+ float scale = GGML_FP16_TO_FP32(*(const ggml_half *)vx);
+ ggml_vec_dot_iq1_bn_q8_K64(n, s, bs, (const void *)((const char *)vx + sizeof(ggml_half)), bx, vy, by, nrc);
+ *s *= scale;
+}
+
void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) {
assert(k%QK_IQ1BN == 0);
int nblock = k / QK_IQ1BN;
@@ -331,8 +379,10 @@ void ggml_vec_dot_iq2_bn_q8_K64(int n, float * s, size_t bs, const void * vx, si
void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
+ GGML_ASSERT(k >= 8*QK_IQ1BN);
+
float * dptr = (float *)y;
- auto qs = (int8_t *)(dptr + 4);
+ auto qs = (int8_t *)(dptr + 8);
#ifdef __ARM_NEON
static const uint8_t k_shuffle[16] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
auto shuffle = vld1q_u8(k_shuffle);
@@ -351,16 +401,22 @@ void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
vid[i] = vdupq_n_f32(id);
}
int8x16x4_t q;
+ int32x4_t qsum = {};
+ const int8x16_t m1 = vdupq_n_s8(1);
for (int j = 0; j < k; j += 16) {
for (int i = 0; i < 4; ++i) {
auto val = vld1q_f32(x + j + 4*i);
val = vmulq_f32(vid[i], val);
- q.val[i] = vreinterpretq_s8_s32(vcvtnq_s32_f32(val));
+ auto ival = vcvtnq_s32_f32(val);
+ q.val[i] = vreinterpretq_s8_s32(ival);
}
auto qi = vqtbl4q_s8(q, shuffle);
+ qsum = ggml_vdotq_s32(qsum, qi, m1);
vst1q_s8(qs, qi);
qs += 16;
}
+ auto sumf = vmulq_f32(vld1q_f32(dptr), vcvtq_f32_s32(qsum));
+ vst1q_f32(dptr + 4, sumf);
#elif defined __AVX__
__m128 max[4] = {};
__m128 sign_bit = _mm_set1_ps(-0.f);
@@ -381,6 +437,9 @@ void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
vid[i] = _mm_set1_ps(id);
}
__m128i q[4];
+ __m128i sums = _mm_setzero_si128();
+ __m128i m1_8 = _mm_set1_epi8(1);
+ __m128i m1_16 = _mm_set1_epi16(1);
for (int j = 0; j < k; j += 16) {
for (int i = 0; i < 4; ++i) {
auto val = _mm_loadu_ps(x + j + 4*i);
@@ -390,9 +449,13 @@ void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
auto q1 = _mm_packs_epi32(q[0], q[1]);
auto q2 = _mm_packs_epi32(q[2], q[3]);
auto qi = _mm_packs_epi16(q1, q2);
+ auto aux = _mm_maddubs_epi16(m1_8, qi);
+ sums = _mm_add_epi32(sums, _mm_madd_epi16(m1_16, aux));
_mm_storeu_si128((__m128i *)qs, qi);
qs += 16;
}
+ auto minus = _mm_mul_ps(_mm_loadu_ps(dptr), _mm_cvtepi32_ps(sums));
+ _mm_storeu_ps(dptr + 4, minus);
#else
float aux[4] = {0.f, 0.f, 0.f, 0.f};
for (int j = 0; j < k; j += 16) {
@@ -407,11 +470,16 @@ void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
dptr[i] = aux[i]/127;
aux[i] = dptr[i] > 0 ? 1/dptr[i] : 0.f;
}
+ int32_t sum[4] = {};
for (int j = 0; j < k; j += 16) {
for (int i = 0; i < 4; ++i) {
- for (int l = 0; l < 4; ++l) qs[j+4*i+l] = nearest_int(aux[i]*x[j+4*i+l]);
+ for (int l = 0; l < 4; ++l) {
+ qs[j+4*i+l] = nearest_int(aux[i]*x[j+4*i+l]);
+ sum[i] += qs[j+4*i+l];
+ }
}
}
+ for (int i = 0; i < 4; ++i) dptr[4+i] = dptr[i]*sum[i];
#endif
}