summaryrefslogtreecommitdiff
path: root/ggml/src/iqk/iqk_quantize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp161
1 files changed, 101 insertions, 60 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index abd4be61..0384e49a 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -7397,7 +7397,7 @@ void dequantize_row_ms_i2s(const void * vx, float * y, int64_t k) {
}
namespace {
-template <int block_size, int group_size, int num_bits, bool is_abs = false>
+template <int block_size, int group_size, int num_bits, bool is_abs = false, bool is_int = false>
class QuantizerIQKT {
static_assert(group_size == 8 || group_size == 4);
static_assert(block_size >= 8 && block_size%8 == 0);
@@ -7408,7 +7408,7 @@ public:
constexpr static int kNg = kBlockSize/kGroupSize;
constexpr static int kNblock = kSuperBlockSize/kBlockSize;
constexpr static int kNumVal = 1 << num_bits; // i.e, 16 bits per group of 8
- constexpr static float kScale = 31.75f;
+ constexpr static float kScale = is_int ? 1.f : 31.75f;
constexpr static bool kVerbose = false;
QuantizerIQKT(int num_clusters, int num_neighbours, int offset = 4096);
@@ -7419,17 +7419,32 @@ public:
inline float find_best_inverse_scale(const float * xb, const float * weight, const int * best_idx) const;
static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
- constexpr uint32_t ka = 89226354;
- constexpr uint32_t kb = 64248484;
- constexpr uint32_t kmask = 0x8fff8fff;
- constexpr uint32_t km32 = 0x3b603b60;
uint32_t x = i + offset;
- for (int k = 0; k < kGroupSize; ++k) {
- x = ka*x + kb;
- uint32_t s = (x & kmask) ^ km32;
- float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
- if constexpr (is_abs) result[k] = scale*std::abs(val);
- else result[k] = scale*val;
+ if constexpr (is_int) {
+ constexpr uint32_t ka = 0xCBAC1FED;
+ uint32_t s;
+ auto i8 = (const int8_t *)&s;
+ for (int k = 0; k < kGroupSize; ++k) {
+ x = ka*x;
+ s = x & 0x3f3f3f3f;
+ if constexpr (is_abs) {
+ result[k] = scale*std::abs(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
+ } else {
+ result[k] = scale*(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
+ }
+ }
+ } else {
+ constexpr uint32_t ka = 89226354;
+ constexpr uint32_t kb = 64248484;
+ constexpr uint32_t kmask = 0x8fff8fff;
+ constexpr uint32_t km32 = 0x3b603b60;
+ for (int k = 0; k < kGroupSize; ++k) {
+ x = ka*x + kb;
+ uint32_t s = (x & kmask) ^ km32;
+ float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
+ if constexpr (is_abs) result[k] = scale*std::abs(val);
+ else result[k] = scale*val;
+ }
}
}
@@ -7478,14 +7493,15 @@ private:
float m_mid[4*kGroupSize];
};
-template <int block_size, int group_size, int num_bits, bool is_abs>
-QuantizerIQKT<block_size, group_size, num_bits, is_abs>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
m_values.resize(kNumVal*kGroupSize);
float * data = m_values.data();
for (int i = 0; i < kNumVal; ++i) {
set_values(i, data, kScale, offset);
data += kGroupSize;
}
+ if (num_clusters == 0) return;
// Make 128 clusters.
// Note: we get a slightly better result by using 64 clusters
// at the expense of almost doubling the quantization time.
@@ -7494,8 +7510,8 @@ QuantizerIQKT<block_size, group_size, num_bits, is_abs>::QuantizerIQKT(int num_c
m_in_cluster = finalize_clusters(num_neighbours, m_values, m_clusters, m_c_values);
}
-template <int block_size, int group_size, int num_bits, bool is_abs>
-std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_scale(
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_scale(
const float * xb, const float * weight, const int * best_idx) const {
float sumqx = 0, sumq2 = 0;
#ifdef __AVX2__
@@ -7527,8 +7543,8 @@ std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>:
return sumq2 > 0 ? std::make_pair(sumqx/sumq2, sumqx*sumqx/sumq2) : std::make_pair(0.f, 0.f);
}
-template <int block_size, int group_size, int num_bits, bool is_abs>
-float QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_inverse_scale(
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+float QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_inverse_scale(
const float * xb, const float * weight, const int * best_idx) const {
float sumqx = 0, sumx2 = 0;
#ifdef __AVX2__
@@ -7560,8 +7576,8 @@ float QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_inverse
return sumx2 > 0 ? sumqx/sumx2 : 0.f;
}
-template <int block_size, int group_size, int num_bits, bool is_abs>
-void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const {
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+void QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const {
if (!d) {
std::memset(best_idx, 0, kNg*sizeof(int));
return;
@@ -7739,8 +7755,8 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
#endif
}
-template <int block_size, int group_size, int num_bits, bool is_abs>
-std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::finalize_clusters(int num_neighbours,
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::finalize_clusters(int num_neighbours,
const std::vector<float>& values, const std::vector<float>& clusters, std::vector<std::vector<float>>& c_values) {
int ncluster = clusters.size()/kGroupSize;
std::vector<std::vector<int>> p_in_cluster(ncluster);
@@ -7826,8 +7842,8 @@ std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is
return p_in_cluster;
}
-template <int block_size, int group_size, int num_bits, bool is_abs>
-std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::cluster_points(const std::vector<float>& points, int ncluster, int niter, float * mid) {
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::cluster_points(const std::vector<float>& points, int ncluster, int niter, float * mid) {
constexpr int ndim = kGroupSize;
GGML_ASSERT(points.size() % ndim == 0);
int npoint = points.size() / ndim;
@@ -7995,7 +8011,7 @@ std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::clus
// ========================================== iq2_kt ====================================================
-using QuantizerIQ2KT = QuantizerIQKT<32, 8, 16>;
+using QuantizerIQ2KT = QuantizerIQKT<32, 8, 16, false, true>;
const QuantizerIQ2KT& iq2kt_quantizer() {
static std::mutex mutex;
@@ -8006,7 +8022,7 @@ const QuantizerIQ2KT& iq2kt_quantizer() {
}
void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_weights,
- float * qtmp) {
+ int * all_idx) {
constexpr float kSigmaScale = 2.0f;
using Q = QuantizerIQ2KT;
@@ -8025,6 +8041,11 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
Q::set_weights(kSigmaScale, nblock, x, quant_weights, all_weights);
+ float amax_row = 0;
+ for (int j = 0; j < n_per_row; ++j) {
+ amax_row = std::max(amax_row, std::abs(x[j]));
+ }
+
float amax_scale = 0, max_scale = 0;
for (int ibl = 0; ibl < nblock; ++ibl) {
@@ -8042,9 +8063,10 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
float ax = std::abs(xb[j]);
amax = std::max(amax, ax);
}
- quantizer.find_best_match( amax/96.f, xb, weight, best_idx);
+ float scale_0 = std::max(90.f, 124.f*amax/amax_row);
+ quantizer.find_best_match( amax/scale_0, xb, weight, best_idx);
auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx);
- quantizer.find_best_match(-amax/96.f, xb, weight, best_idx + Q::kNg);
+ quantizer.find_best_match(-amax/scale_0, xb, weight, best_idx + Q::kNg);
auto [dm, score_m] = quantizer.find_best_scale(xb, weight, best_idx + Q::kNg);
auto idx = best_idx;
@@ -8052,12 +8074,7 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
else {
scales[ib] = dm; idx += Q::kNg;
}
- auto qt = qtmp + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
- for (int ig = 0; ig < Q::kNg; ++ig) {
- auto q = quantizer.values() + idx[ig]*Q::kGroupSize;
- for (int j = 0; j < Q::kGroupSize; ++j) qt[j] = q[j];
- qt += Q::kGroupSize;
- }
+ for (int ig = 0; ig < Q::kNg; ++ig) all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig] = idx[ig];
float abs_scale = std::abs(scales[ib]);
if (abs_scale > amax_scale) {
@@ -8080,20 +8097,22 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
float sumqx = 0, sumq2 = 0;
for (int ibl = 0; ibl < nblock; ++ibl) {
const float * xb = x + ibl*Q::kSuperBlockSize;
- const float * qb = qtmp + ibl*Q::kSuperBlockSize;
const float * wb = all_weights + ibl*Q::kSuperBlockSize;
auto scales = all_scales + ibl*Q::kNblock;
for (int ib = 0; ib < Q::kNblock; ++ib) {
int ls = best_index_iq4nl(iq4k_values, id*scales[ib]);
float dl = iq4k_values[ls];
- for (int j = 0; j < Q::kBlockSize; ++j) {
- float q = dl*qb[j];
- sumqx += wb[j]*xb[j]*q;
- sumq2 += wb[j]*q*q;
+ for (int ig = 0; ig < Q::kNg; ++ig) {
+ auto qb = quantizer.values() + Q::kGroupSize*all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig];
+ for (int j = 0; j < Q::kGroupSize; ++j) {
+ int jj = ig*Q::kGroupSize + j;
+ float q = dl*qb[j];
+ sumqx += wb[jj]*xb[jj]*q;
+ sumq2 += wb[jj]*q*q;
+ }
}
xb += Q::kBlockSize;
wb += Q::kBlockSize;
- qb += Q::kBlockSize;
}
}
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
@@ -8129,6 +8148,26 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
float dl = d*ls;
quantizer.find_best_match(dl, xb, weight, best_idx);
+ auto prev_idx = all_idx + (ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize;
+
+ float mse1 = 0, mse2 = 0;
+ for (int ig = 0; ig < Q::kNg; ++ig) {
+ auto q1 = quantizer.values() + Q::kGroupSize*prev_idx[ig];
+ auto q2 = quantizer.values() + Q::kGroupSize*best_idx[ig];
+ for (int j = 0; j < Q::kGroupSize; ++j) {
+ int jj = ig*Q::kGroupSize + j;
+ float diff1 = xb[jj] - dl*q1[j];
+ float diff2 = xb[jj] - dl*q2[j];
+ mse1 += weight[jj]*diff1*diff1;
+ mse2 += weight[jj]*diff2*diff2;
+ }
+ }
+ if (mse1 < mse2) {
+ for (int ig = 0; ig < Q::kNg; ++ig) best_idx[ig] = prev_idx[ig];
+ } else {
+ for (int ig = 0; ig < Q::kNg; ++ig) prev_idx[ig] = best_idx[ig];
+ }
+
for (int j = 0; j < Q::kNg; ++j) {
qs[j] = best_idx[j];
auto xl = xb + Q::kGroupSize*j;
@@ -8196,10 +8235,10 @@ size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
auto row_size = ggml_row_size(GGML_TYPE_IQ2_KT, n_per_row);
std::vector<float> scales(n_per_row/QuantizerIQ2KT::kBlockSize);
std::vector<float> weights(n_per_row);
- std::vector<float> xtmp(n_per_row);
+ std::vector<int> idx(n_per_row/QuantizerIQ2KT::kGroupSize);
char * qrow = (char *)dst;
for (int64_t row = 0; row < nrows; ++row) {
- quantize_row_iq2_kt_impl(src, (void *)qrow, n_per_row, imatrix, scales.data(), weights.data(), xtmp.data());
+ quantize_row_iq2_kt_impl(src, (void *)qrow, n_per_row, imatrix, scales.data(), weights.data(), idx.data());
src += n_per_row;
qrow += row_size;
}
@@ -8209,7 +8248,7 @@ size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
void dequantize_row_iq2_kt(const block_iq2_kt * x, float * y, int64_t k) {
assert(k % QuantizerIQ2KT::kSuperBlockSize == 0);
#ifdef __AVX2__
- if (iqk_dequantize_ktquants(GGML_TYPE_IQ2_KT, k, x, 0, y, 0, 1)) return;
+ //if (iqk_dequantize_ktquants(GGML_TYPE_IQ2_KT, k, x, 0, y, 0, 1)) return;
#endif
const int nb = k / QuantizerIQ2KT::kSuperBlockSize;
const float * dptr = (const float *)x;
@@ -8254,7 +8293,7 @@ void vec_dot_iq2_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
namespace {
-using QuantizerIQ3KT = QuantizerIQKT<32, 8, 16, true>;
+using QuantizerIQ3KT = QuantizerIQKT<32, 8, 16, true, true>;
const QuantizerIQ3KT& iq3kt_quantizer() {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
@@ -8465,7 +8504,7 @@ size_t quantize_iq3_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
void dequantize_row_iq3_kt(const block_iq3_kt * x, float * y, int64_t k) {
#ifdef __AVX2__
- if (iqk_dequantize_ktquants(GGML_TYPE_IQ3_KT, k, x, 0, y, 0, 1)) return;
+ //if (iqk_dequantize_ktquants(GGML_TYPE_IQ3_KT, k, x, 0, y, 0, 1)) return;
#endif
using Q = QuantizerIQ3KT;
constexpr int kNumGroups = Q::kSuperBlockSize/Q::kGroupSize;
@@ -8521,7 +8560,7 @@ void vec_dot_iq3_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
namespace{
-using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15>;
+using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15, false, true>;
const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
static std::mutex mutex;
@@ -8536,6 +8575,14 @@ const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
return *quantizer1;
}
+const QuantizerIQ4KT& iq4kt_dequantizer() {
+ static std::mutex mutex;
+ std::lock_guard<std::mutex> lock(mutex);
+ static std::unique_ptr<QuantizerIQ4KT> dequantizer;
+ if (!dequantizer) dequantizer = std::make_unique<QuantizerIQ4KT>(0, 0, 4096);
+ return *dequantizer;
+}
+
void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_weights) {
constexpr float kSigmaScale = 2.0f;
@@ -8546,7 +8593,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
float * dptr = (float *)vy;
- block_iq4_kt * y = (block_iq4_kt *)(dptr + 2);
+ block_iq4_kt * y = (block_iq4_kt *)(dptr + 1);
auto& quantizer1 = iq4kt_quantizer();
auto& quantizer2 = iq4kt_quantizer(true);
@@ -8555,13 +8602,10 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
Q::set_weights(kSigmaScale, nblock, x, quant_weights, all_weights);
- float amax_row = 0, row_av = 0;
+ float amax_row = 0;
for (int j = 0; j < n_per_row; ++j) {
- row_av += x[j];
amax_row = std::max(amax_row, std::abs(x[j]));
}
- row_av /= n_per_row;
- dptr[1] = row_av;
if (!amax_row) {
dptr[0] = 0.f;
std::memset(y, 0, nblock*sizeof(block_iq4_kt));
@@ -8584,7 +8628,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
float amax = 0;
for (int j = 0; j < Q::kBlockSize; ++j) {
- xaux[j] = xbl[ib*Q::kBlockSize+j] - row_av;
+ xaux[j] = xbl[ib*Q::kBlockSize+j];
float ax = std::abs(xaux[j]);
amax = std::max(amax, ax);
}
@@ -8593,7 +8637,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
continue;
}
float best = 0;
- float scale_0 = std::max(92.f, 127.f*amax/amax_row);
+ float scale_0 = std::max(90.f, 124.f*amax/amax_row);
for (int itry = -kNtry; itry <= kNtry; ++itry) {
quantizer1.find_best_match( amax/(8.f*itry + scale_0), xaux, weight, best_idx);
auto [dp, score_p] = quantizer1.find_best_scale(xaux, weight, best_idx);
@@ -8664,7 +8708,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
for (int ib = 0; ib < Q::kNblock; ++ib) {
auto& quantizer = y[ibl].qs[ib] & 1 ? quantizer2 : quantizer1;
const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
- for (int j = 0; j < Q::kBlockSize; ++j) xaux[j] = xbl[ib*Q::kBlockSize+j] - row_av;
+ for (int j = 0; j < Q::kBlockSize; ++j) xaux[j] = xbl[ib*Q::kBlockSize+j];
int ls = nearest_int(id*scales[ib]);
ls = std::min(ls, 63);
*(uint8_t *)(shb + ib) = ((ls + 64) << 1) | (shb[ib] & 1);
@@ -8724,7 +8768,7 @@ size_t quantize_iq4_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
void dequantize_row_iq4_kt(const block_iq4_kt * x, float * y, int64_t k) {
#ifdef __AVX2__
- if (iqk_dequantize_ktquants(GGML_TYPE_IQ4_KT, k, x, 0, y, 0, 1)) return;
+ //if (iqk_dequantize_ktquants(GGML_TYPE_IQ4_KT, k, x, 0, y, 0, 1)) return;
#endif
using Q = QuantizerIQ4KT;
assert(k % Q::kSuperBlockSize == 0);
@@ -8732,23 +8776,20 @@ void dequantize_row_iq4_kt(const block_iq4_kt * x, float * y, int64_t k) {
const int nb = k / Q::kSuperBlockSize;
const float * dptr = (const float *)x;
const float d = dptr[0] * Q::kScale;
- const float row_av = dptr[1];
- x = (const block_iq4_kt *)(dptr + 2);
- auto& deq = iq4kt_quantizer();
+ x = (const block_iq4_kt *)(dptr + 1);
+ auto& deq = iq4kt_dequantizer();
for (int ibl = 0; ibl < nb; ++ibl) {
auto shb = x[ibl].qs;
auto ql = (const uint8_t *)(shb + Q::kNblock);
auto qh = ql + kNumGroups;
for (int ib = 0; ib < Q::kNblock; ++ib) {
int offset = shb[ib] & 1 ? 32768 + 4096 : 4096;
- //auto& deq = shb[ib] & 1 ? deq2 : deq1;
int ls = int((shb[ib] & 0xff) >> 1) - 64;
float sl = d * ls;
for (int ig = 0; ig < Q::kNg; ++ig) {
int jj = ib*Q::kNg+ig;
uint16_t idx = ql[jj] | ((qh[jj%(kNumGroups/2)] << (8 - 4*(jj/(kNumGroups/2)))) & 0xf00) | (((shb[ib] >> (8 + 3*ig)) & 7) << 12);
deq.set_values(idx, y, sl, offset);
- for (int j = 0; j < Q::kGroupSize; ++j) y[j] += row_av;
y += Q::kGroupSize;
}
}