summaryrefslogtreecommitdiff
path: root/ggml/src
diff options
context:
space:
mode:
Diffstat (limited to 'ggml/src')
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp79
-rw-r--r--ggml/src/iqk/iqk_quantize.h2
2 files changed, 81 insertions, 0 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 3077fe21..3408d054 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -21,6 +21,9 @@
#include <algorithm>
#include <cstring>
#include <mutex>
+#include <thread>
+#include <atomic>
+#include <unordered_map>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -5054,3 +5057,79 @@ void vec_dot_iq2_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
GGML_UNUSED(by);
}
+namespace {
+struct Repack {
+ using repack_func = void (*) (int nrows, int n_per_row, const char * src, char * dst);
+ ggml_type new_type;
+ int num_rows;
+ repack_func repack;
+};
+}
+
+void iqk_repack_tensor(struct ggml_tensor * tensor) {
+ constexpr int kChunk = 8;
+ if (!tensor) return;
+ if (!ggml_is_contiguous(tensor)) return;
+ if (strncmp(tensor->name, "token_embd.weight", GGML_MAX_NAME) == 0) return;
+ if (tensor->ne[1] % 4 || tensor->ne[2]*tensor->ne[3] > 1) return;
+ static const std::unordered_map<ggml_type, Repack> k_map = {
+ { GGML_TYPE_IQ2_K, { GGML_TYPE_IQ2_K_R4, 4, (Repack::repack_func)repack_iq2_k} },
+ { GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} },
+ { GGML_TYPE_IQ4_K, { GGML_TYPE_IQ4_K_R4, 4, (Repack::repack_func)repack_iq4_k} },
+ { GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 4, (Repack::repack_func)repack_iq4_xs} },
+ { GGML_TYPE_IQ4_NL, { GGML_TYPE_IQ4_NL_R4, 4, (Repack::repack_func)repack_iq4_nl} },
+ { GGML_TYPE_IQ2_BN, { GGML_TYPE_IQ2_BN_R4, 4, (Repack::repack_func)repack_iq2_bn} },
+ { GGML_TYPE_Q2_K, { GGML_TYPE_Q2_K_R4, 4, (Repack::repack_func)repack_q2_k} },
+ { GGML_TYPE_Q3_K, { GGML_TYPE_Q3_K_R4, 4, (Repack::repack_func)repack_q3_k} },
+ { GGML_TYPE_Q4_K, { GGML_TYPE_Q4_K_R4, 4, (Repack::repack_func)repack_q4_k} },
+ { GGML_TYPE_Q5_K, { GGML_TYPE_Q5_K_R4, 4, (Repack::repack_func)repack_q5_k} },
+ { GGML_TYPE_Q6_K, { GGML_TYPE_Q6_K_R4, 4, (Repack::repack_func)repack_q6_k} },
+ { GGML_TYPE_Q4_0, { GGML_TYPE_Q4_0_R4, 4, (Repack::repack_func)repack_q4_0} },
+ { GGML_TYPE_Q5_0, { GGML_TYPE_Q5_0_R4, 4, (Repack::repack_func)repack_q5_0} },
+ { GGML_TYPE_Q6_0, { GGML_TYPE_Q6_0_R4, 4, (Repack::repack_func)repack_q6_0} },
+ { GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} },
+ { GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} },
+#ifdef __AVX512BF16__
+ { GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>} },
+#endif
+ };
+
+ auto it = k_map.find(tensor->type);
+ if (it == k_map.end()) return;
+ if (tensor->ne[1] % it->second.num_rows) return;
+
+ auto& r = it->second;
+
+ int max_thread = std::max(1, int(std::thread::hardware_concurrency()/2));
+ int num_chunks = (tensor->ne[1] + kChunk*r.num_rows - 1)/(kChunk*r.num_rows);
+ int nthread = std::min(num_chunks, max_thread);
+
+ //printf("%s(%s): %s -> %s. %d rows, %d chunks, %d threads\n", __func__, tensor->name, ggml_type_name(tensor->type), ggml_type_name(r.new_type),
+ // int(tensor->ne[1]), num_chunks, nthread);
+
+ std::atomic<int> counter(0);;
+ auto compute = [&counter, &r, tensor, num_chunks] () {
+ int nrows = tensor->ne[1];
+ int n_per_row = tensor->ne[0];
+ auto row_size = ggml_row_size(tensor->type, n_per_row);
+ std::vector<char> qtmp(r.num_rows*row_size);
+ auto data = (char *)tensor->data;
+ while (true) {
+ int chunk = counter.fetch_add(1);
+ if (chunk >= num_chunks) break;
+ int first_row = chunk*kChunk*r.num_rows;
+ int last_row = std::min(first_row + kChunk*r.num_rows, nrows);
+ for (int row = first_row; row < last_row; row += r.num_rows) {
+ std::memcpy(qtmp.data(), data + row*row_size, r.num_rows*row_size);
+ r.repack(r.num_rows, n_per_row, qtmp.data(), data + row*row_size);
+ }
+ }
+ };
+ std::vector<std::thread> workers(nthread-1);
+ for (auto& w : workers) w = std::thread(compute);
+ compute();
+ for (auto& w : workers) w.join();
+
+ tensor->type = r.new_type;
+}
+
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index 8640b59b..7c568ded 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -173,6 +173,8 @@ void quantize_row_q8_KR8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
+void iqk_repack_tensor(struct ggml_tensor * tensor);
+
#ifdef __cplusplus
}
#endif