diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-03-21 07:23:36 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-03-21 07:23:36 +0100 |
commit | b8d1fac97b756968b86b470d44bb1026ded7157a (patch) | |
tree | 5a5893796293475185e833a787648830a7189450 /ggml/src/iqk/iqk_quantize.cpp | |
parent | 127c6ee6493a3084995d754d987f0240ffdffe6a (diff) |
Convert models to row-interleaved quants using the quantize tool (#272)
* Repack a model with the quantize tool
* WIP
* Fixed various issues
As we don't have a way to tell if a repacked quant has been modified,
I had to remove the modification at the expense of a slight decrease
in performance. This affects q8_0_r8, q8_KV_r8, q8_k_r8 on Zen4, and
q4_0_r8 on ARM.
* Create wk_b and wv_b as Q8_0_R8 if the wkv_b type is interleaved
* Fix GCC 13.3 compilation error
* Another one
* Add missing include
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 69 |
1 files changed, 52 insertions, 17 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index fb6a5db4..5e657f4a 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -25,6 +25,7 @@ #include <thread> #include <atomic> #include <unordered_map> +#include <string> #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -6766,9 +6767,7 @@ struct Modify { modify_func_t mod_func; int nrows; }; -} - -bool iqk_modify_tensor(struct ggml_tensor * tensor) { +const Modify * get_modify_info(ggml_type type) { static const std::unordered_map<ggml_type, Modify> k_mod_map = { #ifdef __ARM_NEON { GGML_TYPE_Q4_0_R8, {modify_q4_0_r8, 8} }, @@ -6779,10 +6778,31 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) { { GGML_TYPE_Q8_KV_R8, {modify_q8_KV_r8, 8} }, #endif }; - auto it = k_mod_map.find(tensor->type); - if (it == k_mod_map.end()) return false; + auto it = k_mod_map.find(type); + return it != k_mod_map.end() ? &it->second : nullptr; +} +bool is_forbidden_tensor(const std::string& name) { + static const std::string kTokenEmbd{"token_embd.weight"}; + if (name == kTokenEmbd) return true; + //if (auto pos = name.find("attn_kv_b.weight"); pos != std::string::npos) return true; + return false; +} +} - auto& m = it->second; +bool iqk_should_modify_tensor([[maybe_unused]] const struct ggml_tensor * tensor) { + return false; + //if (is_forbidden_tensor(tensor->name)) return false; + //auto mptr = get_modify_info(tensor->type); + //return mptr ? true : false; +} + +bool iqk_modify_tensor(struct ggml_tensor * tensor) { + return false; + auto mptr = get_modify_info(tensor->type); + if (!mptr) return false; + if (is_forbidden_tensor(std::string{tensor->name})) return false; + + auto& m = *mptr; int nrows = ggml_nrows(tensor); int nchunks = nrows/m.nrows; int max_thread = std::max(1, int(std::thread::hardware_concurrency()/2)); @@ -6805,12 +6825,8 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) { return true; } -void iqk_repack_tensor(struct ggml_tensor * tensor) { - constexpr int kChunk = 8; - if (!tensor) return; - if (!ggml_is_contiguous(tensor)) return; - if (strncmp(tensor->name, "token_embd.weight", GGML_MAX_NAME) == 0) return; - if (tensor->ne[1] % 4) return; +namespace { +const Repack * get_repack_info(ggml_type type) { static const std::unordered_map<ggml_type, Repack> k_map = { { GGML_TYPE_IQ2_K, { GGML_TYPE_IQ2_K_R4, 4, (Repack::repack_func)repack_iq2_k} }, { GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} }, @@ -6841,12 +6857,30 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) { { GGML_TYPE_F16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_half>} }, #endif }; + auto it = k_map.find(type); + return it != k_map.end() ? &it->second : nullptr; +} +} + +int iqk_repacked_type(const struct ggml_tensor * tensor) { + if (!ggml_is_contiguous(tensor)) return (int)tensor->type; + if (is_forbidden_tensor(tensor->name)) return (int)tensor->type; + auto rptr = get_repack_info(tensor->type); + return rptr && tensor->ne[1] % rptr->num_rows == 0 ? (int)rptr->new_type : (int)tensor->type; +} + +void iqk_repack_tensor(struct ggml_tensor * tensor) { + constexpr int kChunk = 8; + if (!tensor) return; + if (!ggml_is_contiguous(tensor)) return; + if (is_forbidden_tensor(tensor->name)) return; + if (tensor->ne[1] % 4) return; - auto it = k_map.find(tensor->type); - if (it == k_map.end()) return; - if (tensor->ne[1] % it->second.num_rows) return; + auto rptr = get_repack_info(tensor->type); + if (!rptr) return; + if (tensor->ne[1] % rptr->num_rows) return; - auto& r = it->second; + auto& r = *rptr; auto nrows = ggml_nrows(tensor); @@ -6871,7 +6905,8 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) { int last_row = std::min(first_row + chunkSize*r.num_rows, nrows); for (int row = first_row; row < last_row; row += r.num_rows) { std::memcpy(qtmp.data(), data + row*row_size, r.num_rows*row_size); - r.repack(r.num_rows, n_per_row, qtmp.data(), data + row*row_size, true); + //r.repack(r.num_rows, n_per_row, qtmp.data(), data + row*row_size, true); + r.repack(r.num_rows, n_per_row, qtmp.data(), data + row*row_size, false); } } }; |