diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-02-13 13:01:29 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-13 13:01:29 +0200 |
commit | 49cc1f7d67de2da99f3ac185f9ff1319b7bf35f8 (patch) | |
tree | 46e731e1f972752260cf470cc226a05ad9cfea6a /llama.cpp | |
parent | 99b8b43d7b185a6483f28cf798a2d968b2e16ca7 (diff) |
bert : add tests + fix quantization (#5475)
* llama : do not quantize pos embd and token type tensors
* ci : add BERT tests
ggml-ci
* ci : do not do BERT tests on low-perf nodes
ggml-ci
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 6 |
1 files changed, 5 insertions, 1 deletions
@@ -10444,7 +10444,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= !params->only_copy; // do not quantize expert gating tensors - quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight"); + + // do not quantize positional embeddings and token types (BERT) + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); enum ggml_type new_type; void * new_data; |