diff options
Diffstat (limited to 'src/llama.cpp')
-rw-r--r-- | src/llama.cpp | 16 |
1 files changed, 13 insertions, 3 deletions
diff --git a/src/llama.cpp b/src/llama.cpp index 836fd97a..b6a4a06d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7650,7 +7650,7 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } else { layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - layer.ffn_exp_probs_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert} ); + layer.ffn_exp_probs_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 1); GGML_ASSERT(n_expert > 0); GGML_ASSERT(n_expert_used > 0); @@ -8014,6 +8014,16 @@ static bool llm_load_tensors( } } + if (!ml.use_mmap) { + int n_modified = 0; + for (auto& it : model.tensors_by_name) { + if (ggml_backend_buffer_is_host(it.second->buffer)) { + if (iqk_modify_tensor(it.second)) ++n_modified; + } + } + if (n_modified > 0) printf("============ Modified %d tensors\n", n_modified); + } + if (!ml.use_mmap && ml.repack_tensors) { int n_repacked = 0; for (auto& it : model.tensors_by_name) { @@ -16910,8 +16920,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else chunk_size_multiplier = 8; } else if (new_type == GGML_TYPE_Q4_0_R4) { - if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; - else chunk_size_multiplier = 4; + if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q4_0; + else chunk_size_multiplier = 8; } else if (new_type == GGML_TYPE_Q5_0_R4) { if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_0; |