diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 10 |
1 files changed, 9 insertions, 1 deletions
@@ -479,6 +479,7 @@ struct llama_file_loader { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: break; default: { throw format("unrecognized tensor type %u\n", shard.type); @@ -552,6 +553,7 @@ struct llama_file_saver { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: break; default: LLAMA_ASSERT(false); } @@ -841,6 +843,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; + case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3"; default: return "unknown, may not work"; } } @@ -1575,6 +1578,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; + case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break; default: throw format("invalid output file type %d\n", ftype); }; @@ -1652,6 +1656,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s { new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); } break; + case GGML_TYPE_Q4_3: + { + new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; default: LLAMA_ASSERT(false); } @@ -1963,7 +1971,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * base_t = dest_t; } - if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) { + if (ggml_is_quantized(base_t->type)) { if (!warned) { fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " "use a f16 or f32 base model with --lora-base\n", __func__); |