diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 20 |
1 files changed, 18 insertions, 2 deletions
@@ -3175,6 +3175,7 @@ struct llama_model_loader { switch (type_max) { case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; + case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; @@ -3666,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "F16"; + case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: @@ -6129,6 +6131,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam || !( model.ftype == LLAMA_FTYPE_ALL_F32 || model.ftype == LLAMA_FTYPE_MOSTLY_F16 || + model.ftype == LLAMA_FTYPE_MOSTLY_BF16 || model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ) @@ -14158,13 +14161,16 @@ static void llama_tensor_dequantize_internal( if (qtype.to_float == NULL) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type))); } - } else if (tensor->type != GGML_TYPE_F16) { + } else if (tensor->type != GGML_TYPE_F16 && + tensor->type != GGML_TYPE_BF16) { throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type))); } if (nthread < 2) { if (tensor->type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements); + } else if (tensor->type == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements); } else if (ggml_is_quantized(tensor->type)) { qtype.to_float(tensor->data, f32_output, nelements); } else { @@ -14173,7 +14179,14 @@ static void llama_tensor_dequantize_internal( return; } - size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type); + size_t block_size; + if (tensor->type == GGML_TYPE_F16 || + tensor->type == GGML_TYPE_BF16) { + block_size = 1; + } else { + block_size = (size_t)ggml_blck_size(tensor->type); + } + size_t block_size_bytes = ggml_type_size(tensor->type); GGML_ASSERT(nelements % block_size == 0); @@ -14192,6 +14205,8 @@ static void llama_tensor_dequantize_internal( auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { if (typ == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); + } else if (typ == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels); } else { qtype.to_float(inbuf, outbuf, nels); } @@ -14552,6 +14567,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; + case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break; case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break; // K-quants |