diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-05-19 22:17:18 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-19 22:17:18 +0300 |
commit | 2d5db48371052087a83974abda3767d1aedec598 (patch) | |
tree | ca7e6ad4b2be21d96272aece6489b2f39c444ecb /llama.cpp | |
parent | 6986c7835adc13ba3f9d933b95671bb1f3984dc6 (diff) |
ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 18 |
1 files changed, 15 insertions, 3 deletions
@@ -406,6 +406,7 @@ enum llama_file_version { LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab LLAMA_FILE_VERSION_GGJT_V1, // added padding LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format + LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format }; struct llama_file_loader { @@ -438,6 +439,8 @@ struct llama_file_loader { file_version = LLAMA_FILE_VERSION_GGJT_V1; } else if (magic == 'ggjt' && version == 2) { file_version = LLAMA_FILE_VERSION_GGJT_V2; + } else if (magic == 'ggjt' && version == 3) { + file_version = LLAMA_FILE_VERSION_GGJT_V3; } else { throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", magic, version); @@ -844,7 +847,8 @@ static const char *llama_file_version_name(llama_file_version version) { case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; - case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; + case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; + case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; } return "unknown"; @@ -924,11 +928,19 @@ static void llama_model_load_internal( fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } - if (file_version != LLAMA_FILE_VERSION_GGJT_V2) { + if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { - throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)"); + throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"); + } + } + + if (file_version < LLAMA_FILE_VERSION_GGJT_V3) { + if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || + hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 || + hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { + throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"); } } |