summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-05-19 22:17:18 +0300
committerGitHub <noreply@github.com>2023-05-19 22:17:18 +0300
commit2d5db48371052087a83974abda3767d1aedec598 (patch)
treeca7e6ad4b2be21d96272aece6489b2f39c444ecb /llama.cpp
parent6986c7835adc13ba3f9d933b95671bb1f3984dc6 (diff)
ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0 * llama : bump LLAMA_FILE_VERSION to 3 * cuda : update Q4 and Q8 dequantize kernels * ggml : fix AVX dot products * readme : update performance table + hot topics
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp18
1 files changed, 15 insertions, 3 deletions
diff --git a/llama.cpp b/llama.cpp
index 1802d231..6ebe85d0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -406,6 +406,7 @@ enum llama_file_version {
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
LLAMA_FILE_VERSION_GGJT_V1, // added padding
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
+ LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
};
struct llama_file_loader {
@@ -438,6 +439,8 @@ struct llama_file_loader {
file_version = LLAMA_FILE_VERSION_GGJT_V1;
} else if (magic == 'ggjt' && version == 2) {
file_version = LLAMA_FILE_VERSION_GGJT_V2;
+ } else if (magic == 'ggjt' && version == 3) {
+ file_version = LLAMA_FILE_VERSION_GGJT_V3;
} else {
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
@@ -844,7 +847,8 @@ static const char *llama_file_version_name(llama_file_version version) {
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
- case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
+ case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
}
return "unknown";
@@ -924,11 +928,19 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
}
- if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
- throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
+ }
+ }
+
+ if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
+ hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
}
}