gguf : add ftype meta info to the model (#2710)

* llama : add ftype meta info to the model ggml-ci * convert.py : add ftype when converting (does not work) * convert.py : fix Enum to IntEnum ggml-ci
author: Georgi Gerganov <ggerganov@gmail.com> 2023-08-22 20:05:59 +0300
committer: GitHub <noreply@github.com> 2023-08-22 20:05:59 +0300
commit: deb7dfca4b9725cd295d1426db75fe8e0a6d5312 (patch)
tree: f36daf023af86b6005325cbb4ee80a7966255e59 /llama.cpp
parent: bac66994cf356cf488078c056831396eb4ce31d5 (diff)
1 files changed, 18 insertions, 3 deletions
diff --git a/llama.cpp b/llama.cpp
index 0584749c..6abdc44f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -995,6 +995,16 @@ struct llama_model_loader {
                      } break;
             }
 
+            // this is a way to mark that we have "guessed" the file type
+            ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
+
+            {
+                const int kid = gguf_find_key(ctx_gguf, "general.file_type");
+                if (kid >= 0) {
+                    ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
+                }
+            }
+
             for (int i = 0; i < n_kv; i++) {
                 const char * name         = gguf_get_key(ctx_gguf, i);
                 const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
@@ -1197,7 +1207,11 @@ struct llama_model_loader {
 // load LLaMA models
 //
 
-const char * llama_model_ftype_name(enum llama_ftype ftype) {
+std::string llama_model_ftype_name(enum llama_ftype ftype) {
+    if (ftype & LLAMA_FTYPE_GUESSED) {
+        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+    }
+
     switch (ftype) {
         case LLAMA_FTYPE_ALL_F32:     return "all F32";
         case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
@@ -1426,7 +1440,7 @@ static void llama_model_load_internal(
         LLAMA_LOG_INFO("%s: freq_base    = %.1f\n",   __func__, hparams.rope_freq_base);
         LLAMA_LOG_INFO("%s: freq_scale   = %g\n",     __func__, hparams.rope_freq_scale);
         LLAMA_LOG_INFO("%s: model type   = %s\n",     __func__, llama_model_type_name(model.type));
-        LLAMA_LOG_INFO("%s: model ftype  = %s\n",     __func__, llama_model_ftype_name(model.ftype));
+        LLAMA_LOG_INFO("%s: model ftype  = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
         LLAMA_LOG_INFO("%s: model size   = %.2f B\n", __func__, ml->n_elements*1e-9);
 
         // general kv
@@ -3450,6 +3464,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     // copy the KV pairs from the input file
     gguf_set_kv     (ctx_out, model_loader->ctx_gguf);
     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out, "general.file_type", ftype);
 
 #ifdef GGML_USE_K_QUANTS
     int n_attention_wv    = 0;
@@ -4310,7 +4325,7 @@ int llama_model_n_embd(const struct llama_model * model) {
 }
 
 int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
-    return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype));
+    return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str());
 }
 
 int llama_model_quantize(
author	Georgi Gerganov <ggerganov@gmail.com>	2023-08-22 20:05:59 +0300
committer	GitHub <noreply@github.com>	2023-08-22 20:05:59 +0300
commit	deb7dfca4b9725cd295d1426db75fe8e0a6d5312 (patch)
tree	f36daf023af86b6005325cbb4ee80a7966255e59 /llama.cpp
parent	bac66994cf356cf488078c056831396eb4ce31d5 (diff)