When tokenizer info is missing in the model, use llama3 by default

author: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-07-19 12:29:01 +0300
committer: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-07-19 12:29:01 +0300
commit: 99119ec29c785908165946eeee55aa0c750e0f37 (patch)
tree: 798ee2e6c71a05f3233d6a38ba9d17aa37bffa9c
parent: 30b8bcf1a3bf232aabcbb826c7a2769dda6eafa0 (diff)
1 files changed, 10 insertions, 3 deletions
diff --git a/llama.cpp b/llama.cpp
index 2bebf894..52c9d571 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4882,14 +4882,21 @@ static void llm_load_vocab(
         // for now, only BPE models have pre-tokenizers
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             if (tokenizer_pre.empty()) {
-                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
+                //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+                // OK - I don't feel like recreati8ng the LLaMA-v3 models. Considering that, at least for now,
+                // LLaMA-v3 is the only model wehere we end up here, let's just force the pre-tokanizer to be
+                // llama3.
+                tokenizer_pre = "llama3";
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'llama3'\n", __func__);
                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
-                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
+                LLAMA_LOG_WARN("%s: GENERATION QUALITY MAY BE DEGRADED!         \n", __func__);
                 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+                //vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+                //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
             } else if (tokenizer_pre == "default") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
author	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-07-19 12:29:01 +0300
committer	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-07-19 12:29:01 +0300
commit	99119ec29c785908165946eeee55aa0c750e0f37 (patch)
tree	798ee2e6c71a05f3233d6a38ba9d17aa37bffa9c
parent	30b8bcf1a3bf232aabcbb826c7a2769dda6eafa0 (diff)