Add StableLM2 pre-tokenizer (#7349)

* Add StableLM pre-tokenizer * Fix space * Fix trailing whitespace
author: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> 2024-05-19 14:46:46 +0200
committer: GitHub <noreply@github.com> 2024-05-19 22:46:46 +1000
commit: 6aade19ee74b896c59929676629340b36be3e22c (patch)
tree: 3c9a5a15ada2e5f87c801a27552598e31d7f8add /llama.cpp
parent: ab33f7a338593f6cf1ae98b10b6f8684f63bd72c (diff)
1 files changed, 4 insertions, 0 deletions
diff --git a/llama.cpp b/llama.cpp
index 1409a05d..06ff4da6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4464,6 +4464,9 @@ static void llm_load_vocab(
                 tokenizer_pre == "qwen2") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
             } else if (
+                tokenizer_pre == "stablelm2") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
+            } else if (
                 tokenizer_pre == "olmo") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
             } else if (
@@ -12363,6 +12366,7 @@ struct llm_tokenizer_bpe {
                             "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                         });
                         break;
+                    case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
                     case LLAMA_VOCAB_PRE_TYPE_QWEN2:
                         word_collection = unicode_regex_split(text, {
                             // original regex from tokenizer.json
author	Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>	2024-05-19 14:46:46 +0200
committer	GitHub <noreply@github.com>	2024-05-19 22:46:46 +1000
commit	6aade19ee74b896c59929676629340b36be3e22c (patch)
tree	3c9a5a15ada2e5f87c801a27552598e31d7f8add /llama.cpp
parent	ab33f7a338593f6cf1ae98b10b6f8684f63bd72c (diff)