summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp10
1 files changed, 10 insertions, 0 deletions
diff --git a/llama.cpp b/llama.cpp
index 331c9d47..9c72d118 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4392,6 +4392,9 @@ static void llm_load_vocab(
tokenizer_pre == "command-r") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
} else if (
+ tokenizer_pre == "qwen2") {
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+ } else if (
tokenizer_pre == "olmo") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
} else if (
@@ -12263,6 +12266,13 @@ struct llm_tokenizer_bpe {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
+ word_collection = unicode_regex_split(text, {
+ // original regex from tokenizer.json
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ });
+ break;
default:
// default regex for BPE tokenization pre-processing
word_collection = unicode_regex_split(text, {