From 229ffff872f8ad0d21c997d18ee7a23692ae60a0 Mon Sep 17 00:00:00 2001 From: Ren Xuancheng Date: Wed, 8 May 2024 20:06:43 +0800 Subject: llama : add BPE pre-tokenization for Qwen2 (#7114) * Add BPE pre-tokenization for Qwen2. * minor : fixes --------- Co-authored-by: Ren Xuancheng <17811943+jklj077@users.noreply.github.com> Co-authored-by: Georgi Gerganov --- llama.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index 331c9d47..9c72d118 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4391,6 +4391,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "command-r") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; + } else if ( + tokenizer_pre == "qwen2") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; } else if ( tokenizer_pre == "olmo") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO; @@ -12263,6 +12266,13 @@ struct llm_tokenizer_bpe { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); break; + case LLAMA_VOCAB_PRE_TYPE_QWEN2: + word_collection = unicode_regex_split(text, { + // original regex from tokenizer.json + // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }); + break; default: // default regex for BPE tokenization pre-processing word_collection = unicode_regex_split(text, { -- cgit v1.2.3