From 229ffff872f8ad0d21c997d18ee7a23692ae60a0 Mon Sep 17 00:00:00 2001
From: Ren Xuancheng <jklj077@users.noreply.github.com>
Date: Wed, 8 May 2024 20:06:43 +0800
Subject: llama : add BPE pre-tokenization for Qwen2 (#7114)

* Add BPE pre-tokenization for Qwen2.

* minor : fixes

---------

Co-authored-by: Ren Xuancheng <17811943+jklj077@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 llama.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'llama.cpp')

diff --git a/llama.cpp b/llama.cpp
index 331c9d47..9c72d118 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4391,6 +4391,9 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "command-r") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+            } else if (
+                tokenizer_pre == "qwen2") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
             } else if (
                 tokenizer_pre == "olmo") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@@ -12263,6 +12266,13 @@ struct llm_tokenizer_bpe {
                             "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                         });
                         break;
+                    case LLAMA_VOCAB_PRE_TYPE_QWEN2:
+                        word_collection = unicode_regex_split(text, {
+                            // original regex from tokenizer.json
+                            // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                        });
+                        break;
                     default:
                         // default regex for BPE tokenization pre-processing
                         word_collection = unicode_regex_split(text, {
-- 
cgit v1.2.3