From f99e1e456eaf69cc38c1982a2693ce41c0f897ef Mon Sep 17 00:00:00 2001 From: Haoxiang Fei Date: Sat, 11 May 2024 16:12:06 +0800 Subject: llama : lookup word in vocab before doing BPE merges (#7193) * fix: llama-3 ignore_merges * test: add test for llama-3 bpe ignore_merges * fix: set ignore_merges only for llama-3 * fix: test-tokenizer-1-bpe --ingore-merges detection * fix: copy to fix fallthrough * fix: change ignore_merges to bool * fix: add ignore merges tests to cmake * llama : alternative merge ignore logic --------- Co-authored-by: Haoxiang Fei Co-authored-by: Georgi Gerganov --- llama.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index cdff28cd..e91ad728 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12253,13 +12253,14 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; + bool ignore_merges = false; std::vector word_collection; switch (vocab.type) { case LLAMA_VOCAB_TYPE_BPE: switch (vocab.type_pre) { case LLAMA_VOCAB_PRE_TYPE_LLAMA3: - case LLAMA_VOCAB_PRE_TYPE_DBRX: + ignore_merges = true; word_collection = unicode_regex_split(text, { // original regex from tokenizer.json //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", @@ -12268,6 +12269,12 @@ struct llm_tokenizer_bpe { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); break; + case LLAMA_VOCAB_PRE_TYPE_DBRX: + word_collection = unicode_regex_split(text, { + // same as llama3 + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }); + break; case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: word_collection = unicode_regex_split(text, { "[\r\n]", @@ -12351,6 +12358,11 @@ struct llm_tokenizer_bpe { int index = 0; size_t offset = 0; + if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { + symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()}); + offset = word.size(); + } + while (offset < word.size()) { llm_symbol sym; size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset])); -- cgit v1.2.3