summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorHaoxiang Fei <tonyfettes@tonyfettes.com>2024-05-11 16:12:06 +0800
committerGitHub <noreply@github.com>2024-05-11 11:12:06 +0300
commitf99e1e456eaf69cc38c1982a2693ce41c0f897ef (patch)
treef6bb7dd98afdc852fa428c77c53bf8e72fb69b5e /llama.cpp
parent5ae3426b0b64672991563d4c28b2018b9f961467 (diff)
llama : lookup word in vocab before doing BPE merges (#7193)
* fix: llama-3 ignore_merges * test: add test for llama-3 bpe ignore_merges * fix: set ignore_merges only for llama-3 * fix: test-tokenizer-1-bpe --ingore-merges detection * fix: copy to fix fallthrough * fix: change ignore_merges to bool * fix: add ignore merges tests to cmake * llama : alternative merge ignore logic --------- Co-authored-by: Haoxiang Fei <feihaoxiang@idea.edu.cn> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp14
1 files changed, 13 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index cdff28cd..e91ad728 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12253,13 +12253,14 @@ struct llm_tokenizer_bpe {
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
int final_prev_index = -1;
+ bool ignore_merges = false;
std::vector<std::string> word_collection;
switch (vocab.type) {
case LLAMA_VOCAB_TYPE_BPE:
switch (vocab.type_pre) {
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
+ ignore_merges = true;
word_collection = unicode_regex_split(text, {
// original regex from tokenizer.json
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12268,6 +12269,12 @@ struct llm_tokenizer_bpe {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
});
break;
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
+ word_collection = unicode_regex_split(text, {
+ // same as llama3
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ });
+ break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
word_collection = unicode_regex_split(text, {
"[\r\n]",
@@ -12351,6 +12358,11 @@ struct llm_tokenizer_bpe {
int index = 0;
size_t offset = 0;
+ if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+ offset = word.size();
+ }
+
while (offset < word.size()) {
llm_symbol sym;
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));