diff options
author | jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> | 2024-05-20 20:15:57 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-20 20:15:57 +0200 |
commit | 917dc8cfa67a72fb7c8bf7392270da3bf4833af4 (patch) | |
tree | 170c39ba79ae72036105970d92548da420699ba1 /llama.cpp | |
parent | fabf30b4c4fca32e116009527180c252919ca922 (diff) |
Tokenizer SPM fixes for phi-3 and llama-spm (#7375)
* Update brute force test: special tokens
* Fix added tokens
- Try to read 'added_tokens.json'.
- Try to read 'tokenizer_config.json'.
- Try to read 'tokenizer.json'.
* Fix special tokens rtrim
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* server : fix test regexes
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 31 |
1 files changed, 27 insertions, 4 deletions
@@ -4553,7 +4553,8 @@ static void llm_load_vocab( (t.first == "<|eot_id|>" || t.first == "<|im_end|>" || t.first == "<|end|>" || - t.first == "<end_of_turn>" + t.first == "<end_of_turn>" || + t.first == "<|endoftext|>" ) ) { vocab.special_eot_id = t.second; @@ -12502,6 +12503,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & output.push_back(vocab.special_bos_id); } + static const bool rtrim = true; //TODO: as param + bool is_prev_special = false; + bool special_token_rtrim = false; + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer @@ -12511,9 +12516,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & // and passing 'add space prefix' as bool argument // auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); - if (&fragment == &fragment_buffer.front()) { - if (vocab.add_space_prefix) { - raw_text = " " + raw_text; // prefix with space if the first token is not special + + if (special_token_rtrim) { + size_t num_whitespaces = 0; + while (isspace(raw_text[num_whitespaces])) { + num_whitespaces++; + } + if (num_whitespaces == raw_text.size()) { + continue; // skip if all whitespaces + } + raw_text = raw_text.substr(num_whitespaces); + } + + if (vocab.add_space_prefix) { + if (!output.size() || is_prev_special) { // prefix with space if first token + raw_text = " " + raw_text; } } @@ -12525,6 +12542,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & tokenizer.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); + is_prev_special = true; + // phi-3 special tokens without rtrim, works fine for llama-spm too + special_token_rtrim = rtrim + && fragment.token != vocab.special_bos_id + && fragment.token != vocab.special_unk_id + && fragment.token != vocab.special_eos_id; } } |