From d7e852c1bc8e85bf62a6f1aede08cd2de723404a Mon Sep 17 00:00:00 2001 From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> Date: Tue, 21 May 2024 14:39:48 +0200 Subject: Tokenizer SPM fixes for phi-3 and llama-spm (bugfix) (#7425) * Update brute force test: add_special * Update brute force test: default values for add_bos_token and add_eos_token * Enable rtrim when pre-inserting BOS Co-authored-by: Georgi Gerganov * Revert "server : fix test regexes" --- llama.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index e2ebe175..d26fe559 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12498,15 +12498,16 @@ static std::vector llama_tokenize_internal(const llama_vocab & // tokenizer.encode('', add_special_tokens=True) returns [1] // tokenizer.encode('', add_special_tokens=False) returns [] + static const bool rtrim = true; //TODO: as param + bool is_prev_special = false; + bool special_token_rtrim = false; + if (add_special && vocab.special_add_bos != 0) { GGML_ASSERT(vocab.special_bos_id != -1); output.push_back(vocab.special_bos_id); + is_prev_special = true; } - static const bool rtrim = true; //TODO: as param - bool is_prev_special = false; - bool special_token_rtrim = false; - for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer -- cgit v1.2.3