From d7e852c1bc8e85bf62a6f1aede08cd2de723404a Mon Sep 17 00:00:00 2001
From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
Date: Tue, 21 May 2024 14:39:48 +0200
Subject: Tokenizer SPM fixes for phi-3 and llama-spm (bugfix) (#7425)

* Update brute force test: add_special
* Update brute force test: default values for add_bos_token and add_eos_token
* Enable rtrim when pre-inserting BOS

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "server : fix test regexes"
---
 llama.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'llama.cpp')

diff --git a/llama.cpp b/llama.cpp
index e2ebe175..d26fe559 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12498,15 +12498,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                 // tokenizer.encode('', add_special_tokens=True)  returns [1]
                 // tokenizer.encode('', add_special_tokens=False) returns []
 
+                static const bool rtrim = true;  //TODO: as param
+                bool is_prev_special = false;
+                bool special_token_rtrim = false;
+
                 if (add_special && vocab.special_add_bos != 0) {
                     GGML_ASSERT(vocab.special_bos_id != -1);
                     output.push_back(vocab.special_bos_id);
+                    is_prev_special = true;
                 }
 
-                static const bool rtrim = true;  //TODO: as param
-                bool is_prev_special = false;
-                bool special_token_rtrim = false;
-
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                         // without adding this leading whitespace, we do not get the same results as the original tokenizer
-- 
cgit v1.2.3