summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorjaime-m-p <167997752+jaime-m-p@users.noreply.github.com>2024-05-21 14:39:48 +0200
committerGitHub <noreply@github.com>2024-05-21 14:39:48 +0200
commitd7e852c1bc8e85bf62a6f1aede08cd2de723404a (patch)
tree46323a83d73f66727459aee88a995e946a78e005 /llama.cpp
parent917dc8cfa67a72fb7c8bf7392270da3bf4833af4 (diff)
Tokenizer SPM fixes for phi-3 and llama-spm (bugfix) (#7425)
* Update brute force test: add_special * Update brute force test: default values for add_bos_token and add_eos_token * Enable rtrim when pre-inserting BOS Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Revert "server : fix test regexes"
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp9
1 files changed, 5 insertions, 4 deletions
diff --git a/llama.cpp b/llama.cpp
index e2ebe175..d26fe559 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12498,15 +12498,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
// tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns []
+ static const bool rtrim = true; //TODO: as param
+ bool is_prev_special = false;
+ bool special_token_rtrim = false;
+
if (add_special && vocab.special_add_bos != 0) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
+ is_prev_special = true;
}
- static const bool rtrim = true; //TODO: as param
- bool is_prev_special = false;
- bool special_token_rtrim = false;
-
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer