summaryrefslogtreecommitdiff
path: root/common/common.cpp
diff options
context:
space:
mode:
authorKerfuffle <44031344+KerfuffleV2@users.noreply.github.com>2023-11-16 19:14:37 -0700
committerGitHub <noreply@github.com>2023-11-16 19:14:37 -0700
commit91f6499393d2d999331fbfdba47a7f8b9f913f0d (patch)
tree27caf3ad0b9cec979bb5ed3317b5334bdcd9470c /common/common.cpp
parent8da46278e1a57107591653275f8e03a281de94f0 (diff)
Respect tokenizer.ggml.add_bos_token value when tokenizing (#4040)
* gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode. * Respect add_bos_token GGUF metadata value * gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
Diffstat (limited to 'common/common.cpp')
-rw-r--r--common/common.cpp6
1 files changed, 6 insertions, 0 deletions
diff --git a/common/common.cpp b/common/common.cpp
index 6a711420..e119317d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1072,6 +1072,12 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
return result;
}
+bool llama_should_add_bos_token(const llama_model * model) {
+ const int add_bos = llama_add_bos_token(model);
+
+ return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+}
+
//
// YAML utils
//