From f9a6364912fd0463fddfdbc9ef9f79fdc281570d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 8 May 2023 17:41:54 +0300 Subject: llama : require first token to be BOS (#1303) * llama : require first token to be BOS * scripts : add ppl-run-all.sh * perplexity : add BOS for each chunk * readme : update perplexity values after BOS fix * perplexity : add clarifying comments --- examples/common.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'examples/common.cpp') diff --git a/examples/common.cpp b/examples/common.cpp index f1c3bae1..6af44027 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -438,8 +438,8 @@ std::string gpt_random_prompt(std::mt19937 & rng) { // TODO: not great allocating this every time std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars - std::vector res(text.size() + (int)add_bos); - int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + std::vector res(text.size() + (int) add_bos); + const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); assert(n >= 0); res.resize(n); -- cgit v1.2.3