From 2ba83c8685177faea3399db9564f9c52df75c366 Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Sat, 26 Aug 2023 13:45:53 +0200 Subject: Fix spm whitespaces (#2806) * llama.cpp : fix spm whitespace escaping + clean up * main.cpp : spm - add whitespace in front of prompt * test-tokenizer-0.cpp : spm - add whitespace in front of prompt --- tests/test-tokenizer-0.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tests/test-tokenizer-0.cpp') diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index f3ee851a..7e9ac918 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -100,7 +100,8 @@ int main(int argc, char **argv) { bool success = true; for (const auto & test_kv : k_tests()) { - std::vector res = llama_tokenize(ctx, test_kv.first, true); + // Add a space in front of the first character to match OG llama tokenizer behavior + std::vector res = llama_tokenize(ctx, " " + test_kv.first, true); fprintf(stderr, "%s : '%s' tokenized to '%s'\n", __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str()); -- cgit v1.2.3