From 83796e62bc9f6caae6228168e359890f51e60fee Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 11 Mar 2024 17:47:47 +0200 Subject: llama : refactor unicode stuff (#5992) * llama : refactor unicode stuff ggml-ci * unicode : names * make : fix c++ compiler * unicode : names * unicode : straighten tables * zig : fix build * unicode : put nfd normalization behind API ggml-ci * swift : fix build * unicode : add BOM * unicode : add ggml-ci * unicode : pass as cpts as const ref --- tests/test-tokenizer-1-bpe.cpp | 4 ++-- tests/test-tokenizer-1-llama.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tests') diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 3596ce55..a0e2caf9 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -64,7 +64,7 @@ int main(int argc, char **argv) { for (int i = 0; i < n_vocab; ++i) { std::string str = llama_detokenize_bpe(ctx, std::vector(1, i)); try { - auto cps = codepoints_from_utf8(str); + auto cps = unicode_cpts_from_utf8(str); std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_bpe(ctx, tokens); if (check != str) { @@ -97,7 +97,7 @@ int main(int argc, char **argv) { continue; } - std::string str = codepoint_to_utf8(cp); + std::string str = unicode_cpt_to_utf8(cp); std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_bpe(ctx, tokens); if (cp != 9601 && str != check) { diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp index 9333f868..8caf0b24 100644 --- a/tests/test-tokenizer-1-llama.cpp +++ b/tests/test-tokenizer-1-llama.cpp @@ -85,7 +85,7 @@ int main(int argc, char **argv) { continue; } - std::string str = codepoint_to_utf8(cp); + std::string str = unicode_cpt_to_utf8(cp); std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_spm(ctx, tokens); if (cp != 9601 && str != check) { -- cgit v1.2.3