llama : refactor unicode stuff (#5992)

* llama : refactor unicode stuff ggml-ci * unicode : names * make : fix c++ compiler * unicode : names * unicode : straighten tables * zig : fix build * unicode : put nfd normalization behind API ggml-ci * swift : fix build * unicode : add BOM * unicode : add <cstdint> ggml-ci * unicode : pass as cpts as const ref
author: Georgi Gerganov <ggerganov@gmail.com> 2024-03-11 17:47:47 +0200
committer: GitHub <noreply@github.com> 2024-03-11 17:47:47 +0200
commit: 83796e62bc9f6caae6228168e359890f51e60fee (patch)
tree: 30ea49ae9b8fbde216ca164200bea1df44e18df3 /tests/test-tokenizer-1-bpe.cpp
parent: 828defefb66fc8a25404f5de845897145bf34061 (diff)
1 files changed, 2 insertions, 2 deletions
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 3596ce55..a0e2caf9 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -64,7 +64,7 @@ int main(int argc, char **argv) {
     for (int i = 0; i < n_vocab; ++i) {
         std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
         try {
-            auto cps = codepoints_from_utf8(str);
+            auto cps = unicode_cpts_from_utf8(str);
             std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
             std::string check = llama_detokenize_bpe(ctx, tokens);
             if (check != str) {
@@ -97,7 +97,7 @@ int main(int argc, char **argv) {
                         continue;
                     }
 
-                    std::string str = codepoint_to_utf8(cp);
+                    std::string str = unicode_cpt_to_utf8(cp);
                     std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
                     std::string check = llama_detokenize_bpe(ctx, tokens);
                     if (cp != 9601 && str != check) {
author	Georgi Gerganov <ggerganov@gmail.com>	2024-03-11 17:47:47 +0200
committer	GitHub <noreply@github.com>	2024-03-11 17:47:47 +0200
commit	83796e62bc9f6caae6228168e359890f51e60fee (patch)
tree	30ea49ae9b8fbde216ca164200bea1df44e18df3 /tests/test-tokenizer-1-bpe.cpp
parent	828defefb66fc8a25404f5de845897145bf34061 (diff)