From 83796e62bc9f6caae6228168e359890f51e60fee Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 11 Mar 2024 17:47:47 +0200
Subject: llama : refactor unicode stuff (#5992)

* llama : refactor unicode stuff

ggml-ci

* unicode : names

* make : fix c++ compiler

* unicode : names

* unicode : straighten tables

* zig : fix build

* unicode : put nfd normalization behind API

ggml-ci

* swift : fix build

* unicode : add BOM

* unicode : add <cstdint>

ggml-ci

* unicode : pass as cpts as const ref
---
 tests/test-tokenizer-1-bpe.cpp   | 4 ++--
 tests/test-tokenizer-1-llama.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tests')
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 3596ce55..a0e2caf9 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -64,7 +64,7 @@ int main(int argc, char **argv) {
     for (int i = 0; i < n_vocab; ++i) {
         std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
         try {
-            auto cps = codepoints_from_utf8(str);
+            auto cps = unicode_cpts_from_utf8(str);
             std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
             std::string check = llama_detokenize_bpe(ctx, tokens);
             if (check != str) {
@@ -97,7 +97,7 @@ int main(int argc, char **argv) {
                         continue;
                     }
 
-                    std::string str = codepoint_to_utf8(cp);
+                    std::string str = unicode_cpt_to_utf8(cp);
                     std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
                     std::string check = llama_detokenize_bpe(ctx, tokens);
                     if (cp != 9601 && str != check) {
diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp
index 9333f868..8caf0b24 100644
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@@ -85,7 +85,7 @@ int main(int argc, char **argv) {
                         continue;
                     }
 
-                    std::string str = codepoint_to_utf8(cp);
+                    std::string str = unicode_cpt_to_utf8(cp);
                     std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
                     std::string check = llama_detokenize_spm(ctx, tokens);
                     if (cp != 9601 && str != check) {
-- 
cgit v1.2.3