summaryrefslogtreecommitdiff
path: root/tests/test-tokenizer-1-bpe.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-03-11 17:47:47 +0200
committerGitHub <noreply@github.com>2024-03-11 17:47:47 +0200
commit83796e62bc9f6caae6228168e359890f51e60fee (patch)
tree30ea49ae9b8fbde216ca164200bea1df44e18df3 /tests/test-tokenizer-1-bpe.cpp
parent828defefb66fc8a25404f5de845897145bf34061 (diff)
llama : refactor unicode stuff (#5992)
* llama : refactor unicode stuff ggml-ci * unicode : names * make : fix c++ compiler * unicode : names * unicode : straighten tables * zig : fix build * unicode : put nfd normalization behind API ggml-ci * swift : fix build * unicode : add BOM * unicode : add <cstdint> ggml-ci * unicode : pass as cpts as const ref
Diffstat (limited to 'tests/test-tokenizer-1-bpe.cpp')
-rw-r--r--tests/test-tokenizer-1-bpe.cpp4
1 files changed, 2 insertions, 2 deletions
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 3596ce55..a0e2caf9 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -64,7 +64,7 @@ int main(int argc, char **argv) {
for (int i = 0; i < n_vocab; ++i) {
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
try {
- auto cps = codepoints_from_utf8(str);
+ auto cps = unicode_cpts_from_utf8(str);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_bpe(ctx, tokens);
if (check != str) {
@@ -97,7 +97,7 @@ int main(int argc, char **argv) {
continue;
}
- std::string str = codepoint_to_utf8(cp);
+ std::string str = unicode_cpt_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_bpe(ctx, tokens);
if (cp != 9601 && str != check) {