summaryrefslogtreecommitdiff
path: root/tests/test-tokenizer-1-llama.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-03-11 17:47:47 +0200
committerGitHub <noreply@github.com>2024-03-11 17:47:47 +0200
commit83796e62bc9f6caae6228168e359890f51e60fee (patch)
tree30ea49ae9b8fbde216ca164200bea1df44e18df3 /tests/test-tokenizer-1-llama.cpp
parent828defefb66fc8a25404f5de845897145bf34061 (diff)
llama : refactor unicode stuff (#5992)
* llama : refactor unicode stuff ggml-ci * unicode : names * make : fix c++ compiler * unicode : names * unicode : straighten tables * zig : fix build * unicode : put nfd normalization behind API ggml-ci * swift : fix build * unicode : add BOM * unicode : add <cstdint> ggml-ci * unicode : pass as cpts as const ref
Diffstat (limited to 'tests/test-tokenizer-1-llama.cpp')
-rw-r--r--tests/test-tokenizer-1-llama.cpp2
1 files changed, 1 insertions, 1 deletions
diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp
index 9333f868..8caf0b24 100644
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@@ -85,7 +85,7 @@ int main(int argc, char **argv) {
continue;
}
- std::string str = codepoint_to_utf8(cp);
+ std::string str = unicode_cpt_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (cp != 9601 && str != check) {