summaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
Diffstat (limited to 'tests')
-rw-r--r--tests/CMakeLists.txt6
-rw-r--r--tests/test-tokenizer-0-falcon.cpp178
-rw-r--r--tests/test-tokenizer-0-falcon.py83
-rw-r--r--tests/test-tokenizer-0-llama.cpp182
-rw-r--r--tests/test-tokenizer-0-llama.py95
-rw-r--r--tests/test-tokenizer-0.cpp141
-rw-r--r--tests/test-tokenizer-1.cpp14
7 files changed, 545 insertions, 154 deletions
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2afaf86b..ca1f39d3 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -25,8 +25,10 @@ endfunction()
llama_build_and_test_executable(test-quantize-fns.cpp)
llama_build_and_test_executable(test-quantize-perf.cpp)
llama_build_and_test_executable(test-sampling.cpp)
-llama_build_executable(test-tokenizer-0.cpp)
-llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_build_executable(test-tokenizer-0-llama.cpp)
+llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_build_executable(test-tokenizer-0-falcon.cpp)
+#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_build_executable(test-tokenizer-1.cpp)
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp
new file mode 100644
index 00000000..836fb8ad
--- /dev/null
+++ b/tests/test-tokenizer-0-falcon.cpp
@@ -0,0 +1,178 @@
+#include "llama.h"
+#include "common.h"
+
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include <fstream>
+
+// generate using test-tokenizer-0-falcon.py
+static const std::map<std::string, std::vector<llama_token>> & k_tests() {
+ static std::map<std::string, std::vector<llama_token>> _k_tests = {
+ { "" , { }, },
+ { " " , { 204, }, },
+ { " " , { 258, }, },
+ { " " , { 466, }, },
+ { "\t" , { 192, }, },
+ { "\n" , { 193, }, },
+ { "\t\n" , { 19125, }, },
+ { "Hello world" , { 9856, 1079, }, },
+ { " Hello world" , { 23090, 1079, }, },
+ { "Hello World" , { 9856, 2889, }, },
+ { " Hello World" , { 23090, 2889, }, },
+ { " Hello World!" , { 23090, 2889, 12, }, },
+ { "Hello, world!" , { 9856, 23, 1079, 12, }, },
+ { " Hello, world!" , { 23090, 23, 1079, 12, }, },
+ { " this is πŸ¦™.cpp" , { 414, 304, 3346, 111, 231, 25, 29247, }, },
+ { "w048 7tuijk dsdfhu" , { 98, 55866, 204, 34, 16682, 7149, 36190, 6869, 11481, }, },
+ { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ" , { 150, 133, 6207, 151, 215, 150, 134, 5052, 133, 6279, 5052, 223, 151, 216, 49679, 123, 53110, 47043, 7795, }, },
+ { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰" , { 38154, 206, 38154, 126, 38154, 225, 167, 237, 217, 38154, 221, 167, 237, 208, 38154, 228, 38154, 127, 38154, 237, 167, 237, 207, 38154, 237, 38154, 107, 38154, 126, 38154, 211, 38154, 207, 38154, 233, 38154, 211, 167, 237, 207, 38154, 215, }, },
+ { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 2571, 232, 206, 204, 19, 11003, 20, 8196, 126, 283, 219, 48778, 116, 13392, 204, 19, 51831, 732, 63209, 1741, 7955, 522, 20, 22438, 211, 204, 19, 7927, 53360, 325, 504, 701, 946, 10930, 20, }, },
+ { "Hello" , { 9856, }, },
+ { " Hello" , { 23090, }, },
+ { " Hello" , { 204, 23090, }, },
+ { " Hello" , { 258, 23090, }, },
+ { " Hello" , { 466, 23090, }, },
+ { " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
+ };
+
+ return _k_tests;
+}
+
+int main(int argc, char **argv) {
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
+ return 1;
+ }
+
+ const std::string fname = argv[1];
+
+ std::string fname_text;
+ if (argc > 2) {
+ fname_text = argv[2];
+ }
+
+ fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+ llama_model * model;
+ llama_context * ctx;
+
+ llama_backend_init(false);
+
+ // load the vocab
+ {
+ auto lparams = llama_context_default_params();
+
+ lparams.vocab_only = true;
+
+ model = llama_load_model_from_file(fname.c_str(), lparams);
+
+ if (model == NULL) {
+ fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+ return 1;
+ }
+
+ ctx = llama_new_context_with_model(model, lparams);
+
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+ llama_free_model(model);
+ return 1;
+ }
+ }
+
+ if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) {
+ fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
+ llama_free_model(model);
+ llama_free(ctx);
+ return 2;
+ }
+
+ bool success = true;
+
+ for (const auto & test_kv : k_tests()) {
+ const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
+
+ printf("\n");
+ printf("src: '%s'\n", test_kv.first.c_str());
+ printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
+ printf("tok: ");
+ for (const auto & tok : res) {
+ printf("%d ", tok);
+ }
+ printf("\n");
+
+ bool correct = res.size() == test_kv.second.size();
+
+ for (int i = 0; i < (int) res.size() && correct; ++i) {
+ if (test_kv.second[i] != res[i]) {
+ correct = false;
+ }
+ }
+
+ if (!correct) {
+ fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
+ fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+ llama_detokenize_bpe(ctx, res).c_str(),
+ llama_detokenize_bpe(ctx, test_kv.second).c_str());
+ fprintf(stderr, "%s : expected tokens: ", __func__);
+ for (const auto & t : test_kv.second) {
+ fprintf(stderr, "%6d, ", t);
+ }
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s : got tokens: ", __func__);
+ for (const auto & t : res) {
+ fprintf(stderr, "%6d, ", t);
+ }
+ fprintf(stderr, "\n");
+
+ success = false;
+ }
+ }
+
+ if (!fname_text.empty()) {
+ fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+ std::string text;
+ {
+ std::ifstream ifs(fname_text);
+ if (!ifs) {
+ fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+ return 1;
+ }
+ text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+ }
+
+ fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+ const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
+
+ fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+ {
+ const std::string fname_out = fname_text + ".tokcpp";
+
+ std::ofstream ofs(fname_out);
+ if (!ofs) {
+ fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+ return 1;
+ }
+
+ for (const auto & tok : res) {
+ ofs << tok << " ";
+ }
+
+ ofs << "\n";
+ }
+
+ fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
+ }
+
+ llama_free_model(model);
+ llama_free(ctx);
+
+ llama_backend_free();
+
+ return success ? 0 : 3;
+}
diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py
new file mode 100644
index 00000000..9c8c1c7d
--- /dev/null
+++ b/tests/test-tokenizer-0-falcon.py
@@ -0,0 +1,83 @@
+# tests with BPE tokenizer
+
+import os
+import sys
+import argparse
+
+from transformers import AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+parser.add_argument("--fname-tok", help="path to a text file to tokenize")
+args = parser.parse_args()
+
+dir_tokenizer = args.dir_tokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
+
+tests = [
+ "",
+ " ",
+ " ",
+ " ",
+ "\t",
+ "\n",
+ "\t\n",
+ "Hello world",
+ " Hello world",
+ "Hello World",
+ " Hello World",
+ " Hello World!",
+ "Hello, world!",
+ " Hello, world!",
+ " this is πŸ¦™.cpp",
+ "w048 7tuijk dsdfhu",
+ "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ",
+ "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰",
+ "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)",
+ "Hello",
+ " Hello",
+ " Hello",
+ " Hello",
+ " Hello",
+ " Hello\n Hello",
+ ]
+
+for text in tests:
+ print('text: ', text)
+ print(tokenizer.encode(text))
+ print(tokenizer.decode(tokenizer.encode(text)))
+
+print("\n\ntests for C++:\n")
+for text in tests:
+ res = tokenizer.encode(text)
+
+ k = text.replace('\n', '\\n')
+ k = k.replace('\t', '\\t')
+ k = '"' + k + '"'
+ print("{ %-24s, { " % k, end='')
+ for x in res:
+ print("%7d," % x, end='')
+ print(" }, },")
+
+print(tokenizer.encode('hello'))
+print(tokenizer.encode('world'))
+print(tokenizer.encode(' world'))
+print(tokenizer.encode('hello world'))
+
+fname_tok = args.fname_tok
+if fname_tok:
+ print('tokenizing file: ', fname_tok)
+ fname_out = fname_tok + '.tok'
+ with open(fname_tok, 'r') as f:
+ lines = f.readlines()
+ s = ''.join(lines)
+ res = tokenizer.encode(s)
+ # write to file
+ with open(fname_out, 'w') as f:
+ for x in res:
+ f.write(str(x) + ' ')
+ f.write('\n')
+ print('len(res): ', len(res))
+ print('len(lines): ', len(lines))
+ print('results written to: ', fname_out)
diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp
new file mode 100644
index 00000000..8630742c
--- /dev/null
+++ b/tests/test-tokenizer-0-llama.cpp
@@ -0,0 +1,182 @@
+#include "llama.h"
+#include "common.h"
+
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include <fstream>
+
+// generate using test-tokenizer-0-llama.py
+static const std::map<std::string, std::vector<llama_token>> & k_tests() {
+ static std::map<std::string, std::vector<llama_token>> _k_tests = {
+ { "" , { }, },
+ { " " , { 259, }, },
+ { " " , { 1678, }, },
+ { " " , { 268, }, },
+ { "\t" , { 29871, 12, }, },
+ { "\n" , { 29871, 13, }, },
+ { "\t\n" , { 29871, 12, 13, }, },
+ { "Hello world" , { 15043, 3186, }, },
+ { " Hello world" , { 29871, 15043, 3186, }, },
+ { "Hello World" , { 15043, 2787, }, },
+ { " Hello World" , { 29871, 15043, 2787, }, },
+ { " Hello World!" , { 29871, 15043, 2787, 29991, }, },
+ { "Hello, world!" , { 15043, 29892, 3186, 29991, }, },
+ { " Hello, world!" , { 29871, 15043, 29892, 3186, 29991, }, },
+ { " this is πŸ¦™.cpp" , { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
+ { "w048 7tuijk dsdfhu" , { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
+ { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ" , { 1538, 4851, 665, 1386, 29713, 1305, }, },
+ { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰" , { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, 136, 228, 162, 132, 228, 161, 140, }, },
+ { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
+ { "Hello" , { 15043, }, },
+ { " Hello" , { 29871, 15043, }, },
+ { " Hello" , { 259, 15043, }, },
+ { " Hello" , { 1678, 15043, }, },
+ { " Hello" , { 268, 15043, }, },
+ { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, },
+ };
+
+ return _k_tests;
+}
+
+int main(int argc, char **argv) {
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
+ return 1;
+ }
+
+ const std::string fname = argv[1];
+
+ std::string fname_text;
+ if (argc > 2) {
+ fname_text = argv[2];
+ }
+
+ fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+ llama_model * model;
+ llama_context * ctx;
+
+ llama_backend_init(false);
+
+ // load the vocab
+ {
+ auto lparams = llama_context_default_params();
+
+ lparams.vocab_only = true;
+
+ model = llama_load_model_from_file(fname.c_str(), lparams);
+
+ if (model == NULL) {
+ fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+ return 1;
+ }
+
+ ctx = llama_new_context_with_model(model, lparams);
+
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+ llama_free_model(model);
+ return 1;
+ }
+ }
+
+ if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) {
+ fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
+ llama_free_model(model);
+ llama_free(ctx);
+ return 2;
+ }
+
+ bool success = true;
+
+ for (const auto & test_kv : k_tests()) {
+ const std::vector<llama_token> res_bos = llama_tokenize(ctx, test_kv.first, true);
+ const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
+
+ printf("\n");
+ printf("src: '%s'\n", test_kv.first.c_str());
+ printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
+ printf("tok: ");
+ for (const auto & tok : res_bos) {
+ printf("%d ", tok);
+ }
+ printf("\n");
+
+ bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
+
+ for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
+ if (test_kv.second[i] != res_bos[i + 1]) {
+ correct = false;
+ }
+ if (test_kv.second[i] != res_nobos[i]) {
+ correct = false;
+ }
+ }
+
+ if (!correct) {
+ fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
+ fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+ llama_detokenize_spm(ctx, res_nobos).c_str(),
+ llama_detokenize_spm(ctx, test_kv.second).c_str());
+ fprintf(stderr, "%s : expected tokens: ", __func__);
+ for (const auto & t : test_kv.second) {
+ fprintf(stderr, "%6d, ", t);
+ }
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s : got tokens: ", __func__);
+ for (const auto & t : res_nobos) {
+ fprintf(stderr, "%6d, ", t);
+ }
+ fprintf(stderr, "\n");
+
+ success = false;
+ }
+ }
+
+ if (!fname_text.empty()) {
+ fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+ std::string text;
+ {
+ std::ifstream ifs(fname_text);
+ if (!ifs) {
+ fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+ return 1;
+ }
+ text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+ }
+
+ fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+ const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
+
+ fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+ {
+ const std::string fname_out = fname_text + ".tokcpp";
+
+ std::ofstream ofs(fname_out);
+ if (!ofs) {
+ fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+ return 1;
+ }
+
+ for (const auto & tok : res) {
+ ofs << tok << " ";
+ }
+
+ ofs << "\n";
+ }
+
+ fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
+ }
+
+ llama_free_model(model);
+ llama_free(ctx);
+
+ llama_backend_free();
+
+ return success ? 0 : 3;
+}
diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py
new file mode 100644
index 00000000..bc164ee2
--- /dev/null
+++ b/tests/test-tokenizer-0-llama.py
@@ -0,0 +1,95 @@
+# tests with SPM tokenizer
+
+import os
+import sys
+import argparse
+
+from sentencepiece import SentencePieceProcessor
+
+parser = argparse.ArgumentParser()
+parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+parser.add_argument("--fname-tok", help="path to a text file to tokenize")
+args = parser.parse_args()
+
+dir_tokenizer = args.dir_tokenizer
+
+tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
+
+tests = [
+ "",
+ " ",
+ " ",
+ " ",
+ "\t",
+ "\n",
+ "\t\n",
+ "Hello world",
+ " Hello world",
+ "Hello World",
+ " Hello World",
+ " Hello World!",
+ "Hello, world!",
+ " Hello, world!",
+ " this is πŸ¦™.cpp",
+ "w048 7tuijk dsdfhu",
+ "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ",
+ "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰",
+ "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)",
+ "Hello",
+ " Hello",
+ " Hello",
+ " Hello",
+ " Hello",
+ " Hello\n Hello",
+ ]
+
+
+for text in tests:
+ print('text: ', text)
+ print('\nwith bos:')
+ print(tokenizer.encode(text, add_bos=True))
+ print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
+ print('\nwithout bos:')
+ print(tokenizer.encode(text, add_bos=False))
+ print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
+
+print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
+print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
+print("'" + tokenizer.decode([15043]) + "'") # 'Hello'
+print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
+print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello'
+print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello'
+
+print("\n\ntests for C++:\n")
+for text in tests:
+ res = tokenizer.encode(text, add_bos=False)
+
+ k = text.replace('\n', '\\n')
+ k = k.replace('\t', '\\t')
+ k = '"' + k + '"'
+ print("{ %-24s, { " % k, end='')
+ for x in res:
+ print("%7d," % x, end='')
+ print(" }, },")
+
+print(tokenizer.encode('hello'))
+print(tokenizer.encode('world'))
+print(tokenizer.encode(' world'))
+print(tokenizer.encode('hello world'))
+
+fname_tok = args.fname_tok
+if fname_tok:
+ print('tokenizing file: ', fname_tok)
+ fname_out = fname_tok + '.tok'
+ with open(fname_tok, 'r') as f:
+ lines = f.readlines()
+ s = ''.join(lines)
+ res = tokenizer.encode(s, add_bos=True)
+ # write to file
+ with open(fname_out, 'w') as f:
+ for x in res:
+ f.write(str(x) + ' ')
+ f.write('\n')
+ print('len(res): ', len(res))
+ print('len(lines): ', len(lines))
+ print('results written to: ', fname_out)
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
deleted file mode 100644
index 7e9ac918..00000000
--- a/tests/test-tokenizer-0.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-#include "llama.h"
-#include "common.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-
-static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
- std::string result;
- for (size_t i = 0; i < tokens.size(); ++i) {
- result += llama_token_to_str(ctx, tokens[i]);
- }
- return result;
-}
-
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
- static std::map<std::string, std::vector<llama_token>> _k_tests = {
- { " ", {1, 259, }, },
- { " ", { 1, 1678, }, },
- { " ", { 1, 268, }, },
- { "\t", { 1, 29871, 12, }, },
- { "\n", { 1, 29871, 13, }, },
- { "\t\n", { 1, 29871, 12, 13, }, },
- { "Hello world", { 1, 15043, 3186, }, },
- { " Hello world", { 1, 29871, 15043, 3186, }, },
- { "Hello World", { 1, 15043, 2787, }, },
- { " Hello World", { 1, 29871, 15043, 2787, }, },
- { " Hello World!", { 1, 29871, 15043, 2787, 29991, }, },
- { " this is πŸ¦™.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
- { "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
- { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
- { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161,
- 146, 228, 162, 133, 228, 161, 153, 228, 161, 186,
- 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228,
- 161, 136, 228, 161, 132, 228, 161, 158, 228, 161,
- 136, 228, 162, 132, 228, 161, 140, }, },
- { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)",
- { 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871,
- 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
- 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
- 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
- { "Hello", { 1, 15043 }, },
- { " Hello", { 1, 29871, 15043 }, },
- { " Hello", { 1, 259, 15043 }, },
- { " Hello", { 1, 1678, 15043 }, },
- { " Hello", { 1, 268, 15043 }, },
- { " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, },
- };
-
- return _k_tests;
-}
-
-int main(int argc, char **argv) {
- if (argc < 2) {
- fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
- return 1;
- }
-
- const std::string fname = argv[1];
-
- fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
- llama_model * model;
- llama_context * ctx;
-
- llama_backend_init(false);
-
- // load the vocab
- {
- auto lparams = llama_context_default_params();
-
- lparams.vocab_only = true;
-
- model = llama_load_model_from_file(fname.c_str(), lparams);
-
- if (model == NULL) {
- fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
- return 1;
- }
-
- ctx = llama_new_context_with_model(model, lparams);
-
- if (ctx == NULL) {
- fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
- llama_free_model(model);
- return 1;
- }
- }
-
- const int n_vocab = llama_n_vocab(ctx);
-
- if (n_vocab != 32000) {
- fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
- llama_free_model(model);
- llama_free(ctx);
- return 2;
- }
-
- bool success = true;
-
- for (const auto & test_kv : k_tests()) {
- // Add a space in front of the first character to match OG llama tokenizer behavior
- std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
- fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
- __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
-
- bool correct = res.size() == test_kv.second.size();
-
- for (int i = 0; i < (int) res.size() && correct; ++i) {
- if (res[i] != test_kv.second[i]) {
- correct = false;
- }
- }
-
- if (!correct) {
- fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
- fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
- unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
- fprintf(stderr, "%s : expected tokens: ", __func__);
- for (const auto & t : test_kv.second) {
- fprintf(stderr, "%6d, ", t);
- }
- fprintf(stderr, "\n");
- fprintf(stderr, "%s : got tokens: ", __func__);
- for (const auto & t : res) {
- fprintf(stderr, "%6d, ", t);
- }
- fprintf(stderr, "\n");
-
- success = false;
- }
- }
-
- llama_free_model(model);
- llama_free(ctx);
-
- llama_backend_free();
-
- return success ? 0 : 3;
-}
diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp
index bd607d12..ce4f2898 100644
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@@ -22,14 +22,6 @@ static std::string escape_whitespace(const std::string& text) {
return result;
}
-static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
- std::string result;
- for (size_t i = 0; i < tokens.size(); ++i) {
- result += llama_token_to_str(ctx, tokens[i]);
- }
- return result;
-}
-
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
@@ -72,13 +64,13 @@ int main(int argc, char **argv) {
const int n_vocab = llama_n_vocab(ctx);
for (int i = 0; i < n_vocab; ++i) {
- std::string forward = llama_token_to_str(ctx, i);
+ std::string forward = llama_token_to_piece(ctx, i);
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
if (tokens.size() == 1) {
if (i != tokens[0]) {
- std::string backward = llama_token_to_str(ctx, tokens[0]);
+ std::string backward = llama_token_to_piece(ctx, tokens[0]);
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
- __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
+ __func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
return 2;
}
}