diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-05-04 08:32:32 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-04 08:32:32 +0300 |
commit | 92139b90af4841d7fd060b526bdd443b621770ff (patch) | |
tree | 9679c3de1b39970ca73b5bd988c63ddac0359ca6 /tests/test-tokenizer-0.cpp | |
parent | a2ac89d6efb41b535778bfeaecaae8fe295b6ed3 (diff) |
tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)
* tests : add test-tokenizer-0.sh
* unicode : add all unicode number ranges
* starcoder : fix pre-tokenizer
* tests : add test that fails with DeepSeek tokenizers
* falcon : fix regex
* unicode : regenerate unicode tables
* refact : add tokenizer model
* lint : fix
* tests : disable failing tests
ggml-ci
* refact : add tests files
ggml-ci
* convert : print -> logging
ggml-ci
* lint : fix
* unicode : digit -> number
* phi-3 : update
Diffstat (limited to 'tests/test-tokenizer-0.cpp')
-rw-r--r-- | tests/test-tokenizer-0.cpp | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 5122757c..d478f104 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -55,8 +55,10 @@ // return _k_tests; //} -static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) { - std::map<std::string, std::vector<llama_token>> tests; +using llama_tests = std::map<std::string, std::vector<llama_token>>; + +static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) { + llama_tests tests; std::ifstream ifs_inp(fname_inp); if (!ifs_inp) { @@ -175,12 +177,20 @@ int main(int argc, char **argv) { bool success = true; - const auto k_tests = read_tests(fname_inp, fname_out); + const auto k_tests = [&]() -> llama_tests { + if (!fname_text.empty()) { + return {}; + } - if (k_tests.empty()) { - fprintf(stderr, "%s : error: no tests found\n", __func__); - return 1; - } + const auto res = read_tests(fname_inp, fname_out); + + if (res.empty()) { + fprintf(stderr, "%s : error: no tests found\n", __func__); + exit(1); + } + + return res; + }(); const bool add_special = false; @@ -238,7 +248,17 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special); + std::vector<llama_token> res; + + { + const auto t_start = ggml_time_us(); + + res = llama_tokenize(ctx, text, add_special); + + const auto t_end = ggml_time_us(); + + fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0); + } fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); @@ -252,7 +272,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl; + //ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl; + ofs << tok << "\n"; } } |