diff options
Diffstat (limited to 'tests')
-rw-r--r-- | tests/CMakeLists.txt | 6 | ||||
-rw-r--r-- | tests/test-tokenizer-0-bpe.py | 126 | ||||
-rw-r--r-- | tests/test-tokenizer-0-spm.py | 126 | ||||
-rw-r--r-- | tests/test-tokenizer-0.cpp | 39 | ||||
-rw-r--r-- | tests/test-tokenizer-0.py | 46 | ||||
-rwxr-xr-x | tests/test-tokenizer-0.sh | 34 |
6 files changed, 114 insertions, 263 deletions
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d23e7f77..cad703fc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -74,13 +74,15 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) -llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) -llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf) # TODO: enable when fixed +# https://github.com/ggerganov/llama.cpp/pull/7036 #llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) +#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) # build test-tokenizer-1-bpe target once and add many tests add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp) diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py deleted file mode 100644 index 6b70ad03..00000000 --- a/tests/test-tokenizer-0-bpe.py +++ /dev/null @@ -1,126 +0,0 @@ -# tests with BPE tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/ -# - -import logging -import argparse - -from transformers import AutoTokenizer - -logger = logging.getLogger("test-tokenizer-0-bpe") - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - -args = parser.parse_args() -logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) -dir_tokenizer = args.dir_tokenizer - -tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is π¦.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅ΡΠΎ Π½Π° ΠΡΠ»Π³Π°ΡΡΠΊΠΈ", - "ααΆαααααα·αααα’αΆα
ααα
αα", - "π (normal) πΆβπ«οΈ (multiple emojis concatenated) β
(only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you π ?ζζ³ε¨appleε·₯δ½1314151倩ο½", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - -for text in tests: - logger.info(f"text: {text}") - logger.info(tokenizer.encode(text)) - logger.info(tokenizer.decode(tokenizer.encode(text))) - -logger.info("tests for C++:") -for text in tests: - res = tokenizer.encode(text) - - # Modify text representation for logging - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - - # Log the modified text and its encoding - log_message = "{ %-24s, { " % k - for x in res: - log_message += "%7d," % x - log_message += " }, }," - logger.info(log_message) - -logger.info(tokenizer.encode('hello')) -logger.info(tokenizer.encode('world')) -logger.info(tokenizer.encode(' world')) -logger.info(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - logger.info(f"tokenizing file: {fname_tok}") - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - # LLaMA v3 for some reason strips the space for these tokens (and others) - # if x == 662: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 1174: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 2564: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 758: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 949: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 5354: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # else: - # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') - logger.info(f"len(res): {len(res)}") - logger.info(f"len(lines): {len(lines)}") - logger.info(f"results written to: {fname_out}") diff --git a/tests/test-tokenizer-0-spm.py b/tests/test-tokenizer-0-spm.py deleted file mode 100644 index 4b80a438..00000000 --- a/tests/test-tokenizer-0-spm.py +++ /dev/null @@ -1,126 +0,0 @@ -# tests with SPM tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/ -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/ -# - - -import logging -import argparse - -from sentencepiece import SentencePieceProcessor - -logger = logging.getLogger("test-tokenizer-0-spm") - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - -args = parser.parse_args() - -logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - -dir_tokenizer = args.dir_tokenizer - -tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is π¦.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅ΡΠΎ Π½Π° ΠΡΠ»Π³Π°ΡΡΠΊΠΈ", - "ααΆαααααα·αααα’αΆα
ααα
αα", - "π (normal) πΆβπ«οΈ (multiple emojis concatenated) β
(only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you π ?ζζ³ε¨appleε·₯δ½1314151倩ο½", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - - -for text in tests: - message_log = (f"text: {text}\n" - "with bos:\n" - f"{tokenizer.encode(text, add_bos=True)}\n" - f"{tokenizer.decode(tokenizer.encode(text, add_bos=True))}\n" - "without bos:\n" - f"{tokenizer.encode(text, add_bos=False)}\n" - f"{tokenizer.decode(tokenizer.encode(text, add_bos=False))}\n") - logger.info(message_log) - -logger.info(f"'{tokenizer.id_to_piece(15043)}'") # '_Hello' -logger.info(f"'{tokenizer.id_to_piece(29871)}'") # '_' -logger.info(f"'{tokenizer.decode([15043])}'") # 'Hello' -logger.info(f"'{tokenizer.decode([15043, 15043])}'") # 'Hello Hello' -logger.info(f"'{tokenizer.decode([29871, 15043])}'") # ' Hello' -logger.info(f"'{tokenizer.decode([29871, 15043, 29871, 15043])}'") # ' Hello Hello' - -logger.info("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text, add_bos=False) - - # Modify text representation for logging - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - - # Log the modified text and its encoding - log_message = "{ %-24s, { " % k - for x in res: - log_message += "%7d," % x - log_message += " }, }," - logger.info(log_message) - -logger.info(tokenizer.encode('hello')) -logger.info(tokenizer.encode('world')) -logger.info(tokenizer.encode(' world')) -logger.info(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - logger.info(f"tokenizing file: {fname_tok}") - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s, add_bos=True) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - logger.info(f"len(res): {len(res)}") - logger.info(f"len(lines): {len(lines)}") - logger.info(f"results written to: {fname_out}") diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 5122757c..d478f104 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -55,8 +55,10 @@ // return _k_tests; //} -static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) { - std::map<std::string, std::vector<llama_token>> tests; +using llama_tests = std::map<std::string, std::vector<llama_token>>; + +static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) { + llama_tests tests; std::ifstream ifs_inp(fname_inp); if (!ifs_inp) { @@ -175,12 +177,20 @@ int main(int argc, char **argv) { bool success = true; - const auto k_tests = read_tests(fname_inp, fname_out); + const auto k_tests = [&]() -> llama_tests { + if (!fname_text.empty()) { + return {}; + } - if (k_tests.empty()) { - fprintf(stderr, "%s : error: no tests found\n", __func__); - return 1; - } + const auto res = read_tests(fname_inp, fname_out); + + if (res.empty()) { + fprintf(stderr, "%s : error: no tests found\n", __func__); + exit(1); + } + + return res; + }(); const bool add_special = false; @@ -238,7 +248,17 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special); + std::vector<llama_token> res; + + { + const auto t_start = ggml_time_us(); + + res = llama_tokenize(ctx, text, add_special); + + const auto t_end = ggml_time_us(); + + fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0); + } fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); @@ -252,7 +272,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl; + //ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl; + ofs << tok << "\n"; } } diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py new file mode 100644 index 00000000..8e7638e4 --- /dev/null +++ b/tests/test-tokenizer-0.py @@ -0,0 +1,46 @@ +import time +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize", required=True) +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer +fname_tok = args.fname_tok + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +print('tokenizing file: ', fname_tok) +fname_out = fname_tok + '.tok' +with open(fname_tok, 'r', encoding='utf-8') as f: + lines = f.readlines() + s = ''.join(lines) + t_start = time.time() + res = tokenizer.encode(s, add_special_tokens=False) + t_end = time.time() + print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') + with open(fname_out, 'w', encoding='utf-8') as f: + for x in res: + # LLaMA v3 for some reason strips the space for these tokens (and others) + # if x == 662: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 1174: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 2564: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 758: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 949: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 5354: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # else: + # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') + # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') + f.write(str(x) + '\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) +print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh new file mode 100755 index 00000000..2fb8632d --- /dev/null +++ b/tests/test-tokenizer-0.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +# Usage: +# +# test-tokenizer-0.sh <name> <input> +# + +if [ $# -ne 2 ]; then + printf "Usage: $0 <name> <input>\n" + exit 1 +fi + +name=$1 +input=$2 + +make -j tests/test-tokenizer-0 + +printf "Testing %s on %s ...\n" $name $input + +python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 +cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" + +./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 +cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" + +diff $input.tok $input.tokcpp > /dev/null 2>&1 + +if [ $? -eq 0 ]; then + printf "Tokenization is correct!\n" +else + diff $input.tok $input.tokcpp | head -n 32 + + printf "Tokenization differs!\n" +fi |