From 71ca2fad7d6c0ef95ef9944fb3a1a843e481f314 Mon Sep 17 00:00:00 2001 From: goerch Date: Wed, 13 Sep 2023 15:19:44 +0200 Subject: whisper : tokenizer fix + re-enable tokenizer test for LLaMa (#3096) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix für #2721 * Reenable tokenizer test for LLaMa * Add `console.cpp` dependency * Fix dependency to `common` * Fixing wrong fix. * Make console usage platform specific Work on compiler warnings. * Adapting makefile * Remove trailing whitespace * Adapting the other parts of the makefile * Fix typo. --- tests/test-tokenizer-1.cpp | 108 --------------------------------------------- 1 file changed, 108 deletions(-) delete mode 100644 tests/test-tokenizer-1.cpp (limited to 'tests/test-tokenizer-1.cpp') diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp deleted file mode 100644 index ce4f2898..00000000 --- a/tests/test-tokenizer-1.cpp +++ /dev/null @@ -1,108 +0,0 @@ -#include "llama.h" -#include "common.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -static std::string escape_whitespace(const std::string& text) { - std::string result = "\xe2\x96\x81"; - for (size_t offs = 0; offs < text.length(); ++offs) { - if (text[offs] == ' ') { - result += "\xe2\x96\x81"; - } else { - result += text[offs]; - } - } - return result; -} - -int main(int argc, char **argv) { - if (argc < 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } - - const std::string fname = argv[1]; - - fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); - - llama_model * model; - llama_context * ctx; - - llama_backend_init(false); - - // load the vocab - { - auto lparams = llama_context_default_params(); - - lparams.vocab_only = true; - - model = llama_load_model_from_file(fname.c_str(), lparams); - - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - return 1; - } - - ctx = llama_new_context_with_model(model, lparams); - - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - llama_free_model(model); - return 1; - } - } - - GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE); - - const int n_vocab = llama_n_vocab(ctx); - - for (int i = 0; i < n_vocab; ++i) { - std::string forward = llama_token_to_piece(ctx, i); - std::vector tokens = llama_tokenize(ctx, forward, false); - if (tokens.size() == 1) { - if (i != tokens[0]) { - std::string backward = llama_token_to_piece(ctx, tokens[0]); - fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n", - __func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str()); - return 2; - } - } - } - -#ifdef _WIN32 - std::wstring_convert, char16_t> u16converter; - for (char16_t ch = 0x0000; ch < 0xffff; ++ch) { - std::u16string u16str(1, ch); - std::string str = u16converter.to_bytes(u16str); - std::vector tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false); - if (tokens.size() == 1) { - fprintf(stderr, "%s : info: %s tokenized to %d \n", - __func__, str.c_str(), tokens[0]); - } - } - - std::wstring_convert, char32_t> u32converter; - for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) { - std::u32string u32str(1, ch); - std::string str = u32converter.to_bytes(u32str); - std::vector tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false); - if (tokens.size() == 1) { - fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]); - } - } -#endif - - llama_free_model(model); - llama_free(ctx); - - llama_backend_free(); - - return 0; -} -- cgit v1.2.3