From 2fa02b4b3d86182381311c98b75065ee1b7c2930 Mon Sep 17 00:00:00 2001 From: zakkor Date: Fri, 17 Nov 2023 17:36:44 +0200 Subject: examples : add tokenize (#4039) --- examples/tokenize/tokenize.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/tokenize/tokenize.cpp (limited to 'examples/tokenize/tokenize.cpp') diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp new file mode 100644 index 00000000..72b60f9a --- /dev/null +++ b/examples/tokenize/tokenize.cpp @@ -0,0 +1,44 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include + +int main(int argc, char ** argv) { + if (argc < 3 || argv[1][0] == '-') { + printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]); + return 1; + } + + auto model_path = argv[1]; + auto prompt = argv[2]; + + const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids"; + + llama_backend_init(false); + + llama_model_params model_params = llama_model_default_params(); + model_params.vocab_only = true; + llama_model * model = llama_load_model_from_file(model_path, model_params); + + llama_context_params ctx_params = llama_context_default_params(); + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + + const bool add_bos = true; + + std::vector tokens; + + tokens = ::llama_tokenize(model, prompt, add_bos, true); + + for (int i = 0; i < (int) tokens.size(); i++) { + if (printing_ids) { + printf("%d\n", tokens[i]); + } else { + printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str()); + } + } + + return 0; +} -- cgit v1.2.3