diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-07-27 07:55:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-27 07:55:01 +0200 |
commit | 154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch) | |
tree | 81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /examples/tokenize | |
parent | 0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff) |
Merge mainline llama.cpp (#3)
* Merging mainline - WIP
* Merging mainline - WIP
AVX2 and CUDA appear to work.
CUDA performance seems slightly (~1-2%) lower as it is so often
the case with llama.cpp/ggml after some "improvements" have been made.
* Merging mainline - fix Metal
* Remove check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'examples/tokenize')
-rw-r--r-- | examples/tokenize/tokenize.cpp | 16 |
1 files changed, 15 insertions, 1 deletions
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 54c9834a..2afb6024 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -29,7 +29,9 @@ static void print_usage_information(const char * argv0, FILE * stream) { fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); fprintf(stream, " --stdin read prompt from standard input.\n"); fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); + fprintf(stream, " --no-parse-special do not parse control tokens.\n"); fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); + fprintf(stream, " --show-count print the total number of tokens.\n"); } static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) { @@ -194,7 +196,9 @@ int main(int raw_argc, char ** raw_argv) { // variables where to put any arguments we see. bool printing_ids = false; bool no_bos = false; + bool no_parse_special = false; bool disable_logging = false; + bool show_token_count = false; const char * model_path = NULL; const char * prompt_path = NULL; const char * prompt_arg = NULL; @@ -227,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) { else if (arg == "--no-bos") { no_bos = true; } + else if (arg == "--no-parse-special") { + no_parse_special = true; + } else if (arg == "-p" || arg == "--prompt") { if (prompt_set) { fprintf(stderr, "Error: -p or --prompt specified multiple times.\n"); @@ -249,6 +256,9 @@ int main(int raw_argc, char ** raw_argv) { else if (arg == "--log-disable") { disable_logging = true; } + else if (arg == "--show-count") { + show_token_count = true; + } else { fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str()); return 1; @@ -354,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) { const bool model_wants_add_bos = llama_should_add_bos_token(model); const bool add_bos = model_wants_add_bos && !no_bos; + const bool parse_special = !no_parse_special; std::vector<llama_token> tokens; - tokens = ::llama_tokenize(model, prompt, add_bos, true); + tokens = ::llama_tokenize(model, prompt, add_bos, parse_special); if (printing_ids) { printf("["); @@ -384,6 +395,9 @@ int main(int raw_argc, char ** raw_argv) { printf("]\n"); } + if (show_token_count) { + printf("Total number of tokens: %ld\n", tokens.size()); + } // silence valgrind llama_free(ctx); llama_free_model(model); |