summaryrefslogtreecommitdiff
path: root/examples/tokenize/tokenize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'examples/tokenize/tokenize.cpp')
-rw-r--r--examples/tokenize/tokenize.cpp16
1 files changed, 15 insertions, 1 deletions
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index 54c9834a..2afb6024 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -29,7 +29,9 @@ static void print_usage_information(const char * argv0, FILE * stream) {
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
fprintf(stream, " --stdin read prompt from standard input.\n");
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+ fprintf(stream, " --no-parse-special do not parse control tokens.\n");
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
+ fprintf(stream, " --show-count print the total number of tokens.\n");
}
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -194,7 +196,9 @@ int main(int raw_argc, char ** raw_argv) {
// variables where to put any arguments we see.
bool printing_ids = false;
bool no_bos = false;
+ bool no_parse_special = false;
bool disable_logging = false;
+ bool show_token_count = false;
const char * model_path = NULL;
const char * prompt_path = NULL;
const char * prompt_arg = NULL;
@@ -227,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
else if (arg == "--no-bos") {
no_bos = true;
}
+ else if (arg == "--no-parse-special") {
+ no_parse_special = true;
+ }
else if (arg == "-p" || arg == "--prompt") {
if (prompt_set) {
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
@@ -249,6 +256,9 @@ int main(int raw_argc, char ** raw_argv) {
else if (arg == "--log-disable") {
disable_logging = true;
}
+ else if (arg == "--show-count") {
+ show_token_count = true;
+ }
else {
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
return 1;
@@ -354,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
const bool model_wants_add_bos = llama_should_add_bos_token(model);
const bool add_bos = model_wants_add_bos && !no_bos;
+ const bool parse_special = !no_parse_special;
std::vector<llama_token> tokens;
- tokens = ::llama_tokenize(model, prompt, add_bos, true);
+ tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
if (printing_ids) {
printf("[");
@@ -384,6 +395,9 @@ int main(int raw_argc, char ** raw_argv) {
printf("]\n");
}
+ if (show_token_count) {
+ printf("Total number of tokens: %ld\n", tokens.size());
+ }
// silence valgrind
llama_free(ctx);
llama_free_model(model);