common : revert showing control tokens by default for server (#6860)

* fix: revert showing control tokens by default * feat: revert changes to default behavior of llama_token_to_piece; provide overridden declaration to receive "bool special" param to toggle showing control tokens * feat: use the overridden declaration of llama_token_to_piece from common/common.cpp to specify "false" so that control tokens are not shown in chat completion responses" * common : simplify --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: Kyle Mistele <kyle@mistele.com> 2024-04-24 05:15:29 -0500
committer: GitHub <noreply@github.com> 2024-04-24 13:15:29 +0300
commit: 37246b1031b1680c0dcaf20aef736d6b446203fa (patch)
tree: e058ba6529eb7436367f35d0d69e42684a8f431b
parent: 28103f4832e301a9c84d44ff0df9d75d46ab6c76 (diff)
3 files changed, 7 insertions, 6 deletions
diff --git a/common/common.cpp b/common/common.cpp
index a0d1f8d5..97f55b05 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2328,12 +2328,12 @@ std::vector<llama_token> llama_tokenize(
     return result;
 }
 
-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
     std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
diff --git a/common/common.h b/common/common.h
index cca44268..157b54a3 100644
--- a/common/common.h
+++ b/common/common.h
@@ -237,11 +237,12 @@ std::vector<llama_token> llama_tokenize(
                         bool   add_special,
                         bool   parse_special = false);
 
-// tokenizes a token into a piece
+// tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
         const struct llama_context * ctx,
-                       llama_token   token);
+                       llama_token   token,
+                       bool          special = true);
 
 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 68c63f9f..3acbd17d 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1117,7 +1117,7 @@ struct server_context {
 
     bool process_token(completion_token_output & result, server_slot & slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok);
+        const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
         slot.sampled = result.tok;
 
         // search stop word and delete it
author	Kyle Mistele <kyle@mistele.com>	2024-04-24 05:15:29 -0500
committer	GitHub <noreply@github.com>	2024-04-24 13:15:29 +0300
commit	37246b1031b1680c0dcaf20aef736d6b446203fa (patch)
tree	e058ba6529eb7436367f35d0d69e42684a8f431b
parent	28103f4832e301a9c84d44ff0df9d75d46ab6c76 (diff)