llama : remove token functions with `context` args in favor of `model` (#3720)

* added `llama_model_token_*` variants to all the `llama_token_*` functions. * added `LLAMA_API` * formatting Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * removed old `llama_token` functions * changed 3 more functions to take in model - `llama_token_get_text` - `llama_token_get_score` - `llama_token_get_type` * added back docs * fixed main.cpp * changed token functions to use new model variants * changed token functions to use new model variants --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> 2023-10-23 12:40:03 -0700
committer: GitHub <noreply@github.com> 2023-10-23 22:40:03 +0300
commit: 5be6c803fa5378f62a1590f3ad8c6b64c7c0c2ce (patch)
tree: 190868e0431070686d797c3c2d86da857b8ba55f /examples/server/server.cpp
parent: 6336701c9378c23c85d1c0e464b663ca2bbb8e60 (diff)
1 files changed, 7 insertions, 7 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c3279dbc..693f9b77 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -726,7 +726,7 @@ struct llama_server_context
 
         if (json_value(data, "ignore_eos", false))
         {
-            slot->sparams.logit_bias[llama_token_eos(ctx)] = -INFINITY;
+            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
         }
 
         const auto &logit_bias = data.find("logit_bias");
@@ -1056,7 +1056,7 @@ struct llama_server_context
             slot.has_next_token = false;
         }
 
-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(ctx))
+        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
         {
             slot.stopped_eos = true;
             slot.has_next_token = false;
@@ -1130,7 +1130,7 @@ struct llama_server_context
 
     json get_formated_generation(llama_client_slot &slot)
     {
-        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(ctx));
+        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
                                 eos_bias->second < 0.0f && std::isinf(eos_bias->second);
         return json {
@@ -1555,11 +1555,11 @@ struct llama_server_context
                             suffix_tokens.erase(suffix_tokens.begin());
                         }
 
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
-                        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
+                        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
                         prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
-                        prefix_tokens.push_back(llama_token_middle(ctx));
+                        prefix_tokens.push_back(llama_token_middle(model));
                         prompt_tokens = prefix_tokens;
                     }
                     else
author	Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>	2023-10-23 12:40:03 -0700
committer	GitHub <noreply@github.com>	2023-10-23 22:40:03 +0300
commit	5be6c803fa5378f62a1590f3ad8c6b64c7c0c2ce (patch)
tree	190868e0431070686d797c3c2d86da857b8ba55f /examples/server/server.cpp
parent	6336701c9378c23c85d1c0e464b663ca2bbb8e60 (diff)