embedding : add EOS token if not present (#899)

author: Georgi Gerganov <ggerganov@gmail.com> 2024-03-14 15:14:14 +0200
committer: Georgi Gerganov <ggerganov@gmail.com> 2024-03-14 15:14:14 +0200
commit: 044ec4b2a567f649459ccd20af2f387c784faa51 (patch)
tree: a99402e1fa0ec74c8bd3a477cc7f62d1f5e1a8be /examples/embedding/embedding.cpp
parent: 77178eedc83d49f31bf757d8e12315d76460be78 (diff)
1 files changed, 9 insertions, 2 deletions
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 895469a3..cbf9aa2b 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -112,13 +112,20 @@ int main(int argc, char ** argv) {
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;
     for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true);
+        auto inp = ::llama_tokenize(ctx, prompt, true, false);
         if (inp.size() > n_batch) {
             inp.resize(n_batch);
         }
         inputs.push_back(inp);
     }
 
+    // add eos if not present
+    for (auto & inp : inputs) {
+        if (inp.empty() || inp.back() != llama_token_eos(model)) {
+            inp.push_back(llama_token_eos(model));
+        }
+    }
+
     // tokenization stats
     if (params.verbose_prompt) {
         for (int i = 0; i < (int) inputs.size(); i++) {
@@ -172,7 +179,7 @@ int main(int argc, char ** argv) {
     for (int j = 0; j < n_prompts; j++) {
         fprintf(stdout, "embedding %d: ", j);
         for (int i = 0; i < std::min(16, n_embd); i++) {
-            fprintf(stdout, "%f ", emb[j * n_embd + i]);
+            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
         }
         fprintf(stdout, "\n");
     }
author	Georgi Gerganov <ggerganov@gmail.com>	2024-03-14 15:14:14 +0200
committer	Georgi Gerganov <ggerganov@gmail.com>	2024-03-14 15:14:14 +0200
commit	044ec4b2a567f649459ccd20af2f387c784faa51 (patch)
tree	a99402e1fa0ec74c8bd3a477cc7f62d1f5e1a8be /examples/embedding/embedding.cpp
parent	77178eedc83d49f31bf757d8e12315d76460be78 (diff)