diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-03-14 15:14:14 +0200 |
---|---|---|
committer | Georgi Gerganov <ggerganov@gmail.com> | 2024-03-14 15:14:14 +0200 |
commit | 044ec4b2a567f649459ccd20af2f387c784faa51 (patch) | |
tree | a99402e1fa0ec74c8bd3a477cc7f62d1f5e1a8be /examples/embedding/embedding.cpp | |
parent | 77178eedc83d49f31bf757d8e12315d76460be78 (diff) |
embedding : add EOS token if not present (#899)
Diffstat (limited to 'examples/embedding/embedding.cpp')
-rw-r--r-- | examples/embedding/embedding.cpp | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 895469a3..cbf9aa2b 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -112,13 +112,20 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim std::vector<std::vector<int32_t>> inputs; for (const auto & prompt : prompts) { - auto inp = ::llama_tokenize(ctx, prompt, true); + auto inp = ::llama_tokenize(ctx, prompt, true, false); if (inp.size() > n_batch) { inp.resize(n_batch); } inputs.push_back(inp); } + // add eos if not present + for (auto & inp : inputs) { + if (inp.empty() || inp.back() != llama_token_eos(model)) { + inp.push_back(llama_token_eos(model)); + } + } + // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) inputs.size(); i++) { @@ -172,7 +179,7 @@ int main(int argc, char ** argv) { for (int j = 0; j < n_prompts; j++) { fprintf(stdout, "embedding %d: ", j); for (int i = 0; i < std::min(16, n_embd); i++) { - fprintf(stdout, "%f ", emb[j * n_embd + i]); + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); } fprintf(stdout, "\n"); } |