From 1b67731e184e27a465b8c5476061294a4af668ea Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 9 Apr 2024 13:44:08 -0400 Subject: BERT tokenizer fixes (#6498) Key changes: * BERT conversion: fix abuse of LlamaHfVocab, do not set BOS or EOS * Nomic Embed conversion: pad vocab instead of slicing embedding tensor * llama_tokenize: handle added special tokens like HF does --- examples/embedding/embedding.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'examples/embedding/embedding.cpp') diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 53665752..6a93147d 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -123,10 +123,10 @@ int main(int argc, char ** argv) { inputs.push_back(inp); } - // add eos if not present + // add SEP if not present for (auto & inp : inputs) { - if (inp.empty() || inp.back() != llama_token_eos(model)) { - inp.push_back(llama_token_eos(model)); + if (inp.empty() || inp.back() != llama_token_sep(model)) { + inp.push_back(llama_token_sep(model)); } } -- cgit v1.2.3