From 1b67731e184e27a465b8c5476061294a4af668ea Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 9 Apr 2024 13:44:08 -0400 Subject: BERT tokenizer fixes (#6498) Key changes: * BERT conversion: fix abuse of LlamaHfVocab, do not set BOS or EOS * Nomic Embed conversion: pad vocab instead of slicing embedding tensor * llama_tokenize: handle added special tokens like HF does --- examples/tokenize/tokenize.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'examples/tokenize/tokenize.cpp') diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index d95a9247..8b1baea8 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -26,11 +26,9 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); llama_context * ctx = llama_new_context_with_model(model, ctx_params); - const bool add_bos = llama_should_add_bos_token(model); - std::vector tokens; - tokens = ::llama_tokenize(model, prompt, add_bos, true); + tokens = ::llama_tokenize(model, prompt, true, true); for (int i = 0; i < (int) tokens.size(); i++) { if (printing_ids) { -- cgit v1.2.3