From 1b67731e184e27a465b8c5476061294a4af668ea Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 9 Apr 2024 13:44:08 -0400 Subject: BERT tokenizer fixes (#6498) Key changes: * BERT conversion: fix abuse of LlamaHfVocab, do not set BOS or EOS * Nomic Embed conversion: pad vocab instead of slicing embedding tensor * llama_tokenize: handle added special tokens like HF does --- common/common.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'common/common.h') diff --git a/common/common.h b/common/common.h index 4635e05d..a7f476c1 100644 --- a/common/common.h +++ b/common/common.h @@ -223,14 +223,14 @@ void llama_batch_add( std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, - bool add_bos, - bool special = false); + bool add_special, + bool parse_special = false); std::vector llama_tokenize( const struct llama_model * model, const std::string & text, - bool add_bos, - bool special = false); + bool add_special, + bool parse_special = false); // tokenizes a token into a piece // should work similar to Python's `tokenizer.id_to_piece` -- cgit v1.2.3