From 73122473ffd73030146276dbb85da7c8021a3ee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20de=20Vries?= Date: Thu, 15 Feb 2024 14:14:37 +0100 Subject: fix(gguf-py): special tokens are no longer skipped when add__token is set to false (#5487) * fix(gguf-py): special tokens are no longer skipped when add__token is set to false * fix(gguf-py): added missing cls and mask token ids to the gguf metadata --- gguf-py/gguf/constants.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'gguf-py/gguf/constants.py') diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5fba0171..9986ce9d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -73,6 +73,8 @@ class Keys: UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" + CLS_ID = "tokenizer.ggml.cls_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_EOS = "tokenizer.ggml.add_eos_token" ADD_PREFIX = "tokenizer.ggml.add_space_prefix" @@ -685,5 +687,7 @@ KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV -- cgit v1.2.3