summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichaƫl de Vries <vriesdemichael@gmail.com>2024-02-15 14:14:37 +0100
committerGitHub <noreply@github.com>2024-02-15 14:14:37 +0100
commit73122473ffd73030146276dbb85da7c8021a3ee4 (patch)
tree51544fef2e4d934553a5569eab83e385622cfd46
parent0d4177126b0556e202efb85bf3f768be81076400 (diff)
fix(gguf-py): special tokens are no longer skipped when add_<token>_token is set to false (#5487)
* fix(gguf-py): special tokens are no longer skipped when add_<token>_token is set to false * fix(gguf-py): added missing cls and mask token ids to the gguf metadata
-rw-r--r--gguf-py/gguf/constants.py4
-rw-r--r--gguf-py/gguf/gguf_writer.py6
-rw-r--r--gguf-py/gguf/vocab.py6
3 files changed, 11 insertions, 5 deletions
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 5fba0171..9986ce9d 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -73,6 +73,8 @@ class Keys:
UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id"
+ CLS_ID = "tokenizer.ggml.cls_token_id"
+ MASK_ID = "tokenizer.ggml.mask_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token"
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
@@ -685,5 +687,7 @@ KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
+KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
+KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index d87bd8e8..26724bf9 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -414,6 +414,12 @@ class GGUFWriter:
def add_pad_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
+ def add_cls_token_id(self, id: int) -> None:
+ self.add_uint32(Keys.Tokenizer.CLS_ID, id)
+
+ def add_mask_token_id(self, id: int) -> None:
+ self.add_uint32(Keys.Tokenizer.MASK_ID, id)
+
def add_add_bos_token(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.ADD_BOS, value)
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index cd194297..a23136b1 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -29,7 +29,7 @@ class SpecialVocab:
if special_token_types is not None:
self.special_token_types = special_token_types
else:
- self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad')
+ self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask')
self._load(Path(path))
def __repr__(self) -> str:
@@ -152,10 +152,6 @@ class SpecialVocab:
add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool):
self.add_special_token[typ] = add_entry
- if not added_tokens:
- # We will need this to get the content for the token, so if it's empty
- # may as well just give up.
- continue
entry = tokenizer_config.get(f'{typ}_token')
if isinstance(entry, str):
tc_content = entry