diff options
author | Michaƫl de Vries <vriesdemichael@gmail.com> | 2024-02-15 14:14:37 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-15 14:14:37 +0100 |
commit | 73122473ffd73030146276dbb85da7c8021a3ee4 (patch) | |
tree | 51544fef2e4d934553a5569eab83e385622cfd46 /gguf-py/gguf/vocab.py | |
parent | 0d4177126b0556e202efb85bf3f768be81076400 (diff) |
fix(gguf-py): special tokens are no longer skipped when add_<token>_token is set to false (#5487)
* fix(gguf-py): special tokens are no longer skipped when add_<token>_token is set to false
* fix(gguf-py): added missing cls and mask token ids to the gguf metadata
Diffstat (limited to 'gguf-py/gguf/vocab.py')
-rw-r--r-- | gguf-py/gguf/vocab.py | 6 |
1 files changed, 1 insertions, 5 deletions
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index cd194297..a23136b1 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -29,7 +29,7 @@ class SpecialVocab: if special_token_types is not None: self.special_token_types = special_token_types else: - self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad') + self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask') self._load(Path(path)) def __repr__(self) -> str: @@ -152,10 +152,6 @@ class SpecialVocab: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): self.add_special_token[typ] = add_entry - if not added_tokens: - # We will need this to get the content for the token, so if it's empty - # may as well just give up. - continue entry = tokenizer_config.get(f'{typ}_token') if isinstance(entry, str): tc_content = entry |