From 91f6499393d2d999331fbfdba47a7f8b9f913f0d Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Thu, 16 Nov 2023 19:14:37 -0700 Subject: Respect tokenizer.ggml.add_bos_token value when tokenizing (#4040) * gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode. * Respect add_bos_token GGUF metadata value * gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time --- gguf-py/gguf/vocab.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'gguf-py/gguf/vocab.py') diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 71192a92..b9f50a0a 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -117,17 +117,18 @@ class SpecialVocab: def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer_file = path / 'tokenizer.json' - if not tokenizer_file.is_file(): - return False - with open(tokenizer_file, encoding = 'utf-8') as f: - tokenizer = json.load(f) - if self.load_merges: - merges = tokenizer.get('model', {}).get('merges') - if isinstance(merges, list) and merges and isinstance(merges[0], str): - self.merges = merges + if tokenizer_file.is_file(): + with open(tokenizer_file, encoding = 'utf-8') as f: + tokenizer = json.load(f) + if self.load_merges: + merges = tokenizer.get('model', {}).get('merges') + if isinstance(merges, list) and merges and isinstance(merges[0], str): + self.merges = merges + added_tokens = tokenizer.get('added_tokens', {}) + else: + added_tokens = {} tokenizer_config_file = path / 'tokenizer_config.json' - added_tokens = tokenizer.get('added_tokens') - if added_tokens is None or not tokenizer_config_file.is_file(): + if not tokenizer_config_file.is_file(): return True with open(tokenizer_config_file, encoding = 'utf-8') as f: tokenizer_config = json.load(f) @@ -135,6 +136,10 @@ class SpecialVocab: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): self.add_special_token[typ] = add_entry + if not added_tokens: + # We will need this to get the content for the token, so if it's empty + # may as well just give up. + continue entry = tokenizer_config.get(f'{typ}_token') if isinstance(entry, str): tc_content = entry -- cgit v1.2.3