summaryrefslogtreecommitdiff
path: root/gguf-py/gguf/vocab.py
diff options
context:
space:
mode:
Diffstat (limited to 'gguf-py/gguf/vocab.py')
-rw-r--r--gguf-py/gguf/vocab.py33
1 files changed, 30 insertions, 3 deletions
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index dc574991..cca09798 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -122,8 +122,30 @@ class SpecialVocab:
tokenizer = json.load(f)
if self.load_merges:
merges = tokenizer.get('model', {}).get('merges')
- if isinstance(merges, list) and merges and isinstance(merges[0], str):
- self.merges = merges
+ if isinstance(merges, list) and merges:
+ if isinstance(merges[0], str):
+ self.merges = merges
+ elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
+ # New format since transformers 4.45 to support spaces in merges
+ # ref: https://github.com/ggml-org/llama.cpp/issues/9692
+ # TODO: internally store as the new format instead of converting to old
+ if any(' ' in s for pair in merges for s in pair):
+ logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
+ self.merges = [
+ ' '.join(
+ [
+ # ensure the spaces are properly encoded
+ ''.join(
+ chr(ord(c) + 256) if c == ' ' else c
+ for c in part
+ )
+ for part in pair
+ ]
+ )
+ for pair in merges
+ ]
+ else:
+ raise ValueError("Unknown tokenizer merges format")
added_tokens = tokenizer.get('added_tokens', {})
else:
added_tokens = {}
@@ -132,7 +154,12 @@ class SpecialVocab:
return True
with open(tokenizer_config_file, encoding = 'utf-8') as f:
tokenizer_config = json.load(f)
- chat_template = tokenizer_config.get('chat_template')
+ chat_template_alt = None
+ chat_template_file = path / 'chat_template.json'
+ if chat_template_file.is_file():
+ with open(chat_template_file, encoding = 'utf-8') as f:
+ chat_template_alt = json.load(f).get('chat_template')
+ chat_template = tokenizer_config.get('chat_template', chat_template_alt)
if chat_template is None or isinstance(chat_template, (str, list)):
self.chat_template = chat_template
else: