diff options
Diffstat (limited to 'gguf-py/gguf')
-rw-r--r-- | gguf-py/gguf/constants.py | 2 | ||||
-rw-r--r-- | gguf-py/gguf/gguf_writer.py | 31 | ||||
-rw-r--r-- | gguf-py/gguf/vocab.py | 2 |
3 files changed, 32 insertions, 3 deletions
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4b0b6c4c..feae03e1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -90,6 +90,8 @@ class Keys: HF_JSON = "tokenizer.huggingface.json" RWKV = "tokenizer.rwkv.world" CHAT_TEMPLATE = "tokenizer.chat_template" + CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" + CHAT_TEMPLATES = "tokenizer.chat_templates" # FIM/Infill special tokens constants PREFIX_ID = "tokenizer.ggml.prefix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ff9326d5..e3dbca45 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -6,7 +6,8 @@ import struct import tempfile from enum import Enum, auto from io import BufferedWriter -from typing import IO, Any, Sequence +from typing import IO, Any, Sequence, Mapping +from string import ascii_letters, digits import numpy as np @@ -466,7 +467,33 @@ class GGUFWriter: def add_add_space_prefix(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) - def add_chat_template(self, value: str) -> None: + def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: + if isinstance(value, list): + template_default = None + template_names = set() + + for choice in value: + name = choice.get('name', '') + template = choice.get('template') + + # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it + name = ''.join((c if c in ascii_letters + digits else '_' for c in name)) + + if name and template is not None: + if name == 'default': + template_default = template + else: + template_names.add(name) + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template) + + if template_names: + self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names)) + + if template_default is None: + return + + value = template_default + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) def add_prefix_token_id(self, id: int) -> None: diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index a23136b1..378eaeca 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -141,7 +141,7 @@ class SpecialVocab: with open(tokenizer_config_file, encoding = 'utf-8') as f: tokenizer_config = json.load(f) chat_template = tokenizer_config.get('chat_template') - if chat_template is None or isinstance(chat_template, str): + if chat_template is None or isinstance(chat_template, (str, list)): self.chat_template = chat_template else: print( |