convert : support models with multiple chat templates (#6588)

* Support converting models with multiple chat templates Adds the following metadata: * tokenizer.chat_templates * tokenizer.chat_template.<name1> * tokenizer.chat_template.<name2> * tokenizer.chat_template.<...> Where `tokenizer.chat_templates` is an array of the template names (except `default`), `default` is added to the regular `tokenizer.chat_template`. * replace filtered characters with underscore * New script to add/modify/remove metadata This scripts creates a copy of a GGUF file and allows you to add/modify/remove metadata in the process. Most importantly this allows you to update chat templates, either as a string or directly from an updated tokenizer_config.json file. * Add files via upload add new script to project/readme * flake--
author: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> 2024-04-18 13:49:01 +0200
committer: GitHub <noreply@github.com> 2024-04-18 14:49:01 +0300
commit: 03c0946d73c63ea73e1d85015b7088298443d438 (patch)
tree: 3add06ebc5f14022771887ca0bce636b39ff8633 /gguf-py/gguf/gguf_writer.py
parent: e11b2e6e1e18522ca7cf129600875a0f6fb9307d (diff)
1 files changed, 29 insertions, 2 deletions
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index ff9326d5..e3dbca45 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -6,7 +6,8 @@ import struct
 import tempfile
 from enum import Enum, auto
 from io import BufferedWriter
-from typing import IO, Any, Sequence
+from typing import IO, Any, Sequence, Mapping
+from string import ascii_letters, digits
 
 import numpy as np
 
@@ -466,7 +467,33 @@ class GGUFWriter:
     def add_add_space_prefix(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
 
-    def add_chat_template(self, value: str) -> None:
+    def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
+        if isinstance(value, list):
+            template_default = None
+            template_names = set()
+
+            for choice in value:
+                name = choice.get('name', '')
+                template = choice.get('template')
+
+                # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
+                name = ''.join((c if c in ascii_letters + digits else '_' for c in name))
+
+                if name and template is not None:
+                    if name == 'default':
+                        template_default = template
+                    else:
+                        template_names.add(name)
+                        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template)
+
+            if template_names:
+                self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names))
+
+            if template_default is None:
+                return
+
+            value = template_default
+
         self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
 
     def add_prefix_token_id(self, id: int) -> None:
author	Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2024-04-18 13:49:01 +0200
committer	GitHub <noreply@github.com>	2024-04-18 14:49:01 +0300
commit	03c0946d73c63ea73e1d85015b7088298443d438 (patch)
tree	3add06ebc5f14022771887ca0bce636b39ff8633 /gguf-py/gguf/gguf_writer.py
parent	e11b2e6e1e18522ca7cf129600875a0f6fb9307d (diff)