From a5e7dbd6141128bfa3c40a19c2945a181df625d3 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Sun, 22 Oct 2023 12:14:56 -0600
Subject: llama : validate special token ids are in range when loading GGUF
 model (#3635)

* Add validation for special token ids to llama.cpp

Small optimization for llama_byte_to_token SPM mode

* Fix BPE newline check, only I could break something so simple

* Killll meeeeee

* Account for GGUF_KEY_KEY only setting when the key exists

* Minor code cleanups.

* Fix convert.py error msg when added tokens are out of range

* Make gguf SpecialVocab vocab size-aware

Update conversion scripts accordingly

* Avoid a string copy

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 gguf-py/gguf/gguf.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

(limited to 'gguf-py/gguf/gguf.py')

diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 072c839c..6b7d6542 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -987,12 +987,15 @@ class SpecialVocab:
     merges: list[str] = []
     special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
     special_token_ids: dict[str, int] = {}
+    n_vocab: int | None = None
 
     def __init__(
         self, path: str | os.PathLike[str], load_merges: bool = False,
         special_token_types: tuple[str, ...] | None = None,
+        n_vocab: int | None = None,
     ):
         self.special_token_ids = {}
+        self.n_vocab = n_vocab
         self.load_merges = load_merges
         if special_token_types is not None:
             self.special_token_types = special_token_types
@@ -1002,6 +1005,16 @@ class SpecialVocab:
         if not self._try_load_from_tokenizer_json(path):
             self._try_load_from_config_json(path)
 
+    def _set_special_token(self, typ: str, tid: Any):
+        if not isinstance(tid, int) or tid < 0:
+            return
+        if self.n_vocab is None or tid < self.n_vocab:
+            self.special_token_ids[typ] = tid
+            return
+        print(f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
+            file = sys.stderr)
+
+
     def _try_load_from_tokenizer_json(self, path: Path) -> bool:
         tokenizer_file = path / 'tokenizer.json'
         if not tokenizer_file.is_file():
@@ -1029,10 +1042,11 @@ class SpecialVocab:
                 tc_content = entry_content
             else:
                 continue
-            for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
-                if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
-                    self.special_token_ids[typ] = maybe_token_id
-                break
+            # We only need the first match here.
+            maybe_token_id = next((
+                atok.get('id') for atok in added_tokens
+                if atok.get('content') == tc_content), None)
+            self._set_special_token(typ, maybe_token_id)
         return True
 
     def _try_load_from_config_json(self, path: Path) -> bool:
@@ -1042,21 +1056,21 @@ class SpecialVocab:
         with open(config_file, encoding = 'utf-8') as f:
             config = json.load(f)
         for typ in self.special_token_types:
-            maybe_token_id = config.get(f'{typ}_token_id')
-            if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
-                self.special_token_ids[typ] = maybe_token_id
+            self._set_special_token(typ, config.get(f'{typ}_token_id'))
         return True
 
-    def add_to_gguf(self, gw: GGUFWriter) -> None:
+    def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
         if len(self.merges) > 0:
-            print(f'gguf: Adding {len(self.merges)} merge(s).')
+            if not quiet:
+                print(f'gguf: Adding {len(self.merges)} merge(s).')
             gw.add_token_merges(self.merges)
         for typ, tokid in self.special_token_ids.items():
             handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
             if handler is None:
-                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
+                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', file = sys.stderr)
                 continue
-            print(f'gguf: Setting special token type {typ} to {tokid}')
+            if not quiet:
+                print(f'gguf: Setting special token type {typ} to {tokid}')
             handler(tokid)
 
     def __repr__(self) -> str:
-- 
cgit v1.2.3