summaryrefslogtreecommitdiff
path: root/gguf-py
diff options
context:
space:
mode:
authorKerfuffle <44031344+KerfuffleV2@users.noreply.github.com>2023-10-22 12:14:56 -0600
committerGitHub <noreply@github.com>2023-10-22 21:14:56 +0300
commita5e7dbd6141128bfa3c40a19c2945a181df625d3 (patch)
tree14cb15291418d4f591d7a58d8239eb02b966b595 /gguf-py
parentd3956aea53369455008159cc405ed4c496976692 (diff)
llama : validate special token ids are in range when loading GGUF model (#3635)
* Add validation for special token ids to llama.cpp Small optimization for llama_byte_to_token SPM mode * Fix BPE newline check, only I could break something so simple * Killll meeeeee * Account for GGUF_KEY_KEY only setting when the key exists * Minor code cleanups. * Fix convert.py error msg when added tokens are out of range * Make gguf SpecialVocab vocab size-aware Update conversion scripts accordingly * Avoid a string copy Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'gguf-py')
-rw-r--r--gguf-py/gguf/gguf.py36
1 files changed, 25 insertions, 11 deletions
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 072c839c..6b7d6542 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -987,12 +987,15 @@ class SpecialVocab:
merges: list[str] = []
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
special_token_ids: dict[str, int] = {}
+ n_vocab: int | None = None
def __init__(
self, path: str | os.PathLike[str], load_merges: bool = False,
special_token_types: tuple[str, ...] | None = None,
+ n_vocab: int | None = None,
):
self.special_token_ids = {}
+ self.n_vocab = n_vocab
self.load_merges = load_merges
if special_token_types is not None:
self.special_token_types = special_token_types
@@ -1002,6 +1005,16 @@ class SpecialVocab:
if not self._try_load_from_tokenizer_json(path):
self._try_load_from_config_json(path)
+ def _set_special_token(self, typ: str, tid: Any):
+ if not isinstance(tid, int) or tid < 0:
+ return
+ if self.n_vocab is None or tid < self.n_vocab:
+ self.special_token_ids[typ] = tid
+ return
+ print(f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
+ file = sys.stderr)
+
+
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer_file = path / 'tokenizer.json'
if not tokenizer_file.is_file():
@@ -1029,10 +1042,11 @@ class SpecialVocab:
tc_content = entry_content
else:
continue
- for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
- if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
- self.special_token_ids[typ] = maybe_token_id
- break
+ # We only need the first match here.
+ maybe_token_id = next((
+ atok.get('id') for atok in added_tokens
+ if atok.get('content') == tc_content), None)
+ self._set_special_token(typ, maybe_token_id)
return True
def _try_load_from_config_json(self, path: Path) -> bool:
@@ -1042,21 +1056,21 @@ class SpecialVocab:
with open(config_file, encoding = 'utf-8') as f:
config = json.load(f)
for typ in self.special_token_types:
- maybe_token_id = config.get(f'{typ}_token_id')
- if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
- self.special_token_ids[typ] = maybe_token_id
+ self._set_special_token(typ, config.get(f'{typ}_token_id'))
return True
- def add_to_gguf(self, gw: GGUFWriter) -> None:
+ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
if len(self.merges) > 0:
- print(f'gguf: Adding {len(self.merges)} merge(s).')
+ if not quiet:
+ print(f'gguf: Adding {len(self.merges)} merge(s).')
gw.add_token_merges(self.merges)
for typ, tokid in self.special_token_ids.items():
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
if handler is None:
- print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
+ print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', file = sys.stderr)
continue
- print(f'gguf: Setting special token type {typ} to {tokid}')
+ if not quiet:
+ print(f'gguf: Setting special token type {typ} to {tokid}')
handler(tokid)
def __repr__(self) -> str: