From a5e7dbd6141128bfa3c40a19c2945a181df625d3 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sun, 22 Oct 2023 12:14:56 -0600 Subject: llama : validate special token ids are in range when loading GGUF model (#3635) * Add validation for special token ids to llama.cpp Small optimization for llama_byte_to_token SPM mode * Fix BPE newline check, only I could break something so simple * Killll meeeeee * Account for GGUF_KEY_KEY only setting when the key exists * Minor code cleanups. * Fix convert.py error msg when added tokens are out of range * Make gguf SpecialVocab vocab size-aware Update conversion scripts accordingly * Avoid a string copy Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- gguf-py/gguf/gguf.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'gguf-py/gguf/gguf.py') diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 072c839c..6b7d6542 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -987,12 +987,15 @@ class SpecialVocab: merges: list[str] = [] special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad') special_token_ids: dict[str, int] = {} + n_vocab: int | None = None def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, special_token_types: tuple[str, ...] | None = None, + n_vocab: int | None = None, ): self.special_token_ids = {} + self.n_vocab = n_vocab self.load_merges = load_merges if special_token_types is not None: self.special_token_types = special_token_types @@ -1002,6 +1005,16 @@ class SpecialVocab: if not self._try_load_from_tokenizer_json(path): self._try_load_from_config_json(path) + def _set_special_token(self, typ: str, tid: Any): + if not isinstance(tid, int) or tid < 0: + return + if self.n_vocab is None or tid < self.n_vocab: + self.special_token_ids[typ] = tid + return + print(f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping', + file = sys.stderr) + + def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer_file = path / 'tokenizer.json' if not tokenizer_file.is_file(): @@ -1029,10 +1042,11 @@ class SpecialVocab: tc_content = entry_content else: continue - for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content): - if isinstance(maybe_token_id, int) and maybe_token_id >= 0: - self.special_token_ids[typ] = maybe_token_id - break + # We only need the first match here. + maybe_token_id = next(( + atok.get('id') for atok in added_tokens + if atok.get('content') == tc_content), None) + self._set_special_token(typ, maybe_token_id) return True def _try_load_from_config_json(self, path: Path) -> bool: @@ -1042,21 +1056,21 @@ class SpecialVocab: with open(config_file, encoding = 'utf-8') as f: config = json.load(f) for typ in self.special_token_types: - maybe_token_id = config.get(f'{typ}_token_id') - if isinstance(maybe_token_id, int) and maybe_token_id >= 0: - self.special_token_ids[typ] = maybe_token_id + self._set_special_token(typ, config.get(f'{typ}_token_id')) return True - def add_to_gguf(self, gw: GGUFWriter) -> None: + def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if len(self.merges) > 0: - print(f'gguf: Adding {len(self.merges)} merge(s).') + if not quiet: + print(f'gguf: Adding {len(self.merges)} merge(s).') gw.add_token_merges(self.merges) for typ, tokid in self.special_token_ids.items(): handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) if handler is None: - print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping') + print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', file = sys.stderr) continue - print(f'gguf: Setting special token type {typ} to {tokid}') + if not quiet: + print(f'gguf: Setting special token type {typ} to {tokid}') handler(tokid) def __repr__(self) -> str: -- cgit v1.2.3