summaryrefslogtreecommitdiff
path: root/gguf-py
diff options
context:
space:
mode:
authorKerfuffle <44031344+KerfuffleV2@users.noreply.github.com>2023-11-16 19:14:37 -0700
committerGitHub <noreply@github.com>2023-11-16 19:14:37 -0700
commit91f6499393d2d999331fbfdba47a7f8b9f913f0d (patch)
tree27caf3ad0b9cec979bb5ed3317b5334bdcd9470c /gguf-py
parent8da46278e1a57107591653275f8e03a281de94f0 (diff)
Respect tokenizer.ggml.add_bos_token value when tokenizing (#4040)
* gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode. * Respect add_bos_token GGUF metadata value * gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
Diffstat (limited to 'gguf-py')
-rw-r--r--gguf-py/gguf/vocab.py25
-rw-r--r--gguf-py/pyproject.toml2
-rwxr-xr-xgguf-py/scripts/gguf-dump.py15
3 files changed, 24 insertions, 18 deletions
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index 71192a92..b9f50a0a 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -117,17 +117,18 @@ class SpecialVocab:
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer_file = path / 'tokenizer.json'
- if not tokenizer_file.is_file():
- return False
- with open(tokenizer_file, encoding = 'utf-8') as f:
- tokenizer = json.load(f)
- if self.load_merges:
- merges = tokenizer.get('model', {}).get('merges')
- if isinstance(merges, list) and merges and isinstance(merges[0], str):
- self.merges = merges
+ if tokenizer_file.is_file():
+ with open(tokenizer_file, encoding = 'utf-8') as f:
+ tokenizer = json.load(f)
+ if self.load_merges:
+ merges = tokenizer.get('model', {}).get('merges')
+ if isinstance(merges, list) and merges and isinstance(merges[0], str):
+ self.merges = merges
+ added_tokens = tokenizer.get('added_tokens', {})
+ else:
+ added_tokens = {}
tokenizer_config_file = path / 'tokenizer_config.json'
- added_tokens = tokenizer.get('added_tokens')
- if added_tokens is None or not tokenizer_config_file.is_file():
+ if not tokenizer_config_file.is_file():
return True
with open(tokenizer_config_file, encoding = 'utf-8') as f:
tokenizer_config = json.load(f)
@@ -135,6 +136,10 @@ class SpecialVocab:
add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool):
self.add_special_token[typ] = add_entry
+ if not added_tokens:
+ # We will need this to get the content for the token, so if it's empty
+ # may as well just give up.
+ continue
entry = tokenizer_config.get(f'{typ}_token')
if isinstance(entry, str):
tc_content = entry
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index af777c3e..6e3f9e85 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
-version = "0.5.2"
+version = "0.5.3"
description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [
diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py
index 5141873d..dbf89150 100755
--- a/gguf-py/scripts/gguf-dump.py
+++ b/gguf-py/scripts/gguf-dump.py
@@ -86,13 +86,14 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
else:
curr["value"] = field.parts[-1].tolist()[0]
- for idx, tensor in enumerate(reader.tensors):
- tensors[tensor.name] = {
- "index": idx,
- "shape": tensor.shape.tolist(),
- "type": tensor.tensor_type.name,
- "offset": tensor.field.offset,
- }
+ if not args.no_tensors:
+ for idx, tensor in enumerate(reader.tensors):
+ tensors[tensor.name] = {
+ "index": idx,
+ "shape": tensor.shape.tolist(),
+ "type": tensor.tensor_type.name,
+ "offset": tensor.field.offset,
+ }
json.dump(result, sys.stdout)