summaryrefslogtreecommitdiff
path: root/gguf-py/scripts
diff options
context:
space:
mode:
authorKerfuffle <44031344+KerfuffleV2@users.noreply.github.com>2023-11-16 19:14:37 -0700
committerGitHub <noreply@github.com>2023-11-16 19:14:37 -0700
commit91f6499393d2d999331fbfdba47a7f8b9f913f0d (patch)
tree27caf3ad0b9cec979bb5ed3317b5334bdcd9470c /gguf-py/scripts
parent8da46278e1a57107591653275f8e03a281de94f0 (diff)
Respect tokenizer.ggml.add_bos_token value when tokenizing (#4040)
* gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode. * Respect add_bos_token GGUF metadata value * gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
Diffstat (limited to 'gguf-py/scripts')
-rwxr-xr-xgguf-py/scripts/gguf-dump.py15
1 files changed, 8 insertions, 7 deletions
diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py
index 5141873d..dbf89150 100755
--- a/gguf-py/scripts/gguf-dump.py
+++ b/gguf-py/scripts/gguf-dump.py
@@ -86,13 +86,14 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
else:
curr["value"] = field.parts[-1].tolist()[0]
- for idx, tensor in enumerate(reader.tensors):
- tensors[tensor.name] = {
- "index": idx,
- "shape": tensor.shape.tolist(),
- "type": tensor.tensor_type.name,
- "offset": tensor.field.offset,
- }
+ if not args.no_tensors:
+ for idx, tensor in enumerate(reader.tensors):
+ tensors[tensor.name] = {
+ "index": idx,
+ "shape": tensor.shape.tolist(),
+ "type": tensor.tensor_type.name,
+ "offset": tensor.field.offset,
+ }
json.dump(result, sys.stdout)