diff options
author | Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> | 2023-11-16 19:14:37 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-16 19:14:37 -0700 |
commit | 91f6499393d2d999331fbfdba47a7f8b9f913f0d (patch) | |
tree | 27caf3ad0b9cec979bb5ed3317b5334bdcd9470c /gguf-py/scripts | |
parent | 8da46278e1a57107591653275f8e03a281de94f0 (diff) |
Respect tokenizer.ggml.add_bos_token value when tokenizing (#4040)
* gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode.
* Respect add_bos_token GGUF metadata value
* gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
Diffstat (limited to 'gguf-py/scripts')
-rwxr-xr-x | gguf-py/scripts/gguf-dump.py | 15 |
1 files changed, 8 insertions, 7 deletions
diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py index 5141873d..dbf89150 100755 --- a/gguf-py/scripts/gguf-dump.py +++ b/gguf-py/scripts/gguf-dump.py @@ -86,13 +86,14 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8") else: curr["value"] = field.parts[-1].tolist()[0] - for idx, tensor in enumerate(reader.tensors): - tensors[tensor.name] = { - "index": idx, - "shape": tensor.shape.tolist(), - "type": tensor.tensor_type.name, - "offset": tensor.field.offset, - } + if not args.no_tensors: + for idx, tensor in enumerate(reader.tensors): + tensors[tensor.name] = { + "index": idx, + "shape": tensor.shape.tolist(), + "type": tensor.tensor_type.name, + "offset": tensor.field.offset, + } json.dump(result, sys.stdout) |