summaryrefslogtreecommitdiff
path: root/gguf-py
diff options
context:
space:
mode:
authorcompilade <git@compilade.net>2024-05-24 21:11:48 -0400
committerGitHub <noreply@github.com>2024-05-25 11:11:48 +1000
commitb83bab15a5d2a1e7807d09613a9b34309d86cfaa (patch)
tree449b4201f8b8929f674fc2ad7654406ba2c50a4b /gguf-py
parentd041d2ceaaf50e058622d92921b3e680ffa4e9e7 (diff)
gguf-py : fix and simplify quantized shape round-trip (#7483)
* gguf-py : fix and simplify quantized shape round-trip * gguf-py : remove unused import
Diffstat (limited to 'gguf-py')
-rw-r--r--gguf-py/gguf/gguf_reader.py6
-rw-r--r--gguf-py/gguf/gguf_writer.py8
-rw-r--r--gguf-py/gguf/quants.py16
-rwxr-xr-xgguf-py/scripts/gguf-new-metadata.py4
4 files changed, 24 insertions, 10 deletions
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 21b089f8..e48bc00c 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
import numpy as np
import numpy.typing as npt
+from .quants import quant_shape_to_byte_shape
+
if __name__ == "__main__":
import sys
from pathlib import Path
@@ -251,6 +253,7 @@ class GGUFReader:
tensor_names.add(tensor_name)
ggml_type = GGMLQuantizationType(raw_dtype[0])
n_elems = int(np.prod(dims))
+ np_dims = tuple(reversed(dims.tolist()))
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
n_bytes = n_elems * type_size // block_size
data_offs = int(start_offs + offset_tensor[0])
@@ -279,6 +282,7 @@ class GGUFReader:
else:
item_count = n_bytes
item_type = np.uint8
+ np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
tensors.append(ReaderTensor(
name = tensor_name,
tensor_type = ggml_type,
@@ -286,7 +290,7 @@ class GGUFReader:
n_elements = n_elems,
n_bytes = n_bytes,
data_offset = data_offs,
- data = self._get(data_offs, item_type, item_count),
+ data = self._get(data_offs, item_type, item_count).reshape(np_dims),
field = field,
))
self.tensors = tensors
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 8b41b54e..c194dd5d 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -13,7 +13,6 @@ from string import ascii_letters, digits
import numpy as np
from .constants import (
- GGML_QUANT_SIZES,
GGUF_DEFAULT_ALIGNMENT,
GGUF_MAGIC,
GGUF_VERSION,
@@ -26,6 +25,8 @@ from .constants import (
TokenType,
)
+from .quants import quant_shape_from_byte_shape
+
logger = logging.getLogger(__name__)
@@ -229,10 +230,7 @@ class GGUFWriter:
else:
dtype = raw_dtype
if tensor_dtype == np.uint8:
- block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
- if tensor_shape[-1] % type_size != 0:
- raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
- tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
+ tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
n_dims = len(tensor_shape)
self.ti_data += self._pack("I", n_dims)
for i in range(n_dims):
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index e7fc0eae..b22eec16 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -1,5 +1,5 @@
from __future__ import annotations
-from typing import Callable
+from typing import Callable, Sequence
from numpy.typing import DTypeLike
@@ -9,6 +9,20 @@ from .lazy import LazyNumpyTensor
import numpy as np
+def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+ block_size, type_size = GGML_QUANT_SIZES[quant_type]
+ if shape[-1] % block_size != 0:
+ raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
+ return (*shape[:-1], shape[-1] // block_size * type_size)
+
+
+def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+ block_size, type_size = GGML_QUANT_SIZES[quant_type]
+ if shape[-1] % type_size != 0:
+ raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
+ return (*shape[:-1], shape[-1] // type_size * block_size)
+
+
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
n = n.astype(np.float32, copy=False).view(np.int32)
diff --git a/gguf-py/scripts/gguf-new-metadata.py b/gguf-py/scripts/gguf-new-metadata.py
index 63d3c5d8..c9f1927f 100755
--- a/gguf-py/scripts/gguf-new-metadata.py
+++ b/gguf-py/scripts/gguf-new-metadata.py
@@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
for tensor in reader.tensors:
total_bytes += tensor.n_bytes
- # Dimensions are written in reverse order, so flip them first
- shape = np.flipud(tensor.shape).tolist()
- writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
+ writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)