diff options
author | compilade <git@compilade.net> | 2024-05-13 14:10:51 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-13 14:10:51 -0400 |
commit | ee52225067622babc277371511b8124884e1c797 (patch) | |
tree | 8150564487416fc038952c8b85f1462f3b1c98cf /gguf-py | |
parent | 614d3b914e1c3e02596f869649eb4f1d3b68614d (diff) |
convert-hf : support direct Q8_0 conversion (#7234)
* convert-hf : support q8_0 conversion
* convert-hf : add missing ftype
This was messing with the checksums otherwise.
* convert-hf : add missing ftype to Baichuan and Xverse
I didn't notice these on my first pass.
Diffstat (limited to 'gguf-py')
-rw-r--r-- | gguf-py/gguf/__init__.py | 1 | ||||
-rw-r--r-- | gguf-py/gguf/gguf_writer.py | 16 | ||||
-rw-r--r-- | gguf-py/gguf/lazy.py | 29 | ||||
-rw-r--r-- | gguf-py/gguf/quants.py | 109 |
4 files changed, 141 insertions, 14 deletions
diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py index e5d5806c..ea5146b1 100644 --- a/gguf-py/gguf/__init__.py +++ b/gguf-py/gguf/__init__.py @@ -2,5 +2,6 @@ from .constants import * from .lazy import * from .gguf_reader import * from .gguf_writer import * +from .quants import * from .tensor_mapping import * from .vocab import * diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 96574358..d5e323a5 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -13,6 +13,7 @@ from string import ascii_letters, digits import numpy as np from .constants import ( + GGML_QUANT_SIZES, GGUF_DEFAULT_ALIGNMENT, GGUF_MAGIC, GGUF_VERSION, @@ -195,7 +196,7 @@ class GGUFWriter: return ((x + n - 1) // n) * n def add_tensor_info( - self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], + self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None, ) -> None: if self.state is not WriterState.EMPTY: @@ -208,10 +209,6 @@ class GGUFWriter: encoded_name = name.encode("utf-8") self.ti_data += self._pack("Q", len(encoded_name)) self.ti_data += encoded_name - n_dims = len(tensor_shape) - self.ti_data += self._pack("I", n_dims) - for i in range(n_dims): - self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: if tensor_dtype == np.float16: dtype = GGMLQuantizationType.F16 @@ -231,6 +228,15 @@ class GGUFWriter: raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now") else: dtype = raw_dtype + if tensor_dtype == np.uint8: + block_size, type_size = GGML_QUANT_SIZES[raw_dtype] + if tensor_shape[-1] % type_size != 0: + raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})") + tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,) + n_dims = len(tensor_shape) + self.ti_data += self._pack("I", n_dims) + for i in range(n_dims): + self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) self.ti_data += self._pack("I", dtype) self.ti_data += self._pack("Q", self.offset_tensor) self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index 650bea11..1167335b 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -6,6 +6,7 @@ from typing import Any, Callable from collections import deque import numpy as np +from numpy._typing import _Shape from numpy.typing import DTypeLike @@ -110,7 +111,7 @@ class LazyBase(ABC, metaclass=LazyMeta): return o @classmethod - def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike = False) -> Callable[[Any], Any]: + def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]: def wrapped_fn(*args, **kwargs): if kwargs is None: kwargs = {} @@ -130,9 +131,14 @@ class LazyBase(ABC, metaclass=LazyMeta): res = args[0] assert isinstance(res, cls) res = res._meta - # allow operations to override the dtype + # allow operations to override the dtype and shape if meta_noop is not True: - res = cls.meta_with_dtype(res, meta_noop) + if isinstance(meta_noop, tuple): + dtype, shape = meta_noop + assert callable(shape) + res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape)) + else: + res = cls.meta_with_dtype_and_shape(meta_noop, res.shape) if isinstance(res, cls._tensor_type): def collect_replace(t: LazyBase): @@ -168,7 +174,12 @@ class LazyBase(ABC, metaclass=LazyMeta): while _t._data is None: lt = _t._lazy.popleft() if lt._data is not None: - raise ValueError(f"{lt} did not belong in the lazy queue") + # Lazy tensor did not belong in the lazy queue. + # Weirdly only happens with Bloom models... + # likely because tensors aren't unique in the queue. + # The final output is still the same as in eager mode, + # so it's safe to ignore this. + continue assert lt._func is not None lt._args = cls._recurse_apply(lt._args, already_eager_to_eager) lt._data = lt._func(lt._args) @@ -183,12 +194,12 @@ class LazyBase(ABC, metaclass=LazyMeta): @classmethod def eager_to_meta(cls, t: Any) -> Any: - return cls.meta_with_dtype(t, t.dtype) + return cls.meta_with_dtype_and_shape(t.dtype, t.shape) # must be overridden, meta tensor init is backend-specific @classmethod @abstractmethod - def meta_with_dtype(cls, m: Any, dtype: Any) -> Any: pass + def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass @classmethod def from_eager(cls, t: Any) -> Any: @@ -205,15 +216,15 @@ class LazyNumpyTensor(LazyBase): _tensor_type = np.ndarray @classmethod - def meta_with_dtype(cls, m: np.ndarray[Any, Any], dtype: DTypeLike) -> np.ndarray[Any, Any]: + def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]: # The initial idea was to use np.nan as the fill value, # but non-float types like np.int16 can't use that. # So zero it is. cheat = np.zeros(1, dtype) - return np.lib.stride_tricks.as_strided(cheat, m.shape, (0 for _ in m.shape)) + return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape)) def astype(self, dtype, *args, **kwargs): - meta = type(self).meta_with_dtype(self._meta, dtype) + meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape) full_args = (self, dtype,) + args # very important to pass the shared _lazy deque, or else there's an infinite loop somewhere. return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs))) diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py new file mode 100644 index 00000000..e7fc0eae --- /dev/null +++ b/gguf-py/gguf/quants.py @@ -0,0 +1,109 @@ +from __future__ import annotations +from typing import Callable + +from numpy.typing import DTypeLike + +from .constants import GGML_QUANT_SIZES, GGMLQuantizationType +from .lazy import LazyNumpyTensor + +import numpy as np + + +# same as ggml_compute_fp32_to_bf16 in ggml-impl.h +def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray: + n = n.astype(np.float32, copy=False).view(np.int32) + # force nan to quiet + n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n) + # flush subnormals to zero + n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n) + # round to nearest even + n = (n + (0x7fff + ((n >> 16) & 1))) >> 16 + return n.astype(np.int16) + + +# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time +def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray: + rows = arr.reshape((-1, arr.shape[-1])) + osize = 1 + for dim in oshape: + osize *= dim + out = np.empty(shape=osize, dtype=otype) + # compute over groups of 16 rows (arbitrary, but seems good for performance) + n_groups = rows.shape[0] // 16 + np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out) + return out.reshape(oshape) + + +def __quantize_bf16_array(n: np.ndarray) -> np.ndarray: + return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape) + + +__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16) + + +def quantize_bf16(n: np.ndarray): + if type(n) is LazyNumpyTensor: + return __quantize_bf16_lazy(n) + else: + return __quantize_bf16_array(n) + + +__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0] + + +def can_quantize_to_q8_0(n: np.ndarray) -> bool: + return n.shape[-1] % __q8_block_size == 0 + + +# round away from zero +# ref: https://stackoverflow.com/a/59143326/22827863 +def np_roundf(n: np.ndarray) -> np.ndarray: + a = abs(n) + floored = np.floor(a) + b = floored + np.floor(2 * (a - floored)) + return np.sign(n) * b + + +def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]: + return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size) + + +# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c +def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray: + shape = n.shape + assert shape[-1] % __q8_block_size == 0 + + n_blocks = n.size // __q8_block_size + + blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False) + + d = abs(blocks).max(axis=1, keepdims=True) / 127 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np_roundf(blocks * id) + + # (n_blocks, 2) + d = d.astype(np.float16).view(np.uint8) + # (n_blocks, block_size) + qs = qs.astype(np.int8).view(np.uint8) + + assert d.shape[1] + qs.shape[1] == __q8_type_size + + return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape)) + + +def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray: + return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape)) + + +__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn( + __quantize_q8_0_array, + meta_noop=(np.uint8, __quantize_q8_0_shape_change), +) + + +def quantize_q8_0(data: np.ndarray): + if type(data) is LazyNumpyTensor: + return __quantize_q8_0_lazy(data) + else: + return __quantize_q8_0_array(data) |