summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorQin Yue Chen <71813199+chenqiny@users.noreply.github.com>2023-10-20 06:19:40 -0500
committerGitHub <noreply@github.com>2023-10-20 14:19:40 +0300
commit8cf19d60dc93809db8e51fedc811595eed9134c5 (patch)
tree879c1861fb50748c02ec031a1dcc3f6e732ca366
parenta0edf73bda31c7c4e649e6f07c6fd30a729929cd (diff)
gguf : support big endian platform (#3552)
* check whether platform is 390x if yes->do not import immintrin.h * support s390x big endian * support --bigendian option for s390x 1. verified with baichuan7b-chat with float 16 on s390x 2. verified with baichuan7b-chat 3. verified with chinese-alpaca-2-13b-f16 * update format based on editor-config checker result * Update convert-baichuan-hf-to-gguf.py * 1. check in ggml.c if endianess is not match 2. update GGUF version 3. change get_pack_prefix to property 4. update information log * always use "GGUF" as beginng of GGUF file * Compare "GGUF" with file header char by char 1. Set GGUF_MAGIC to "GGUF" string instead of int value 2. Compare "GGUF" char by char to ensure its byte order 3. Move bytes swap code from convert.py to gguf.py write_tensor_data --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
-rwxr-xr-xconvert-baichuan-hf-to-gguf.py8
-rwxr-xr-xconvert.py20
-rw-r--r--examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp2
-rw-r--r--ggml.c19
-rw-r--r--ggml.h5
-rw-r--r--gguf-py/gguf/gguf.py73
-rw-r--r--gguf-py/pyproject.toml2
-rw-r--r--k_quants.c2
-rw-r--r--tests/test-double-float.cpp2
9 files changed, 84 insertions, 49 deletions
diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
index 513a7516..a1783f71 100755
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
+ parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
return parser.parse_args()
args = parse_args()
@@ -86,6 +87,11 @@ if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
+endianess = gguf.GGUFEndian.LITTLE
+if args.bigendian:
+ endianess = gguf.GGUFEndian.BIG
+endianess_str = "Big Endian" if args.bigendian else "Little Endian"
+print(f"gguf: Conversion Endianess {endianess}")
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
@@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
print("gguf: get model metadata")
diff --git a/convert.py b/convert.py
index e9b08d34..24da25ef 100755
--- a/convert.py
+++ b/convert.py
@@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
class OutputFile:
- def __init__(self, fname_out: Path) -> None:
- self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+ def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+ self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_arch(self, params: Params) -> None:
name = "LLaMA"
@@ -875,10 +875,10 @@ class OutputFile:
self.gguf.close()
@staticmethod
- def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
+ def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)
- of = OutputFile(fname_out)
+ of = OutputFile(fname_out, endianess=endianess)
# meta data
of.add_meta_arch(params)
@@ -903,10 +903,10 @@ class OutputFile:
return dt.quantize(arr)
@staticmethod
- def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
+ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)
- of = OutputFile(fname_out)
+ of = OutputFile(fname_out, endianess=endianess)
# meta data
of.add_meta_arch(params)
@@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
- args = parser.parse_args(args_in)
+ parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
+ args = parser.parse_args(args_in)
if args.dump_single:
model_plus = lazy_load_file(args.model)
do_dump_model(model_plus)
@@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None:
if args.dump:
do_dump_model(model_plus)
return
+ endianess = gguf.GGUFEndian.LITTLE
+ if args.bigendian:
+ endianess = gguf.GGUFEndian.BIG
params = Params.load(model_plus)
if params.n_ctx == -1:
@@ -1185,7 +1189,7 @@ def main(args_in: list[str] | None = None) -> None:
params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")
- OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
+ OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
print(f"Wrote {outfile}")
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index c291f0ad..cae3bf3c 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
if (file.size < 4) {
return false;
}
- uint32_t magic = file.read_u32();
+ std::string magic = file.read_string(4);
return magic == GGUF_MAGIC;
}
diff --git a/ggml.c b/ggml.c
index ed157aab..49f3b7ab 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20845,7 +20845,7 @@ struct gguf_kv {
};
struct gguf_header {
- uint32_t magic;
+ char magic[4];
uint32_t version;
uint64_t n_tensors; // GGUFv2
uint64_t n_kv; // GGUFv2
@@ -20915,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
- ctx->header.magic = GGUF_MAGIC;
+ memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
ctx->header.version = GGUF_VERSION;
ctx->header.n_tensors = 0;
ctx->header.n_kv = 0;
@@ -20941,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// offset from start of file
size_t offset = 0;
- uint32_t magic = 0;
+ char magic[4];
// check the magic before making allocations
{
gguf_fread_el(file, &magic, sizeof(magic), &offset);
- if (magic != GGUF_MAGIC) {
- fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
- fclose(file);
- return NULL;
+ for (uint32_t i = 0; i < sizeof(magic); i++) {
+ if (magic[i] != GGUF_MAGIC[i]) {
+ fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
+ fclose(file);
+ return NULL;
+ }
}
}
@@ -20960,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the header
{
- ctx->header.magic = magic;
+ strncpy(ctx->header.magic, magic, 4);
+
ctx->kv = NULL;
ctx->infos = NULL;
diff --git a/ggml.h b/ggml.h
index 6e35888e..16aaf169 100644
--- a/ggml.h
+++ b/ggml.h
@@ -231,8 +231,9 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
-#define GGUF_MAGIC 0x46554747 // "GGUF"
-#define GGUF_VERSION 2
+#define GGUF_MAGIC "GGUF"
+
+#define GGUF_VERSION 3
#define GGUF_DEFAULT_ALIGNMENT 32
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 557ce7ac..072c839c 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -19,9 +19,10 @@ import numpy as np
#
GGUF_MAGIC = 0x46554747
-GGUF_VERSION = 2
+GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32
+
# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum):
Q6_K = 14
Q8_K = 15
+class GGUFEndian(IntEnum):
+ LITTLE = 0
+ BIG = 1
+
class GGUFValueType(IntEnum):
UINT8 = 0
@@ -644,18 +649,41 @@ class GGUFWriter:
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
tensors: list[tuple[np.ndarray[Any, Any], int]]
- def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
+ @property
+ def pack_prefix(self):
+ if self.endianess==GGUFEndian.LITTLE:
+ return "<"
+ else:
+ return ">"
+
+ def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
self.fout = open(path, "wb")
self.arch = arch
+ self.endianess = endianess
+ self._simple_value_packing = {
+ GGUFValueType.UINT8: f"{self.pack_prefix}B",
+ GGUFValueType.INT8: f"{self.pack_prefix}b",
+ GGUFValueType.UINT16: f"{self.pack_prefix}H",
+ GGUFValueType.INT16: f"{self.pack_prefix}h",
+ GGUFValueType.UINT32: f"{self.pack_prefix}I",
+ GGUFValueType.INT32: f"{self.pack_prefix}i",
+ GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
+ GGUFValueType.UINT64: f"{self.pack_prefix}Q",
+ GGUFValueType.INT64: f"{self.pack_prefix}q",
+ GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
+ GGUFValueType.BOOL: "?" ,
+ }
self.add_architecture()
self.use_temp_file = use_temp_file
self.tensors = []
+ endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
+ print(f"This gguf file is for {endianess_str} only")
def write_header_to_file(self):
self.fout.write(struct.pack("<I", GGUF_MAGIC))
- self.fout.write(struct.pack("<I", GGUF_VERSION))
- self.fout.write(struct.pack("<Q", self.ti_data_count))
- self.fout.write(struct.pack("<Q", self.kv_data_count))
+ self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
+ self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
+ self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
self.flush()
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
@@ -727,25 +755,12 @@ class GGUFWriter:
self.add_key(key)
self.add_val(val, GGUFValueType.ARRAY)
- _simple_value_packing = {
- GGUFValueType.UINT8: "<B",
- GGUFValueType.INT8: "<b",
- GGUFValueType.UINT16: "<H",
- GGUFValueType.INT16: "<h",
- GGUFValueType.UINT32: "<I",
- GGUFValueType.INT32: "<i",
- GGUFValueType.FLOAT32: "<f",
- GGUFValueType.UINT64: "<Q",
- GGUFValueType.INT64: "<q",
- GGUFValueType.FLOAT64: "<d",
- GGUFValueType.BOOL: "?" ,
- }
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
if vtype is None:
vtype = GGUFValueType.get_type(val)
if add_vtype:
- self.kv_data += struct.pack("<I", vtype)
+ self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
self.kv_data_count += 1
pack_fmt = self._simple_value_packing.get(vtype)
@@ -753,14 +768,14 @@ class GGUFWriter:
self.kv_data += struct.pack(pack_fmt, val)
elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf8") if isinstance(val, str) else val
- self.kv_data += struct.pack("<Q", len(encoded_val))
+ self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
self.kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
- self.kv_data += struct.pack("<I", ltype)
- self.kv_data += struct.pack("<Q", len(val))
+ self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
+ self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
for item in val:
self.add_val(item, add_vtype=False)
else:
@@ -774,22 +789,24 @@ class GGUFWriter:
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
encoded_name = name.encode("utf8")
- self.ti_data += struct.pack("<Q", len(encoded_name))
+ self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
self.ti_data += encoded_name
n_dims = len(tensor_shape)
- self.ti_data += struct.pack("<I", n_dims)
+ self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
for i in range(n_dims):
- self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
+ self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
if raw_dtype is None:
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
else:
dtype = raw_dtype
- self.ti_data += struct.pack("<I", dtype)
- self.ti_data += struct.pack("<Q", self.offset_tensor)
+ self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
+ self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1
def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
+ if self.endianess == GGUFEndian.BIG:
+ tensor.byteswap(inplace=True)
if self.use_temp_file and self.temp_file is None:
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
fp.seek(0)
@@ -815,6 +832,8 @@ class GGUFWriter:
fp.write(bytes([0] * pad))
def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
+ if self.endianess==GGUFEndian.BIG:
+ tensor.byteswap(inplace=True)
self.write_padding(self.fout, self.fout.tell())
tensor.tofile(self.fout)
self.write_padding(self.fout, tensor.nbytes)
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 07a7ab4d..f0741a7c 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
-version = "0.4.4"
+version = "0.4.5"
description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [
diff --git a/k_quants.c b/k_quants.c
index e168a87b..801941fb 100644
--- a/k_quants.c
+++ b/k_quants.c
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
-#if !defined(__riscv)
+#if !defined(__riscv) && !defined(__s390__)
#include <immintrin.h>
#endif
#endif
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp
index b506f273..afd7bf77 100644
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -4,7 +4,9 @@
#undef NDEBUG
#include <cassert>
+#if !defined(__riscv) && !defined(__s390__)
#include <immintrin.h>
+#endif
#include <cmath>
#include <cstdint>
#include <cstring>