diff options
author | Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> | 2023-10-20 06:19:40 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-20 14:19:40 +0300 |
commit | 8cf19d60dc93809db8e51fedc811595eed9134c5 (patch) | |
tree | 879c1861fb50748c02ec031a1dcc3f6e732ca366 | |
parent | a0edf73bda31c7c4e649e6f07c6fd30a729929cd (diff) |
gguf : support big endian platform (#3552)
* check whether platform is 390x if yes->do not import immintrin.h
* support s390x big endian
* support --bigendian option for s390x
1. verified with baichuan7b-chat with float 16 on s390x
2. verified with baichuan7b-chat
3. verified with chinese-alpaca-2-13b-f16
* update format based on editor-config checker result
* Update convert-baichuan-hf-to-gguf.py
* 1. check in ggml.c if endianess is not match
2. update GGUF version
3. change get_pack_prefix to property
4. update information log
* always use "GGUF" as beginng of GGUF file
* Compare "GGUF" with file header char by char
1. Set GGUF_MAGIC to "GGUF" string instead of int value
2. Compare "GGUF" char by char to ensure its byte order
3. Move bytes swap code from convert.py to gguf.py write_tensor_data
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
-rwxr-xr-x | convert-baichuan-hf-to-gguf.py | 8 | ||||
-rwxr-xr-x | convert.py | 20 | ||||
-rw-r--r-- | examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp | 2 | ||||
-rw-r--r-- | ggml.c | 19 | ||||
-rw-r--r-- | ggml.h | 5 | ||||
-rw-r--r-- | gguf-py/gguf/gguf.py | 73 | ||||
-rw-r--r-- | gguf-py/pyproject.toml | 2 | ||||
-rw-r--r-- | k_quants.c | 2 | ||||
-rw-r--r-- | tests/test-double-float.cpp | 2 |
9 files changed, 84 insertions, 49 deletions
diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py index 513a7516..a1783f71 100755 --- a/convert-baichuan-hf-to-gguf.py +++ b/convert-baichuan-hf-to-gguf.py @@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace: "ftype", type=int, choices=[0, 1], default=1, nargs='?', help="output format - use 0 for float32, 1 for float16", ) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") return parser.parse_args() args = parse_args() @@ -86,6 +87,11 @@ if not dir_model.is_dir(): print(f'Error: {args.model} is not a directory', file = sys.stderr) sys.exit(1) +endianess = gguf.GGUFEndian.LITTLE +if args.bigendian: + endianess = gguf.GGUFEndian.BIG +endianess_str = "Big Endian" if args.bigendian else "Little Endian" +print(f"gguf: Conversion Endianess {endianess}") # possible tensor data types # ftype == 0 -> float32 # ftype == 1 -> float16 @@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM": num_parts = count_model_parts(dir_model) print(f"num_parts:{num_parts}\n") ARCH=gguf.MODEL_ARCH.BAICHUAN -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) print("gguf: get model metadata") @@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None: class OutputFile: - def __init__(self, fname_out: Path) -> None: - self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: name = "LLaMA" @@ -875,10 +875,10 @@ class OutputFile: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None: + def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: check_vocab_size(params, vocab) - of = OutputFile(fname_out) + of = OutputFile(fname_out, endianess=endianess) # meta data of.add_meta_arch(params) @@ -903,10 +903,10 @@ class OutputFile: return dt.quantize(arr) @staticmethod - def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: + def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None: check_vocab_size(params, vocab) - of = OutputFile(fname_out) + of = OutputFile(fname_out, endianess=endianess) # meta data of.add_meta_arch(params) @@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) - args = parser.parse_args(args_in) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + args = parser.parse_args(args_in) if args.dump_single: model_plus = lazy_load_file(args.model) do_dump_model(model_plus) @@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None: if args.dump: do_dump_model(model_plus) return + endianess = gguf.GGUFEndian.LITTLE + if args.bigendian: + endianess = gguf.GGUFEndian.BIG params = Params.load(model_plus) if params.n_ctx == -1: @@ -1185,7 +1189,7 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess) print(f"Wrote {outfile}") diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index c291f0ad..cae3bf3c 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) { if (file.size < 4) { return false; } - uint32_t magic = file.read_u32(); + std::string magic = file.read_string(4); return magic == GGUF_MAGIC; } @@ -20845,7 +20845,7 @@ struct gguf_kv { }; struct gguf_header { - uint32_t magic; + char magic[4]; uint32_t version; uint64_t n_tensors; // GGUFv2 uint64_t n_kv; // GGUFv2 @@ -20915,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) struct gguf_context * gguf_init_empty(void) { struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); - ctx->header.magic = GGUF_MAGIC; + memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); ctx->header.version = GGUF_VERSION; ctx->header.n_tensors = 0; ctx->header.n_kv = 0; @@ -20941,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // offset from start of file size_t offset = 0; - uint32_t magic = 0; + char magic[4]; // check the magic before making allocations { gguf_fread_el(file, &magic, sizeof(magic), &offset); - if (magic != GGUF_MAGIC) { - fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic); - fclose(file); - return NULL; + for (uint32_t i = 0; i < sizeof(magic); i++) { + if (magic[i] != GGUF_MAGIC[i]) { + fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic); + fclose(file); + return NULL; + } } } @@ -20960,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the header { - ctx->header.magic = magic; + strncpy(ctx->header.magic, magic, 4); + ctx->kv = NULL; ctx->infos = NULL; @@ -231,8 +231,9 @@ #define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_ABORTED 1 -#define GGUF_MAGIC 0x46554747 // "GGUF" -#define GGUF_VERSION 2 +#define GGUF_MAGIC "GGUF" + +#define GGUF_VERSION 3 #define GGUF_DEFAULT_ALIGNMENT 32 diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 557ce7ac..072c839c 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -19,9 +19,10 @@ import numpy as np # GGUF_MAGIC = 0x46554747 -GGUF_VERSION = 2 +GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 + # general KEY_GENERAL_ARCHITECTURE = "general.architecture" KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version" @@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum): Q6_K = 14 Q8_K = 15 +class GGUFEndian(IntEnum): + LITTLE = 0 + BIG = 1 + class GGUFValueType(IntEnum): UINT8 = 0 @@ -644,18 +649,41 @@ class GGUFWriter: temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None tensors: list[tuple[np.ndarray[Any, Any], int]] - def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True): + @property + def pack_prefix(self): + if self.endianess==GGUFEndian.LITTLE: + return "<" + else: + return ">" + + def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE): self.fout = open(path, "wb") self.arch = arch + self.endianess = endianess + self._simple_value_packing = { + GGUFValueType.UINT8: f"{self.pack_prefix}B", + GGUFValueType.INT8: f"{self.pack_prefix}b", + GGUFValueType.UINT16: f"{self.pack_prefix}H", + GGUFValueType.INT16: f"{self.pack_prefix}h", + GGUFValueType.UINT32: f"{self.pack_prefix}I", + GGUFValueType.INT32: f"{self.pack_prefix}i", + GGUFValueType.FLOAT32: f"{self.pack_prefix}f", + GGUFValueType.UINT64: f"{self.pack_prefix}Q", + GGUFValueType.INT64: f"{self.pack_prefix}q", + GGUFValueType.FLOAT64: f"{self.pack_prefix}d", + GGUFValueType.BOOL: "?" , + } self.add_architecture() self.use_temp_file = use_temp_file self.tensors = [] + endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian" + print(f"This gguf file is for {endianess_str} only") def write_header_to_file(self): self.fout.write(struct.pack("<I", GGUF_MAGIC)) - self.fout.write(struct.pack("<I", GGUF_VERSION)) - self.fout.write(struct.pack("<Q", self.ti_data_count)) - self.fout.write(struct.pack("<Q", self.kv_data_count)) + self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION)) + self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count)) + self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count)) self.flush() # print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count)) @@ -727,25 +755,12 @@ class GGUFWriter: self.add_key(key) self.add_val(val, GGUFValueType.ARRAY) - _simple_value_packing = { - GGUFValueType.UINT8: "<B", - GGUFValueType.INT8: "<b", - GGUFValueType.UINT16: "<H", - GGUFValueType.INT16: "<h", - GGUFValueType.UINT32: "<I", - GGUFValueType.INT32: "<i", - GGUFValueType.FLOAT32: "<f", - GGUFValueType.UINT64: "<Q", - GGUFValueType.INT64: "<q", - GGUFValueType.FLOAT64: "<d", - GGUFValueType.BOOL: "?" , - } def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True): if vtype is None: vtype = GGUFValueType.get_type(val) if add_vtype: - self.kv_data += struct.pack("<I", vtype) + self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype) self.kv_data_count += 1 pack_fmt = self._simple_value_packing.get(vtype) @@ -753,14 +768,14 @@ class GGUFWriter: self.kv_data += struct.pack(pack_fmt, val) elif vtype == GGUFValueType.STRING: encoded_val = val.encode("utf8") if isinstance(val, str) else val - self.kv_data += struct.pack("<Q", len(encoded_val)) + self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val)) self.kv_data += encoded_val elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0: ltype = GGUFValueType.get_type(val[0]) if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): raise ValueError("All items in a GGUF array should be of the same type") - self.kv_data += struct.pack("<I", ltype) - self.kv_data += struct.pack("<Q", len(val)) + self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype) + self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val)) for item in val: self.add_val(item, add_vtype=False) else: @@ -774,22 +789,24 @@ class GGUFWriter: assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now" encoded_name = name.encode("utf8") - self.ti_data += struct.pack("<Q", len(encoded_name)) + self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name)) self.ti_data += encoded_name n_dims = len(tensor_shape) - self.ti_data += struct.pack("<I", n_dims) + self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims) for i in range(n_dims): - self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i]) + self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 else: dtype = raw_dtype - self.ti_data += struct.pack("<I", dtype) - self.ti_data += struct.pack("<Q", self.offset_tensor) + self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype) + self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor) self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) self.ti_data_count += 1 def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None): + if self.endianess == GGUFEndian.BIG: + tensor.byteswap(inplace=True) if self.use_temp_file and self.temp_file is None: fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024) fp.seek(0) @@ -815,6 +832,8 @@ class GGUFWriter: fp.write(bytes([0] * pad)) def write_tensor_data(self, tensor: np.ndarray[Any, Any]): + if self.endianess==GGUFEndian.BIG: + tensor.byteswap(inplace=True) self.write_padding(self.fout, self.fout.tell()) tensor.tofile(self.fout) self.write_padding(self.fout, tensor.nbytes) diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 07a7ab4d..f0741a7c 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.4.4" +version = "0.4.5" description = "Write ML models in GGUF for GGML" authors = ["GGML <ggml@ggml.ai>"] packages = [ @@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) { #if defined(_MSC_VER) || defined(__MINGW32__) #include <intrin.h> #else -#if !defined(__riscv) +#if !defined(__riscv) && !defined(__s390__) #include <immintrin.h> #endif #endif diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp index b506f273..afd7bf77 100644 --- a/tests/test-double-float.cpp +++ b/tests/test-double-float.cpp @@ -4,7 +4,9 @@ #undef NDEBUG #include <cassert> +#if !defined(__riscv) && !defined(__s390__) #include <immintrin.h> +#endif #include <cmath> #include <cstdint> #include <cstring> |