summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-08-21 23:07:43 +0300
committerGitHub <noreply@github.com>2023-08-21 23:07:43 +0300
commit6381d4e110bd0ec02843a60bbeb8b6fc37a9ace9 (patch)
tree15f5b726f864ad0913bc8dcf6ea08b90ecc7ada9 /ggml.c
parentdadbed99e65252d79f81101a392d0d6497b86caa (diff)
gguf : new file format with flexible meta data (beta) (#2398)
* gguf : first API pass * gguf : read header + meta data * gguf : read tensor info * gguf : initial model loading - not tested * gguf : add gguf_get_tensor_name() * gguf : do not support passing existing ggml_context to gguf_init * gguf : simplify gguf_get_val * gguf : gguf.c is now part of ggml.c * gguf : read / write sample models * gguf : add comments * refactor : reduce code duplication and better API (#2415) * gguf : expose the gguf_type enum through the API for now * gguf : add array support * gguf.py : some code style changes * convert.py : start a new simplified implementation by removing old stuff * convert.py : remove GGML vocab + other obsolete stuff * GGUF : write tensor (#2426) * WIP: Write tensor * GGUF : Support writing tensors in Python * refactor : rm unused import and upd todos * fix : fix errors upd writing example * rm example.gguf * gitignore *.gguf * undo formatting * gguf : add gguf_find_key (#2438) * gguf.cpp : find key example * ggml.h : add gguf_find_key * ggml.c : add gguf_find_key * gguf : fix writing tensors * gguf : do not hardcode tensor names to read * gguf : write sample tensors to read * gguf : add tokenization constants * quick and dirty conversion example * gguf : fix writing gguf arrays * gguf : write tensors one by one and code reuse * gguf : fix writing gguf arrays * gguf : write tensors one by one * gguf : write tensors one by one * gguf : write tokenizer data * gguf : upd gguf conversion script * Update convert-llama-h5-to-gguf.py * gguf : handle already encoded string * ggml.h : get array str and f32 * ggml.c : get arr str and f32 * gguf.py : support any type * Update convert-llama-h5-to-gguf.py * gguf : fix set is not subscriptable * gguf : update convert-llama-h5-to-gguf.py * constants.py : add layer norm eps * gguf.py : add layer norm eps and merges * ggml.h : increase GGML_MAX_NAME to 64 * ggml.c : add gguf_get_arr_n * Update convert-llama-h5-to-gguf.py * add gptneox gguf example * Makefile : add gptneox gguf example * Update convert-llama-h5-to-gguf.py * add gptneox gguf example * Update convert-llama-h5-to-gguf.py * Update convert-gptneox-h5-to-gguf.py * Update convert-gptneox-h5-to-gguf.py * Update convert-llama-h5-to-gguf.py * gguf : support custom alignment value * gguf : fix typo in function call * gguf : mmap tensor data example * fix : update convert-llama-h5-to-gguf.py * Update convert-llama-h5-to-gguf.py * convert-gptneox-h5-to-gguf.py : Special tokens * gptneox-main.cpp : special tokens * Update gptneox-main.cpp * constants.py : special tokens * gguf.py : accumulate kv and tensor info data + special tokens * convert-gptneox-h5-to-gguf.py : accumulate kv and ti + special tokens * gguf : gguf counterpart of llama-util.h * gguf-util.h : update note * convert-llama-h5-to-gguf.py : accumulate kv / ti + special tokens * convert-llama-h5-to-gguf.py : special tokens * Delete gptneox-common.cpp * Delete gptneox-common.h * convert-gptneox-h5-to-gguf.py : gpt2bpe tokenizer * gptneox-main.cpp : gpt2 bpe tokenizer * gpt2 bpe tokenizer (handles merges and unicode) * Makefile : remove gptneox-common * gguf.py : bytesarray for gpt2bpe tokenizer * cmpnct_gpt2bpe.hpp : comments * gguf.py : use custom alignment if present * gguf : minor stuff * Update gptneox-main.cpp * map tensor names * convert-gptneox-h5-to-gguf.py : map tensor names * convert-llama-h5-to-gguf.py : map tensor names * gptneox-main.cpp : map tensor names * gguf : start implementing libllama in GGUF (WIP) * gguf : start implementing libllama in GGUF (WIP) * rm binary commited by mistake * upd .gitignore * gguf : calculate n_mult * gguf : inference with 7B model working (WIP) * gguf : rm deprecated function * gguf : start implementing gguf_file_saver (WIP) * gguf : start implementing gguf_file_saver (WIP) * gguf : start implementing gguf_file_saver (WIP) * gguf : add gguf_get_kv_type * gguf : add gguf_get_kv_type * gguf : write metadata in gguf_file_saver (WIP) * gguf : write metadata in gguf_file_saver (WIP) * gguf : write metadata in gguf_file_saver * gguf : rm references to old file formats * gguf : shorter name for member variable * gguf : rm redundant method * gguf : get rid of n_mult, read n_ff from file * Update gguf_tensor_map.py * Update gptneox-main.cpp * gguf : rm references to old file magics * gguf : start implementing quantization (WIP) * gguf : start implementing quantization (WIP) * gguf : start implementing quantization (WIP) * gguf : start implementing quantization (WIP) * gguf : start implementing quantization (WIP) * gguf : start implementing quantization (WIP) * gguf : quantization is working * gguf : roper closing of file * gguf.py : no need to convert tensors twice * convert-gptneox-h5-to-gguf.py : no need to convert tensors twice * convert-llama-h5-to-gguf.py : no need to convert tensors twice * convert-gptneox-h5-to-gguf.py : simplify nbytes * convert-llama-h5-to-gguf.py : simplify nbytes * gptneox-main.cpp : n_layer --> n_block * constants.py : n_layer --> n_block * gguf.py : n_layer --> n_block * convert-gptneox-h5-to-gguf.py : n_layer --> n_block * convert-llama-h5-to-gguf.py : n_layer --> n_block * gptneox-main.cpp : n_layer --> n_block * Update gguf_tensor_map.py * convert-gptneox-h5-to-gguf.py : load model in parts to save memory * convert-llama-h5-to-gguf.py : load model in parts to save memory * convert : write more metadata for LLaMA * convert : rm quantization version * convert-gptneox-h5-to-gguf.py : add file_type key * gptneox-main.cpp : add file_type key * fix conflicts * gguf : add todos and comments * convert-gptneox-h5-to-gguf.py : tensor name map changes * Create gguf_namemap.py : tensor name map changes * Delete gguf_tensor_map.py * gptneox-main.cpp : tensor name map changes * convert-llama-h5-to-gguf.py : fixes * gguf.py : dont add empty strings * simple : minor style changes * gguf : use UNIX line ending * Create convert-llama-7b-pth-to-gguf.py * llama : sync gguf-llama.cpp with latest llama.cpp (#2608) * llama : sync gguf-llama.cpp with latest llama.cpp * minor : indentation + assert * llama : refactor gguf_buffer and gguf_ctx_buffer * llama : minor * gitignore : add gptneox-main * llama : tokenizer fixes (#2549) * Merge tokenizer fixes into the gguf branch. * Add test vocabularies * convert : update convert-new.py with tokenizer fixes (#2614) * Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * llama : sync gguf-llama with llama (#2613) * llama : sync gguf-llama with llama * tests : fix build + warnings (test-tokenizer-1 still fails) * tests : fix wstring_convert * convert : fix layer names * llama : sync gguf-llama.cpp * convert : update HF converter to new tokenizer voodoo magics * llama : update tokenizer style * convert-llama-h5-to-gguf.py : add token types * constants.py : add token types * gguf.py : add token types * convert-llama-7b-pth-to-gguf.py : add token types * gguf-llama.cpp : fix n_head_kv * convert-llama-h5-to-gguf.py : add 70b gqa support * gguf.py : add tensor data layout * convert-llama-h5-to-gguf.py : add tensor data layout * convert-llama-7b-pth-to-gguf.py : add tensor data layout * gptneox-main.cpp : add tensor data layout * convert-llama-h5-to-gguf.py : clarify the reverse permute * llama : refactor model loading code (#2620) * llama : style formatting + remove helper methods * llama : fix quantization using gguf tool * llama : simplify gguf_file_saver * llama : fix method names * llama : simplify write_header() * llama : no need to pass full file loader to the file saver just gguf_ctx * llama : gguf_file_saver write I32 * llama : refactor tensor names (#2622) * gguf: update tensor names searched in quantization * gguf : define tensor names as constants * gguf : initial write API (not tested yet) * gguf : write to file API (not tested) * gguf : initial write API ready + example * gguf : fix header write * gguf : fixes + simplify example + add ggml_nbytes_pad() * gguf : minor * llama : replace gguf_file_saver with new gguf write API * gguf : streaming support when writing files * gguf : remove oboslete write methods * gguf : remove obosolete gguf_get_arr_xxx API * llama : simplify gguf_file_loader * llama : move hparams and vocab from gguf_file_loader to llama_model_loader * llama : merge gguf-util.h in llama.cpp * llama : reorder definitions in .cpp to match .h * llama : minor simplifications * llama : refactor llama_model_loader (WIP) wip : remove ggml_ctx from llama_model_loader wip : merge gguf_file_loader in llama_model_loader * llama : fix shape prints * llama : fix Windows build + fix norm_rms_eps key * llama : throw error on missing KV paris in model meta data * llama : improve printing + log meta data * llama : switch print order of meta data --------- Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com> * gguf : deduplicate (#2629) * gguf : better type names * dedup : CPU + Metal is working * ggml : fix warnings about unused results * llama.cpp : fix line feed and compiler warning * llama : fix strncpy warning + note token_to_str does not write null * llama : restore the original load/save session implementation Will migrate this to GGUF in the future * convert-llama-h5-to-gguf.py : support alt ctx param name * ggml : assert when using ggml_mul with non-F32 src1 * examples : dedup simple --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> * gguf.py : merge all files in gguf.py * convert-new.py : pick #2427 for HF 70B support * examples/gguf : no need to keep q option for quantization any more * llama.cpp : print actual model size * llama.cpp : use ggml_elements() * convert-new.py : output gguf (#2635) * convert-new.py : output gguf (WIP) * convert-new.py : add gguf key-value pairs * llama : add hparams.ctx_train + no longer print ftype * convert-new.py : minor fixes * convert-new.py : vocab-only option should work now * llama : fix tokenizer to use llama_char_to_byte * tests : add new ggml-vocab-llama.gguf * convert-new.py : tensor name mapping * convert-new.py : add map for skipping tensor serialization * convert-new.py : convert script now works * gguf.py : pick some of the refactoring from #2644 * convert-new.py : minor fixes * convert.py : update to support GGUF output * Revert "ci : disable CI temporary to not waste energy" This reverts commit 7e82d25f40386540c2c15226300ad998ecd871ea. * convert.py : n_head_kv optional and .gguf file extension * convert.py : better always have n_head_kv and default it to n_head * llama : sync with recent PRs on master * editorconfig : ignore models folder ggml-ci * ci : update ".bin" to ".gguf" extension ggml-ci * llama : fix llama_model_loader memory leak * gptneox : move as a WIP example * llama : fix lambda capture ggml-ci * ggml : fix bug in gguf_set_kv ggml-ci * common.h : .bin --> .gguf * quantize-stats.cpp : .bin --> .gguf * convert.py : fix HF tensor permuting / unpacking ggml-ci * llama.cpp : typo * llama : throw error if gguf fails to init from file ggml-ci * llama : fix tensor name grepping during quantization ggml-ci * gguf.py : write tensors in a single pass (#2644) * gguf : single pass for writing tensors + refactoring writer * gguf : single pass for writing tensors + refactoring writer * gguf : single pass for writing tensors + refactoring writer * gguf : style fixes in simple conversion script * gguf : refactor gptneox conversion script * gguf : rename h5 to hf (for HuggingFace) * gguf : refactor pth to gguf conversion script * gguf : rm file_type key and method * gguf.py : fix vertical alignment * gguf.py : indentation --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * convert-gptneox-hf-to-gguf.py : fixes * gguf.py : gptneox mapping * convert-llama-hf-to-gguf.py : fixes * convert-llama-7b-pth-to-gguf.py : fixes * ggml.h : reverse GGUF_MAGIC * gguf.py : reverse GGUF_MAGIC * test-tokenizer-0.cpp : fix warning * llama.cpp : print kv general.name * llama.cpp : get special token kv and linefeed token id * llama : print number of tensors per type + print arch + style * tests : update vocab file with new magic * editorconfig : fix whitespaces * llama : re-order functions * llama : remove C++ API + reorganize common source in /common dir * llama : minor API updates * llama : avoid hardcoded special tokens * llama : fix MPI build ggml-ci * llama : introduce enum llama_vocab_type + remove hardcoded string constants * convert-falcon-hf-to-gguf.py : falcon HF --> gguf conversion, not tested * falcon-main.cpp : falcon inference example * convert-falcon-hf-to-gguf.py : remove extra kv * convert-gptneox-hf-to-gguf.py : remove extra kv * convert-llama-7b-pth-to-gguf.py : remove extra kv * convert-llama-hf-to-gguf.py : remove extra kv * gguf.py : fix for falcon 40b * falcon-main.cpp : fix for falcon 40b * convert-falcon-hf-to-gguf.py : update ref * convert-falcon-hf-to-gguf.py : add tensor data layout * cmpnct_gpt2bpe.hpp : fixes * falcon-main.cpp : fixes * gptneox-main.cpp : fixes * cmpnct_gpt2bpe.hpp : remove non-general stuff * Update examples/server/README.md Co-authored-by: slaren <slarengh@gmail.com> * cmpnct_gpt2bpe.hpp : cleanup * convert-llama-hf-to-gguf.py : special tokens * convert-llama-7b-pth-to-gguf.py : special tokens * convert-permute-debug.py : permute debug print * convert-permute-debug-master.py : permute debug for master * convert-permute-debug.py : change permute type of attn_q * convert.py : 70b model working (change attn_q permute) * Delete convert-permute-debug-master.py * Delete convert-permute-debug.py * convert-llama-hf-to-gguf.py : fix attn_q permute * gguf.py : fix rope scale kv * convert-llama-hf-to-gguf.py : rope scale and added tokens * convert-llama-7b-pth-to-gguf.py : rope scale and added tokens * llama.cpp : use rope scale kv * convert-llama-7b-pth-to-gguf.py : rope scale fix * convert-llama-hf-to-gguf.py : rope scale fix * py : fix whitespace * gguf : add Python script to convert GGMLv3 LLaMA models to GGUF (#2682) * First pass at converting GGMLv3 LLaMA models to GGUF * Cleanups, better output during conversion * Fix vocab space conversion logic * More vocab conversion fixes * Add description to converted GGUF files * Improve help text, expand warning * Allow specifying name and description for output GGUF * Allow overriding vocab and hyperparams from original model metadata * Use correct params override var name * Fix wrong type size for Q8_K Better handling of original style metadata * Set default value for gguf add_tensor raw_shape KW arg * llama : improve token type support (#2668) * Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * Improved tokenizer test But does it work on MacOS? * Improve token type support - Added @klosax code to convert.py - Improved token type support in vocabulary * Exclude platform dependent tests * More sentencepiece compatibility by eliminating magic numbers * Restored accidentally removed comment * llama : add API for token type ggml-ci * tests : use new tokenizer type API (#2692) * Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * Improved tokenizer test But does it work on MacOS? * Improve token type support - Added @klosax code to convert.py - Improved token type support in vocabulary * Exclude platform dependent tests * More sentencepiece compatibility by eliminating magic numbers * Restored accidentally removed comment * Improve commentary * Use token type API in test-tokenizer-1.cpp * py : cosmetics * readme : add notice about new file format ggml-ci --------- Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com> Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> Co-authored-by: goerch <jhr.walter@t-online.de> Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c1013
1 files changed, 1009 insertions, 4 deletions
diff --git a/ggml.c b/ggml.c
index 44c43b42..c917d73c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -213,10 +213,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
error_desc = "insufficient memory";
break;
}
- GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
- __func__, error_desc, size/(1024.0*1024.0));
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
return NULL;
}
+
return aligned_memory;
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
@@ -4091,7 +4091,11 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
//
// is enough, but just in case, adding the second part
- return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type), GGML_MEM_ALIGN);
+ return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
+}
+
+size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
+ return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
}
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
@@ -9118,6 +9122,8 @@ static void ggml_compute_forward_mul(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
+
switch (src0->type) {
case GGML_TYPE_F32:
{
@@ -16881,7 +16887,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
// compute size of intermediate results
// TODO: does not take into account scratch buffers !!!!
for (int i = 0; i < cgraph->n_nodes; ++i) {
- size_eval += ggml_nbytes(cgraph->nodes[i]);
+ size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
}
// print
@@ -18542,6 +18548,1005 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
////////////////////////////////////////////////////////////////////////////////
+struct gguf_str {
+ uint32_t n;
+ char * data;
+};
+
+static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
+ [GGUF_TYPE_UINT8] = sizeof(uint8_t),
+ [GGUF_TYPE_INT8] = sizeof(int8_t),
+ [GGUF_TYPE_UINT16] = sizeof(uint16_t),
+ [GGUF_TYPE_INT16] = sizeof(int16_t),
+ [GGUF_TYPE_UINT32] = sizeof(uint32_t),
+ [GGUF_TYPE_INT32] = sizeof(int32_t),
+ [GGUF_TYPE_FLOAT32] = sizeof(float),
+ [GGUF_TYPE_BOOL] = sizeof(bool),
+ [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
+ [GGUF_TYPE_ARRAY] = 0, // undefined
+};
+static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
+
+static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
+ [GGUF_TYPE_UINT8] = "u8",
+ [GGUF_TYPE_INT8] = "i8",
+ [GGUF_TYPE_UINT16] = "u16",
+ [GGUF_TYPE_INT16] = "i16",
+ [GGUF_TYPE_UINT32] = "u32",
+ [GGUF_TYPE_INT32] = "i32",
+ [GGUF_TYPE_FLOAT32] = "f32",
+ [GGUF_TYPE_BOOL] = "bool",
+ [GGUF_TYPE_STRING] = "str",
+ [GGUF_TYPE_ARRAY] = "arr",
+};
+static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
+
+union gguf_value {
+ uint8_t uint8;
+ int8_t int8;
+ uint16_t uint16;
+ int16_t int16;
+ uint32_t uint32;
+ int32_t int32;
+ float float32;
+ bool bool_;
+
+ struct gguf_str str;
+
+ struct {
+ enum gguf_type type;
+
+ uint32_t n;
+ void * data;
+ } arr;
+};
+
+struct gguf_kv {
+ struct gguf_str key;
+
+ uint32_t n_bytes; // TODO: is this actually needed?
+
+ enum gguf_type type;
+ union gguf_value value;
+};
+
+struct gguf_header {
+ uint32_t magic;
+ uint32_t version;
+ uint32_t n_tensors;
+ uint32_t n_kv;
+};
+
+struct gguf_tensor_info {
+ struct gguf_str name;
+
+ uint32_t n_dims;
+ uint32_t ne[GGML_MAX_DIMS];
+
+ enum ggml_type type;
+
+ uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+ // for writing API
+ const void * data;
+ size_t size;
+};
+
+struct gguf_context {
+ struct gguf_header header;
+
+ struct gguf_kv * kv;
+ struct gguf_tensor_info * infos;
+
+ size_t alignment;
+ size_t offset; // offset of `data` from beginning of file
+ size_t size; // size of `data` in bytes
+
+ //uint8_t * padding;
+ void * data;
+};
+
+static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
+ const size_t n = fread(dst, 1, size, file);
+ *offset += n;
+ return n == size;
+}
+
+static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
+ p->n = 0;
+ p->data = NULL;
+
+ bool ok = true;
+
+ // TODO: how to avoid mallocs for strings?
+ ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
+
+ return ok;
+}
+
+struct gguf_context * gguf_init_empty(void) {
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+
+ ctx->header.magic = GGUF_MAGIC;
+ ctx->header.version = GGUF_VERSION;
+ ctx->header.n_tensors = 0;
+ ctx->header.n_kv = 0;
+
+ ctx->kv = NULL;
+ ctx->infos = NULL;
+
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
+ ctx->offset = 0;
+ ctx->size = 0;
+
+ ctx->data = NULL;
+
+ return ctx;
+}
+
+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
+ FILE * file = fopen(fname, "rb");
+ if (!file) {
+ return NULL;
+ }
+
+ // offset from start of file
+ size_t offset = 0;
+
+ uint32_t magic = 0;
+
+ // check the magic before making allocations
+ {
+ gguf_fread_el(file, &magic, sizeof(magic), &offset);
+
+ if (magic != GGUF_MAGIC) {
+ fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+ fclose(file);
+ return NULL;
+ }
+ }
+
+ bool ok = true;
+
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+
+ // read the header
+ {
+ ctx->header.magic = magic;
+
+ ctx->kv = NULL;
+ ctx->infos = NULL;
+ ctx->data = NULL;
+
+ ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read header\n", __func__);
+ fclose(file);
+ gguf_free(ctx);
+ return NULL;
+ }
+ }
+
+ // read the kv pairs
+ {
+ ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
+
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+ struct gguf_kv * kv = &ctx->kv[i];
+
+ //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
+
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
+ //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
+
+ //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
+
+ switch (kv->type) {
+ case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
+ case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
+ case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
+ case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
+ case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
+ case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
+ case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
+ case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
+ case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
+ case GGUF_TYPE_ARRAY:
+ {
+ ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
+
+ switch (kv->value.arr.type) {
+ case GGUF_TYPE_UINT8:
+ case GGUF_TYPE_INT8:
+ case GGUF_TYPE_UINT16:
+ case GGUF_TYPE_INT16:
+ case GGUF_TYPE_UINT32:
+ case GGUF_TYPE_INT32:
+ case GGUF_TYPE_FLOAT32:
+ case GGUF_TYPE_BOOL:
+ {
+ kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+ ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
+ } break;
+ case GGUF_TYPE_STRING:
+ {
+ kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+ ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
+ }
+ } break;
+ case GGUF_TYPE_ARRAY:
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
+ };
+ } break;
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
+ };
+
+ if (!ok) {
+ break;
+ }
+ }
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
+ fclose(file);
+ gguf_free(ctx);
+ return NULL;
+ }
+ }
+
+ // read the tensor infos
+ {
+ ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ info->ne[j] = 1;
+ }
+
+ ok = ok && gguf_fread_str(file, &info->name, &offset);
+ ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
+ }
+ ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
+ ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read tensor info\n", __func__);
+ fclose(file);
+ gguf_free(ctx);
+ return NULL;
+ }
+ }
+ }
+
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
+
+ int alignment_idx = gguf_find_key(ctx, "general.alignment");
+ if (alignment_idx != -1) {
+ ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
+ }
+
+ // we require the data section to be aligned, so take into account any padding
+ {
+ const size_t offset_pad = offset % ctx->alignment;
+
+ if (offset_pad != 0) {
+ offset += ctx->alignment - offset_pad;
+ fseek(file, offset, SEEK_SET);
+ }
+ }
+
+ // store the current file offset - this is where the data section starts
+ ctx->offset = offset;
+
+ // compute the total size of the data section, taking into account the alignment
+ {
+ ctx->size = 0;
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ const int64_t ne =
+ (int64_t) info->ne[0] *
+ (int64_t) info->ne[1] *
+ (int64_t) info->ne[2] *
+ (int64_t) info->ne[3];
+
+ if (ne % ggml_blck_size(info->type) != 0) {
+ fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
+ __func__, info->name.data, ne, ggml_blck_size(info->type));
+ fclose(file);
+ gguf_free(ctx);
+ return NULL;
+ }
+
+ const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
+
+ ctx->size += GGML_PAD(size_cur, ctx->alignment);
+ }
+ }
+
+ // load the tensor data only if requested
+ if (params.ctx != NULL) {
+ // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
+ // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
+ // the ggml_tensor structs to the appropriate locations in the binary blob
+
+ // compute the exact size needed for the new ggml_context
+ const size_t mem_size =
+ params.no_alloc ?
+ (ctx->header.n_tensors )*ggml_tensor_overhead() :
+ (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
+
+ struct ggml_init_params pdata = {
+ .mem_size = mem_size,
+ .mem_buffer = NULL,
+ .no_alloc = params.no_alloc,
+ };
+
+ *params.ctx = ggml_init(pdata);
+
+ struct ggml_context * ctx_data = *params.ctx;
+
+ struct ggml_tensor * data = NULL;
+
+ if (params.no_alloc == false) {
+ data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
+
+ ok = ok && data != NULL;
+
+ // read the binary blob with the tensor data
+ ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read tensor data\n", __func__);
+ fclose(file);
+ ggml_free(ctx_data);
+ gguf_free(ctx);
+ return NULL;
+ }
+
+ ctx->data = data->data;
+ }
+
+ ggml_set_no_alloc(ctx_data, true);
+
+ // create the tensors
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ const int64_t ne[GGML_MAX_DIMS] = {
+ ctx->infos[i].ne[0],
+ ctx->infos[i].ne[1],
+ ctx->infos[i].ne[2],
+ ctx->infos[i].ne[3],
+ };
+
+ struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
+
+ ok = ok && cur != NULL;
+
+ ggml_set_name(cur, ctx->infos[i].name.data);
+
+ if (!ok) {
+ break;
+ }
+
+ // point the data member to the appropriate location in the binary blob using the tensor infos
+ if (params.no_alloc == false) {
+ //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
+ cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
+ }
+ }
+
+ if (!ok) {
+ fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
+ fclose(file);
+ ggml_free(ctx_data);
+ gguf_free(ctx);
+ return NULL;
+ }
+
+ ggml_set_no_alloc(ctx_data, params.no_alloc);
+ }
+
+ fclose(file);
+
+ return ctx;
+}
+
+void gguf_free(struct gguf_context * ctx) {
+ if (ctx == NULL) {
+ return;
+ }
+
+ if (ctx->kv) {
+ // free string memory - not great..
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+ struct gguf_kv * kv = &ctx->kv[i];
+
+ if (kv->key.data) {
+ free(kv->key.data);
+ }
+
+ if (kv->type == GGUF_TYPE_STRING) {
+ if (kv->value.str.data) {
+ free(kv->value.str.data);
+ }
+ }
+
+ if (kv->type == GGUF_TYPE_ARRAY) {
+ if (kv->value.arr.data) {
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
+ if (str->data) {
+ free(str->data);
+ }
+ }
+ }
+ free(kv->value.arr.data);
+ }
+ }
+ }
+
+ GGML_ALIGNED_FREE(ctx->kv);
+ }
+
+ if (ctx->infos) {
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ if (info->name.data) {
+ free(info->name.data);
+ }
+ }
+
+ GGML_ALIGNED_FREE(ctx->infos);
+ }
+
+ GGML_ALIGNED_FREE(ctx);
+}
+
+const char * gguf_type_name(enum gguf_type type) {
+ return GGUF_TYPE_NAME[type];
+}
+
+int gguf_get_version(struct gguf_context * ctx) {
+ return ctx->header.version;
+}
+
+size_t gguf_get_alignment(struct gguf_context * ctx) {
+ return ctx->alignment;
+}
+
+size_t gguf_get_data_offset(struct gguf_context * ctx) {
+ return ctx->offset;
+}
+
+void * gguf_get_data(struct gguf_context * ctx) {
+ return ctx->data;
+}
+
+int gguf_get_n_kv(struct gguf_context * ctx) {
+ return ctx->header.n_kv;
+}
+
+int gguf_find_key(struct gguf_context * ctx, const char * key) {
+ // return -1 if key not found
+ int keyfound = -1;
+
+ const int n_kv = gguf_get_n_kv(ctx);
+
+ for (int i = 0; i < n_kv; ++i) {
+ if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
+ keyfound = i;
+ break;
+ }
+ }
+
+ return keyfound;
+}
+
+const char * gguf_get_key(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].key.data;
+}
+
+enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].type;
+}
+
+enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.arr.type;
+}
+
+const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.arr.data;
+}
+
+const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
+ struct gguf_kv * kv = &ctx->kv[key_id];
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
+ return str->data;
+}
+
+int gguf_get_arr_n(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.arr.n;
+}
+
+uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.uint8;
+}
+
+int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.int8;
+}
+
+uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.uint16;
+}
+
+int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.int16;
+}
+
+uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.uint32;
+}
+
+int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.int32;
+}
+
+float gguf_get_val_f32(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.float32;
+}
+
+bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.bool_;
+}
+
+const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
+ return ctx->kv[i].value.str.data;
+}
+
+int gguf_get_n_tensors(struct gguf_context * ctx) {
+ return ctx->header.n_tensors;
+}
+
+int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
+ // return -1 if tensor not found
+ int tensorfound = -1;
+
+ const int n_tensors = gguf_get_n_tensors(ctx);
+
+ for (int i = 0; i < n_tensors; ++i) {
+ if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
+ tensorfound = i;
+ break;
+ }
+ }
+
+ return tensorfound;
+}
+
+size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
+ return ctx->infos[i].offset;
+}
+
+char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
+ return ctx->infos[i].name.data;
+}
+
+// returns the index
+static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
+ const int idx = gguf_find_key(ctx, key);
+ if (idx >= 0) {
+ return idx;
+ }
+
+ const int n_kv = gguf_get_n_kv(ctx);
+
+ ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+ ctx->kv[n_kv].key.n = strlen(key) + 1;
+ ctx->kv[n_kv].key.data = strdup(key);
+ ctx->header.n_kv++;
+
+ return n_kv;
+}
+
+void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_UINT8;
+ ctx->kv[idx].value.uint8 = val;
+}
+
+void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_INT8;
+ ctx->kv[idx].value.int8 = val;
+}
+
+void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_UINT16;
+ ctx->kv[idx].value.uint16 = val;
+}
+
+void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_INT16;
+ ctx->kv[idx].value.int16 = val;
+}
+
+void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_UINT32;
+ ctx->kv[idx].value.uint32 = val;
+}
+
+void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_INT32;
+ ctx->kv[idx].value.int32 = val;
+}
+
+void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
+ ctx->kv[idx].value.float32 = val;
+}
+
+void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_BOOL;
+ ctx->kv[idx].value.bool_ = val;
+}
+
+void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_STRING;
+ ctx->kv[idx].value.str.n = strlen(val) + 1;
+ ctx->kv[idx].value.str.data = strdup(val);
+}
+
+void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
+ ctx->kv[idx].value.arr.type = type;
+ ctx->kv[idx].value.arr.n = n;
+ ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
+ memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
+}
+
+void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
+ const int idx = gguf_get_or_add_key(ctx, key);
+
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
+ ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
+ ctx->kv[idx].value.arr.n = n;
+ ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
+ for (int i = 0; i < n; i++) {
+ struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
+ str->n = strlen(data[i]) + 1;
+ str->data = strdup(data[i]);
+ }
+}
+
+// set or add KV pairs from another context
+void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
+ for (uint32_t i = 0; i < src->header.n_kv; i++) {
+ switch (src->kv[i].type) {
+ case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
+ case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
+ case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
+ case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
+ case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
+ case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
+ case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
+ case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
+ case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
+ case GGUF_TYPE_ARRAY:
+ {
+ if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
+ const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
+ for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
+ data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
+ }
+ gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
+ free(data);
+ } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
+ GGML_ASSERT(false && "nested arrays not supported");
+ } else {
+ gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
+ }
+ } break;
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
+ }
+ }
+}
+
+void gguf_add_tensor(
+ struct gguf_context * ctx,
+ const struct ggml_tensor * tensor) {
+ const int idx = ctx->header.n_tensors;
+ ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
+
+ ctx->infos[idx].name.n = strlen(tensor->name) + 1;
+ ctx->infos[idx].name.data = strdup(tensor->name);
+
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+ ctx->infos[idx].ne[i] = 1;
+ }
+
+ ctx->infos[idx].n_dims = tensor->n_dims;
+ for (int i = 0; i < tensor->n_dims; i++) {
+ ctx->infos[idx].ne[i] = tensor->ne[i];
+ }
+
+ ctx->infos[idx].type = tensor->type;
+ ctx->infos[idx].offset = 0;
+ ctx->infos[idx].data = tensor->data;
+ ctx->infos[idx].size = ggml_nbytes(tensor);
+
+ if (ctx->header.n_tensors > 0) {
+ ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
+ }
+
+ ctx->header.n_tensors++;
+}
+
+void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
+ const int idx = gguf_find_tensor(ctx, name);
+ if (idx < 0) {
+ GGML_ASSERT(false && "tensor not found");
+ }
+
+ ctx->infos[idx].type = type;
+}
+
+void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
+ const int idx = gguf_find_tensor(ctx, name);
+ if (idx < 0) {
+ GGML_ASSERT(false && "tensor not found");
+ }
+
+ ctx->infos[idx].data = data;
+ ctx->infos[idx].size = size;
+
+ // update offsets
+ for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
+ ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
+ }
+}
+
+//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
+// fwrite(&val->n, sizeof(val->n), 1, file);
+// fwrite(val->data, sizeof(char), val->n, file);
+//}
+//
+//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
+// fwrite(val, sizeof(char), size, file);
+//}
+
+struct gguf_buf {
+ void * data;
+ size_t size;
+ size_t offset;
+};
+
+static struct gguf_buf gguf_buf_init(size_t size) {
+ struct gguf_buf buf = {
+ /*buf.data =*/ size == 0 ? NULL : malloc(size),
+ /*buf.size =*/ size,
+ /*buf.offset =*/ 0,
+ };
+
+ return buf;
+}
+
+static void gguf_buf_free(struct gguf_buf buf) {
+ if (buf.data) {
+ free(buf.data);
+ }
+}
+
+static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
+ if (buf->offset + size > buf->size) {
+ buf->size = 1.5*(buf->offset + size);
+ if (buf->data) {
+ buf->data = realloc(buf->data, buf->size);
+ }
+ }
+}
+
+static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
+ gguf_buf_grow(buf, sizeof(val->n) + val->n);
+
+ if (buf->data) {
+ memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
+ }
+ buf->offset += sizeof(val->n);
+
+ if (buf->data) {
+ memcpy((char *) buf->data + buf->offset, val->data, val->n);
+ }
+ buf->offset += val->n;
+}
+
+static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
+ gguf_buf_grow(buf, el_size);
+
+ if (buf->data) {
+ memcpy((char *) buf->data + buf->offset, val, el_size);
+ }
+ buf->offset += el_size;
+}
+
+static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
+ // write header
+ gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
+ gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
+ gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
+ gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
+
+ // write key-value pairs
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+ struct gguf_kv * kv = &ctx->kv[i];
+
+ gguf_bwrite_str(buf, &kv->key);
+ gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
+
+ switch (kv->type) {
+ case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
+ case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
+ case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
+ case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
+ case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
+ case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
+ case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
+ case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
+ case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
+ case GGUF_TYPE_ARRAY:
+ {
+ gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
+ gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
+
+ switch (kv->value.arr.type) {
+ case GGUF_TYPE_UINT8:
+ case GGUF_TYPE_INT8:
+ case GGUF_TYPE_UINT16:
+ case GGUF_TYPE_INT16:
+ case GGUF_TYPE_UINT32:
+ case GGUF_TYPE_INT32:
+ case GGUF_TYPE_FLOAT32:
+ case GGUF_TYPE_BOOL:
+ {
+ gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+ } break;
+ case GGUF_TYPE_STRING:
+ {
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+ gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
+ }
+ } break;
+ case GGUF_TYPE_ARRAY:
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
+ };
+ } break;
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
+ };
+ }
+
+ // write tensor infos
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ gguf_bwrite_str(buf, &info->name);
+ gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
+ gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
+ }
+ gguf_bwrite_el(buf, &info->type, sizeof(info->type));
+ gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
+ }
+
+ // we require the data section to be aligned, so take into account any padding
+ {
+ const size_t offset = buf->offset;
+ const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
+
+ if (offset_pad != offset) {
+ uint8_t pad = 0;
+ for (size_t i = 0; i < offset_pad - offset; ++i) {
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
+ }
+ }
+ }
+
+ if (only_meta) {
+ return;
+ }
+
+ size_t offset = 0;
+
+ // write tensor data
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+ struct gguf_tensor_info * info = &ctx->infos[i];
+
+ const size_t size = info->size;
+ const size_t size_pad = GGML_PAD(size, ctx->alignment);
+
+ gguf_bwrite_el(buf, info->data, size);
+
+ if (size_pad != size) {
+ uint8_t pad = 0;
+ for (size_t j = 0; j < size_pad - size; ++j) {
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
+ }
+ }
+
+ GGML_ASSERT(offset == info->offset);
+
+ offset += size_pad;
+ }
+}
+
+void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
+ FILE * file = fopen(fname, "wb");
+ if (!file) {
+ GGML_ASSERT(false && "failed to open file for writing");
+ }
+
+ struct gguf_buf buf = gguf_buf_init(16*1024);
+
+ gguf_write_to_buf(ctx, &buf, only_meta);
+
+ fwrite(buf.data, 1, buf.offset, file);
+
+ gguf_buf_free(buf);
+
+ fclose(file);
+}
+
+size_t gguf_get_meta_size(struct gguf_context * ctx) {
+ // no allocs - only compute size
+ struct gguf_buf buf = gguf_buf_init(0);
+
+ gguf_write_to_buf(ctx, &buf, true);
+
+ return buf.offset;
+}
+
+void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
+ struct gguf_buf buf = gguf_buf_init(16*1024);
+
+ gguf_write_to_buf(ctx, &buf, true);
+
+ memcpy(data, buf.data, buf.offset);
+
+ gguf_buf_free(buf);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
int ggml_cpu_has_avx(void) {
#if defined(__AVX__)
return 1;