summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xconvert.py37
-rw-r--r--examples/llava/README.md12
-rw-r--r--examples/llava/clip.cpp766
-rw-r--r--examples/llava/clip.h47
-rw-r--r--examples/llava/convert-image-encoder-to-gguf.py66
-rw-r--r--examples/llava/llava-cli.cpp26
-rw-r--r--examples/llava/llava-surgery-v2.py167
-rw-r--r--examples/llava/llava.cpp296
-rw-r--r--examples/llava/llava.h2
-rw-r--r--examples/server/server.cpp15
10 files changed, 1229 insertions, 205 deletions
diff --git a/convert.py b/convert.py
index 323e8058..63a0a5d7 100755
--- a/convert.py
+++ b/convert.py
@@ -1173,7 +1173,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
for (name, tensor) in model.items()}
-def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
+def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
@@ -1199,7 +1199,11 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
for name, lazy_tensor in model.items():
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
if name_new is None:
- raise Exception(f"Unexpected tensor name: {name}")
+ if skip_unknown:
+ print(f"Unexpected tensor name: {name} - skipping")
+ continue
+ else:
+ raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
if tensor_type in should_skip:
print(f"skipping tensor {name_new}")
@@ -1377,19 +1381,20 @@ def main(args_in: list[str] | None = None) -> None:
output_choices.append("q8_0")
vocab_types = ["spm", "bpe", "hfft"]
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
- parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
- parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
- parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
- parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
- parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
- parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
- parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
- parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
- parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
- parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
- parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
- parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
- parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
+ parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
+ parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
+ parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
+ parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+ parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+ parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
+ parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+ parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+ parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
+ parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
+ parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
+ parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
+ parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
args = parser.parse_args(args_in)
if args.awq_path:
@@ -1461,7 +1466,7 @@ def main(args_in: list[str] | None = None) -> None:
print(f"Special vocab info: {special_vocab}")
model = model_plus.model
- model = convert_model_names(model, params)
+ model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 19f1a50a..e2ef0eff 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -19,9 +19,9 @@ After building, run: `./llava-cli` to see the usage. For example:
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
-## Model conversion
+## LLaVA 1.5
-- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally:
+- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
@@ -55,8 +55,14 @@ python ./convert.py ../llava-v1.5-7b
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
+## LLaVA 1.6
+
+- Use `llava-surgery-v2.py`
+
+- TODO: add detailed instructions
+
## TODO
-- [ ] Support non-CPU backend for the image encoding part.
+- [x] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods.
- [ ] Support more model variants.
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index ccd0d85a..9c5091e6 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1,7 +1,7 @@
// NOTE: This is modified from clip.cpp only for LLaVA,
// so there might be still unnecessary artifacts hanging around
// I'll gradually clean and extend it
-
+// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
#include "ggml.h"
#include "ggml-alloc.h"
@@ -30,6 +30,26 @@
#include <vector>
#include <sstream>
#include <cinttypes>
+#include <limits>
+
+//#define CLIP_DEBUG_FUNCTIONS
+
+// RGB uint8 image
+struct clip_image_u8 {
+ int nx;
+ int ny;
+
+ std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+ int nx;
+ int ny;
+
+ std::vector<float> buf;
+};
static std::string format(const char * fmt, ...) {
va_list ap;
@@ -50,50 +70,56 @@ static std::string format(const char * fmt, ...) {
// key constants
//
-#define KEY_FTYPE "general.file_type"
-#define KEY_NAME "general.name"
-#define KEY_DESCRIPTION "general.description"
-#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
-#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
+#define KEY_FTYPE "general.file_type"
+#define KEY_NAME "general.name"
+#define KEY_DESCRIPTION "general.description"
+#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
+#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
-#define KEY_USE_GELU "clip.use_gelu"
-#define KEY_N_EMBD "clip.%s.embedding_length"
-#define KEY_N_FF "clip.%s.feed_forward_length"
-#define KEY_N_BLOCK "clip.%s.block_count"
-#define KEY_N_HEAD "clip.%s.attention.head_count"
+#define KEY_USE_GELU "clip.use_gelu"
+#define KEY_N_EMBD "clip.%s.embedding_length"
+#define KEY_N_FF "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK "clip.%s.block_count"
+#define KEY_N_HEAD "clip.%s.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM "clip.%s.projection_dim"
-#define KEY_TOKENS "tokenizer.ggml.tokens"
-#define KEY_N_POSITIONS "clip.text.context_length"
-#define KEY_IMAGE_SIZE "clip.vision.image_size"
-#define KEY_PATCH_SIZE "clip.vision.patch_size"
-#define KEY_IMAGE_MEAN "clip.vision.image_mean"
-#define KEY_IMAGE_STD "clip.vision.image_std"
-#define KEY_PROJ_TYPE "clip.projector_type"
+#define KEY_PROJ_DIM "clip.%s.projection_dim"
+#define KEY_TOKENS "tokenizer.ggml.tokens"
+#define KEY_N_POSITIONS "clip.text.context_length"
+#define KEY_IMAGE_SIZE "clip.vision.image_size"
+#define KEY_PATCH_SIZE "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN "clip.vision.image_mean"
+#define KEY_IMAGE_STD "clip.vision.image_std"
+#define KEY_PROJ_TYPE "clip.projector_type"
+
+#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+
//
// tensor name constants
//
-#define TN_TOKEN_EMBD "%s.token_embd.weight"
-#define TN_POS_EMBD "%s.position_embd.weight"
-#define TN_CLASS_EMBD "v.class_embd"
-#define TN_PATCH_EMBD "v.patch_embd.weight"
-#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
-#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
-#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
-#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
-#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
-#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
-#define TN_LN_1 "%s.blk.%d.ln1.%s"
-#define TN_LN_2 "%s.blk.%d.ln2.%s"
-#define TN_LN_PRE "%s.pre_ln.%s"
-#define TN_LN_POST "%s.post_ln.%s"
-#define TN_TEXT_PROJ "text_projection.weight"
-#define TN_VIS_PROJ "visual_projection.weight"
-#define TN_LLAVA_PROJ "mm.%d.%s"
-#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
+#define TN_TOKEN_EMBD "%s.token_embd.weight"
+#define TN_POS_EMBD "%s.position_embd.weight"
+#define TN_CLASS_EMBD "v.class_embd"
+#define TN_PATCH_EMBD "v.patch_embd.weight"
+#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
+#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
+#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
+#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
+#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
+#define TN_LN_1 "%s.blk.%d.ln1.%s"
+#define TN_LN_2 "%s.blk.%d.ln2.%s"
+#define TN_LN_PRE "%s.pre_ln.%s"
+#define TN_LN_POST "%s.post_ln.%s"
+#define TN_TEXT_PROJ "text_projection.weight"
+#define TN_VIS_PROJ "visual_projection.weight"
+#define TN_LLAVA_PROJ "mm.%d.%s"
+#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_IMAGE_NEWLINE "model.image_newline"
enum projector_type {
@@ -104,8 +130,8 @@ enum projector_type {
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
- { PROJECTOR_TYPE_MLP, "mlp" },
- { PROJECTOR_TYPE_LDP, "ldp" },
+ { PROJECTOR_TYPE_MLP, "mlp" },
+ { PROJECTOR_TYPE_LDP, "ldp" },
};
@@ -165,7 +191,6 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
}
}
-
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
@@ -217,7 +242,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
}
}
-static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
+static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
size_t tensor_size = ggml_nbytes(tensor);
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
@@ -233,31 +258,136 @@ static projector_type clip_projector_type_from_string(const std::string & name)
return PROJECTOR_TYPE_UNKNOWN;
}
-//
-// image data
-//
+#ifdef CLIP_DEBUG_FUNCTIONS
+static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
+ std::ofstream file(filename, std::ios::binary);
+ if (!file.is_open()) {
+ std::cerr << "Failed to open file for writing: " << filename << std::endl;
+ return;
+ }
-// RGB uint8 image
-struct clip_image_u8 {
- int nx;
- int ny;
+ // PPM header: P6 format, width, height, and max color value
+ file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
- std::vector<uint8_t> buf;
-};
+ // Write pixel data
+ for (size_t i = 0; i < img.buf.size(); i += 3) {
+ // PPM expects binary data in RGB format, which matches our image buffer
+ file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
+ }
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
- int nx;
- int ny;
+ file.close();
+}
+
+static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
+ std::ofstream file(filename, std::ios::binary);
+ if (!file.is_open()) {
+ std::cerr << "Failed to open file for writing: " << filename << std::endl;
+ return;
+ }
+
+ int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
+ int bytesPerPixel = 3;
+ int widthInBytes = img.nx * bytesPerPixel;
+ int paddingAmount = (4 - (widthInBytes % 4)) % 4;
+ int stride = widthInBytes + paddingAmount;
+
+ // Bitmap file header
+ unsigned char fileHeader[14] = {
+ 'B','M', // Signature
+ 0,0,0,0, // Image file size in bytes
+ 0,0,0,0, // Reserved
+ 54,0,0,0 // Start of pixel array
+ };
+
+ // Total file size
+ fileSize = 54 + (stride * img.ny);
+ fileHeader[2] = (unsigned char)(fileSize);
+ fileHeader[3] = (unsigned char)(fileSize >> 8);
+ fileHeader[4] = (unsigned char)(fileSize >> 16);
+ fileHeader[5] = (unsigned char)(fileSize >> 24);
+
+ // Bitmap information header (BITMAPINFOHEADER)
+ unsigned char infoHeader[40] = {
+ 40,0,0,0, // Size of this header (40 bytes)
+ 0,0,0,0, // Image width
+ 0,0,0,0, // Image height
+ 1,0, // Number of color planes
+ 24,0, // Bits per pixel
+ 0,0,0,0, // No compression
+ 0,0,0,0, // Image size (can be 0 for no compression)
+ 0,0,0,0, // X pixels per meter (not specified)
+ 0,0,0,0, // Y pixels per meter (not specified)
+ 0,0,0,0, // Total colors (color table not used)
+ 0,0,0,0 // Important colors (all are important)
+ };
+
+ // Width and height in the information header
+ infoHeader[4] = (unsigned char)(img.nx);
+ infoHeader[5] = (unsigned char)(img.nx >> 8);
+ infoHeader[6] = (unsigned char)(img.nx >> 16);
+ infoHeader[7] = (unsigned char)(img.nx >> 24);
+ infoHeader[8] = (unsigned char)(img.ny);
+ infoHeader[9] = (unsigned char)(img.ny >> 8);
+ infoHeader[10] = (unsigned char)(img.ny >> 16);
+ infoHeader[11] = (unsigned char)(img.ny >> 24);
+
+ // Write file headers
+ file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
+ file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
+
+ // Pixel data
+ std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
+ for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
+ for (int x = 0; x < img.nx; ++x) {
+ // Each pixel
+ size_t pixelIndex = (y * img.nx + x) * 3;
+ unsigned char pixel[3] = {
+ img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
+ img.buf[pixelIndex + 1],
+ img.buf[pixelIndex]
+ };
+ file.write(reinterpret_cast<char*>(pixel), 3);
+ }
+ // Write padding for the row
+ file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
+ }
+
+ file.close();
+}
+
+// debug function to convert f32 to u8
+static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
+ dst.nx = src.nx;
+ dst.ny = src.ny;
+ dst.buf.resize(3 * src.nx * src.ny);
+ for (size_t i = 0; i < src.buf.size(); ++i) {
+ dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
+ }
+}
+#endif
- std::vector<float> buf;
-};
//
// clip layers
//
+struct clip_hparams {
+ int32_t image_size;
+ int32_t patch_size;
+ int32_t hidden_size;
+ int32_t n_intermediate;
+ int32_t projection_dim;
+ int32_t n_head;
+ int32_t n_layer;
+
+ float eps;
+
+ char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
+
+ int32_t image_grid_pinpoints[32];
+ int32_t image_crop_resolution;
+};
+
struct clip_layer {
// attention
struct ggml_tensor * k_w;
@@ -287,7 +417,7 @@ struct clip_layer {
};
struct clip_vision_model {
- struct clip_vision_hparams hparams;
+ struct clip_hparams hparams;
// embeddings
struct ggml_tensor * class_embedding;
@@ -310,6 +440,8 @@ struct clip_vision_model {
struct ggml_tensor * mm_2_w = NULL;
struct ggml_tensor * mm_2_b = NULL;
+ struct ggml_tensor * image_newline = NULL;
+
// Yi type models with mlp+normalization projection
struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
struct ggml_tensor * mm_1_b = NULL;
@@ -364,9 +496,10 @@ struct clip_ctx {
std::vector<uint8_t> buf_compute_meta;
// memory buffers to evaluate the model
- ggml_backend_buffer_t params_buffer = NULL;
+ ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_buffer_t compute_buffer = NULL;
- ggml_backend_t backend = NULL;
+
+ ggml_backend_t backend = NULL;
ggml_gallocr_t compute_alloc = NULL;
};
@@ -379,18 +512,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
- const int image_size = hparams.image_size;
- const int patch_size = hparams.patch_size;
- const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
- const int num_positions = num_patches + 1;
- const int hidden_size = hparams.hidden_size;
- const int n_head = hparams.n_head;
- const int d_head = hidden_size / n_head;
- const int n_layer = hparams.n_layer;
- //const int n_intermediate = hparams.n_intermediate;
- //const int projection_dim = hparams.projection_dim;
- const float eps = hparams.eps;
- int batch_size = imgs->size;
+ const int image_size = hparams.image_size;
+ const int patch_size = hparams.patch_size;
+ const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
+ const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
+ const int num_positions = num_patches + 1;
+ const int hidden_size = hparams.hidden_size;
+ const int n_head = hparams.n_head;
+ const int d_head = hidden_size / n_head;
+ const int n_layer = hparams.n_layer;
+ const float eps = hparams.eps;
+
+ const int batch_size = imgs->size;
+
if (ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1);
}
@@ -540,7 +674,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
embeddings = ggml_gelu(ctx0, embeddings);
-
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
@@ -791,10 +924,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
if (idx != -1) {
const std::string proj_type = gguf_get_val_str(ctx, idx);
new_clip->proj_type = clip_projector_type_from_string(proj_type);
- }
- else {
+ } else {
new_clip->proj_type = PROJECTOR_TYPE_MLP;
}
+
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
@@ -920,11 +1053,41 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
+ try {
+ int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
+ int n = gguf_get_arr_n(ctx, idx);
+ const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
+ for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
+ hparams.image_grid_pinpoints[i] = pinpoints[i];
+ }
+ if (n < 32)
+ hparams.image_grid_pinpoints[n] = 0;
+ } catch (std::runtime_error & e) {
+ hparams.image_grid_pinpoints[0]=0;
+ }
+
+ try {
+ int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
+ strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
+ } catch (std::runtime_error & e) {
+ strcpy(hparams.mm_patch_merge_type, "flat");
+ }
+
+ try {
+ hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
+ } catch(const std::exception& e) {
+ hparams.image_crop_resolution = hparams.image_size;
+ }
+
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
+
+ const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
+ const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std);
+
for (int i = 0; i < 3; ++i) {
- new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
- new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
+ new_clip->image_mean[i] = mean_data[i];
+ new_clip->image_std[i] = std_data[i];
}
if (verbosity >= 2) {
@@ -936,13 +1099,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
printf("v_projection_dim %d\n", hparams.projection_dim);
printf("v_n_head %d\n", hparams.n_head);
printf("v_n_layer %d\n", hparams.n_layer);
+ printf("v_eps %f\n", hparams.eps);
+ printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+ printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+ printf("v_image_grid_pinpoints: ");
+ for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) {
+ printf("%d ", hparams.image_grid_pinpoints[i]);
+ }
+ printf("\n");
+ printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+
}
- vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
- vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
- vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
- vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
- vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+ try {
+ vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+ vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
+ vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+ vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+ vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+ } catch(const std::exception& e) {
+ fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
+ }
// LLaVA projection
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
@@ -968,40 +1145,43 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
} catch (std::runtime_error & e) { }
- }
- else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
+ try {
+ vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
+ // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
+ } catch (std::runtime_error & e) { }
+ } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
- vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
- vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
- vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
- vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
- vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
- vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
- vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+ vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
+ vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
+ vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
+ vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
+ vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+ vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+ vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
- vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
- vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
- vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
- vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
- vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
- vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+ vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+ vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+ vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+ vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+ vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+ vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
- vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
- vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
- vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
- }
- else {
+ vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+ vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+ vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
+ } else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
vision_model.layers.resize(hparams.n_layer);
+
for (int il = 0; il < hparams.n_layer; ++il) {
auto & layer = vision_model.layers[il];
layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
@@ -1084,24 +1264,255 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
return true;
}
-// normalize: x = (x - mean) / std
-// TODO: implement bicubic interpolation instead of linear.
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
+// Linear interpolation between two points
+inline float lerp(float s, float e, float t) {
+ return s + (e - s) * t;
+}
+// Bilinear resize function
+static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
+ dst.nx = target_width;
+ dst.ny = target_height;
+ dst.buf.resize(3 * target_width * target_height);
+
+ float x_ratio = static_cast<float>(src.nx - 1) / target_width;
+ float y_ratio = static_cast<float>(src.ny - 1) / target_height;
+
+ for (int y = 0; y < target_height; y++) {
+ for (int x = 0; x < target_width; x++) {
+ float px = x_ratio * x;
+ float py = y_ratio * y;
+ int x_floor = static_cast<int>(px);
+ int y_floor = static_cast<int>(py);
+ float x_lerp = px - x_floor;
+ float y_lerp = py - y_floor;
+
+ for (int c = 0; c < 3; c++) {
+ float top = lerp(
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
+ static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
+ x_lerp
+ );
+ float bottom = lerp(
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
+ static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
+ x_lerp
+ );
+ dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+ }
+ }
+ }
+}
+
+// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
+static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
+ dst->nx = src->nx;
+ dst->ny = src->ny;
+ dst->buf.resize(src->buf.size());
+
+ for (size_t i = 0; i < src->buf.size(); ++i) {
+ int c = i % 3; // rgb
+ dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
+ }
+}
+
+inline float clip(float x, float lower, float upper) {
+ return std::max(lower, std::min(x, upper));
+}
+
+static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
+ const int nx = img.nx;
+ const int ny = img.ny;
+
+ dst.nx = target_width;
+ dst.ny = target_height;
+ dst.buf.resize(3 * target_width * target_height);
+
+ float Cc;
+ float C[5];
+ float d0, d2, d3, a0, a1, a2, a3;
+ int i, j, k, jj;
+ int x, y;
+ float dx, dy;
+ float tx, ty;
+
+ tx = (float)nx / (float)target_width;
+ ty = (float)ny / (float)target_height;
+
+ // Bicubic interpolation; adapted from ViT.cpp, inspired from :
+ // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
+ // -> https://en.wikipedia.org/wiki/Bicubic_interpolation
+
+ for (i = 0; i < target_height; i++) {
+ for (j = 0; j < target_width; j++) {
+ x = (int)(tx * j);
+ y = (int)(ty * i);
+
+ dx = tx * j - x;
+ dy = ty * i - y;
+
+ for (k = 0; k < 3; k++) {
+ for (jj = 0; jj <= 3; jj++) {
+ d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+ d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+ d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+ a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+
+ C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
+
+ d0 = C[0] - C[1];
+ d2 = C[2] - C[1];
+ d3 = C[3] - C[1];
+ a0 = C[1];
+ a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+ a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+ a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+ Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
+
+ const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
+ dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+// llava-1.6 type of resize_and_pad (black)
+static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
+ int target_width = target_resolution.first;
+ int target_height = target_resolution.second;
+
+ float scale_w = static_cast<float>(target_width) / image.nx;
+ float scale_h = static_cast<float>(target_height) / image.ny;
+
+ int new_width, new_height;
+
+ if (scale_w < scale_h) {
+ new_width = target_width;
+ new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
+ } else {
+ new_height = target_height;
+ new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
+ }
+
+ clip_image_u8 resized_image;
+ // bilinear_resize(image, resized_image, new_width, new_height);
+ bicubic_resize(image, resized_image, new_width, new_height);
+
+ clip_image_u8 padded_image;
+ padded_image.nx = target_width;
+ padded_image.ny = target_height;
+ padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
+
+ // Calculate padding offsets
+ int pad_x = (target_width - new_width) / 2;
+ int pad_y = (target_height - new_height) / 2;
+
+ // Copy the resized image into the center of the padded buffer
+ for (int y = 0; y < new_height; ++y) {
+ for (int x = 0; x < new_width; ++x) {
+ for (int c = 0; c < 3; ++c) {
+ padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
+ }
+ }
+ }
+ image_output = std::move(padded_image);
+}
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) {
+ int original_width = original_size.first;
+ int original_height = original_size.second;
+ std::pair<int, int> best_fit;
+ int max_effective_resolution = 0;
+ int min_wasted_resolution = std::numeric_limits<int>::max();
+
+ for (const auto& resolution : possible_resolutions) {
+ int width = resolution.first;
+ int height = resolution.second;
+ float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+ int downscaled_width = static_cast<int>(original_width * scale);
+ int downscaled_height = static_cast<int>(original_height * scale);
+ int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+ int wasted_resolution = (width * height) - effective_resolution;
+ // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+ if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+ max_effective_resolution = effective_resolution;
+ min_wasted_resolution = wasted_resolution;
+ best_fit = resolution;
+ }
+ }
+
+ return best_fit;
+}
+
+static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
+ std::vector<clip_image_u8*> patches;
+ int width = image.nx;
+ int height = image.ny;
+ for (int i = 0; i < height; i += patch_size) {
+ for (int j = 0; j < width; j += patch_size) {
+ clip_image_u8 *patch = clip_image_u8_init();
+ patch->nx = std::min(patch_size, width - j);
+ patch->ny = std::min(patch_size, height - i);
+ patch->buf.resize(3 * patch->nx * patch->ny);
+ for (int y = 0; y < patch->ny; ++y) {
+ for (int x = 0; x < patch->nx; ++x) {
+ for (int c = 0; c < 3; ++c) {
+ patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
+ }
+ }
+ }
+ patches.push_back(patch);
+ }
+ }
+ return patches;
+}
+
+// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
+// res_imgs memory is being allocated here, previous allocations will be freed if found
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
+ bool pad_to_square = true;
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return false;
}
+ auto & params = ctx->vision_model.hparams;
+ // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
+ if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
+ pad_to_square = false;
+ }
+ // free the previous res_imgs if any set
+ if (res_imgs.size > 0 && res_imgs.size < 100) {
+ for (size_t i = 0; i < res_imgs.size; i++) {
+ clip_image_f32_free(&(res_imgs.data[i]));
+ }
+ delete[] res_imgs.data;
+ }
+ res_imgs.data = nullptr;
+ res_imgs.size = 0;
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
- if (pad2square && img->nx != img->ny) {
+ if (pad_to_square && img->nx != img->ny) {
int longer_side = std::max(img->nx, img->ny);
temp->nx = longer_side;
temp->ny = longer_side;
temp->buf.resize(3 * longer_side * longer_side);
- const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
+ const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
// fill with background color
for (size_t i = 0; i < temp->buf.size(); i++) {
@@ -1119,18 +1530,63 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
}
}
} else {
- temp->nx = img->nx;
- temp->ny = img->ny;
- temp->buf.resize(img->buf.size());
- memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
+ if (params.image_grid_pinpoints[0] != 0) {
+ // "spatial_unpad" with "anyres" processing for llava-1.6
+ std::vector<std::pair<int, int>> possible_resolutions;
+ for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+ possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+ }
+ std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
+ // clip_image_save_to_bmp(*img, "input.bmp");
+ resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
+ // clip_image_save_to_bmp(*temp, "resized.bmp");
+ // visually verify normalized image:
+ // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
+ // {
+ // clip_image_u8 * temp2 = clip_image_u8_init();
+ // clip_image_convert_f32_to_u8(*res, *temp2);
+ // clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
+ // clip_image_u8_free(temp2);
+ // }
+
+ std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
+
+ clip_image_u8 *image_original_resize = clip_image_u8_init();
+ // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
+ bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
+ patches.insert(patches.begin(), image_original_resize);
+ // clip_image_f32_batch_init(patches.size());
+ res_imgs.size = patches.size();
+ res_imgs.data = new clip_image_f32[res_imgs.size];
+ int num=0;
+ for (auto& patch : patches) {
+ normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
+ num++;
+ }
+
+ for (size_t i = 0; i < patches.size(); i++) {
+ // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+ clip_image_u8_free(patches[i]);
+ }
+
+ clip_image_u8_free(temp);
+
+ return true;
+ } else {
+ temp->nx = img->nx;
+ temp->ny = img->ny;
+ temp->buf.resize(img->buf.size());
+ memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
+ }
}
const int nx = temp->nx;
const int ny = temp->ny;
+ // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
const int nx2 = ctx->vision_model.hparams.image_size;
const int ny2 = ctx->vision_model.hparams.image_size;
-
+ clip_image_f32 * res = clip_image_f32_init();
res->nx = nx2;
res->ny = ny2;
res->buf.resize(3 * nx2 * ny2);
@@ -1184,9 +1640,25 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
}
clip_image_u8_free(temp);
+ // {
+ // clip_image_u8 * temp2 = clip_image_u8_init();
+ // clip_image_convert_f32_to_u8(*res, *temp2);
+ // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
+ // clip_image_u8_free(temp2);
+ // }
+ // res_imgs.push_back(res);
+
+ res_imgs.size = 1;
+ res_imgs.data = new clip_image_f32[res_imgs.size];
+ res_imgs.data[0] = std::move(*res);
+
return true;
}
+ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
+ return ctx->vision_model.image_newline;
+}
+
void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
@@ -1194,6 +1666,42 @@ void clip_free(clip_ctx * ctx) {
delete ctx;
}
+size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
+ return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
+}
+
+int32_t clip_image_size(const struct clip_ctx * ctx) {
+ return ctx->vision_model.hparams.image_size;
+}
+
+int32_t clip_patch_size(const struct clip_ctx * ctx) {
+ return ctx->vision_model.hparams.patch_size;
+}
+
+int32_t clip_hidden_size(const struct clip_ctx * ctx) {
+ return ctx->vision_model.hparams.hidden_size;
+}
+
+const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
+ return ctx->vision_model.hparams.mm_patch_merge_type;
+}
+
+const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
+ return ctx->vision_model.hparams.image_grid_pinpoints;
+}
+
+int clip_n_patches(const struct clip_ctx * ctx) {
+ const auto & params = ctx->vision_model.hparams;
+
+ int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+
+ if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+ n_patches /= 4;
+ }
+
+ return n_patches;
+}
+
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
@@ -1213,7 +1721,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
int batch_size = imgs->size;
- if(ctx->has_llava_projector) {
+ if (ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
}
@@ -1224,9 +1732,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// set inputs
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
- const int image_size = hparams.image_size;
- const int patch_size = hparams.patch_size;
- const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
+
+ const int image_size = hparams.image_size;
+ const int patch_size = hparams.patch_size;
+ const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_positions = num_patches + 1;
{
@@ -1301,11 +1810,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+
return true;
}
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
-
ggml_type type = GGML_TYPE_Q4_1;
assert(itype < GGML_TYPE_COUNT);
@@ -1494,26 +2003,13 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
}
- else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+ if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0];
- } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
- return ctx->vision_model.mm_3_b->ne[0];
- }
- else {
- std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
- throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
-}
-
-int clip_n_patches(const struct clip_ctx * ctx) {
- auto & params = ctx->vision_model.hparams;
- int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
- if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
- n_patches /= 4;
+ if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+ return ctx->vision_model.mm_3_b->ne[0];
}
- return n_patches;
-}
-size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
- return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
+ std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
+ throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 458a256a..cd9a4022 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -24,25 +24,7 @@ struct clip_ctx;
extern "C" {
#endif
-struct clip_vision_hparams {
- int32_t image_size;
- int32_t patch_size;
- int32_t hidden_size;
- int32_t n_intermediate;
- int32_t projection_dim;
- int32_t n_head;
- int32_t n_layer;
- float eps;
-};
-
-CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
-
-CLIP_API void clip_free(struct clip_ctx * ctx);
-
-CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-
-CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
-CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+struct clip_ctx;
struct clip_image_u8_batch {
struct clip_image_u8 * data;
@@ -54,10 +36,29 @@ struct clip_image_f32_batch {
size_t size;
};
+CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
+
+CLIP_API void clip_free(struct clip_ctx * ctx);
+
+CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+
+CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+
+// TODO: should be enum, not string
+CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
+
+CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+
+CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
-CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
+CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
@@ -65,7 +66,11 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
+/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
+
+CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
+
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index e204b56b..c69f89ac 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -78,18 +78,19 @@ ap.add_argument("--text-only", action="store_true", required=False,
help="Save a text-only model. It can't be used to encode images")
ap.add_argument("--vision-only", action="store_true", required=False,
help="Save a vision-only model. It can't be used to encode texts")
-ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+ help="The clip model is from openclip (for ViT-SO400M type))")
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
-ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
-ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
default_image_std = [0.26862954, 0.26130258, 0.27577711]
-ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
-ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
# with proper
args = ap.parse_args()
@@ -105,7 +106,7 @@ if args.use_f32:
# output in the same directory as the model if output_dir is None
dir_model = args.model_dir
-if args.clip_model_is_vision:
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
vocab = None
tokens = None
else:
@@ -133,7 +134,7 @@ ftype = 1
if args.use_f32:
ftype = 0
-if args.clip_model_is_vision:
+if args.clip_model_is_vision or args.clip_model_is_openclip:
model = CLIPVisionModel.from_pretrained(dir_model)
processor = None
else:
@@ -202,6 +203,57 @@ if has_vision_encoder:
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+ # /**
+ # "image_grid_pinpoints": [
+ # [
+ # 336,
+ # 672
+ # ],
+ # [
+ # 672,
+ # 336
+ # ],
+ # [
+ # 672,
+ # 672
+ # ],
+ # [
+ # 1008,
+ # 336
+ # ],
+ # [
+ # 336,
+ # 1008
+ # ]
+ # ],
+ # Flattened:
+ # [
+ # 336, 672,
+ # 672, 336,
+ # 672, 672,
+ # 1008, 336,
+ # 336, 1008
+ # ]
+ # *
+ # */
+ if "image_grid_pinpoints" in v_hparams:
+ # flatten it
+ image_grid_pinpoints = []
+ for pinpoint in v_hparams["image_grid_pinpoints"]:
+ for p in pinpoint:
+ image_grid_pinpoints.append(p)
+ fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
+ if "image_crop_resolution" in v_hparams:
+ fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
+ if "image_aspect_ratio" in v_hparams:
+ fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
+ if "image_split_resolution" in v_hparams:
+ fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
+ if "mm_patch_merge_type" in v_hparams:
+ fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
+ if "mm_projector_type" in v_hparams:
+ fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
+
if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 031e9806..bef7f7c9 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -155,11 +155,29 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
printf("system_prompt: %s\n", system_prompt.c_str());
+ if (params->verbose_prompt) {
+ auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+ for (int i = 0; i < (int) tmp.size(); i++) {
+ printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+ }
+ }
printf("user_prompt: %s\n", user_prompt.c_str());
+ if (params->verbose_prompt) {
+ auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+ for (int i = 0; i < (int) tmp.size(); i++) {
+ printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+ }
+ }
} else {
// llava-1.5 native mode
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:";
+ if (params->verbose_prompt) {
+ auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+ for (int i = 0; i < (int) tmp.size(); i++) {
+ printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+ }
+ }
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
@@ -171,13 +189,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
fprintf(stderr, "\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-
+ std::string response = "";
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+ response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
-
printf("%s", tmp);
+ if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
+ if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
+ if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
+
fflush(stdout);
}
diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py
new file mode 100644
index 00000000..5bc5bc51
--- /dev/null
+++ b/examples/llava/llava-surgery-v2.py
@@ -0,0 +1,167 @@
+import argparse
+import glob
+import os
+import torch
+from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
+
+# Function to determine if file is a SafeTensor file
+def is_safetensor_file(file_path):
+ return file_path.endswith('.safetensors')
+
+
+# Unified loading function
+def load_model(file_path):
+ if is_safetensor_file(file_path):
+ tensors = {}
+ with safe_open(file_path, framework="pt", device="cpu") as f:
+ for key in f.keys():
+ tensors[key] = f.get_tensor(key).clone()
+ # output shape
+ print(f"{key} : {tensors[key].shape}")
+ return tensors, 'safetensor'
+ else:
+ return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
+
+
+# Unified saving function
+def save_model(model, file_path, file_type):
+ if file_type == 'safetensor':
+ # safe_save(model, file_path)
+ save_file(model, file_path)
+ else:
+ torch.save(model, file_path)
+
+
+# Adapted function to clean vision tower from checkpoint
+def clean_vision_tower_from_checkpoint(checkpoint_path):
+ checkpoint, file_type = load_model(checkpoint_path)
+ # file_type = 'pytorch'
+ model_path = os.path.dirname(checkpoint_path)
+ print(f"Searching for vision tower tensors in {checkpoint_path}")
+ clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))]
+
+ if len(clip_tensors) > 0:
+ print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
+ # Adapted for file type
+ clip_path = os.path.join(model_path, "llava.clip")
+
+ if os.path.exists(clip_path):
+ print(f"Loading existing llava.clip from {clip_path}")
+ existing_clip, _ = load_model(clip_path)
+ else:
+ print(f"Creating new llava.clip at {clip_path}")
+ existing_clip = {}
+ # Update existing_clip with new tensors, avoid duplicates
+ for name in clip_tensors:
+ simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
+ print(f"Adding {simple_name} to llava.clip")
+ if simple_name not in existing_clip:
+ existing_clip[simple_name] = checkpoint[name]
+
+ # Save the updated clip tensors back to llava.clip
+ save_model(existing_clip, clip_path, 'pytorch')
+
+ # Remove the tensors from the original checkpoint
+ for name in clip_tensors:
+ del checkpoint[name]
+
+ # Save the updated checkpoint
+ checkpoint_path = checkpoint_path
+ save_model(checkpoint, checkpoint_path, file_type)
+ return True
+ return False
+
+def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
+ newline_checkpoint_path = None
+ projector_checkpoint_path = None
+
+ for path in checkpoint_paths:
+ checkpoint, _ = load_model(path)
+ if newline_criteria(checkpoint) and newline_checkpoint_path is None:
+ newline_checkpoint_path = path
+ if projector(checkpoint):
+ projector_checkpoint_path = path
+
+ return newline_checkpoint_path, projector_checkpoint_path
+
+def newline_criteria(checkpoint):
+ return any(k.startswith("model.image_newline") for k in checkpoint.keys())
+
+def proj_criteria(checkpoint):
+ return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
+
+
+# Command-line interface setup
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
+ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
+args = ap.parse_args()
+
+if args.clean_vision_tower:
+ # Generalized to handle both PyTorch and SafeTensors models
+ model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+ # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
+ checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+ for projector_checkpoint_path in checkpoint_paths:
+ print(f"Cleaning {projector_checkpoint_path}")
+ if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
+ print(f"No vision tower found in {projector_checkpoint_path}")
+ # we break once none is found, so far all models append them at the end
+ # break
+ print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
+
+# Now we look for the projector in the last checkpoint
+model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+# last_checkpoint_path = checkpoint_paths[0]
+# first_checkpoint_path = checkpoint_paths[-1]
+newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
+
+print(f"Taking projector from {projector_checkpoint_path}")
+first_mm_tensors = []
+first_checkpoint = None
+if newline_checkpoint_path is not None:
+ print(f"Taking newline from {newline_checkpoint_path}")
+ first_checkpoint, file_type = load_model(newline_checkpoint_path)
+ first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
+
+# Load the checkpoint
+mm_tensors = []
+last_checkpoint = None
+if projector_checkpoint_path is not None:
+ last_checkpoint, file_type = load_model(projector_checkpoint_path)
+ mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
+
+if len(mm_tensors) == 0:
+ if last_checkpoint is not None:
+ for k, v in last_checkpoint.items():
+ print(k)
+ print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
+ print("No tensors found. Is this a LLaVA model?")
+ exit()
+
+print(f"Found {len(mm_tensors)} tensors to extract.")
+print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
+# projector = {name: checkpoint.[name].float() for name in mm_tensors}
+projector = {}
+for name in mm_tensors:
+ projector[name] = last_checkpoint[name].float()
+for name in first_mm_tensors:
+ projector[name] = first_checkpoint[name].float()
+
+if len(projector) > 0:
+ save_model(projector, f"{args.model}/llava.projector", 'pytorch')
+
+for name in mm_tensors:
+ del last_checkpoint[name]
+for name in first_mm_tensors:
+ del first_checkpoint[name]
+
+if len(mm_tensors) > 0:
+ save_model(last_checkpoint, projector_checkpoint_path, file_type)
+if len(first_mm_tensors) > 0:
+ save_model(first_checkpoint, newline_checkpoint_path, file_type)
+
+print("Done!")
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index d42e7582..22953417 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -2,32 +2,296 @@
#include "common.h"
#include "llama.h"
#include "llava.h"
+#include "base64.hpp"
#include <cstdio>
#include <cstdlib>
#include <vector>
+#include <numeric>
+
+// RGB uint8 image
+struct clip_image_u8 {
+ int nx;
+ int ny;
+
+ std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+ int nx;
+ int ny;
+
+ std::vector<float> buf;
+};
+
+struct clip_image_grid_shape {
+ int first;
+ int second;
+};
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
+ int original_width = original_size.first;
+ int original_height = original_size.second;
+
+ std::pair<int, int> best_fit;
+ int max_effective_resolution = 0;
+ int min_wasted_resolution = std::numeric_limits<int>::max();
+
+ for (const auto& resolution : possible_resolutions) {
+ int width = resolution.first;
+ int height = resolution.second;
+ float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+ int downscaled_width = static_cast<int>(original_width * scale);
+ int downscaled_height = static_cast<int>(original_height * scale);
+ int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+ int wasted_resolution = (width * height) - effective_resolution;
+ // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+ if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+ max_effective_resolution = effective_resolution;
+ min_wasted_resolution = wasted_resolution;
+ best_fit = resolution;
+ }
+ }
+
+ return best_fit;
+}
+
+/**
+ * @brief Get the anyres image grid shape object
+ *
+ * @param image_size
+ * @param grid_pinpoints
+ * @param image_patch_size
+ * @return <int, int>
+ */
+static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
+ /**
+ Conversion from gguf flat array to vector:
+ std::vector<std::pair<int, int>> possible_resolutions;
+ for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+ possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+ }
+ */
+ auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
+ return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
+}
+
+// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
+static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
+ struct {
+ struct ggml_tensor * newline;
+ struct ggml_context * ctx;
+ } model;
+
+ const int32_t image_size = clip_image_size(ctx_clip);
+ const int32_t patch_size = clip_patch_size(ctx_clip);
+
+ int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
+
+ int num_patches_width = grid_shape.first; // grid 1-4
+ int num_patches_height = grid_shape.second; // grid 1-4
+
+ const size_t num_images = num_patches_width + num_patches_height + 1;
+
+ // TODO: size calculation is not calculated - it's only tens of MB
+ size_t ctx_size = 0;
+
+ {
+ ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
+ ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
+ }
+
+ struct ggml_init_params params {
+ /*.mem_size =*/ ctx_size,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
+ };
+
+ // Python reference code for full unpad:
+ /*
+ base_image_feature = image_feature[0]
+ image_feature = image_feature[1:]
+ image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+ image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+ image_feature = unpad_image(image_feature, image_sizes[image_idx])
+ image_feature = torch.cat((
+ image_feature,
+ self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
+ ), dim=-1)
+ image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+ image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+ */
+ // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
+ // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
+ // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
+ // Once all images are processed to prepended the base_image_features without any changes.
+
+ // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
+ /*
+ image_feature = image_feature.view(2, 2, 24, 24, 4096)
+ image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+ image_feature = image_feature.view(2, 24, 2, 24, 4096)
+ image_feature = image_feature.flatten(0, 3)
+
+ // Reshape to 4D tensor by merging the last two dimensions
+ image_feature = image_feature.view(2, 2, 24, 24*4096)
+ image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
+ image_feature = image_feature.view(-1, 4096)
+ */
+
+ model.ctx = ggml_init(params);
+
+ ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
+ model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
+ if (newline_tmp->backend != GGML_BACKEND_CPU) {
+ if (newline_tmp->buffer == NULL) {
+ printf("newline_tmp tensor buffer is NULL\n");
+ }
+ ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
+ } else {
+ model.newline->data = newline_tmp->data;
+ if (model.newline->data == NULL) {
+ printf("newline_tmp tensor data is NULL\n");
+ }
+ }
+
+ struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
+ // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
+ // fill it with the image embeddings, ignoring the base
+ for (size_t i = 1; i < num_images; i++) {
+ size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
+ memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
+ }
+
+ struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
+ size_t size_ele = ggml_type_size(GGML_TYPE_F32);
+
+ struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
+ num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
+ num_patches_per_side,
+ num_patches_width,
+ num_patches_height,
+ size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
+ size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
+ size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
+ // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
+ struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
+ /**
+ At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
+ image_feature = torch.cat((
+ image_feature,
+ self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
+ ), dim=-1)
+ *
+ */
+
+ // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
+ struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
+ // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
+ ggml_build_forward_expand(gf, flatten);
+ ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+ struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+
+ memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
+ // append without newline tokens (default behavior in llava_arch when not using unpad ):
+ memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+ *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
+
+ // Debug: Test single segments
+ // Current findings: sending base image, sending a segment embedding all works similar to python
+ // However, permuted embeddings do not work yet (stride issue?)
+ // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
+ // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
+ // *n_img_pos_out=576;
+
+ ggml_free(model.ctx);
+ return true;
+}
-#include "base64.hpp"
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
- clip_image_f32 * img_res = clip_image_f32_init();
- if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
+ // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
+ clip_image_f32_batch img_res_v;
+ img_res_v.size = 0;
+ img_res_v.data = nullptr;
+ if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
- clip_image_f32_free(img_res);
+ delete[] img_res_v.data;
return false;
}
- *n_img_pos = clip_n_patches(ctx_clip);
-
const int64_t t_img_enc_start_us = ggml_time_us();
- bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
- clip_image_f32_free(img_res);
- if (!encoded) {
- fprintf(stderr, "Unable to encode image\n");
- return false;
+ const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
+
+ if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
+ // flat / default llava-1.5 type embedding
+ *n_img_pos = clip_n_patches(ctx_clip);
+ bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
+ delete[] img_res_v.data;
+ if (!encoded) {
+ fprintf(stderr, "Unable to encode image\n");
+
+ return false;
+ }
+ } else {
+ // spatial_unpad llava-1.6 type embedding
+ // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
+ std::vector<float *> image_embd_v;
+ image_embd_v.resize(img_res_v.size);
+ for (size_t i = 0; i < img_res_v.size; i++) {
+ image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
+ const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
+ if (!encoded) {
+ fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+ return false;
+ }
+ }
+ const int64_t t_img_enc_batch_us = ggml_time_us();
+ printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+
+ const int32_t * image_grid = clip_image_grid(ctx_clip);
+
+ std::vector<std::pair<int, int>> grid_pinpoints;
+ for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
+ grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
+ }
+
+ // free all img_res_v - not needed anymore
+ delete[] img_res_v.data;
+ img_res_v.size = 0;
+ img_res_v.data = nullptr;
+
+ const int32_t image_size = clip_image_size(ctx_clip);
+
+ struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
+
+ int n_img_pos_out;
+ clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
+ *n_img_pos = n_img_pos_out;
+
+ for (size_t i = 0; i < image_embd_v.size(); i++) {
+ free(image_embd_v[i]);
+ }
+ image_embd_v.clear();
+
+ // debug image/segment/normalization content:
+ // clip_image_u8 * tmp = clip_image_u8_init();
+ // clip_image_convert_f32_to_u8(*image_feature, *tmp);
+ // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
}
+ printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
@@ -48,7 +312,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
}
static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
- float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
+ float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
if (!image_embd) {
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
free(image_embd);
@@ -85,7 +349,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
return true;
}
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
+struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
@@ -142,7 +406,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
return true;
}
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
+struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
unsigned char* image_bytes;
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
@@ -151,13 +415,13 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct
return NULL;
}
- auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+ llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
free(image_bytes);
return embed;
}
-LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) {
+void llava_image_embed_free(struct llava_image_embed * embed) {
free(embed->embed);
free(embed);
}
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index e08ce788..9e9466a5 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -3,7 +3,6 @@
#include "ggml.h"
-
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
@@ -42,7 +41,6 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
-
#ifdef __cplusplus
}
#endif
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1699eb76..6e343403 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -968,13 +968,20 @@ struct llama_server_context
{
continue;
}
- clip_image_f32 * img_res = clip_image_f32_init();
- if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
+ clip_image_f32_batch img_res_v;
+ img_res_v.size = 0;
+ img_res_v.data = nullptr;
+ if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
{
LOG_TEE("Error processing the given image");
clip_free(clp_ctx);
+ clip_image_f32_free(img_res_v.data);
return false;
}
+
+ // note: assumes only one image was returned by clip_image_preprocess
+ clip_image_f32 * img_res = img_res_v.data;
+
img.image_tokens = clip_n_patches(clp_ctx);
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
if (!img.image_embedding)
@@ -989,7 +996,9 @@ struct llama_server_context
LOG_TEE("Unable to encode image\n");
return false;
}
- clip_image_f32_free(img_res);
+
+ clip_image_f32_free(img_res_v.data);
+
img.request_encode_image = false;
}