summaryrefslogtreecommitdiff
path: root/examples/llava/llava.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'examples/llava/llava.cpp')
-rw-r--r--examples/llava/llava.cpp75
1 files changed, 72 insertions, 3 deletions
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 63878d17..916d9dc4 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -202,6 +202,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
return true;
}
+static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
+ int width = image->nx;
+ int height = image->ny;
+ int num_patches = (height / patch_size) * (width / patch_size);
+ clip_image_f32 * patch = clip_image_f32_init();
+ patch->nx = patch_size * num_patches;
+ patch->ny = patch_size;
+ patch->buf.resize(3 * patch->nx * patch->ny);
+
+ int patch_index = 0;
+
+ for (int i = 0; i < height; i += patch_size) {
+ for (int j = 0; j < width; j += patch_size) {
+ for (int pi = 0; pi < patch_size; ++pi) {
+ for (int pj = 0; pj < patch_size; ++pj) {
+ int input_index = ((i + pi) * width + (j + pj)) * 3;
+ int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
+ patch->buf[output_index] = image->buf[input_index];
+ patch->buf[output_index+1] = image->buf[input_index+1];
+ patch->buf[output_index+2] = image->buf[input_index+2];
+ }
+ }
+ patch_index++;
+ }
+ }
+ return patch;
+}
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
@@ -218,7 +245,44 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
- if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
+ if (clip_is_minicpmv(ctx_clip)) {
+ std::vector<float *> image_embd_v;
+ image_embd_v.resize(img_res_v.size);
+ struct clip_image_size * load_image_size = clip_image_size_init();
+ for (size_t i = 0; i < img_res_v.size; i++) {
+ const int64_t t_img_enc_step_start_us = ggml_time_us();
+ image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
+ int patch_size=14;
+ load_image_size->width = img_res_v.data[i].nx;
+ load_image_size->height = img_res_v.data[i].ny;
+ clip_add_load_image_size(ctx_clip, load_image_size);
+ const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+ if (!encoded) {
+ LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+ return false;
+ }
+ const int64_t t_img_enc_steop_batch_us = ggml_time_us();
+ LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+ }
+ const int64_t t_img_enc_batch_us = ggml_time_us();
+ LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+
+ int n_img_pos_out = 0;
+ for (size_t i = 0; i < image_embd_v.size(); i++) {
+ std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
+ n_img_pos_out += clip_n_patches(ctx_clip);
+ }
+ *n_img_pos = n_img_pos_out;
+ for (size_t i = 0; i < image_embd_v.size(); i++) {
+ free(image_embd_v[i]);
+ }
+ image_embd_v.clear();
+ load_image_size->width = img->nx;
+ load_image_size->height = img->ny;
+ clip_add_load_image_size(ctx_clip, load_image_size);
+ LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+ }
+ else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
@@ -228,7 +292,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
return false;
}
- } else {
+ }
+ else {
// spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
std::vector<float *> image_embd_v;
@@ -297,7 +362,11 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
}
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
- float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
+ int num_max_patches = 6;
+ if (clip_is_minicpmv(ctx_clip)) {
+ num_max_patches = 10;
+ }
+ float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
if (!image_embd) {
LOG_TEE("Unable to allocate memory for image embeddings\n");
return false;