summaryrefslogtreecommitdiff
path: root/examples/llava/clip.cpp
diff options
context:
space:
mode:
authorZiang Wu <97337387+ZiangWu-77@users.noreply.github.com>2024-03-20 23:02:32 +0800
committerGitHub <noreply@github.com>2024-03-20 17:02:32 +0200
commit272935b281fee5c683e3d6d1eb580b84553cf503 (patch)
treebac74b4ffcc9ded2c6462926a011cefa6135f32b /examples/llava/clip.cpp
parentccf58aa3ec4d20b10162ba40898dc038ad4c3fad (diff)
llava : add MobileVLM_V2 backup (#6175)
* Add MobileVLM_V2 backup * Update MobileVLM-README.md * Update examples/llava/MobileVLM-README.md Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/llava/convert-image-encoder-to-gguf.py Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * clip : fix whitespace * fix deifinition mistake in clip.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/llava/clip.cpp')
-rw-r--r--examples/llava/clip.cpp50
1 files changed, 49 insertions, 1 deletions
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 690bca2e..48caafa8 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -119,6 +119,7 @@ static std::string format(const char * fmt, ...) {
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline"
@@ -126,12 +127,14 @@ enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
+ PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_UNKNOWN,
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
+ { PROJECTOR_TYPE_LDPV2, "ldpv2"},
};
@@ -475,6 +478,14 @@ struct clip_vision_model {
struct ggml_tensor * mm_model_block_2_block_2_0_w;
struct ggml_tensor * mm_model_block_2_block_2_1_w;
struct ggml_tensor * mm_model_block_2_block_2_1_b;
+
+ // MobileVLM_V2 projection
+ struct ggml_tensor * mm_model_mlp_0_w;
+ struct ggml_tensor * mm_model_mlp_0_b;
+ struct ggml_tensor * mm_model_mlp_2_w;
+ struct ggml_tensor * mm_model_mlp_2_b;
+ struct ggml_tensor * mm_model_peg_0_w;
+ struct ggml_tensor * mm_model_peg_0_b;
};
struct clip_ctx {
@@ -807,6 +818,29 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}
embeddings = block_1;
}
+ else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
+ {
+ int n_patch = 24;
+ struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+ mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+ mlp_0 = ggml_gelu(ctx0, mlp_0);
+ struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+ mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+ // mlp_2 ne = [2048, 576, 1, 1]
+ // // AVG Pool Layer 2*2, strides = 2
+ mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
+ // mlp_2 ne = [576, 2048, 1, 1]
+ mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+ // mlp_2 ne [24, 24, 2048, 1]
+ mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+ // weight ne = [3, 3, 2048, 1]
+ struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+ peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+ peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+ peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+ peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+ embeddings = peg_0;
+ }
else {
GGML_ASSERT(false);
}
@@ -1177,7 +1211,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
- } else {
+ }
+ else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
+ {
+ // MobilVLM_V2 projection
+ vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
+ vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
+ vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
+ vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
+ vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
+ vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
+ }
+ else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
@@ -1966,6 +2011,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
}
+ if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
+ return ctx->vision_model.mm_model_peg_0_b->ne[0];
+ }
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0];
}