diff options
author | John <78893154+cmp-nct@users.noreply.github.com> | 2024-01-27 16:09:18 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-27 17:09:18 +0200 |
commit | 6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855 (patch) | |
tree | 5bbe07db083a41aff3f71ab72dc92242b3a80728 /examples/llava/clip.cpp | |
parent | 753eafed0ebd07af6903771327a1786a7c02cf98 (diff) |
llava : support for Yi-VL and fix for mobileVLM (#5093)
* Support for Yi-VL, templating fix for mobileVLM
* ws
* Update examples/llava/clip.cpp
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update llava-cli.cpp
* Update clip.cpp
bugfix for new conversions
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/llava/clip.cpp')
-rw-r--r-- | examples/llava/clip.cpp | 70 |
1 files changed, 63 insertions, 7 deletions
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index f2cd86af..9129052a 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -98,6 +98,7 @@ static std::string format(const char * fmt, ...) { enum projector_type { PROJECTOR_TYPE_MLP, + PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_UNKNOWN, }; @@ -304,10 +305,18 @@ struct clip_vision_model { struct ggml_tensor * projection; // LLaVA projection - struct ggml_tensor * mm_0_w; - struct ggml_tensor * mm_0_b; - struct ggml_tensor * mm_2_w; - struct ggml_tensor * mm_2_b; + struct ggml_tensor * mm_0_w = NULL; + struct ggml_tensor * mm_0_b = NULL; + struct ggml_tensor * mm_2_w = NULL; + struct ggml_tensor * mm_2_b = NULL; + + // Yi type models with mlp+normalization projection + struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 + struct ggml_tensor * mm_1_b = NULL; + struct ggml_tensor * mm_3_w = NULL; + struct ggml_tensor * mm_3_b = NULL; + struct ggml_tensor * mm_4_w = NULL; + struct ggml_tensor * mm_4_b = NULL; // MobileVLM projection struct ggml_tensor * mm_model_mlp_1_w; @@ -460,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // pre-layernorm { embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "pre_ln"); embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); } @@ -575,6 +585,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); + // First LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), + model.mm_1_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); + + // Second LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), + model.mm_4_b); } else if (ctx->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projector @@ -808,6 +839,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { else { new_clip->proj_type = PROJECTOR_TYPE_MLP; } + if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { + if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { + new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; + } + } } #ifdef GGML_USE_CUBLAS @@ -956,11 +992,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); // LLaVA projection - if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { + if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); - vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); - vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + try { + // Yi-type llava + vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); + vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); + } catch (std::runtime_error & e) { } + try { + // missing in Yi-type llava + vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + } catch (std::runtime_error & e) { } + try { + // Yi-type llava + vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); + vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); + } catch (std::runtime_error & e) { } + try { + // Yi-type llava + vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); + vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); + } catch (std::runtime_error & e) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1432,6 +1486,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { } else if (ctx->proj_type == PROJECTOR_TYPE_MLP) { return ctx->vision_model.mm_2_b->ne[0]; + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + return ctx->vision_model.mm_3_b->ne[0]; } else { std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; |