llava : support for Yi-VL and fix for mobileVLM (#5093)

* Support for Yi-VL, templating fix for mobileVLM * ws * Update examples/llava/clip.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update llava-cli.cpp * Update clip.cpp bugfix for new conversions --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: John <78893154+cmp-nct@users.noreply.github.com> 2024-01-27 16:09:18 +0100
committer: GitHub <noreply@github.com> 2024-01-27 17:09:18 +0200
commit: 6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855 (patch)
tree: 5bbe07db083a41aff3f71ab72dc92242b3a80728 /examples/llava/clip.cpp
parent: 753eafed0ebd07af6903771327a1786a7c02cf98 (diff)
1 files changed, 63 insertions, 7 deletions
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index f2cd86af..9129052a 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -98,6 +98,7 @@ static std::string format(const char * fmt, ...) {
 
 enum projector_type {
     PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_MLP_NORM,
     PROJECTOR_TYPE_LDP,
     PROJECTOR_TYPE_UNKNOWN,
 };
@@ -304,10 +305,18 @@ struct clip_vision_model {
     struct ggml_tensor * projection;
 
     // LLaVA projection
-    struct ggml_tensor * mm_0_w;
-    struct ggml_tensor * mm_0_b;
-    struct ggml_tensor * mm_2_w;
-    struct ggml_tensor * mm_2_b;
+    struct ggml_tensor * mm_0_w = NULL;
+    struct ggml_tensor * mm_0_b = NULL;
+    struct ggml_tensor * mm_2_w = NULL;
+    struct ggml_tensor * mm_2_b = NULL;
+
+    // Yi type models with mlp+normalization projection
+    struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
+    struct ggml_tensor * mm_1_b = NULL;
+    struct ggml_tensor * mm_3_w = NULL;
+    struct ggml_tensor * mm_3_b = NULL;
+    struct ggml_tensor * mm_4_w = NULL;
+    struct ggml_tensor * mm_4_b = NULL;
 
     // MobileVLM projection
     struct ggml_tensor * mm_model_mlp_1_w;
@@ -460,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     // pre-layernorm
     {
         embeddings = ggml_norm(ctx0, embeddings, eps);
+        ggml_set_name(embeddings, "pre_ln");
 
         embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
     }
@@ -575,6 +585,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
             embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+
+        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+            // First LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+                                model.mm_1_b);
+
+            // GELU activation
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            // Second linear layer
+            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+            // Second LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+                                model.mm_4_b);
         }
         else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projector
@@ -808,6 +839,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         else {
             new_clip->proj_type = PROJECTOR_TYPE_MLP;
         }
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+            if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
+                new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
+            }
+        }
     }
 
 #ifdef GGML_USE_CUBLAS
@@ -956,11 +992,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
 
         // LLaVA projection
-        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
             vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
             vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-            vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-            vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+            try {
+                // Yi-type llava
+                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
+                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // missing in Yi-type llava
+                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // Yi-type llava
+                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
+                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // Yi-type llava
+                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
+                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
+            } catch (std::runtime_error & e) {  }
         }
         else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projection
@@ -1432,6 +1486,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
     }
     else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
         return ctx->vision_model.mm_2_b->ne[0];
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+        return ctx->vision_model.mm_3_b->ne[0];
     }
     else {
         std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
author	John <78893154+cmp-nct@users.noreply.github.com>	2024-01-27 16:09:18 +0100
committer	GitHub <noreply@github.com>	2024-01-27 17:09:18 +0200
commit	6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855 (patch)
tree	5bbe07db083a41aff3f71ab72dc92242b3a80728 /examples/llava/clip.cpp
parent	753eafed0ebd07af6903771327a1786a7c02cf98 (diff)