llava : MobileVLM support (#4954)

* MobileVLM native implementation * delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake * move android script to example/llava directory * Fix the editor config checks --------- Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>
author: XiaotaoChen <chenxiaotao1234@gmail.com> 2024-01-22 21:09:35 +0800
committer: GitHub <noreply@github.com> 2024-01-22 15:09:35 +0200
commit: 3ce7e8f8e7ccfce07e5947ac5f1f3f4628cf68ea (patch)
tree: 75c5f7d2eb2e6df853fe1fa8cb7119f3178a59a3 /examples/llava/convert-image-encoder-to-gguf.py
parent: b2d80e105a59b54822edf7ce7f3ed5f317e96e21 (diff)
1 files changed, 5 insertions, 1 deletions
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 03688e0e..f5a3c9b4 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                 help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
     fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
     fout.add_description("image encoder for LLaVA")
+    # add projector type
+    fout.add_string("clip.projector_type", args.projector_type)
 else:
     fout.add_description("two-tower CLIP model")
 
@@ -218,7 +221,8 @@ if has_llava_projector:
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
         name = get_tensor_name(name)
-        if data.ndim == 2:
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
             data = data.squeeze().numpy().astype(np.float16)
         else:
             data = data.squeeze().numpy().astype(np.float32)
author	XiaotaoChen <chenxiaotao1234@gmail.com>	2024-01-22 21:09:35 +0800
committer	GitHub <noreply@github.com>	2024-01-22 15:09:35 +0200
commit	3ce7e8f8e7ccfce07e5947ac5f1f3f4628cf68ea (patch)
tree	75c5f7d2eb2e6df853fe1fa8cb7119f3178a59a3 /examples/llava/convert-image-encoder-to-gguf.py
parent	b2d80e105a59b54822edf7ce7f3ed5f317e96e21 (diff)