diff options
author | XiaotaoChen <chenxiaotao1234@gmail.com> | 2024-01-22 21:09:35 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-22 15:09:35 +0200 |
commit | 3ce7e8f8e7ccfce07e5947ac5f1f3f4628cf68ea (patch) | |
tree | 75c5f7d2eb2e6df853fe1fa8cb7119f3178a59a3 /examples/llava/convert-image-encoder-to-gguf.py | |
parent | b2d80e105a59b54822edf7ce7f3ed5f317e96e21 (diff) |
llava : MobileVLM support (#4954)
* MobileVLM native implementation
* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake
* move android script to example/llava directory
* Fix the editor config checks
---------
Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>
Diffstat (limited to 'examples/llava/convert-image-encoder-to-gguf.py')
-rw-r--r-- | examples/llava/convert-image-encoder-to-gguf.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 03688e0e..f5a3c9b4 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False, ap.add_argument("--clip_model_is_vision", action="store_true", required=False, help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) @@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector: fout.add_description("vision-only CLIP model") elif has_llava_projector: fout.add_description("image encoder for LLaVA") + # add projector type + fout.add_string("clip.projector_type", args.projector_type) else: fout.add_description("two-tower CLIP model") @@ -218,7 +221,8 @@ if has_llava_projector: projector = torch.load(args.llava_projector) for name, data in projector.items(): name = get_tensor_name(name) - if data.ndim == 2: + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: data = data.squeeze().numpy().astype(np.float16) else: data = data.squeeze().numpy().astype(np.float32) |