diff options
Diffstat (limited to 'examples/llava/convert-image-encoder-to-gguf.py')
-rw-r--r-- | examples/llava/convert-image-encoder-to-gguf.py | 66 |
1 files changed, 59 insertions, 7 deletions
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index e204b56b..c69f89ac 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -78,18 +78,19 @@ ap.add_argument("--text-only", action="store_true", required=False, help="Save a text-only model. It can't be used to encode images") ap.add_argument("--vision-only", action="store_true", required=False, help="Save a vision-only model. It can't be used to encode texts") -ap.add_argument("--clip_model_is_vision", action="store_true", required=False, +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") -ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") -ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 default_image_mean = [0.48145466, 0.4578275, 0.40821073] default_image_std = [0.26862954, 0.26130258, 0.27577711] -ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) -ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) # with proper args = ap.parse_args() @@ -105,7 +106,7 @@ if args.use_f32: # output in the same directory as the model if output_dir is None dir_model = args.model_dir -if args.clip_model_is_vision: +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: vocab = None tokens = None else: @@ -133,7 +134,7 @@ ftype = 1 if args.use_f32: ftype = 0 -if args.clip_model_is_vision: +if args.clip_model_is_vision or args.clip_model_is_openclip: model = CLIPVisionModel.from_pretrained(dir_model) processor = None else: @@ -202,6 +203,57 @@ if has_vision_encoder: fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + # /** + # "image_grid_pinpoints": [ + # [ + # 336, + # 672 + # ], + # [ + # 672, + # 336 + # ], + # [ + # 672, + # 672 + # ], + # [ + # 1008, + # 336 + # ], + # [ + # 336, + # 1008 + # ] + # ], + # Flattened: + # [ + # 336, 672, + # 672, 336, + # 672, 672, + # 1008, 336, + # 336, 1008 + # ] + # * + # */ + if "image_grid_pinpoints" in v_hparams: + # flatten it + image_grid_pinpoints = [] + for pinpoint in v_hparams["image_grid_pinpoints"]: + for p in pinpoint: + image_grid_pinpoints.append(p) + fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) + if "image_crop_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) + if "image_aspect_ratio" in v_hparams: + fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) + if "image_split_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) + if "mm_patch_merge_type" in v_hparams: + fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) + if "mm_projector_type" in v_hparams: + fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) + if processor is not None: image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean |