diff options
author | M. Yusuf Sarıgöz <yusufsarigoz@gmail.com> | 2023-10-12 18:23:18 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-12 18:23:18 +0300 |
commit | 370359e5baf619f3a8d461023143d1494b1e8fde (patch) | |
tree | acfd94911cdb83780f7afc3a703b8abb31aa00e2 /examples/llava/clip.h | |
parent | 9e24cc6e2e589d405bd1720c400f5b0b9d0ca3ee (diff) |
examples: support LLaVA v1.5 (multimodal model) (#3436)
* WIP: start implementing LLaVA
* rm scratch buf for now, will revert after cleanup
* LLaVA image encoder is working. will combine with llama
* Add llava inference code, but it's buggy. debugging
* LLaVA is working e2e, needs to optimize memory allocation + cleanup
* Use ggml_allocr + rm unnecessary code
* fix: crlf -> lf
* fix: new line at EoF
* fix: trailing whitespace
* Add readme
* Update readme
* Some cleanup
* Are you happy editorconfig?
* rm unused batch image preprocessing
* rm unused import
* fix: rm designated initializers
* introduce pad-to-square mode for non-square images
* are you happy editorconfig?
* gitignore /llava
* Handle cases where image file does not exist
* add llava target to Makefile
* add support for 13b model variant
* Maybe seed is unlucky?
* Check if apples are compared to apples
* are you happy editorconfig?
* Use temperature = 0.1 by default
* command line: use gpt_params_parse()
* minor
* handle default n_predict
* fix typo
* llava : code formatting, rename files, fix compile warnings
* do not use Wno-cast-qual for MSVC
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/llava/clip.h')
-rw-r--r-- | examples/llava/clip.h | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/examples/llava/clip.h b/examples/llava/clip.h new file mode 100644 index 00000000..3d7261e2 --- /dev/null +++ b/examples/llava/clip.h @@ -0,0 +1,73 @@ +#ifndef CLIP_H +#define CLIP_H + +#include "ggml.h" + +struct clip_ctx; + +#ifdef __cplusplus +extern "C" { +#endif + +struct clip_vision_hparams { + int32_t image_size; + int32_t patch_size; + int32_t hidden_size; + int32_t n_intermediate; + int32_t projection_dim; + int32_t n_head; + int32_t n_layer; + float eps; +}; + +struct clip_ctx * clip_model_load(const char * fname, const int verbosity); + +void clip_free(struct clip_ctx * ctx); + +size_t clip_embd_nbytes(struct clip_ctx * ctx); +int clip_n_patches(struct clip_ctx * ctx); +int clip_n_mmproj_embd(struct clip_ctx * ctx); + +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + uint8_t * data; + size_t size; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + float * data; + size_t size; +}; + +struct clip_image_u8_batch { + struct clip_image_u8 * data; + size_t size; +}; + +struct clip_image_f32_batch { + struct clip_image_f32 * data; + size_t size; +}; + +struct clip_image_u8 * make_clip_image_u8(); +struct clip_image_f32 * make_clip_image_f32(); +bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); +bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square); +bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec); + +bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs, + float * vec); + +bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype); + +#ifdef __cplusplus +} +#endif + +#endif // CLIP_H |