diff options
Diffstat (limited to 'examples/llava/llava.cpp')
-rw-r--r-- | examples/llava/llava.cpp | 230 |
1 files changed, 111 insertions, 119 deletions
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index f0974d5b..d10bcf2d 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -1,164 +1,156 @@ #include "clip.h" -#include "llava-utils.h" #include "common.h" #include "llama.h" +#include "llava.h" #include <cstdio> #include <cstdlib> #include <vector> -static void show_additional_info(int /*argc*/, char ** argv) { - printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - gpt_params params; +#include "base64.hpp" - if (!gpt_params_parse(argc, argv, params)) { - show_additional_info(argc, argv); - return 1; +static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { + clip_image_f32 * img_res = make_clip_image_f32(); + if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) { + fprintf(stderr, "%s: unable to preprocess image\n", __func__); + clip_image_f32_free(img_res); + return false; } - if (params.mmproj.empty() || params.image.empty()) { - gpt_print_usage(argc, argv, params); - show_additional_info(argc, argv); - return 1; - } + *n_img_pos = clip_n_patches(ctx_clip); - const char * clip_path = params.mmproj.c_str(); - const char * img_path = params.image.c_str(); + const int64_t t_img_enc_start_us = ggml_time_us(); + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); + clip_image_f32_free(img_res); + if (!encoded) { + fprintf(stderr, "Unable to encode image\n"); - if (params.prompt.empty()) { - params.prompt = "describe the image in detail."; + return false; } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - - // load and preprocess the image - clip_image_u8 img; - clip_image_f32 img_res; + const int64_t t_img_enc_end_us = ggml_time_us(); + float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; - if (!clip_image_load_from_file(img_path, &img)) { - fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path); + printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); - clip_free(ctx_clip); - return 1; - } - - if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) { - fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path); + return true; +} - clip_free(ctx_clip); - return 1; +bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { + // make sure that the correct mmproj was used, i.e., compare apples to apples + int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); + auto n_image_embd = clip_n_mmproj_embd(ctx_clip); + if (n_image_embd != n_llama_embd) { + printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); + return false; } + return true; +} - int n_img_pos = clip_n_patches(ctx_clip); - int n_img_embd = clip_n_mmproj_embd(ctx_clip); - +static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)); - if (!image_embd) { fprintf(stderr, "Unable to allocate memory for image embeddings\n"); - - return 1; + free(image_embd); + return false; } - const int64_t t_img_enc_start_us = ggml_time_us(); - if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) { - fprintf(stderr, "Unable to encode image\n"); - - return 1; + int n_img_pos; + if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { + fprintf(stderr, "%s: cannot encode image, aborting\n", __func__); + free(image_embd); + return false; } - const int64_t t_img_enc_end_us = ggml_time_us(); + *image_embd_out = image_embd; + *n_img_pos_out = n_img_pos; - // we get the embeddings, free up the memory required for CLIP - clip_free(ctx_clip); - - llama_backend_init(params.numa); - - llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = params.n_gpu_layers; - model_params.main_gpu = params.main_gpu; - model_params.tensor_split = params.tensor_split; - model_params.use_mmap = params.use_mmap; - model_params.use_mlock = params.use_mlock; + return true; +} - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); - if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); - return 1; +bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) { + int n_embd = llama_n_embd(llama_get_model(ctx_llama)); + + for (int i = 0; i < image_embed->n_image_pos; i += n_batch) { + int n_eval = image_embed->n_image_pos - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; + if (llama_decode(ctx_llama, batch)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + *n_past += n_eval; } + return true; +} - llama_context_params ctx_params = llama_context_default_params(); - - ctx_params.n_ctx = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - ctx_params.seed = params.seed; - - llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); - - if (ctx_llama == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); - return 1; +LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { + clip_image_u8 * img = make_clip_image_u8(); + if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { + clip_image_u8_free(img); + fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__); + return NULL; } - // make sure that the correct mmproj was used, i.e., compare apples to apples - const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); - - if (n_img_embd != n_llama_embd) { - printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd); - - llama_free(ctx_llama); - llama_free_model(model); - llama_backend_free(); - free(image_embd); - - return 1; + float* image_embed = NULL; + int n_image_pos = 0; + bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); + if (!image_embed_result) { + clip_image_u8_free(img); + fprintf(stderr, "%s: coulnd't embed the image\n", __func__); + return NULL; } - // process the prompt - // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:" - - int n_past = 0; - - const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - - eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true); - eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past); - eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false); - - // generate the response + clip_image_u8_free(img); + auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + result->embed = image_embed; + result->n_image_pos = n_image_pos; + return result; +} - printf("\n"); - printf("prompt: '%s'\n", params.prompt.c_str()); - printf("\n"); +static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { + auto file = fopen(path, "rb"); + if (file == NULL) { + fprintf(stderr, "%s: can't read file %s\n", __func__, path); + return false; + } - for (int i = 0; i < max_tgt_len; i++) { - const char * tmp = sample(ctx_llama, params, &n_past); - if (strcmp(tmp, "</s>") == 0) break; + fseek(file, 0, SEEK_END); + auto fileSize = ftell(file); + fseek(file, 0, SEEK_SET); - printf("%s", tmp); - fflush(stdout); + auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data + if (buffer == NULL) { + fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); + perror("Memory allocation error"); + fclose(file); + return false; } + fread(buffer, 1, fileSize, file); // Read the file into the buffer + fclose(file); // Close the file - printf("\n"); - - { - const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; + *bytesOut = buffer; + *sizeOut = fileSize; + return true; +} - printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos); +LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { + unsigned char* image_bytes; + long image_bytes_length; + auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); + if (!loaded) { + fprintf(stderr, "%s: failed to load %s\n", __func__, image_path); + return NULL; } - llama_print_timings(ctx_llama); + auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); + free(image_bytes); - llama_free(ctx_llama); - llama_free_model(model); - llama_backend_free(); - free(image_embd); + return embed; +} - return 0; +LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) { + free(embed->embed); + free(embed); } |