diff options
author | John <78893154+cmp-nct@users.noreply.github.com> | 2024-01-27 16:09:18 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-27 17:09:18 +0200 |
commit | 6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855 (patch) | |
tree | 5bbe07db083a41aff3f71ab72dc92242b3a80728 /examples/llava/llava-cli.cpp | |
parent | 753eafed0ebd07af6903771327a1786a7c02cf98 (diff) |
llava : support for Yi-VL and fix for mobileVLM (#5093)
* Support for Yi-VL, templating fix for mobileVLM
* ws
* Update examples/llava/clip.cpp
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update llava-cli.cpp
* Update clip.cpp
bugfix for new conversions
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/llava/llava-cli.cpp')
-rw-r--r-- | examples/llava/llava-cli.cpp | 32 |
1 files changed, 29 insertions, 3 deletions
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index d94795fe..6ac70ba6 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama)); - // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:" - eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos); + std::string system_prompt, user_prompt; + size_t image_pos = prompt.find("<image>"); + if (image_pos != std::string::npos) { + // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image + + system_prompt = prompt.substr(0, image_pos); + user_prompt = prompt.substr(image_pos + std::string("<image>").length()); + // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string + size_t pos = 0; + while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) { + user_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline + } + while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) { + system_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline + } + + printf("system_prompt: %s\n", system_prompt.c_str()); + printf("user_prompt: %s\n", user_prompt.c_str()); + } else { + // llava-1.5 native mode + system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; + user_prompt = prompt + "\nASSISTANT:"; + } + + eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false); + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); // generate the response @@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); if (strcmp(tmp, "</s>") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior printf("%s", tmp); fflush(stdout); |