summaryrefslogtreecommitdiff
path: root/examples/llava/llava-cli.cpp
diff options
context:
space:
mode:
authorJohn <78893154+cmp-nct@users.noreply.github.com>2024-01-27 16:09:18 +0100
committerGitHub <noreply@github.com>2024-01-27 17:09:18 +0200
commit6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855 (patch)
tree5bbe07db083a41aff3f71ab72dc92242b3a80728 /examples/llava/llava-cli.cpp
parent753eafed0ebd07af6903771327a1786a7c02cf98 (diff)
llava : support for Yi-VL and fix for mobileVLM (#5093)
* Support for Yi-VL, templating fix for mobileVLM * ws * Update examples/llava/clip.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update llava-cli.cpp * Update clip.cpp bugfix for new conversions --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/llava/llava-cli.cpp')
-rw-r--r--examples/llava/llava-cli.cpp32
1 files changed, 29 insertions, 3 deletions
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index d94795fe..6ac70ba6 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
- // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
- eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
+ std::string system_prompt, user_prompt;
+ size_t image_pos = prompt.find("<image>");
+ if (image_pos != std::string::npos) {
+ // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
+
+ system_prompt = prompt.substr(0, image_pos);
+ user_prompt = prompt.substr(image_pos + std::string("<image>").length());
+ // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
+ size_t pos = 0;
+ while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
+ user_prompt.replace(pos, 2, "\n");
+ pos += 1; // Advance past the replaced newline
+ }
+ while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
+ system_prompt.replace(pos, 2, "\n");
+ pos += 1; // Advance past the replaced newline
+ }
+
+ printf("system_prompt: %s\n", system_prompt.c_str());
+ printf("user_prompt: %s\n", user_prompt.c_str());
+ } else {
+ // llava-1.5 native mode
+ system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
+ user_prompt = prompt + "\nASSISTANT:";
+ }
+
+ eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
- eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
+ eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
// generate the response
@@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
if (strcmp(tmp, "</s>") == 0) break;
+ if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
fflush(stdout);