diff options
Diffstat (limited to 'examples/llava/llava-cli.cpp')
-rw-r--r-- | examples/llava/llava-cli.cpp | 32 |
1 files changed, 29 insertions, 3 deletions
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index d94795fe..6ac70ba6 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama)); - // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:" - eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos); + std::string system_prompt, user_prompt; + size_t image_pos = prompt.find("<image>"); + if (image_pos != std::string::npos) { + // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image + + system_prompt = prompt.substr(0, image_pos); + user_prompt = prompt.substr(image_pos + std::string("<image>").length()); + // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string + size_t pos = 0; + while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) { + user_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline + } + while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) { + system_prompt.replace(pos, 2, "\n"); + pos += 1; // Advance past the replaced newline + } + + printf("system_prompt: %s\n", system_prompt.c_str()); + printf("user_prompt: %s\n", user_prompt.c_str()); + } else { + // llava-1.5 native mode + system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; + user_prompt = prompt + "\nASSISTANT:"; + } + + eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false); + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); // generate the response @@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); if (strcmp(tmp, "</s>") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior printf("%s", tmp); fflush(stdout); |