summaryrefslogtreecommitdiff
path: root/examples/llava/llava-cli.cpp
diff options
context:
space:
mode:
authorJohn <78893154+cmp-nct@users.noreply.github.com>2024-02-14 08:38:35 +0100
committerGitHub <noreply@github.com>2024-02-14 09:38:35 +0200
commitaa2341298924ac89778252015efcb792f2df1e20 (patch)
tree1b7702dd6cf16b25495b6acf87467106ab2b75e0 /examples/llava/llava-cli.cpp
parentf5ca054855dea83f424003162f26de376e5643f6 (diff)
llava : support v1.6 (#5267)
* Create llava-survery-v2.py * Update convert-image-encoder-to-gguf.py * Update convert-image-encoder-to-gguf.py * Rename llava-survery-v2.py to llava-surgery-v2.py * Update convert-image-encoder-to-gguf.py will now search for projector * Update convert-image-encoder-to-gguf.py whoops * Update llava-surgery-v2.py * Clip: Bugfix for normalization (it did not loat the 3 std and mean values) Clip: bicubic resize function Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6) Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final convert-image-encoder: fixed image-grid flattening * whitespace corrections * ws * Tensors are now properly permuted. Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference. * ws * added verbose_prompt support into cli added stopwords for llava-1.6 into cli * moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed * ws * convert : skip unknown tensors (need for LLaVA) * llava : update readme * llava : fix compile warnings * llava : style * convert : add --skip-unknown CLI arg * server : remove clip structs * bugfix for non llava-1.6 It should now work with llava-1.5 as well * clip : minor code rearrange * llava : update readme a bit --------- Co-authored-by: John <cmt-nct@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/llava/llava-cli.cpp')
-rw-r--r--examples/llava/llava-cli.cpp26
1 files changed, 24 insertions, 2 deletions
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 031e9806..bef7f7c9 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -155,11 +155,29 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
printf("system_prompt: %s\n", system_prompt.c_str());
+ if (params->verbose_prompt) {
+ auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+ for (int i = 0; i < (int) tmp.size(); i++) {
+ printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+ }
+ }
printf("user_prompt: %s\n", user_prompt.c_str());
+ if (params->verbose_prompt) {
+ auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+ for (int i = 0; i < (int) tmp.size(); i++) {
+ printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+ }
+ }
} else {
// llava-1.5 native mode
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:";
+ if (params->verbose_prompt) {
+ auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+ for (int i = 0; i < (int) tmp.size(); i++) {
+ printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+ }
+ }
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
@@ -171,13 +189,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
fprintf(stderr, "\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-
+ std::string response = "";
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+ response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
-
printf("%s", tmp);
+ if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
+ if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
+ if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
+
fflush(stdout);
}