diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-10-20 21:07:23 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-20 21:07:23 +0300 |
commit | d1031cf49c3b958b915fd558e23453471c29ac33 (patch) | |
tree | 14fa2bc6d54d5e27bd1e8bfd6fa4dbf894dbe6b9 /examples/embd-input | |
parent | 8cf19d60dc93809db8e51fedc811595eed9134c5 (diff) |
sampling : refactor init to use llama_sampling_params (#3696)
* sampling : refactor init to use llama_sampling_params
* llama : combine repetition, frequency and presence penalties in 1 call
* examples : remove embd-input and gptneox-wip
* sampling : rename penalty params + reduce size of "prev" vector
* sampling : add llama_sampling_print helper
* sampling : hide prev behind API and apply #3661
ggml-ci
Diffstat (limited to 'examples/embd-input')
-rw-r--r-- | examples/embd-input/.gitignore | 4 | ||||
-rw-r--r-- | examples/embd-input/CMakeLists.txt | 17 | ||||
-rw-r--r-- | examples/embd-input/README.md | 63 | ||||
-rw-r--r-- | examples/embd-input/embd-input-lib.cpp | 221 | ||||
-rw-r--r-- | examples/embd-input/embd-input-test.cpp | 35 | ||||
-rw-r--r-- | examples/embd-input/embd-input.h | 27 | ||||
-rwxr-xr-x | examples/embd-input/embd_input.py | 72 | ||||
-rwxr-xr-x | examples/embd-input/llava.py | 71 | ||||
-rwxr-xr-x | examples/embd-input/minigpt4.py | 129 | ||||
-rwxr-xr-x | examples/embd-input/panda_gpt.py | 99 |
10 files changed, 0 insertions, 738 deletions
diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore deleted file mode 100644 index 87ef6877..00000000 --- a/examples/embd-input/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -PandaGPT -MiniGPT-4 -*.pth - diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt deleted file mode 100644 index 5bbb1ea0..00000000 --- a/examples/embd-input/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -set(TARGET embdinput) -add_library(${TARGET} embd-input-lib.cpp embd-input.h) -install(TARGETS ${TARGET} LIBRARY) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() - -set(TARGET embd-input-test) -add_executable(${TARGET} embd-input-test.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md deleted file mode 100644 index 5c4c75ea..00000000 --- a/examples/embd-input/README.md +++ /dev/null @@ -1,63 +0,0 @@ -### Examples for input embedding directly - -## Requirement -build `libembdinput.so` -run the following comman in main dir (../../). -``` -make -``` - -## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py) - -1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/). -2. Convert it to ggml format. -3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin). - -``` -import torch - -bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin" -pth_path = "./examples/embd-input/llava_projection.pth" - -dic = torch.load(bin_path) -used_key = ["model.mm_projector.weight","model.mm_projector.bias"] -torch.save({k: dic[k] for k in used_key}, pth_path) -``` -4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`. - - -## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py) - -1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format. -The `adapter_config.json` is -``` -{ - "peft_type": "LORA", - "fan_in_fan_out": false, - "bias": null, - "modules_to_save": null, - "r": 32, - "lora_alpha": 32, - "lora_dropout": 0.1, - "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"] -} -``` -2. Papare the `vicuna` v0 model. -3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model. -4. Clone the PandaGPT source. -``` -git clone https://github.com/yxuansu/PandaGPT -``` -5. Install the requirement of PandaGPT. -6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py. - -## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py) - -1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`. -2. Clone the MiniGPT-4 source. -``` -git clone https://github.com/Vision-CAIR/MiniGPT-4/ -``` -3. Install the requirement of PandaGPT. -4. Papare the `vicuna` v0 model. -5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`. diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp deleted file mode 100644 index 3ce33842..00000000 --- a/examples/embd-input/embd-input-lib.cpp +++ /dev/null @@ -1,221 +0,0 @@ -#include "build-info.h" -#include "common.h" -#include "embd-input.h" - -#include <cassert> -#include <cinttypes> -#include <cmath> -#include <cstdio> -#include <cstring> -#include <ctime> -#include <fstream> -#include <iostream> -#include <string> -#include <vector> - -static llama_context ** g_ctx; - -extern "C" { - -struct MyModel* create_mymodel(int argc, char ** argv) { - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - return nullptr; - } - - print_build_info(); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = uint32_t(time(NULL)); - } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - - llama_backend_init(params.numa); - - llama_model * model; - llama_context * ctx; - - g_ctx = &ctx; - - // load the model and apply lora adapter, if any - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return nullptr; - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", get_system_info(params).c_str()); - } - struct MyModel * ret = new MyModel(); - ret->ctx = ctx; - ret->params = params; - ret->n_past = 0; - // printf("ctx: %d\n", ret->ctx); - return ret; -} - -void free_mymodel(struct MyModel * mymodel) { - llama_context * ctx = mymodel->ctx; - llama_print_timings(ctx); - llama_free(ctx); - delete mymodel; -} - - -bool eval_float(void * model, float * input, int N){ - MyModel * mymodel = (MyModel*)model; - llama_context * ctx = mymodel->ctx; - gpt_params params = mymodel->params; - int n_emb = llama_n_embd(llama_get_model(ctx)); - int n_past = mymodel->n_past; - int n_batch = N; // params.n_batch; - - for (int i = 0; i < (int) N; i += n_batch) { - int n_eval = (int) N - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; - if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - n_past += n_eval; - } - mymodel->n_past = n_past; - return true; -} - -bool eval_tokens(void * model, std::vector<llama_token> tokens) { - MyModel * mymodel = (MyModel* )model; - llama_context * ctx; - ctx = mymodel->ctx; - gpt_params params = mymodel->params; - int n_past = mymodel->n_past; - for (int i = 0; i < (int) tokens.size(); i += params.n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; - } - if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - n_past += n_eval; - } - mymodel->n_past = n_past; - return true; -} - -bool eval_id(struct MyModel* mymodel, int id) { - std::vector<llama_token> tokens; - tokens.push_back(id); - return eval_tokens(mymodel, tokens); -} - -bool eval_string(struct MyModel * mymodel,const char* str){ - llama_context * ctx = mymodel->ctx; - std::string str2 = str; - std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true); - eval_tokens(mymodel, embd_inp); - return true; -} - -llama_token sampling_id(struct MyModel* mymodel) { - llama_context* ctx = mymodel->ctx; - gpt_params params = mymodel->params; - llama_sampling_params & sparams = params.sampling_params; - // int n_ctx = llama_n_ctx(ctx); - - // out of user input, sample next token - const float temp = sparams.temp; - const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k; - const float top_p = sparams.top_p; - const float tfs_z = sparams.tfs_z; - const float typical_p = sparams.typical_p; - // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - // const float repeat_penalty = params.repeat_penalty; - // const float alpha_presence = params.presence_penalty; - // const float alpha_frequency = params.frequency_penalty; - const int mirostat = sparams.mirostat; - const float mirostat_tau = sparams.mirostat_tau; - const float mirostat_eta = sparams.mirostat_eta; - // const bool penalize_nl = params.penalize_nl; - - llama_token id = 0; - { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(llama_get_model(ctx)); - - // Apply params.logit_bias map - for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - std::vector<llama_token_data> candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - // TODO: Apply penalties - // float nl_logit = logits[llama_token_nl(ctx)]; - // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); - // llama_sample_repetition_penalty(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, repeat_penalty); - // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, alpha_frequency, alpha_presence); - // if (!penalize_nl) { - // logits[llama_token_nl(ctx)] = nl_logit; - // } - - if (temp <= 0) { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } else { - if (mirostat == 1) { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temp(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } else if (mirostat == 2) { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temp(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temp(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - } - - return id; -} - -const char * sampling(struct MyModel * mymodel) { - llama_context * ctx = mymodel->ctx; - int id = sampling_id(mymodel); - static std::string ret; - if (id == llama_token_eos(ctx)) { - ret = "</s>"; - } else { - ret = llama_token_to_piece(ctx, id); - } - eval_id(mymodel, id); - return ret.c_str(); -} - -} diff --git a/examples/embd-input/embd-input-test.cpp b/examples/embd-input/embd-input-test.cpp deleted file mode 100644 index dc4a0e48..00000000 --- a/examples/embd-input/embd-input-test.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include "embd-input.h" -#include <stdlib.h> -#include <random> -#include <string.h> - -int main(int argc, char** argv) { - - auto mymodel = create_mymodel(argc, argv); - int N = 10; - int max_tgt_len = 500; - int n_embd = llama_n_embd(llama_get_model(mymodel->ctx)); - - // add random float embd to test evaluation - float * data = new float[N*n_embd]; - std::default_random_engine e; - std::uniform_real_distribution<float> u(0,1); - for (int i=0;i<N*n_embd;i++) { - data[i] = u(e); - } - - eval_string(mymodel, "user: what is the color of the flag of UN?"); - eval_float(mymodel, data, N); - eval_string(mymodel, "assistant:"); - eval_string(mymodel, mymodel->params.prompt.c_str()); - const char* tmp; - for (int i=0; i<max_tgt_len; i++) { - tmp = sampling(mymodel); - if (strcmp(tmp, "</s>")==0) break; - printf("%s", tmp); - fflush(stdout); - } - printf("\n"); - free_mymodel(mymodel); - return 0; -} diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h deleted file mode 100644 index eff5e3b8..00000000 --- a/examples/embd-input/embd-input.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _EMBD_INPUT_H_ -#define _EMBD_INPUT_H_ 1 - -#include "common.h" -#include "llama.h" - -extern "C" { - -typedef struct MyModel { - llama_context* ctx; - gpt_params params; - int n_past = 0; -} MyModel; - -struct MyModel* create_mymodel(int argc, char ** argv); - -bool eval_float(void* model, float* input, int N); -bool eval_tokens(void* model, std::vector<llama_token> tokens); -bool eval_id(struct MyModel* mymodel, int id); -bool eval_string(struct MyModel* mymodel, const char* str); -const char * sampling(struct MyModel* mymodel); -llama_token sampling_id(struct MyModel* mymodel); -void free_mymodel(struct MyModel* mymodel); - -} - -#endif diff --git a/examples/embd-input/embd_input.py b/examples/embd-input/embd_input.py deleted file mode 100755 index f146acdc..00000000 --- a/examples/embd-input/embd_input.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -import ctypes -from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int -import numpy as np -import os - -libc = cdll.LoadLibrary("./libembdinput.so") -libc.sampling.restype=c_char_p -libc.create_mymodel.restype=c_void_p -libc.eval_string.argtypes=[c_void_p, c_char_p] -libc.sampling.argtypes=[c_void_p] -libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int] - - -class MyModel: - def __init__(self, args): - argc = len(args) - c_str = [c_char_p(i.encode()) for i in args] - args_c = (c_char_p * argc)(*c_str) - self.model = c_void_p(libc.create_mymodel(argc, args_c)) - self.max_tgt_len = 512 - self.print_string_eval = True - - def __del__(self): - libc.free_mymodel(self.model) - - def eval_float(self, x): - libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1]) - - def eval_string(self, x): - libc.eval_string(self.model, x.encode()) # c_char_p(x.encode())) - if self.print_string_eval: - print(x) - - def eval_token(self, x): - libc.eval_id(self.model, x) - - def sampling(self): - s = libc.sampling(self.model) - return s - - def stream_generate(self, end="</s>"): - ret = b"" - end = end.encode() - for _ in range(self.max_tgt_len): - tmp = self.sampling() - ret += tmp - yield tmp - if ret.endswith(end): - break - - def generate_with_print(self, end="</s>"): - ret = b"" - for i in self.stream_generate(end=end): - ret += i - print(i.decode(errors="replace"), end="", flush=True) - print("") - return ret.decode(errors="replace") - - - def generate(self, end="</s>"): - text = b"".join(self.stream_generate(end=end)) - return text.decode(errors="replace") - -if __name__ == "__main__": - model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) - model.eval_string("""user: what is the color of the flag of UN?""") - x = np.random.random((5120,10))# , dtype=np.float32) - model.eval_float(x) - model.eval_string("""assistant:""") - for i in model.generate(): - print(i.decode(errors="replace"), end="", flush=True) diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py deleted file mode 100755 index 06fad55f..00000000 --- a/examples/embd-input/llava.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -sys.path.insert(0, os.path.dirname(__file__)) -from embd_input import MyModel -import numpy as np -from torch import nn -import torch -from transformers import CLIPVisionModel, CLIPImageProcessor -from PIL import Image - -# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1' -vision_tower = "openai/clip-vit-large-patch14" -select_hidden_state_layer = -2 -# (vision_config.image_size // vision_config.patch_size) ** 2 -image_token_len = (224//14)**2 - -class Llava: - def __init__(self, args): - self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower) - self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower) - self.mm_projector = nn.Linear(1024, 5120) - self.model = MyModel(["main", *args]) - - def load_projection(self, path): - state = torch.load(path) - self.mm_projector.load_state_dict({ - "weight": state["model.mm_projector.weight"], - "bias": state["model.mm_projector.bias"]}) - - def chat(self, question): - self.model.eval_string("user: ") - self.model.eval_string(question) - self.model.eval_string("\nassistant: ") - return self.model.generate_with_print() - - def chat_with_image(self, image, question): - with torch.no_grad(): - embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] - image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True) - select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer] - image_feature = select_hidden_state[:, 1:] - embd_image = self.mm_projector(image_feature) - embd_image = embd_image.cpu().numpy()[0] - self.model.eval_string("user: ") - self.model.eval_token(32003-2) # im_start - self.model.eval_float(embd_image.T) - for i in range(image_token_len-embd_image.shape[0]): - self.model.eval_token(32003-3) # im_patch - self.model.eval_token(32003-1) # im_end - self.model.eval_string(question) - self.model.eval_string("\nassistant: ") - return self.model.generate_with_print() - - -if __name__=="__main__": - # model form liuhaotian/LLaVA-13b-delta-v1-1 - a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"]) - # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin. - # Also here can use pytorch_model-00003-of-00003.bin directly. - a.load_projection(os.path.join( - os.path.dirname(__file__) , - "llava_projection.pth")) - respose = a.chat_with_image( - Image.open("./media/llama1-logo.png").convert('RGB'), - "what is the text in the picture?") - respose - a.chat("what is the color of it?") - - - diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py deleted file mode 100755 index 7b13e4a5..00000000 --- a/examples/embd-input/minigpt4.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -sys.path.insert(0, os.path.dirname(__file__)) -from embd_input import MyModel -import numpy as np -from torch import nn -import torch -from PIL import Image - -minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4") -sys.path.insert(0, minigpt4_path) -from minigpt4.models.blip2 import Blip2Base -from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor - - -class MiniGPT4(Blip2Base): - """ - MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4 - """ - def __init__(self, - args, - vit_model="eva_clip_g", - q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth", - img_size=224, - drop_path_rate=0, - use_grad_checkpoint=False, - vit_precision="fp32", - freeze_vit=True, - freeze_qformer=True, - num_query_token=32, - llama_model="", - prompt_path="", - prompt_template="", - max_txt_len=32, - end_sym='\n', - low_resource=False, # use 8 bit and put vit in cpu - device_8bit=0 - ): - super().__init__() - self.img_size = img_size - self.low_resource = low_resource - self.preprocessor = Blip2ImageEvalProcessor(img_size) - - print('Loading VIT') - self.visual_encoder, self.ln_vision = self.init_vision_encoder( - vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision - ) - print('Loading VIT Done') - print('Loading Q-Former') - self.Qformer, self.query_tokens = self.init_Qformer( - num_query_token, self.visual_encoder.num_features - ) - self.Qformer.cls = None - self.Qformer.bert.embeddings.word_embeddings = None - self.Qformer.bert.embeddings.position_embeddings = None - for layer in self.Qformer.bert.encoder.layer: - layer.output = None - layer.intermediate = None - self.load_from_pretrained(url_or_filename=q_former_model) - print('Loading Q-Former Done') - self.llama_proj = nn.Linear( - self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size - ) - self.max_txt_len = max_txt_len - self.end_sym = end_sym - self.model = MyModel(["main", *args]) - # system prompt - self.model.eval_string("Give the following image: <Img>ImageContent</Img>. " - "You will be able to see the image once I provide it to you. Please answer my questions." - "###") - - def encode_img(self, image): - image = self.preprocessor(image) - image = image.unsqueeze(0) - device = image.device - if self.low_resource: - self.vit_to_cpu() - image = image.to("cpu") - - with self.maybe_autocast(): - image_embeds = self.ln_vision(self.visual_encoder(image)).to(device) - image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device) - - query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) - query_output = self.Qformer.bert( - query_embeds=query_tokens, - encoder_hidden_states=image_embeds, - encoder_attention_mask=image_atts, - return_dict=True, - ) - - inputs_llama = self.llama_proj(query_output.last_hidden_state) - # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device) - return inputs_llama - - def load_projection(self, path): - state = torch.load(path)["model"] - self.llama_proj.load_state_dict({ - "weight": state["llama_proj.weight"], - "bias": state["llama_proj.bias"]}) - - def chat(self, question): - self.model.eval_string("Human: ") - self.model.eval_string(question) - self.model.eval_string("\n### Assistant:") - return self.model.generate_with_print(end="###") - - def chat_with_image(self, image, question): - with torch.no_grad(): - embd_image = self.encode_img(image) - embd_image = embd_image.cpu().numpy()[0] - self.model.eval_string("Human: <Img>") - self.model.eval_float(embd_image.T) - self.model.eval_string("</Img> ") - self.model.eval_string(question) - self.model.eval_string("\n### Assistant:") - return self.model.generate_with_print(end="###") - - -if __name__=="__main__": - a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"]) - a.load_projection(os.path.join( - os.path.dirname(__file__) , - "pretrained_minigpt4.pth")) - respose = a.chat_with_image( - Image.open("./media/llama1-logo.png").convert('RGB'), - "what is the text in the picture?") - a.chat("what is the color of it?") diff --git a/examples/embd-input/panda_gpt.py b/examples/embd-input/panda_gpt.py deleted file mode 100755 index 891ad7cc..00000000 --- a/examples/embd-input/panda_gpt.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -sys.path.insert(0, os.path.dirname(__file__)) -from embd_input import MyModel -import numpy as np -from torch import nn -import torch - -# use PandaGPT path -panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT") -imagebind_ckpt_path = "./models/panda_gpt/" - -sys.path.insert(0, os.path.join(panda_gpt_path,"code","model")) -from ImageBind.models import imagebind_model -from ImageBind import data - -ModalityType = imagebind_model.ModalityType -max_tgt_len = 400 - -class PandaGPT: - def __init__(self, args): - self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path) - self.visual_encoder.eval() - self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120) - self.max_tgt_len = max_tgt_len - self.model = MyModel(["main", *args]) - self.generated_text = "" - self.device = "cpu" - - def load_projection(self, path): - state = torch.load(path, map_location="cpu") - self.llama_proj.load_state_dict({ - "weight": state["llama_proj.weight"], - "bias": state["llama_proj.bias"]}) - - def eval_inputs(self, inputs): - self.model.eval_string("<Img>") - embds = self.extract_multimoal_feature(inputs) - for i in embds: - self.model.eval_float(i.T) - self.model.eval_string("</Img> ") - - def chat(self, question): - return self.chat_with_image(None, question) - - def chat_with_image(self, inputs, question): - if self.generated_text == "": - self.model.eval_string("###") - self.model.eval_string(" Human: ") - if inputs: - self.eval_inputs(inputs) - self.model.eval_string(question) - self.model.eval_string("\n### Assistant:") - ret = self.model.generate_with_print(end="###") - self.generated_text += ret - return ret - - def extract_multimoal_feature(self, inputs): - features = [] - for key in ["image", "audio", "video", "thermal"]: - if key + "_paths" in inputs: - embeds = self.encode_data(key, inputs[key+"_paths"]) - features.append(embeds) - return features - - def encode_data(self, data_type, data_paths): - - type_map = { - "image": ModalityType.VISION, - "audio": ModalityType.AUDIO, - "video": ModalityType.VISION, - "thermal": ModalityType.THERMAL, - } - load_map = { - "image": data.load_and_transform_vision_data, - "audio": data.load_and_transform_audio_data, - "video": data.load_and_transform_video_data, - "thermal": data.load_and_transform_thermal_data - } - - load_function = load_map[data_type] - key = type_map[data_type] - - inputs = {key: load_function(data_paths, self.device)} - with torch.no_grad(): - embeddings = self.visual_encoder(inputs) - embeds = embeddings[key] - embeds = self.llama_proj(embeds).cpu().numpy() - return embeds - - -if __name__=="__main__": - a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"]) - a.load_projection("./models/panda_gpt/adapter_model.bin") - a.chat_with_image( - {"image_paths": ["./media/llama1-logo.png"]}, - "what is the text in the picture? 'llama' or 'lambda'?") - a.chat("what is the color of it?") |