summaryrefslogtreecommitdiff
path: root/examples/embd-input
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-10-20 21:07:23 +0300
committerGitHub <noreply@github.com>2023-10-20 21:07:23 +0300
commitd1031cf49c3b958b915fd558e23453471c29ac33 (patch)
tree14fa2bc6d54d5e27bd1e8bfd6fa4dbf894dbe6b9 /examples/embd-input
parent8cf19d60dc93809db8e51fedc811595eed9134c5 (diff)
sampling : refactor init to use llama_sampling_params (#3696)
* sampling : refactor init to use llama_sampling_params * llama : combine repetition, frequency and presence penalties in 1 call * examples : remove embd-input and gptneox-wip * sampling : rename penalty params + reduce size of "prev" vector * sampling : add llama_sampling_print helper * sampling : hide prev behind API and apply #3661 ggml-ci
Diffstat (limited to 'examples/embd-input')
-rw-r--r--examples/embd-input/.gitignore4
-rw-r--r--examples/embd-input/CMakeLists.txt17
-rw-r--r--examples/embd-input/README.md63
-rw-r--r--examples/embd-input/embd-input-lib.cpp221
-rw-r--r--examples/embd-input/embd-input-test.cpp35
-rw-r--r--examples/embd-input/embd-input.h27
-rwxr-xr-xexamples/embd-input/embd_input.py72
-rwxr-xr-xexamples/embd-input/llava.py71
-rwxr-xr-xexamples/embd-input/minigpt4.py129
-rwxr-xr-xexamples/embd-input/panda_gpt.py99
10 files changed, 0 insertions, 738 deletions
diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore
deleted file mode 100644
index 87ef6877..00000000
--- a/examples/embd-input/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-PandaGPT
-MiniGPT-4
-*.pth
-
diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt
deleted file mode 100644
index 5bbb1ea0..00000000
--- a/examples/embd-input/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-set(TARGET embdinput)
-add_library(${TARGET} embd-input-lib.cpp embd-input.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
- add_dependencies(${TARGET} BUILD_INFO)
-endif()
-
-set(TARGET embd-input-test)
-add_executable(${TARGET} embd-input-test.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
- add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md
deleted file mode 100644
index 5c4c75ea..00000000
--- a/examples/embd-input/README.md
+++ /dev/null
@@ -1,63 +0,0 @@
-### Examples for input embedding directly
-
-## Requirement
-build `libembdinput.so`
-run the following comman in main dir (../../).
-```
-make
-```
-
-## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
-
-1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
-2. Convert it to ggml format.
-3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
-
-```
-import torch
-
-bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
-pth_path = "./examples/embd-input/llava_projection.pth"
-
-dic = torch.load(bin_path)
-used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
-torch.save({k: dic[k] for k in used_key}, pth_path)
-```
-4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
-
-
-## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
-
-1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
-The `adapter_config.json` is
-```
-{
- "peft_type": "LORA",
- "fan_in_fan_out": false,
- "bias": null,
- "modules_to_save": null,
- "r": 32,
- "lora_alpha": 32,
- "lora_dropout": 0.1,
- "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
-}
-```
-2. Papare the `vicuna` v0 model.
-3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
-4. Clone the PandaGPT source.
-```
-git clone https://github.com/yxuansu/PandaGPT
-```
-5. Install the requirement of PandaGPT.
-6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
-
-## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
-
-1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
-2. Clone the MiniGPT-4 source.
-```
-git clone https://github.com/Vision-CAIR/MiniGPT-4/
-```
-3. Install the requirement of PandaGPT.
-4. Papare the `vicuna` v0 model.
-5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
deleted file mode 100644
index 3ce33842..00000000
--- a/examples/embd-input/embd-input-lib.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-#include "build-info.h"
-#include "common.h"
-#include "embd-input.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-static llama_context ** g_ctx;
-
-extern "C" {
-
-struct MyModel* create_mymodel(int argc, char ** argv) {
- gpt_params params;
-
- if (!gpt_params_parse(argc, argv, params)) {
- return nullptr;
- }
-
- print_build_info();
-
- if (params.seed == LLAMA_DEFAULT_SEED) {
- params.seed = uint32_t(time(NULL));
- }
- fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
-
- llama_backend_init(params.numa);
-
- llama_model * model;
- llama_context * ctx;
-
- g_ctx = &ctx;
-
- // load the model and apply lora adapter, if any
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
- if (model == NULL) {
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
- return nullptr;
- }
-
- // print system information
- {
- fprintf(stderr, "\n");
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
- }
- struct MyModel * ret = new MyModel();
- ret->ctx = ctx;
- ret->params = params;
- ret->n_past = 0;
- // printf("ctx: %d\n", ret->ctx);
- return ret;
-}
-
-void free_mymodel(struct MyModel * mymodel) {
- llama_context * ctx = mymodel->ctx;
- llama_print_timings(ctx);
- llama_free(ctx);
- delete mymodel;
-}
-
-
-bool eval_float(void * model, float * input, int N){
- MyModel * mymodel = (MyModel*)model;
- llama_context * ctx = mymodel->ctx;
- gpt_params params = mymodel->params;
- int n_emb = llama_n_embd(llama_get_model(ctx));
- int n_past = mymodel->n_past;
- int n_batch = N; // params.n_batch;
-
- for (int i = 0; i < (int) N; i += n_batch) {
- int n_eval = (int) N - i;
- if (n_eval > n_batch) {
- n_eval = n_batch;
- }
- llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
- if (llama_decode(ctx, batch)) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
- return false;
- }
- n_past += n_eval;
- }
- mymodel->n_past = n_past;
- return true;
-}
-
-bool eval_tokens(void * model, std::vector<llama_token> tokens) {
- MyModel * mymodel = (MyModel* )model;
- llama_context * ctx;
- ctx = mymodel->ctx;
- gpt_params params = mymodel->params;
- int n_past = mymodel->n_past;
- for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
- int n_eval = (int) tokens.size() - i;
- if (n_eval > params.n_batch) {
- n_eval = params.n_batch;
- }
- if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
- return false;
- }
- n_past += n_eval;
- }
- mymodel->n_past = n_past;
- return true;
-}
-
-bool eval_id(struct MyModel* mymodel, int id) {
- std::vector<llama_token> tokens;
- tokens.push_back(id);
- return eval_tokens(mymodel, tokens);
-}
-
-bool eval_string(struct MyModel * mymodel,const char* str){
- llama_context * ctx = mymodel->ctx;
- std::string str2 = str;
- std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
- eval_tokens(mymodel, embd_inp);
- return true;
-}
-
-llama_token sampling_id(struct MyModel* mymodel) {
- llama_context* ctx = mymodel->ctx;
- gpt_params params = mymodel->params;
- llama_sampling_params & sparams = params.sampling_params;
- // int n_ctx = llama_n_ctx(ctx);
-
- // out of user input, sample next token
- const float temp = sparams.temp;
- const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
- const float top_p = sparams.top_p;
- const float tfs_z = sparams.tfs_z;
- const float typical_p = sparams.typical_p;
- // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
- // const float repeat_penalty = params.repeat_penalty;
- // const float alpha_presence = params.presence_penalty;
- // const float alpha_frequency = params.frequency_penalty;
- const int mirostat = sparams.mirostat;
- const float mirostat_tau = sparams.mirostat_tau;
- const float mirostat_eta = sparams.mirostat_eta;
- // const bool penalize_nl = params.penalize_nl;
-
- llama_token id = 0;
- {
- auto logits = llama_get_logits(ctx);
- auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-
- // Apply params.logit_bias map
- for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
- logits[it->first] += it->second;
- }
-
- std::vector<llama_token_data> candidates;
- candidates.reserve(n_vocab);
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
- candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
- }
-
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
- // TODO: Apply penalties
- // float nl_logit = logits[llama_token_nl(ctx)];
- // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
- // llama_sample_repetition_penalty(ctx, &candidates_p,
- // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
- // last_n_repeat, repeat_penalty);
- // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
- // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
- // last_n_repeat, alpha_frequency, alpha_presence);
- // if (!penalize_nl) {
- // logits[llama_token_nl(ctx)] = nl_logit;
- // }
-
- if (temp <= 0) {
- // Greedy sampling
- id = llama_sample_token_greedy(ctx, &candidates_p);
- } else {
- if (mirostat == 1) {
- static float mirostat_mu = 2.0f * mirostat_tau;
- const int mirostat_m = 100;
- llama_sample_temp(ctx, &candidates_p, temp);
- id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
- } else if (mirostat == 2) {
- static float mirostat_mu = 2.0f * mirostat_tau;
- llama_sample_temp(ctx, &candidates_p, temp);
- id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
- } else {
- // Temperature sampling
- llama_sample_top_k(ctx, &candidates_p, top_k, 1);
- llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
- llama_sample_typical(ctx, &candidates_p, typical_p, 1);
- llama_sample_top_p(ctx, &candidates_p, top_p, 1);
- llama_sample_temp(ctx, &candidates_p, temp);
- id = llama_sample_token(ctx, &candidates_p);
- }
- }
- }
-
- return id;
-}
-
-const char * sampling(struct MyModel * mymodel) {
- llama_context * ctx = mymodel->ctx;
- int id = sampling_id(mymodel);
- static std::string ret;
- if (id == llama_token_eos(ctx)) {
- ret = "</s>";
- } else {
- ret = llama_token_to_piece(ctx, id);
- }
- eval_id(mymodel, id);
- return ret.c_str();
-}
-
-}
diff --git a/examples/embd-input/embd-input-test.cpp b/examples/embd-input/embd-input-test.cpp
deleted file mode 100644
index dc4a0e48..00000000
--- a/examples/embd-input/embd-input-test.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "embd-input.h"
-#include <stdlib.h>
-#include <random>
-#include <string.h>
-
-int main(int argc, char** argv) {
-
- auto mymodel = create_mymodel(argc, argv);
- int N = 10;
- int max_tgt_len = 500;
- int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
-
- // add random float embd to test evaluation
- float * data = new float[N*n_embd];
- std::default_random_engine e;
- std::uniform_real_distribution<float> u(0,1);
- for (int i=0;i<N*n_embd;i++) {
- data[i] = u(e);
- }
-
- eval_string(mymodel, "user: what is the color of the flag of UN?");
- eval_float(mymodel, data, N);
- eval_string(mymodel, "assistant:");
- eval_string(mymodel, mymodel->params.prompt.c_str());
- const char* tmp;
- for (int i=0; i<max_tgt_len; i++) {
- tmp = sampling(mymodel);
- if (strcmp(tmp, "</s>")==0) break;
- printf("%s", tmp);
- fflush(stdout);
- }
- printf("\n");
- free_mymodel(mymodel);
- return 0;
-}
diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h
deleted file mode 100644
index eff5e3b8..00000000
--- a/examples/embd-input/embd-input.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _EMBD_INPUT_H_
-#define _EMBD_INPUT_H_ 1
-
-#include "common.h"
-#include "llama.h"
-
-extern "C" {
-
-typedef struct MyModel {
- llama_context* ctx;
- gpt_params params;
- int n_past = 0;
-} MyModel;
-
-struct MyModel* create_mymodel(int argc, char ** argv);
-
-bool eval_float(void* model, float* input, int N);
-bool eval_tokens(void* model, std::vector<llama_token> tokens);
-bool eval_id(struct MyModel* mymodel, int id);
-bool eval_string(struct MyModel* mymodel, const char* str);
-const char * sampling(struct MyModel* mymodel);
-llama_token sampling_id(struct MyModel* mymodel);
-void free_mymodel(struct MyModel* mymodel);
-
-}
-
-#endif
diff --git a/examples/embd-input/embd_input.py b/examples/embd-input/embd_input.py
deleted file mode 100755
index f146acdc..00000000
--- a/examples/embd-input/embd_input.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-import ctypes
-from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
-import numpy as np
-import os
-
-libc = cdll.LoadLibrary("./libembdinput.so")
-libc.sampling.restype=c_char_p
-libc.create_mymodel.restype=c_void_p
-libc.eval_string.argtypes=[c_void_p, c_char_p]
-libc.sampling.argtypes=[c_void_p]
-libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
-
-
-class MyModel:
- def __init__(self, args):
- argc = len(args)
- c_str = [c_char_p(i.encode()) for i in args]
- args_c = (c_char_p * argc)(*c_str)
- self.model = c_void_p(libc.create_mymodel(argc, args_c))
- self.max_tgt_len = 512
- self.print_string_eval = True
-
- def __del__(self):
- libc.free_mymodel(self.model)
-
- def eval_float(self, x):
- libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
-
- def eval_string(self, x):
- libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
- if self.print_string_eval:
- print(x)
-
- def eval_token(self, x):
- libc.eval_id(self.model, x)
-
- def sampling(self):
- s = libc.sampling(self.model)
- return s
-
- def stream_generate(self, end="</s>"):
- ret = b""
- end = end.encode()
- for _ in range(self.max_tgt_len):
- tmp = self.sampling()
- ret += tmp
- yield tmp
- if ret.endswith(end):
- break
-
- def generate_with_print(self, end="</s>"):
- ret = b""
- for i in self.stream_generate(end=end):
- ret += i
- print(i.decode(errors="replace"), end="", flush=True)
- print("")
- return ret.decode(errors="replace")
-
-
- def generate(self, end="</s>"):
- text = b"".join(self.stream_generate(end=end))
- return text.decode(errors="replace")
-
-if __name__ == "__main__":
- model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
- model.eval_string("""user: what is the color of the flag of UN?""")
- x = np.random.random((5120,10))# , dtype=np.float32)
- model.eval_float(x)
- model.eval_string("""assistant:""")
- for i in model.generate():
- print(i.decode(errors="replace"), end="", flush=True)
diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py
deleted file mode 100755
index 06fad55f..00000000
--- a/examples/embd-input/llava.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-from transformers import CLIPVisionModel, CLIPImageProcessor
-from PIL import Image
-
-# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
-vision_tower = "openai/clip-vit-large-patch14"
-select_hidden_state_layer = -2
-# (vision_config.image_size // vision_config.patch_size) ** 2
-image_token_len = (224//14)**2
-
-class Llava:
- def __init__(self, args):
- self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
- self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
- self.mm_projector = nn.Linear(1024, 5120)
- self.model = MyModel(["main", *args])
-
- def load_projection(self, path):
- state = torch.load(path)
- self.mm_projector.load_state_dict({
- "weight": state["model.mm_projector.weight"],
- "bias": state["model.mm_projector.bias"]})
-
- def chat(self, question):
- self.model.eval_string("user: ")
- self.model.eval_string(question)
- self.model.eval_string("\nassistant: ")
- return self.model.generate_with_print()
-
- def chat_with_image(self, image, question):
- with torch.no_grad():
- embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
- image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
- select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
- image_feature = select_hidden_state[:, 1:]
- embd_image = self.mm_projector(image_feature)
- embd_image = embd_image.cpu().numpy()[0]
- self.model.eval_string("user: ")
- self.model.eval_token(32003-2) # im_start
- self.model.eval_float(embd_image.T)
- for i in range(image_token_len-embd_image.shape[0]):
- self.model.eval_token(32003-3) # im_patch
- self.model.eval_token(32003-1) # im_end
- self.model.eval_string(question)
- self.model.eval_string("\nassistant: ")
- return self.model.generate_with_print()
-
-
-if __name__=="__main__":
- # model form liuhaotian/LLaVA-13b-delta-v1-1
- a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
- # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
- # Also here can use pytorch_model-00003-of-00003.bin directly.
- a.load_projection(os.path.join(
- os.path.dirname(__file__) ,
- "llava_projection.pth"))
- respose = a.chat_with_image(
- Image.open("./media/llama1-logo.png").convert('RGB'),
- "what is the text in the picture?")
- respose
- a.chat("what is the color of it?")
-
-
-
diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py
deleted file mode 100755
index 7b13e4a5..00000000
--- a/examples/embd-input/minigpt4.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-from PIL import Image
-
-minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
-sys.path.insert(0, minigpt4_path)
-from minigpt4.models.blip2 import Blip2Base
-from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
-
-
-class MiniGPT4(Blip2Base):
- """
- MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
- """
- def __init__(self,
- args,
- vit_model="eva_clip_g",
- q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
- img_size=224,
- drop_path_rate=0,
- use_grad_checkpoint=False,
- vit_precision="fp32",
- freeze_vit=True,
- freeze_qformer=True,
- num_query_token=32,
- llama_model="",
- prompt_path="",
- prompt_template="",
- max_txt_len=32,
- end_sym='\n',
- low_resource=False, # use 8 bit and put vit in cpu
- device_8bit=0
- ):
- super().__init__()
- self.img_size = img_size
- self.low_resource = low_resource
- self.preprocessor = Blip2ImageEvalProcessor(img_size)
-
- print('Loading VIT')
- self.visual_encoder, self.ln_vision = self.init_vision_encoder(
- vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
- )
- print('Loading VIT Done')
- print('Loading Q-Former')
- self.Qformer, self.query_tokens = self.init_Qformer(
- num_query_token, self.visual_encoder.num_features
- )
- self.Qformer.cls = None
- self.Qformer.bert.embeddings.word_embeddings = None
- self.Qformer.bert.embeddings.position_embeddings = None
- for layer in self.Qformer.bert.encoder.layer:
- layer.output = None
- layer.intermediate = None
- self.load_from_pretrained(url_or_filename=q_former_model)
- print('Loading Q-Former Done')
- self.llama_proj = nn.Linear(
- self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
- )
- self.max_txt_len = max_txt_len
- self.end_sym = end_sym
- self.model = MyModel(["main", *args])
- # system prompt
- self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
- "You will be able to see the image once I provide it to you. Please answer my questions."
- "###")
-
- def encode_img(self, image):
- image = self.preprocessor(image)
- image = image.unsqueeze(0)
- device = image.device
- if self.low_resource:
- self.vit_to_cpu()
- image = image.to("cpu")
-
- with self.maybe_autocast():
- image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
- image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
-
- query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
- query_output = self.Qformer.bert(
- query_embeds=query_tokens,
- encoder_hidden_states=image_embeds,
- encoder_attention_mask=image_atts,
- return_dict=True,
- )
-
- inputs_llama = self.llama_proj(query_output.last_hidden_state)
- # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
- return inputs_llama
-
- def load_projection(self, path):
- state = torch.load(path)["model"]
- self.llama_proj.load_state_dict({
- "weight": state["llama_proj.weight"],
- "bias": state["llama_proj.bias"]})
-
- def chat(self, question):
- self.model.eval_string("Human: ")
- self.model.eval_string(question)
- self.model.eval_string("\n### Assistant:")
- return self.model.generate_with_print(end="###")
-
- def chat_with_image(self, image, question):
- with torch.no_grad():
- embd_image = self.encode_img(image)
- embd_image = embd_image.cpu().numpy()[0]
- self.model.eval_string("Human: <Img>")
- self.model.eval_float(embd_image.T)
- self.model.eval_string("</Img> ")
- self.model.eval_string(question)
- self.model.eval_string("\n### Assistant:")
- return self.model.generate_with_print(end="###")
-
-
-if __name__=="__main__":
- a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
- a.load_projection(os.path.join(
- os.path.dirname(__file__) ,
- "pretrained_minigpt4.pth"))
- respose = a.chat_with_image(
- Image.open("./media/llama1-logo.png").convert('RGB'),
- "what is the text in the picture?")
- a.chat("what is the color of it?")
diff --git a/examples/embd-input/panda_gpt.py b/examples/embd-input/panda_gpt.py
deleted file mode 100755
index 891ad7cc..00000000
--- a/examples/embd-input/panda_gpt.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-
-# use PandaGPT path
-panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
-imagebind_ckpt_path = "./models/panda_gpt/"
-
-sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
-from ImageBind.models import imagebind_model
-from ImageBind import data
-
-ModalityType = imagebind_model.ModalityType
-max_tgt_len = 400
-
-class PandaGPT:
- def __init__(self, args):
- self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
- self.visual_encoder.eval()
- self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
- self.max_tgt_len = max_tgt_len
- self.model = MyModel(["main", *args])
- self.generated_text = ""
- self.device = "cpu"
-
- def load_projection(self, path):
- state = torch.load(path, map_location="cpu")
- self.llama_proj.load_state_dict({
- "weight": state["llama_proj.weight"],
- "bias": state["llama_proj.bias"]})
-
- def eval_inputs(self, inputs):
- self.model.eval_string("<Img>")
- embds = self.extract_multimoal_feature(inputs)
- for i in embds:
- self.model.eval_float(i.T)
- self.model.eval_string("</Img> ")
-
- def chat(self, question):
- return self.chat_with_image(None, question)
-
- def chat_with_image(self, inputs, question):
- if self.generated_text == "":
- self.model.eval_string("###")
- self.model.eval_string(" Human: ")
- if inputs:
- self.eval_inputs(inputs)
- self.model.eval_string(question)
- self.model.eval_string("\n### Assistant:")
- ret = self.model.generate_with_print(end="###")
- self.generated_text += ret
- return ret
-
- def extract_multimoal_feature(self, inputs):
- features = []
- for key in ["image", "audio", "video", "thermal"]:
- if key + "_paths" in inputs:
- embeds = self.encode_data(key, inputs[key+"_paths"])
- features.append(embeds)
- return features
-
- def encode_data(self, data_type, data_paths):
-
- type_map = {
- "image": ModalityType.VISION,
- "audio": ModalityType.AUDIO,
- "video": ModalityType.VISION,
- "thermal": ModalityType.THERMAL,
- }
- load_map = {
- "image": data.load_and_transform_vision_data,
- "audio": data.load_and_transform_audio_data,
- "video": data.load_and_transform_video_data,
- "thermal": data.load_and_transform_thermal_data
- }
-
- load_function = load_map[data_type]
- key = type_map[data_type]
-
- inputs = {key: load_function(data_paths, self.device)}
- with torch.no_grad():
- embeddings = self.visual_encoder(inputs)
- embeds = embeddings[key]
- embeds = self.llama_proj(embeds).cpu().numpy()
- return embeds
-
-
-if __name__=="__main__":
- a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
- a.load_projection("./models/panda_gpt/adapter_model.bin")
- a.chat_with_image(
- {"image_paths": ["./media/llama1-logo.png"]},
- "what is the text in the picture? 'llama' or 'lambda'?")
- a.chat("what is the color of it?")