summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
Diffstat (limited to 'examples')
-rwxr-xr-xexamples/chat-13B.sh4
-rw-r--r--examples/common.cpp7
-rw-r--r--examples/common.h1
-rw-r--r--examples/main/main.cpp89
4 files changed, 99 insertions, 2 deletions
diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh
index 4265d7b6..2fac3778 100755
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -31,8 +31,6 @@ The transcript only includes text, it does not include markup like HTML and Mark
$USER_NAME: Hello, $AI_NAME!
$AI_NAME: Hello $USER_NAME! How may I help you today?
-$USER_NAME: What time is it?
-$AI_NAME: It is $(date +%H:%M).
$USER_NAME: What year is it?
$AI_NAME: We are in $(date +%Y).
$USER_NAME: Please tell me the largest city in Europe.
@@ -50,4 +48,6 @@ $AI_NAME: The arguments are stored in process.argv.
argv[3] is the second argument passed to the script and so on.
$USER_NAME: Name a color.
$AI_NAME: Blue
+$USER_NAME: What time is it?
+$AI_NAME: It is $(date +%H:%M).
$USER_NAME:" "$@"
diff --git a/examples/common.cpp b/examples/common.cpp
index c0e87eb9..9f10dc26 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -61,6 +61,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.prompt = argv[i];
+ } else if (arg == "--session") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.path_session = argv[i];
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
@@ -228,6 +234,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n");
+ fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
diff --git a/examples/common.h b/examples/common.h
index 6f26b514..9d3697d7 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -31,6 +31,7 @@ struct gpt_params {
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
+ std::string path_session = ""; // path to file for saving/loading model eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index f9c9e9d9..fda65574 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -157,6 +157,32 @@ int main(int argc, char ** argv) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
+ std::string path_session = params.path_session;
+ std::vector<llama_token> session_tokens;
+
+ if (!path_session.empty()) {
+ fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str());
+
+ // REVIEW - fopen to check for existing session
+ FILE * fp = std::fopen(path_session.c_str(), "rb");
+ if (fp != NULL) {
+ std::fclose(fp);
+
+ session_tokens.resize(params.n_ctx);
+ size_t n_token_count_out = 0;
+ const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
+ session_tokens.resize(n_token_count_out);
+
+ if (n_session_bytes > 0) {
+ fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes);
+ } else {
+ fprintf(stderr, "%s: could not load session file, will recreate\n", __func__);
+ }
+ } else {
+ fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+ }
+ }
+
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
@@ -167,6 +193,26 @@ int main(int argc, char ** argv) {
return 1;
}
+ // debug message about similarity of saved session, if applicable
+ size_t n_matching_session_tokens = 0;
+ if (session_tokens.size()) {
+ for (llama_token id : session_tokens) {
+ if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+ break;
+ }
+ n_matching_session_tokens++;
+ }
+ if (n_matching_session_tokens >= embd_inp.size()) {
+ fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+ } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
+ fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+ __func__, n_matching_session_tokens, embd_inp.size());
+ } else {
+ fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
+ __func__, n_matching_session_tokens, embd_inp.size());
+ }
+ }
+
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
params.n_keep = (int)embd_inp.size();
@@ -252,9 +298,16 @@ int main(int argc, char ** argv) {
bool is_antiprompt = false;
bool input_noecho = false;
+ // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
+ // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
+ // initial prompt so it doesn't need to be an exact match.
+ bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
+
+
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
+ int n_session_consumed = 0;
// the first thing we will do is to output the prompt, so set color accordingly
set_console_color(con_st, CONSOLE_COLOR_PROMPT);
@@ -276,6 +329,9 @@ int main(int argc, char ** argv) {
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+ // REVIEW - stop saving session if we run out of context
+ path_session = "";
+
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
@@ -285,6 +341,28 @@ int main(int argc, char ** argv) {
//printf("\n---\n");
}
+ // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+ // REVIEW
+ if (n_session_consumed < (int) session_tokens.size()) {
+ size_t i = 0;
+ for ( ; i < embd.size(); i++) {
+ if (embd[i] != session_tokens[n_session_consumed]) {
+ session_tokens.resize(n_session_consumed);
+ break;
+ }
+
+ n_past++;
+ n_session_consumed++;
+
+ if (n_session_consumed >= (int) session_tokens.size()) {
+ break;
+ }
+ }
+ if (i > 0) {
+ embd.erase(embd.begin(), embd.begin() + i);
+ }
+ }
+
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
@@ -298,6 +376,11 @@ int main(int argc, char ** argv) {
}
n_past += n_eval;
}
+
+ if (embd.size() > 0 && !path_session.empty()) {
+ session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+ n_session_consumed = session_tokens.size();
+ }
}
embd.clear();
@@ -309,6 +392,12 @@ int main(int argc, char ** argv) {
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;
+ // optionally save the session on first sample (for faster prompt loading next time)
+ if (!path_session.empty() && need_to_save_session) {
+ need_to_save_session = false;
+ llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+ }
+
llama_token id = 0;
{