llama.cpp : split llama_context_params into model and context params (#3301)

* llama.cpp : split llama_context_params into model and context params ggml-ci * fix metal build * fix freq_base/scale default to model value * llama-bench : keep the same model between tests when possible * move n_threads to llama_context_params, add n_threads_batch * fix mpi build * remove kv_size(), cuda scratch fixes * remove low-vram option * add n_threads_batch to system info, refactor to get_system_info() * add documentation about --threads-batch to the READMEs * llama-bench fix * main : fix rope freq/scale warning * llama.cpp : add llama_get_model common : add llama_tokenize from model * remove duplicated ctx/model functions ggml-ci * cuda : print total VRAM used
author: slaren <slarengh@gmail.com> 2023-09-28 21:42:38 +0200
committer: GitHub <noreply@github.com> 2023-09-28 22:42:38 +0300
commit: 16bc66d9479edd5ee12ec734973554d4493c5dfa (patch)
tree: 4cca787ebd86dd55fd176d27112117c74e9b34c6 /examples/embd-input
parent: 0512d66670de3f650c579519833c085014b0f200 (diff)
2 files changed, 7 insertions, 8 deletions
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 9bd4d347..99e6bdad 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -48,8 +48,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
     // print system information
     {
         fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
     }
     struct MyModel * ret = new MyModel();
     ret->ctx = ctx;
@@ -71,7 +70,7 @@ bool eval_float(void * model, float * input, int N){
     MyModel * mymodel = (MyModel*)model;
     llama_context * ctx = mymodel->ctx;
     gpt_params params = mymodel->params;
-    int n_emb = llama_n_embd(ctx);
+    int n_emb = llama_n_embd(llama_get_model(ctx));
     int n_past = mymodel->n_past;
     int n_batch = N; // params.n_batch;
 
@@ -81,7 +80,7 @@ bool eval_float(void * model, float * input, int N){
             n_eval = n_batch;
         }
         llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
-        if (llama_decode(ctx, batch, params.n_threads)) {
+        if (llama_decode(ctx, batch)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
         }
@@ -102,7 +101,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
         if (n_eval > params.n_batch) {
             n_eval = params.n_batch;
         }
-        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0), params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
         }
@@ -133,7 +132,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
 
     // out of user input, sample next token
     const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
     const float   top_p           = params.top_p;
     const float   tfs_z           = params.tfs_z;
     const float   typical_p       = params.typical_p;
@@ -149,7 +148,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
     llama_token id = 0;
     {
         auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(ctx);
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
 
         // Apply params.logit_bias map
         for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
diff --git a/examples/embd-input/embd-input-test.cpp b/examples/embd-input/embd-input-test.cpp
index e5e040f6..dc4a0e48 100644
--- a/examples/embd-input/embd-input-test.cpp
+++ b/examples/embd-input/embd-input-test.cpp
@@ -8,7 +8,7 @@ int main(int argc, char** argv) {
     auto mymodel = create_mymodel(argc, argv);
     int N = 10;
     int max_tgt_len = 500;
-    int n_embd = llama_n_embd(mymodel->ctx);
+    int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
 
     // add random float embd to test evaluation
     float * data = new float[N*n_embd];
author	slaren <slarengh@gmail.com>	2023-09-28 21:42:38 +0200
committer	GitHub <noreply@github.com>	2023-09-28 22:42:38 +0300
commit	16bc66d9479edd5ee12ec734973554d4493c5dfa (patch)
tree	4cca787ebd86dd55fd176d27112117c74e9b34c6 /examples/embd-input
parent	0512d66670de3f650c579519833c085014b0f200 (diff)