llama : make model stateless and context stateful (llama_state) (#1797)

* llama : make model stateless and context stateful * llama : minor cleanup * llama : update internal API declaration * Apply suggestions from code review fix style Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Missing model memory release * Fix style * Add deprecated warning for public API function llama_init_from_file * Update public API use cases: move away from deprecated llama_init_from_file * Deprecate public API function llama_apply_lora_from_file --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: Didzis Gosko <didzis@users.noreply.github.com> 2023-06-24 11:47:58 +0300
committer: GitHub <noreply@github.com> 2023-06-24 11:47:58 +0300
commit: 527b6fba1d237befb324fd846bda7418c0fa394d (patch)
tree: 360b44abac0c9a53739444b8ba9e4ccf903938cd /examples/quantize-stats
parent: d7b7484f74d486f77feb4c0b7af7e1718ed91651 (diff)
1 files changed, 13 insertions, 2 deletions
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 6b8018ee..9cea472d 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "Loading model\n");
 
     const int64_t t_main_start_us = ggml_time_us();
+    llama_model * model;
     llama_context * ctx;
 
     {
@@ -330,12 +331,20 @@ int main(int argc, char ** argv) {
         lparams.f16_kv     = false;
         lparams.use_mlock  = false;
 
-        ctx = llama_init_from_file(params.model.c_str(), lparams);
+        model = llama_load_model_from_file(params.model.c_str(), lparams);
 
-        if (ctx == NULL) {
+        if (model == NULL) {
             fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
             return 1;
         }
+
+        ctx = llama_new_context_with_model(model, lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+            llama_free_model(model);
+            return 1;
+        }
     }
 
     const auto &tensors = llama_internal_get_tensor_map(ctx);
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
                 "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
             llama_free(ctx);
+            llama_free_model(model);
             return 1;
         }
         included_layers++;
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
 
 
     llama_free(ctx);
+    llama_free_model(model);
     // report timing
     {
         const int64_t t_main_end_us = ggml_time_us();
author	Didzis Gosko <didzis@users.noreply.github.com>	2023-06-24 11:47:58 +0300
committer	GitHub <noreply@github.com>	2023-06-24 11:47:58 +0300
commit	527b6fba1d237befb324fd846bda7418c0fa394d (patch)
tree	360b44abac0c9a53739444b8ba9e4ccf903938cd /examples/quantize-stats
parent	d7b7484f74d486f77feb4c0b7af7e1718ed91651 (diff)