diff options
author | Didzis Gosko <didzis@users.noreply.github.com> | 2023-06-24 11:47:58 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-24 11:47:58 +0300 |
commit | 527b6fba1d237befb324fd846bda7418c0fa394d (patch) | |
tree | 360b44abac0c9a53739444b8ba9e4ccf903938cd /examples/quantize-stats/quantize-stats.cpp | |
parent | d7b7484f74d486f77feb4c0b7af7e1718ed91651 (diff) |
llama : make model stateless and context stateful (llama_state) (#1797)
* llama : make model stateless and context stateful
* llama : minor cleanup
* llama : update internal API declaration
* Apply suggestions from code review
fix style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Missing model memory release
* Fix style
* Add deprecated warning for public API function llama_init_from_file
* Update public API use cases: move away from deprecated llama_init_from_file
* Deprecate public API function llama_apply_lora_from_file
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/quantize-stats/quantize-stats.cpp')
-rw-r--r-- | examples/quantize-stats/quantize-stats.cpp | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6b8018ee..9cea472d 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -320,6 +320,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "Loading model\n"); const int64_t t_main_start_us = ggml_time_us(); + llama_model * model; llama_context * ctx; { @@ -330,12 +331,20 @@ int main(int argc, char ** argv) { lparams.f16_kv = false; lparams.use_mlock = false; - ctx = llama_init_from_file(params.model.c_str(), lparams); + model = llama_load_model_from_file(params.model.c_str(), lparams); - if (ctx == NULL) { + if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return 1; } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return 1; + } } const auto &tensors = llama_internal_get_tensor_map(ctx); @@ -357,6 +366,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: error: Quantization should be tested with a float model, " "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); llama_free(ctx); + llama_free_model(model); return 1; } included_layers++; @@ -415,6 +425,7 @@ int main(int argc, char ** argv) { llama_free(ctx); + llama_free_model(model); // report timing { const int64_t t_main_end_us = ggml_time_us(); |