summaryrefslogtreecommitdiff
path: root/examples/quantize-stats
diff options
context:
space:
mode:
authorDidzis Gosko <didzis@users.noreply.github.com>2023-06-24 11:47:58 +0300
committerGitHub <noreply@github.com>2023-06-24 11:47:58 +0300
commit527b6fba1d237befb324fd846bda7418c0fa394d (patch)
tree360b44abac0c9a53739444b8ba9e4ccf903938cd /examples/quantize-stats
parentd7b7484f74d486f77feb4c0b7af7e1718ed91651 (diff)
llama : make model stateless and context stateful (llama_state) (#1797)
* llama : make model stateless and context stateful * llama : minor cleanup * llama : update internal API declaration * Apply suggestions from code review fix style Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Missing model memory release * Fix style * Add deprecated warning for public API function llama_init_from_file * Update public API use cases: move away from deprecated llama_init_from_file * Deprecate public API function llama_apply_lora_from_file --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/quantize-stats')
-rw-r--r--examples/quantize-stats/quantize-stats.cpp15
1 files changed, 13 insertions, 2 deletions
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 6b8018ee..9cea472d 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "Loading model\n");
const int64_t t_main_start_us = ggml_time_us();
+ llama_model * model;
llama_context * ctx;
{
@@ -330,12 +331,20 @@ int main(int argc, char ** argv) {
lparams.f16_kv = false;
lparams.use_mlock = false;
- ctx = llama_init_from_file(params.model.c_str(), lparams);
+ model = llama_load_model_from_file(params.model.c_str(), lparams);
- if (ctx == NULL) {
+ if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
+
+ ctx = llama_new_context_with_model(model, lparams);
+
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+ llama_free_model(model);
+ return 1;
+ }
}
const auto &tensors = llama_internal_get_tensor_map(ctx);
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
llama_free(ctx);
+ llama_free_model(model);
return 1;
}
included_layers++;
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
+ llama_free_model(model);
// report timing
{
const int64_t t_main_end_us = ggml_time_us();