diff options
Diffstat (limited to 'examples')
-rw-r--r-- | examples/common.cpp | 9 | ||||
-rw-r--r-- | examples/common.h | 1 | ||||
-rw-r--r-- | examples/embedding/embedding.cpp | 1 | ||||
-rw-r--r-- | examples/main/main.cpp | 1 | ||||
-rw-r--r-- | examples/perplexity/perplexity.cpp | 1 | ||||
-rw-r--r-- | examples/quantize-stats/quantize-stats.cpp | 9 |
6 files changed, 14 insertions, 8 deletions
diff --git a/examples/common.cpp b/examples/common.cpp index b27aa6cf..f909eed2 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -1,7 +1,5 @@ #include "common.h" -#include "ggml.h" - #include <cassert> #include <cstring> #include <fstream> @@ -161,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.use_color = true; } else if (arg == "--mlock") { params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; } else if (arg == "--mtest") { params.mem_test = true; } else if (arg == "--verbose-prompt") { @@ -240,9 +240,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - if (ggml_mlock_supported()) { + if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } + if (llama_mmap_supported()) { + fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); diff --git a/examples/common.h b/examples/common.h index 7a8848f9..1ea6f744 100644 --- a/examples/common.h +++ b/examples/common.h @@ -47,6 +47,7 @@ struct gpt_params { bool instruct = false; // instruction mode (used for Alpaca models) bool ignore_eos = false; // do not stop generating after eos bool perplexity = false; // compute perplexity over the prompt + bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index d397f35f..2eda3ac0 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -38,6 +38,7 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; lparams.embedding = params.embedding; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index d59eeb45..d333d0db 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -97,6 +97,7 @@ int main(int argc, char ** argv) { lparams.n_parts = params.n_parts; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; ctx = llama_init_from_file(params.model.c_str(), lparams); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 07ed0a82..b62f00d0 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -115,6 +115,7 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; lparams.embedding = params.embedding; diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index af1e6272..203bfe8c 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "llama.h" +#include "llama_internal.h" #include <algorithm> #include <cassert> @@ -266,15 +267,13 @@ int main(int argc, char ** argv) { } } - // Sort tensors for consistent output - const auto tensors = llama_internal_get_tensor_map(ctx); - std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() }; + const auto &tensors = llama_internal_get_tensor_map(ctx); // check layer tensors int included_layers = 0; int64_t max_nelements = 0; bool is_f16 = false; - for (const auto& kv_tensor : tensors_sorted) { + for (const auto& kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } @@ -315,7 +314,7 @@ int main(int argc, char ** argv) { error_stats global_stats {}; - for (const auto& kv_tensor : tensors_sorted) { + for (const auto& kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } |