summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
Diffstat (limited to 'examples')
-rw-r--r--examples/common.cpp9
-rw-r--r--examples/common.h1
-rw-r--r--examples/embedding/embedding.cpp1
-rw-r--r--examples/main/main.cpp1
-rw-r--r--examples/perplexity/perplexity.cpp1
-rw-r--r--examples/quantize-stats/quantize-stats.cpp9
6 files changed, 14 insertions, 8 deletions
diff --git a/examples/common.cpp b/examples/common.cpp
index b27aa6cf..f909eed2 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -1,7 +1,5 @@
#include "common.h"
-#include "ggml.h"
-
#include <cassert>
#include <cstring>
#include <fstream>
@@ -161,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_color = true;
} else if (arg == "--mlock") {
params.use_mlock = true;
+ } else if (arg == "--no-mmap") {
+ params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--verbose-prompt") {
@@ -240,9 +240,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
- if (ggml_mlock_supported()) {
+ if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
+ if (llama_mmap_supported()) {
+ fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+ }
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
diff --git a/examples/common.h b/examples/common.h
index 7a8848f9..1ea6f744 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -47,6 +47,7 @@ struct gpt_params {
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
bool perplexity = false; // compute perplexity over the prompt
+ bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool verbose_prompt = false; // print prompt tokens before generation
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index d397f35f..2eda3ac0 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
+ lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index d59eeb45..d333d0db 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
+ lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
ctx = llama_init_from_file(params.model.c_str(), lparams);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 07ed0a82..b62f00d0 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -115,6 +115,7 @@ int main(int argc, char ** argv) {
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
+ lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index af1e6272..203bfe8c 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,5 +1,6 @@
#include "ggml.h"
#include "llama.h"
+#include "llama_internal.h"
#include <algorithm>
#include <cassert>
@@ -266,15 +267,13 @@ int main(int argc, char ** argv) {
}
}
- // Sort tensors for consistent output
- const auto tensors = llama_internal_get_tensor_map(ctx);
- std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() };
+ const auto &tensors = llama_internal_get_tensor_map(ctx);
// check layer tensors
int included_layers = 0;
int64_t max_nelements = 0;
bool is_f16 = false;
- for (const auto& kv_tensor : tensors_sorted) {
+ for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
@@ -315,7 +314,7 @@ int main(int argc, char ** argv) {
error_stats global_stats {};
- for (const auto& kv_tensor : tensors_sorted) {
+ for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}