summaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
Diffstat (limited to 'common')
-rw-r--r--common/common.cpp56
-rw-r--r--common/common.h68
-rw-r--r--common/train.cpp12
3 files changed, 68 insertions, 68 deletions
diff --git a/common/common.cpp b/common/common.cpp
index 9d976c7c..ce739b15 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -583,20 +583,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_gpu_layers = std::stoi(argv[i]);
-#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+ }
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_gpu_layers_draft = std::stoi(argv[i]);
-#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+ }
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
invalid_param = true;
@@ -637,11 +637,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
- if (split_arg.size() >= LLAMA_MAX_DEVICES) {
+ if (split_arg.size() >= llama_max_devices()) {
invalid_param = true;
break;
}
- for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+ for (size_t i = 0; i < llama_max_devices(); ++i) {
if (i < split_arg.size()) {
params.tensor_split[i] = std::stof(split_arg[i]);
} else {
@@ -989,30 +989,30 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
- if (llama_mlock_supported()) {
+ if (llama_supports_mlock()) {
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
- if (llama_mmap_supported()) {
+ if (llama_supports_mmap()) {
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
printf(" --numa attempt optimizations that help on some NUMA systems\n");
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
- printf(" -ngl N, --n-gpu-layers N\n");
- printf(" number of layers to store in VRAM\n");
- printf(" -ngld N, --n-gpu-layers-draft N\n");
- printf(" number of layers to store in VRAM for the draft model\n");
- printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
- printf(" how to split the model across multiple GPUs, one of:\n");
- printf(" - none: use one GPU only\n");
- printf(" - layer (default): split layers and KV across GPUs\n");
- printf(" - row: split rows across GPUs\n");
- printf(" -ts SPLIT, --tensor-split SPLIT\n");
- printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
- printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
- printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
-#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
+ if (llama_supports_gpu_offload()) {
+ printf(" -ngl N, --n-gpu-layers N\n");
+ printf(" number of layers to store in VRAM\n");
+ printf(" -ngld N, --n-gpu-layers-draft N\n");
+ printf(" number of layers to store in VRAM for the draft model\n");
+ printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
+ printf(" how to split the model across multiple GPUs, one of:\n");
+ printf(" - none: use one GPU only\n");
+ printf(" - layer (default): split layers and KV across GPUs\n");
+ printf(" - row: split rows across GPUs\n");
+ printf(" -ts SPLIT, --tensor-split SPLIT\n");
+ printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
+ printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
+ printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
+ }
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n");
@@ -1651,7 +1651,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
- const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
+ const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
diff --git a/common/common.h b/common/common.h
index 214a379b..24a99d72 100644
--- a/common/common.h
+++ b/common/common.h
@@ -43,40 +43,40 @@ extern char const *LLAMA_BUILD_TARGET;
int32_t get_num_physical_cores();
struct gpt_params {
- uint32_t seed = -1; // RNG seed
-
- int32_t n_threads = get_num_physical_cores();
- int32_t n_threads_draft = -1;
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
- int32_t n_threads_batch_draft = -1;
- int32_t n_predict = -1; // new tokens to predict
- int32_t n_ctx = 512; // context size
- int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
- int32_t n_draft = 8; // number of tokens to draft during speculative decoding
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
- int32_t n_parallel = 1; // number of parallel sequences to decode
- int32_t n_sequences = 1; // number of sequences to decode
- float p_accept = 0.5f; // speculative decoding accept probability
- float p_split = 0.1f; // speculative decoding split probability
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
- int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
- llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
- float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
- int32_t n_beams = 0; // if non-zero then use beam search of given width.
- int32_t grp_attn_n = 1; // group-attention factor
- int32_t grp_attn_w = 512; // group-attention width
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
- float rope_freq_base = 0.0f; // RoPE base frequency
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
- float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
- int32_t yarn_orig_ctx = 0; // YaRN original context length
- int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
- // pinging @cebtenzzre
+ uint32_t seed = -1; // RNG seed
+
+ int32_t n_threads = get_num_physical_cores();
+ int32_t n_threads_draft = -1;
+ int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
+ int32_t n_threads_batch_draft = -1;
+ int32_t n_predict = -1; // new tokens to predict
+ int32_t n_ctx = 512; // context size
+ int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
+ int32_t n_draft = 8; // number of tokens to draft during speculative decoding
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
+ int32_t n_parallel = 1; // number of parallel sequences to decode
+ int32_t n_sequences = 1; // number of sequences to decode
+ float p_accept = 0.5f; // speculative decoding accept probability
+ float p_split = 0.1f; // speculative decoding split probability
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+ llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+ int32_t n_beams = 0; // if non-zero then use beam search of given width.
+ int32_t grp_attn_n = 1; // group-attention factor
+ int32_t grp_attn_w = 512; // group-attention width
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
+ float rope_freq_base = 0.0f; // RoPE base frequency
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
+ int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
+ // pinging @cebtenzzre
// // sampling parameters
struct llama_sampling_params sparams;
diff --git a/common/train.cpp b/common/train.cpp
index e6f2f7a2..e4c3d5df 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1363,12 +1363,12 @@ bool consume_common_train_arg(
*invalid_param = true;
return true;
}
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
- params->n_gpu_layers = std::stoi(argv[i]);
-#else
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
+ if (llama_supports_gpu_offload()) {
+ params->n_gpu_layers = std::stoi(argv[i]);
+ } else {
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+ }
} else if (arg == "-h" || arg == "--help") {
params->print_usage = true;
return true;