summaryrefslogtreecommitdiff
path: root/examples/common.h
diff options
context:
space:
mode:
authorJohannes Gäßler <johannesg@5d6.de>2023-06-06 21:33:23 +0200
committerGitHub <noreply@github.com>2023-06-06 21:33:23 +0200
commit17366df842e358768c0df7024484fffecfc7865b (patch)
treef042c8142311d45f8712db10debf89111b2c7e57 /examples/common.h
parent44f906e8537fcec965e312d621c80556d6aa9bec (diff)
Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)
* CUDA multi GPU + scratch ggml_cuda_compute_forward Tensor parallelism ggml_cuda_add ggml_cuda_rms_norm ggml_cuda_silu CUDA scratch buffer --main-gpu CLI option
Diffstat (limited to 'examples/common.h')
-rw-r--r--examples/common.h16
1 files changed, 9 insertions, 7 deletions
diff --git a/examples/common.h b/examples/common.h
index 66bdeb5e..12b49734 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -21,13 +21,15 @@
int32_t get_num_physical_cores();
struct gpt_params {
- int32_t seed = -1; // RNG seed
- int32_t n_threads = get_num_physical_cores();
- int32_t n_predict = -1; // new tokens to predict
- int32_t n_ctx = 512; // context size
- int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
- int32_t n_gpu_layers = 0; // number of layers to store in VRAM
+ int32_t seed = -1; // RNG seed
+ int32_t n_threads = get_num_physical_cores();
+ int32_t n_predict = -1; // new tokens to predict
+ int32_t n_ctx = 512; // context size
+ int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
+ int32_t n_gpu_layers = 0; // number of layers to store in VRAM
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
+ float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
// sampling parameters
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens