diff options
author | Johannes Gäßler <johannesg@5d6.de> | 2023-06-06 21:33:23 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-06 21:33:23 +0200 |
commit | 17366df842e358768c0df7024484fffecfc7865b (patch) | |
tree | f042c8142311d45f8712db10debf89111b2c7e57 /examples/common.h | |
parent | 44f906e8537fcec965e312d621c80556d6aa9bec (diff) |
Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)
* CUDA multi GPU + scratch
ggml_cuda_compute_forward
Tensor parallelism
ggml_cuda_add
ggml_cuda_rms_norm
ggml_cuda_silu
CUDA scratch buffer
--main-gpu CLI option
Diffstat (limited to 'examples/common.h')
-rw-r--r-- | examples/common.h | 16 |
1 files changed, 9 insertions, 7 deletions
diff --git a/examples/common.h b/examples/common.h index 66bdeb5e..12b49734 100644 --- a/examples/common.h +++ b/examples/common.h @@ -21,13 +21,15 @@ int32_t get_num_physical_cores(); struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = get_num_physical_cores(); - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_gpu_layers = 0; // number of layers to store in VRAM + int32_t seed = -1; // RNG seed + int32_t n_threads = get_num_physical_cores(); + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_gpu_layers = 0; // number of layers to store in VRAM + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs // sampling parameters std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens |