Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch ggml_cuda_compute_forward Tensor parallelism ggml_cuda_add ggml_cuda_rms_norm ggml_cuda_silu CUDA scratch buffer --main-gpu CLI option
author: Johannes Gäßler <johannesg@5d6.de> 2023-06-06 21:33:23 +0200
committer: GitHub <noreply@github.com> 2023-06-06 21:33:23 +0200
commit: 17366df842e358768c0df7024484fffecfc7865b (patch)
tree: f042c8142311d45f8712db10debf89111b2c7e57 /examples/common.h
parent: 44f906e8537fcec965e312d621c80556d6aa9bec (diff)
1 files changed, 9 insertions, 7 deletions
diff --git a/examples/common.h b/examples/common.h
index 66bdeb5e..12b49734 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -21,13 +21,15 @@
 int32_t get_num_physical_cores();
 
 struct gpt_params {
-    int32_t seed          = -1;  // RNG seed
-    int32_t n_threads     = get_num_physical_cores();
-    int32_t n_predict     = -1;  // new tokens to predict
-    int32_t n_ctx         = 512; // context size
-    int32_t n_batch       = 512; // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep        = 0;   // number of tokens to keep from initial prompt
-    int32_t n_gpu_layers  = 0;   // number of layers to store in VRAM
+    int32_t seed                           = -1;   // RNG seed
+    int32_t n_threads                      = get_num_physical_cores();
+    int32_t n_predict                      = -1;   // new tokens to predict
+    int32_t n_ctx                          = 512;  // context size
+    int32_t n_batch                        = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                         = 0;    // number of tokens to keep from initial prompt
+    int32_t n_gpu_layers                   = 0;    // number of layers to store in VRAM
+    int32_t main_gpu                       = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
 
     // sampling parameters
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
author	Johannes Gäßler <johannesg@5d6.de>	2023-06-06 21:33:23 +0200
committer	GitHub <noreply@github.com>	2023-06-06 21:33:23 +0200
commit	17366df842e358768c0df7024484fffecfc7865b (patch)
tree	f042c8142311d45f8712db10debf89111b2c7e57 /examples/common.h
parent	44f906e8537fcec965e312d621c80556d6aa9bec (diff)