summaryrefslogtreecommitdiff
path: root/llama.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.h')
-rw-r--r--llama.h18
1 files changed, 16 insertions, 2 deletions
diff --git a/llama.h b/llama.h
index 43d41b8f..689e12d7 100644
--- a/llama.h
+++ b/llama.h
@@ -118,6 +118,12 @@ extern "C" {
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
};
+ enum llama_split_mode {
+ LLAMA_SPLIT_NONE = 0, // single GPU
+ LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
+ LLAMA_SPLIT_ROW = 2, // split rows across GPUs
+ };
+
typedef struct llama_token_data {
llama_token id; // token id
float logit; // log-odds of the token
@@ -180,8 +186,16 @@ extern "C" {
struct llama_model_params {
int32_t n_gpu_layers; // number of layers to store in VRAM
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
- const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+ enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+
+ // main_gpu interpretation depends on split_mode:
+ // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
+ // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
+ // LLAMA_SPLIT_LAYER: ignored
+ int32_t main_gpu;
+
+ // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+ const float * tensor_split;
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.