From 84e723653ca99d51a74b454984acf2c077468561 Mon Sep 17 00:00:00 2001 From: FK Date: Wed, 13 Sep 2023 08:50:46 +0200 Subject: speculative: add --n-gpu-layers-draft option (#3063) --- common/common.h | 1 + 1 file changed, 1 insertion(+) (limited to 'common/common.h') diff --git a/common/common.h b/common/common.h index 012bf5e1..238635ae 100644 --- a/common/common.h +++ b/common/common.h @@ -38,6 +38,7 @@ struct gpt_params { int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. -- cgit v1.2.3