From 254a7a7a5ff4c874ff8488f1f5cbdd7e9c89d682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 14 Jun 2023 19:47:19 +0200 Subject: CUDA full GPU acceleration, KV cache in VRAM (#1827) * Fixed CUDA RoPE * ggml_cuda_mul_mat_vec_p021 * ggml_cuda_scale * ggml_cuda_diag_mask_inf * ggml_is_permuted * ggml_cuda_cpy * flatten rows for ggml_cuda_op * Added a --low-vram option * Fixed Windows performance * Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM --- examples/common.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'examples/common.h') diff --git a/examples/common.h b/examples/common.h index 6fedb414..6c2953cb 100644 --- a/examples/common.h +++ b/examples/common.h @@ -21,15 +21,16 @@ int32_t get_num_physical_cores(); struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = get_num_physical_cores(); - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_gpu_layers = 0; // number of layers to store in VRAM - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + int32_t seed = -1; // RNG seed + int32_t n_threads = get_num_physical_cores(); + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_gpu_layers = 0; // number of layers to store in VRAM + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs + bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance // sampling parameters std::unordered_map logit_bias; // logit bias for specific tokens -- cgit v1.2.3