From 905d87b70aa189623d500a28602d7a3a755a4769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 13 May 2023 15:38:36 +0200 Subject: ggml : GPU-accelerated token generation (#1412) * CUDA kernel for q4_0 dequant. + mat. vec. mult. * Added q4_1 via template * Added missing __syncthreads(); * --gpu_layers -> --gpu-layers * Shorter dequantize_mul_mat_vec line * q5_0 dequantize_mul_mat kernel * More readable dequantize_mul_mat_vec logic * dequantize_mul_mat_vec kernels for q5_1, q8_0, f16 * llama : offload "output" tensor to GPU too + coding style fixes --------- Co-authored-by: Georgi Gerganov --- examples/common.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'examples/common.h') diff --git a/examples/common.h b/examples/common.h index 499671b2..717838f0 100644 --- a/examples/common.h +++ b/examples/common.h @@ -21,13 +21,14 @@ int32_t get_num_physical_cores(); struct gpt_params { - int32_t seed = -1; // RNG seed + int32_t seed = -1; // RNG seed int32_t n_threads = get_num_physical_cores(); int32_t n_predict = -1; // new tokens to predict - int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_gpu_layers = 0; // number of layers to store in VRAM // sampling parameters std::unordered_map logit_bias; // logit bias for specific tokens -- cgit v1.2.3