From 7fc50c051ae8a78e9643fdf172d12e20f2dd9b6c Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Sat, 29 Apr 2023 02:04:18 +0200 Subject: cuBLAS: use host pinned memory and dequantize while copying (#1207) * cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index 45f0d44a..4699e5cf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -136,7 +136,7 @@ struct llama_kv_cache { struct ggml_context * ctx = NULL; - llama_buffer buf; + llama_ctx_buffer buf; int n; // number of tokens currently in the cache @@ -167,7 +167,7 @@ struct llama_model { struct llama_kv_cache kv_self; // the model memory buffer - llama_buffer buf; + llama_ctx_buffer buf; // model memory mapped file std::unique_ptr mapping; @@ -228,8 +228,8 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state - llama_buffer buf_compute; - llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + llama_ctx_buffer buf_compute; + llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; -- cgit v1.2.3