diff options
author | slaren <2141330+slaren@users.noreply.github.com> | 2023-04-29 02:04:18 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-04-29 02:04:18 +0200 |
commit | 7fc50c051ae8a78e9643fdf172d12e20f2dd9b6c (patch) | |
tree | cc017db2f3443a39221ad319ab51df0925012e84 /llama.cpp | |
parent | b1ee8f59b4101b46999a0995d9a34506f7285466 (diff) |
cuBLAS: use host pinned memory and dequantize while copying (#1207)
* cuBLAS: dequantize simultaneously while copying memory
* cuBLAS: use host pinned memory
* cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory
* cuBLAS: also pin kv cache
* fix rebase
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 8 |
1 files changed, 4 insertions, 4 deletions
@@ -136,7 +136,7 @@ struct llama_kv_cache { struct ggml_context * ctx = NULL; - llama_buffer buf; + llama_ctx_buffer buf; int n; // number of tokens currently in the cache @@ -167,7 +167,7 @@ struct llama_model { struct llama_kv_cache kv_self; // the model memory buffer - llama_buffer buf; + llama_ctx_buffer buf; // model memory mapped file std::unique_ptr<llama_mmap> mapping; @@ -228,8 +228,8 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state - llama_buffer buf_compute; - llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + llama_ctx_buffer buf_compute; + llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; |