cuBLAS: use host pinned memory and dequantize while copying (#1207)

* cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase
author: slaren <2141330+slaren@users.noreply.github.com> 2023-04-29 02:04:18 +0200
committer: GitHub <noreply@github.com> 2023-04-29 02:04:18 +0200
commit: 7fc50c051ae8a78e9643fdf172d12e20f2dd9b6c (patch)
tree: cc017db2f3443a39221ad319ab51df0925012e84 /llama.cpp
parent: b1ee8f59b4101b46999a0995d9a34506f7285466 (diff)
1 files changed, 4 insertions, 4 deletions
diff --git a/llama.cpp b/llama.cpp
index 45f0d44a..4699e5cf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -136,7 +136,7 @@ struct llama_kv_cache {
 
     struct ggml_context * ctx = NULL;
 
-    llama_buffer buf;
+    llama_ctx_buffer buf;
 
     int n; // number of tokens currently in the cache
 
@@ -167,7 +167,7 @@ struct llama_model {
     struct llama_kv_cache kv_self;
 
     // the model memory buffer
-    llama_buffer buf;
+    llama_ctx_buffer buf;
 
     // model memory mapped file
     std::unique_ptr<llama_mmap> mapping;
@@ -228,8 +228,8 @@ struct llama_context {
 
     // memory buffers used to evaluate the model
     // TODO: move in llama_state
-    llama_buffer buf_compute;
-    llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
+    llama_ctx_buffer buf_compute;
+    llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
 
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
author	slaren <2141330+slaren@users.noreply.github.com>	2023-04-29 02:04:18 +0200
committer	GitHub <noreply@github.com>	2023-04-29 02:04:18 +0200
commit	7fc50c051ae8a78e9643fdf172d12e20f2dd9b6c (patch)
tree	cc017db2f3443a39221ad319ab51df0925012e84 /llama.cpp
parent	b1ee8f59b4101b46999a0995d9a34506f7285466 (diff)