summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorslaren <2141330+slaren@users.noreply.github.com>2023-04-29 02:04:18 +0200
committerGitHub <noreply@github.com>2023-04-29 02:04:18 +0200
commit7fc50c051ae8a78e9643fdf172d12e20f2dd9b6c (patch)
treecc017db2f3443a39221ad319ab51df0925012e84 /llama.cpp
parentb1ee8f59b4101b46999a0995d9a34506f7285466 (diff)
cuBLAS: use host pinned memory and dequantize while copying (#1207)
* cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp8
1 files changed, 4 insertions, 4 deletions
diff --git a/llama.cpp b/llama.cpp
index 45f0d44a..4699e5cf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -136,7 +136,7 @@ struct llama_kv_cache {
struct ggml_context * ctx = NULL;
- llama_buffer buf;
+ llama_ctx_buffer buf;
int n; // number of tokens currently in the cache
@@ -167,7 +167,7 @@ struct llama_model {
struct llama_kv_cache kv_self;
// the model memory buffer
- llama_buffer buf;
+ llama_ctx_buffer buf;
// model memory mapped file
std::unique_ptr<llama_mmap> mapping;
@@ -228,8 +228,8 @@ struct llama_context {
// memory buffers used to evaluate the model
// TODO: move in llama_state
- llama_buffer buf_compute;
- llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
+ llama_ctx_buffer buf_compute;
+ llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
int buf_last = 0;
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };