summaryrefslogtreecommitdiff
path: root/ggml-cuda.cu
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2023-09-28 21:42:38 +0200
committerGitHub <noreply@github.com>2023-09-28 22:42:38 +0300
commit16bc66d9479edd5ee12ec734973554d4493c5dfa (patch)
tree4cca787ebd86dd55fd176d27112117c74e9b34c6 /ggml-cuda.cu
parent0512d66670de3f650c579519833c085014b0f200 (diff)
llama.cpp : split llama_context_params into model and context params (#3301)
* llama.cpp : split llama_context_params into model and context params ggml-ci * fix metal build * fix freq_base/scale default to model value * llama-bench : keep the same model between tests when possible * move n_threads to llama_context_params, add n_threads_batch * fix mpi build * remove kv_size(), cuda scratch fixes * remove low-vram option * add n_threads_batch to system info, refactor to get_system_info() * add documentation about --threads-batch to the READMEs * llama-bench fix * main : fix rope freq/scale warning * llama.cpp : add llama_get_model common : add llama_tokenize from model * remove duplicated ctx/model functions ggml-ci * cuda : print total VRAM used
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r--ggml-cuda.cu24
1 files changed, 14 insertions, 10 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 29fb7abd..86d1fe20 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,3 +1,4 @@
+#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <limits>
@@ -467,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
static bool g_mul_mat_q = true;
static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
+static size_t g_scratch_size = 0; // disabled by default
static size_t g_scratch_offset = 0;
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -6738,14 +6739,10 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
const int64_t ne1 = dst->ne[1];
// TODO: find the optimal values for these
- if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
- src1->type == GGML_TYPE_F32 &&
- dst->type == GGML_TYPE_F32 &&
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
- return true;
- }
-
- return false;
+ return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+ src1->type == GGML_TYPE_F32 &&
+ dst->type == GGML_TYPE_F32 &&
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
}
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -6901,6 +6898,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
ne10, ne11, nb10, nb11, nb12, main_stream);
} else {
+ fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
+ ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false);
}
@@ -7198,7 +7197,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
}
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
- g_scratch_size = scratch_size;
+ // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
+ // it still won't always work as expected, but it's better than nothing
+ if (scratch_size > g_scratch_size) {
+ ggml_cuda_free_scratch();
+ }
+ g_scratch_size = std::max(g_scratch_size, scratch_size);
}
void ggml_cuda_free_scratch() {