diff options
author | slaren <slarengh@gmail.com> | 2023-09-28 21:42:38 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-28 22:42:38 +0300 |
commit | 16bc66d9479edd5ee12ec734973554d4493c5dfa (patch) | |
tree | 4cca787ebd86dd55fd176d27112117c74e9b34c6 /ggml-cuda.cu | |
parent | 0512d66670de3f650c579519833c085014b0f200 (diff) |
llama.cpp : split llama_context_params into model and context params (#3301)
* llama.cpp : split llama_context_params into model and context params
ggml-ci
* fix metal build
* fix freq_base/scale default to model value
* llama-bench : keep the same model between tests when possible
* move n_threads to llama_context_params, add n_threads_batch
* fix mpi build
* remove kv_size(), cuda scratch fixes
* remove low-vram option
* add n_threads_batch to system info, refactor to get_system_info()
* add documentation about --threads-batch to the READMEs
* llama-bench fix
* main : fix rope freq/scale warning
* llama.cpp : add llama_get_model
common : add llama_tokenize from model
* remove duplicated ctx/model functions
ggml-ci
* cuda : print total VRAM used
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r-- | ggml-cuda.cu | 24 |
1 files changed, 14 insertions, 10 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 29fb7abd..86d1fe20 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,3 +1,4 @@ +#include <algorithm> #include <cstddef> #include <cstdint> #include <limits> @@ -467,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static bool g_mul_mat_q = true; static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default +static size_t g_scratch_size = 0; // disabled by default static size_t g_scratch_offset = 0; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; @@ -6738,14 +6739,10 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te const int64_t ne1 = dst->ne[1]; // TODO: find the optimal values for these - if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && - src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - return true; - } - - return false; + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); } static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ @@ -6901,6 +6898,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); } else { + fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, + ggml_type_name(src0->type), ggml_type_name(src1->type)); GGML_ASSERT(false); } @@ -7198,7 +7197,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) { } void ggml_cuda_set_scratch_size(const size_t scratch_size) { - g_scratch_size = scratch_size; + // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously + // it still won't always work as expected, but it's better than nothing + if (scratch_size > g_scratch_size) { + ggml_cuda_free_scratch(); + } + g_scratch_size = std::max(g_scratch_size, scratch_size); } void ggml_cuda_free_scratch() { |