diff options
author | slaren <slarengh@gmail.com> | 2023-12-24 14:34:22 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-12-24 14:34:22 +0100 |
commit | 5bf3953d7e9831ea22b0bc017ce97409b801ccf1 (patch) | |
tree | 48c0136d9943fb9cca22209894464970549c24b5 /llama.cpp | |
parent | 708e179e8562c2604240df95a2241dea17fd808b (diff) |
cuda : improve cuda pool efficiency using virtual memory (#4606)
* cuda : improve cuda pool efficiency using virtual memory
* fix mixtral
* fix cmake build
* check for vmm support, disable for hip
ggml-ci
* fix hip build
* clarify granularity
* move all caps to g_device_caps
* refactor error checking
* add cuda_pool_alloc, refactor most pool allocations
ggml-ci
* fix hip build
* CUBLAS_TF32_TENSOR_OP_MATH is not a macro
* more hip crap
* llama : fix msvc warnings
* ggml : fix msvc warnings
* minor
* minor
* cuda : fallback to CPU on host buffer alloc fail
* Update ggml-cuda.cu
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
* Update ggml-cuda.cu
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
* ensure allocations are always aligned
* act_size -> actual_size
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 6 |
1 files changed, 3 insertions, 3 deletions
@@ -1281,7 +1281,7 @@ struct llama_hparams { if (this->rope_finetuned != other.rope_finetuned) return true; if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; - const float EPSILON = 1e-9; + const float EPSILON = 1e-9f; if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; @@ -10300,7 +10300,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch std::string result = model->vocab.id_to_token[token].text; llama_unescape_whitespace(result); if (length < (int) result.length()) { - return -result.length(); + return -(int) result.length(); } memcpy(buf, result.c_str(), result.length()); return result.length(); @@ -10330,7 +10330,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch std::string result = model->vocab.id_to_token[token].text; result = llama_decode_text(result); if (length < (int) result.length()) { - return -result.length(); + return -(int) result.length(); } memcpy(buf, result.c_str(), result.length()); return result.length(); |