llama : fix defrag bugs + add parameter (#5735)

* llama : fix defrag bugs + enable by default ggml-ci * llama : add defrag_thold parameter ggml-ci * llama : cont * llama : disable log message ggml-ci * llama : fix graph size check during defrag
author: Georgi Gerganov <ggerganov@gmail.com> 2024-02-27 14:35:51 +0200
committer: GitHub <noreply@github.com> 2024-02-27 14:35:51 +0200
commit: 9d533a77d0c3850ce09d736bc1baa67fd6ad27b3 (patch)
tree: 25adffcbb0f7c13a8578279456a4937ed73ae3f6 /common/common.h
parent: cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b (diff)
1 files changed, 1 insertions, 0 deletions
diff --git a/common/common.h b/common/common.h
index 3e21579b..25003df2 100644
--- a/common/common.h
+++ b/common/common.h
@@ -75,6 +75,7 @@ struct gpt_params {
     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
     float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
     int32_t yarn_orig_ctx         = 0;     // YaRN original context length
+    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
     int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
     ggml_numa_strategy numa       = GGML_NUMA_STRATEGY_DISABLED;
author	Georgi Gerganov <ggerganov@gmail.com>	2024-02-27 14:35:51 +0200
committer	GitHub <noreply@github.com>	2024-02-27 14:35:51 +0200
commit	9d533a77d0c3850ce09d736bc1baa67fd6ad27b3 (patch)
tree	25adffcbb0f7c13a8578279456a4937ed73ae3f6 /common/common.h
parent	cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b (diff)