lookup : add prompt lookup decoding example (#4484)

* initial commit, going through initializations * main loop finished, starting to debug * BUG: generates gibberish/repeating tokens after a while * kv_cache management * Added colors to distinguish drafted tokens (--color). Updated README * lookup : fix token positions in the draft batch * lookup : use n_draft from CLI params * lookup : final touches --------- Co-authored-by: Leon Ericsson <leon.ericsson@icloud.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: LeonEricsson <70749762+LeonEricsson@users.noreply.github.com> 2023-12-22 17:05:56 +0100
committer: GitHub <noreply@github.com> 2023-12-22 18:05:56 +0200
commit: 7082d24cec35e9ce9147535a2224dfc67ee0a78c (patch)
tree: b87d0e65d71c8e2a5bdb889483c75d4429d2d566 /common
parent: ba661751322a7c201fd3bef71af077c5aebfaa2a (diff)
1 files changed, 2 insertions, 1 deletions
diff --git a/common/common.h b/common/common.h
index e87ce113..9659aa04 100644
--- a/common/common.h
+++ b/common/common.h
@@ -51,7 +51,7 @@ struct gpt_params {
     int32_t n_ctx                           = 512;   // context size
     int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft                         = 16;    // number of tokens to draft during speculative decoding
+    int32_t n_draft                         = 8;     // number of tokens to draft during speculative decoding
     int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
     int32_t n_parallel                      = 1;     // number of parallel sequences to decode
     int32_t n_sequences                     = 1;     // number of sequences to decode
@@ -240,3 +240,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
 
 // Dump the KV cache view showing individual sequences in each cell (long output).
 void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
author	LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>	2023-12-22 17:05:56 +0100
committer	GitHub <noreply@github.com>	2023-12-22 18:05:56 +0200
commit	7082d24cec35e9ce9147535a2224dfc67ee0a78c (patch)
tree	b87d0e65d71c8e2a5bdb889483c75d4429d2d566 /common
parent	ba661751322a7c201fd3bef71af077c5aebfaa2a (diff)