1 files changed, 15 insertions, 6 deletions
diff --git a/examples/common.h b/examples/common.h
index 9d3697d7..14e6b1ba 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -8,6 +8,7 @@
 #include <vector>
 #include <random>
 #include <thread>
+#include <unordered_map>
 
 //
 // CLI argument parsing
@@ -17,17 +18,25 @@ struct gpt_params {
     int32_t seed          = -1;   // RNG seed
     int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
     int32_t n_predict     = 128;  // new tokens to predict
-    int32_t repeat_last_n = 64;   // last n tokens to penalize
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
     int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 
     // sampling parameters
-    int32_t top_k = 40;
-    float   top_p = 0.95f;
-    float   temp  = 0.80f;
-    float   repeat_penalty  = 1.10f;
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+    int32_t top_k = 0;              // <= 0 to use vocab size
+    float   top_p = 1.0f;           // 1.0 = disabled
+    float   tfs_z = 1.0f;           // 1.0 = disabled
+    float   typical_p = 1.0f;       // 1.0 = disabled
+    float   temp = 1.0f;            // 1.0 = disabled
+    float   repeat_penalty  = 1.0f; // 1.0 = disabled
+    int32_t repeat_last_n = -1;     // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   frequency_penalty = 0.0f; // 0.0 = disabled
+    float   presence_penalty = 0.0f;  // 0.0 = disabled
+    int     mirostat = 0;           // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau = 5.0f;    // target entropy
+    float   mirostat_eta = 0.1f;    // learning rate
 
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
@@ -47,7 +56,7 @@ struct gpt_params {
     bool interactive_first = false; // wait for user input immediately
 
     bool instruct          = false; // instruction mode (used for Alpaca models)
-    bool ignore_eos        = false; // do not stop generating after eos
+    bool penalize_nl       = true;  // consider newlines as a repeatable token
     bool perplexity        = false; // compute perplexity over the prompt
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory