diff options
author | Johannes Gäßler <johannesg@5d6.de> | 2024-04-24 11:08:36 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-24 11:08:36 +0200 |
commit | 28103f4832e301a9c84d44ff0df9d75d46ab6c76 (patch) | |
tree | 8ba391e3a7e0ce9a20d4b41782ef133bd7e32738 /common/sampling.h | |
parent | c0d1b3e03e27634ac2871761f5033cf9324d472d (diff) |
Server: fix seed for multiple slots (#6835)
* Server: add tests for consistent results
* sampling: separate rng per sampling context
Diffstat (limited to 'common/sampling.h')
-rw-r--r-- | common/sampling.h | 47 |
1 files changed, 27 insertions, 20 deletions
diff --git a/common/sampling.h b/common/sampling.h index 639b819a..cf7081e3 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -4,9 +4,10 @@ #include "grammar-parser.h" +#include <random> #include <string> -#include <vector> #include <unordered_map> +#include <vector> // sampler types enum class llama_sampler_type : char { @@ -20,25 +21,26 @@ enum class llama_sampler_type : char { // sampling parameters typedef struct llama_sampling_params { - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = false; // consider newlines as a repeatable token + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = false; // consider newlines as a repeatable token + uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context std::vector<llama_sampler_type> samplers_sequence = { llama_sampler_type::TOP_K, @@ -79,6 +81,8 @@ struct llama_sampling_context { // TODO: replace with ring-buffer std::vector<llama_token> prev; std::vector<llama_token_data> cur; + + std::mt19937 rng; }; #include "common.h" @@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx); // - reset grammar void llama_sampling_reset(llama_sampling_context * ctx); +// Set the sampler seed +void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed); + // Copy the sampler context void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst); |