summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-10-20 21:07:23 +0300
committerGitHub <noreply@github.com>2023-10-20 21:07:23 +0300
commitd1031cf49c3b958b915fd558e23453471c29ac33 (patch)
tree14fa2bc6d54d5e27bd1e8bfd6fa4dbf894dbe6b9 /llama.cpp
parent8cf19d60dc93809db8e51fedc811595eed9134c5 (diff)
sampling : refactor init to use llama_sampling_params (#3696)
* sampling : refactor init to use llama_sampling_params * llama : combine repetition, frequency and presence penalties in 1 call * examples : remove embd-input and gptneox-wip * sampling : rename penalty params + reduce size of "prev" vector * sampling : add llama_sampling_print helper * sampling : hide prev behind API and apply #3661 ggml-ci
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp94
1 files changed, 39 insertions, 55 deletions
diff --git a/llama.cpp b/llama.cpp
index ec8ffad3..36534933 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1018,8 +1018,8 @@ enum e_model {
};
static const size_t kB = 1024;
-static const size_t MB = kB*kB;
-static const size_t GB = kB*kB*kB;
+static const size_t MB = 1024*kB;
+static const size_t GB = 1024*MB;
struct llama_hparams {
bool vocab_only;
@@ -1042,21 +1042,21 @@ struct llama_hparams {
float f_max_alibi_bias;
bool operator!=(const llama_hparams & other) const {
- if (this->vocab_only != other.vocab_only) return true;
- if (this->n_vocab != other.n_vocab) return true;
+ if (this->vocab_only != other.vocab_only) return true;
+ if (this->n_vocab != other.n_vocab) return true;
if (this->n_ctx_train != other.n_ctx_train) return true;
- if (this->n_embd != other.n_embd) return true;
- if (this->n_head != other.n_head) return true;
- if (this->n_head_kv != other.n_head_kv) return true;
- if (this->n_layer != other.n_layer) return true;
- if (this->n_rot != other.n_rot) return true;
- if (this->n_ff != other.n_ff) return true;
+ if (this->n_embd != other.n_embd) return true;
+ if (this->n_head != other.n_head) return true;
+ if (this->n_head_kv != other.n_head_kv) return true;
+ if (this->n_layer != other.n_layer) return true;
+ if (this->n_rot != other.n_rot) return true;
+ if (this->n_ff != other.n_ff) return true;
const float EPSILON = 1e-9;
- if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
- if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
- if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
return false;
@@ -1195,11 +1195,11 @@ struct llama_vocab {
id special_sep_id = -1;
id special_pad_id = -1;
- id linefeed_id = 13;
+ id linefeed_id = 13;
id special_prefix_id = 32007;
id special_middle_id = 32009;
id special_suffix_id = 32008;
- id special_eot_id = 32010;
+ id special_eot_id = 32010;
int find_bpe_rank(std::string token_left, std::string token_right) const {
replace_all(token_left, " ", "\u0120");
@@ -1359,10 +1359,7 @@ static bool llama_kv_cache_init(
cache.cells.clear();
cache.cells.resize(n_ctx);
- // TODO: this should be:
- // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
- // change it and test that it works
- cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+ cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
memset(cache.buf.data, 0, cache.buf.size);
struct ggml_init_params params;
@@ -7417,37 +7414,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
llama_sample_temp(ctx, candidates_p, temp);
}
-void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
- if (last_tokens_size == 0 || penalty == 1.0f) {
- return;
- }
-
- const int64_t t_start_sample_us = ggml_time_us();
-
- for (size_t i = 0; i < candidates->size; ++i) {
- const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
- if (token_iter == last_tokens + last_tokens_size) {
- continue;
- }
-
- // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
- // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
- if (candidates->data[i].logit <= 0) {
- candidates->data[i].logit *= penalty;
- } else {
- candidates->data[i].logit /= penalty;
- }
- }
-
- candidates->sorted = false;
-
- if (ctx) {
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
- }
-}
-
-void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
- if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
+void llama_sample_repetition_penalties(
+ struct llama_context * ctx,
+ llama_token_data_array * candidates,
+ const llama_token * last_tokens,
+ size_t penalty_last_n,
+ float penalty_repeat,
+ float penalty_freq,
+ float penalty_present) {
+ if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
return;
}
@@ -7455,19 +7430,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
// Create a frequency map to count occurrences of each token in last_tokens
std::unordered_map<llama_token, int> token_count;
- for (size_t i = 0; i < last_tokens_size; ++i) {
- token_count[last_tokens_p[i]]++;
+ for (size_t i = 0; i < penalty_last_n; ++i) {
+ token_count[last_tokens[i]]++;
}
// Apply frequency and presence penalties to the candidates
for (size_t i = 0; i < candidates->size; ++i) {
- auto token_iter = token_count.find(candidates->data[i].id);
+ const auto token_iter = token_count.find(candidates->data[i].id);
if (token_iter == token_count.end()) {
continue;
}
- int count = token_iter->second;
- candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
+ const int count = token_iter->second;
+
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+ if (candidates->data[i].logit <= 0) {
+ candidates->data[i].logit *= penalty_repeat;
+ } else {
+ candidates->data[i].logit /= penalty_repeat;
+ }
+
+ candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
}
candidates->sorted = false;