summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp52
1 files changed, 26 insertions, 26 deletions
diff --git a/llama.cpp b/llama.cpp
index b0eab2e7..ee7eb8ea 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -779,8 +779,8 @@ static bool llama_model_load(
// progress
if (progress_callback) {
- double current_file_progress = double(size_t(fin.tellg()) - file_offset) / double(file_size - file_offset);
- double current_progress = (double(i) + current_file_progress) / double(n_parts);
+ float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
+ float current_progress = (float(i) + current_file_progress) / float(n_parts);
progress_callback(current_progress, progress_callback_user_data);
}
if (model.n_loaded % 8 == 0) {
@@ -922,7 +922,7 @@ static bool llama_eval_internal(
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)));
+ ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
@@ -1240,12 +1240,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
// sampling
//
-static void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k) {
+static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
// find the top k tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
- [](const std::pair<double, llama_vocab::id> & a, const std::pair<double, llama_vocab::id> & b) {
+ [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
return a.first > b.first;
});
@@ -1256,9 +1256,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
llama_context & lctx,
const std::vector<llama_vocab::id> & last_n_tokens,
int top_k,
- double top_p,
- double temp,
- double repeat_penalty) {
+ float top_p,
+ float temp,
+ float repeat_penalty) {
auto & rng = lctx.rng;
const int n_logits = lctx.model.hparams.n_vocab;
@@ -1266,17 +1266,17 @@ static llama_vocab::id llama_sample_top_p_top_k(
const auto & logits = lctx.logits;
const auto * plogits = logits.data() + logits.size() - n_logits;
- std::vector<std::pair<double, llama_vocab::id>> logits_id;
+ std::vector<std::pair<float, llama_vocab::id>> logits_id;
logits_id.reserve(n_logits);
{
- const double scale = 1.0/temp;
+ const float scale = 1.0f/temp;
for (int i = 0; i < n_logits; ++i) {
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
- if (plogits[i] < 0.0) {
+ if (plogits[i] < 0.0f) {
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
} else {
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
@@ -1289,18 +1289,18 @@ static llama_vocab::id llama_sample_top_p_top_k(
sample_top_k(logits_id, top_k);
- double maxl = -std::numeric_limits<double>::infinity();
+ float maxl = -std::numeric_limits<float>::infinity();
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
}
// compute probs for the top k tokens
- std::vector<double> probs;
+ std::vector<float> probs;
probs.reserve(logits_id.size());
double sum = 0.0;
for (const auto & kv : logits_id) {
- double p = exp(kv.first - maxl);
+ const float p = expf(kv.first - maxl);
probs.push_back(p);
sum += p;
}
@@ -1310,8 +1310,8 @@ static llama_vocab::id llama_sample_top_p_top_k(
p /= sum;
}
- if (top_p < 1.0f) {
- double cumsum = 0.0f;
+ if (top_p < 1.0) {
+ double cumsum = 0.0;
for (int i = 0; i < (int) probs.size(); i++) {
cumsum += probs[i];
if (cumsum >= top_p) {
@@ -1590,7 +1590,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
}
for (int i = 0; i < (int) hist_cur.size(); ++i) {
- printf("%5.3f ", hist_cur[i] / (float)nelements);
+ printf("%5.3f ", hist_cur[i] / float(nelements));
}
printf("\n");
} else {
@@ -1613,7 +1613,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
printf("%s: hist: ", __func__);
for (int i = 0; i < (int) hist_all.size(); ++i) {
- printf("%5.3f ", hist_all[i] / (float)sum_all);
+ printf("%5.3f ", hist_all[i] / float(sum_all));
}
printf("\n");
}
@@ -1795,9 +1795,9 @@ llama_token llama_sample_top_p_top_k(
const llama_token * last_n_tokens_data,
int last_n_tokens_size,
int top_k,
- double top_p,
- double temp,
- double repeat_penalty) {
+ float top_p,
+ float temp,
+ float repeat_penalty) {
const int64_t t_start_sample_us = ggml_time_us();
llama_token result = 0;
@@ -1828,11 +1828,11 @@ void llama_print_timings(struct llama_context * ctx) {
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
fprintf(stderr, "\n");
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample);
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3f * ctx->t_p_eval_us, n_p_eval, 1e-3f * ctx->t_p_eval_us / n_p_eval);
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_eval_us, n_eval, 1e-3f * ctx->t_eval_us / n_eval);
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
}
void llama_reset_timings(struct llama_context * ctx) {