diff options
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r-- | examples/server/server.cpp | 50 |
1 files changed, 21 insertions, 29 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9b962483..fe9a4255 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -200,6 +200,7 @@ struct llama_server_context llama_model *model = nullptr; llama_context *ctx = nullptr; gpt_params params; + int n_ctx; grammar_parser::parse_state parsed_grammar; llama_grammar *grammar = nullptr; @@ -239,7 +240,7 @@ struct llama_server_context num_prompt_tokens = 0; num_tokens_predicted = 0; generated_text = ""; - generated_text.reserve(params.n_ctx); + generated_text.reserve(n_ctx); generated_token_probs.clear(); truncated = false; stopped_eos = false; @@ -265,8 +266,8 @@ struct llama_server_context LOG_ERROR("unable to load model", {{"model", params_.model}}); return false; } - - last_n_tokens.resize(params.n_ctx); + n_ctx = llama_n_ctx(ctx); + last_n_tokens.resize(n_ctx); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); return true; } @@ -351,19 +352,19 @@ struct llama_server_context { params.n_keep = (int)num_prompt_tokens; } - params.n_keep = std::min(params.n_ctx - 4, params.n_keep); + params.n_keep = std::min(n_ctx - 4, params.n_keep); // if input prompt is too big, truncate like normal - if (num_prompt_tokens >= (size_t)params.n_ctx) + if (num_prompt_tokens >= (size_t)n_ctx) { - const int n_left = (params.n_ctx - params.n_keep) / 2; + const int n_left = (n_ctx - params.n_keep) / 2; std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left; new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); - std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin()); LOG_VERBOSE("input truncated", { - {"n_ctx", params.n_ctx}, + {"n_ctx", n_ctx}, {"n_keep", params.n_keep}, {"n_left", n_left}, {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, @@ -413,7 +414,7 @@ struct llama_server_context completion_token_output result; result.tok = -1; - if (embd.size() >= (size_t)params.n_ctx) + if (embd.size() >= (size_t)n_ctx) { // Shift context @@ -433,7 +434,7 @@ struct llama_server_context truncated = true; LOG_VERBOSE("input truncated", { - {"n_ctx", params.n_ctx}, + {"n_ctx", n_ctx}, {"n_keep", params.n_keep}, {"n_left", n_left}, }); @@ -447,12 +448,11 @@ struct llama_server_context n_eval = params.n_batch; } - if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads)) + if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) { LOG_ERROR("failed to eval", { {"n_eval", n_eval}, {"n_past", n_past}, - {"n_threads", params.n_threads}, {"embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())}, }); has_next_token = false; @@ -470,11 +470,11 @@ struct llama_server_context // out of user input, sample next token const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k; const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; const float repeat_penalty = params.repeat_penalty; const float alpha_presence = params.presence_penalty; const float alpha_frequency = params.frequency_penalty; @@ -486,7 +486,7 @@ struct llama_server_context { auto *logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); + auto n_vocab = llama_n_vocab(model); // Apply params.logit_bias map for (const auto &it : params.logit_bias) @@ -505,7 +505,7 @@ struct llama_server_context // Apply penalties float nl_logit = logits[llama_token_nl(ctx)]; - auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); llama_sample_repetition_penalty(ctx, &candidates_p, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_repeat, repeat_penalty); @@ -690,7 +690,7 @@ struct llama_server_context std::vector<float> getEmbedding() { - static const int n_embd = llama_n_embd(ctx); + static const int n_embd = llama_n_embd(model); if (!params.embedding) { LOG_WARNING("embedding disabled", { @@ -734,7 +734,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -ts SPLIT --tensor-split SPLIT\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n"); printf(" -nommq, --no-mul-mat-q\n"); printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n"); @@ -920,14 +919,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); #endif // GGML_USE_CUBLAS } - else if (arg == "--low-vram" || arg == "-lv") - { -#ifdef GGML_USE_CUBLAS - params.low_vram = true; -#else - LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {}); -#endif // GGML_USE_CUBLAS - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { #ifdef GGML_USE_CUBLAS @@ -1031,7 +1022,7 @@ static json format_generation_settings(llama_server_context &llama) eos_bias->second < 0.0f && std::isinf(eos_bias->second); return json{ - {"n_ctx", llama.params.n_ctx}, + {"n_ctx", llama.n_ctx}, {"model", llama.params.model_alias}, {"seed", llama.params.seed}, {"temp", llama.params.temp}, @@ -1191,7 +1182,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla const auto &logit_bias = body.find("logit_bias"); if (logit_bias != body.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(llama.ctx); + const int n_vocab = llama_n_vocab(llama.model); for (const auto &el : *logit_bias) { if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) @@ -1324,6 +1315,7 @@ int main(int argc, char **argv) {"commit", BUILD_COMMIT}}); LOG_INFO("system info", { {"n_threads", params.n_threads}, + {"n_threads_batch", params.n_threads_batch}, {"total_threads", std::thread::hardware_concurrency()}, {"system_info", llama_print_system_info()}, }); @@ -1387,7 +1379,7 @@ int main(int argc, char **argv) if (llama.params.n_beams) { // Fill llama.generated_token_probs vector with final beam. llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams, - llama.n_past, llama.n_remain, llama.params.n_threads); + llama.n_past, llama.n_remain); // Translate llama.generated_token_probs to llama.generated_text. append_to_generated_text_from_generated_token_probs(llama); } else { |