diff options
Diffstat (limited to 'examples/server')
-rw-r--r-- | examples/server/README.md | 7 | ||||
-rw-r--r-- | examples/server/server.cpp | 33 |
2 files changed, 10 insertions, 30 deletions
diff --git a/examples/server/README.md b/examples/server/README.md index 1559dd3f..4d97db2e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to Command line options: - `--threads N`, `-t N`: Set the number of threads to use during computation. -- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. @@ -48,15 +48,14 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.): ```bash -./server -m models/7B/ggml-model.bin -c 2048 +./server -m models/7B/ggml-model.gguf -c 2048 ``` ### Windows: ```powershell -server.exe -m models\7B\ggml-model.bin -c 2048 +server.exe -m models\7B\ggml-model.gguf -c 2048 ``` - The above command will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 99660455..a04f1910 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -279,7 +279,7 @@ struct llama_server_context grammar_parser::print_grammar(stderr, parsed_grammar); { - auto it = params.logit_bias.find(llama_token_eos()); + auto it = params.logit_bias.find(llama_token_eos(ctx)); if (it != params.logit_bias.end() && it->second == -INFINITY) { LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {}); } @@ -402,7 +402,7 @@ struct llama_server_context if (params.n_predict == 0) { has_next_token = false; - result.tok = llama_token_eos(); + result.tok = llama_token_eos(ctx); return result; } @@ -442,7 +442,7 @@ struct llama_server_context llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; // Apply penalties - float nl_logit = logits[llama_token_nl()]; + float nl_logit = logits[llama_token_nl(ctx)]; auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); llama_sample_repetition_penalty(ctx, &candidates_p, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, @@ -452,7 +452,7 @@ struct llama_server_context last_n_repeat, alpha_frequency, alpha_presence); if (!penalize_nl) { - logits[llama_token_nl()] = nl_logit; + logits[llama_token_nl(ctx)] = nl_logit; } if (grammar != nullptr) { @@ -515,7 +515,7 @@ struct llama_server_context // decrement remaining sampling budget --n_remain; - if (!embd.empty() && embd.back() == llama_token_eos()) + if (!embd.empty() && embd.back() == llama_token_eos(ctx)) { // stopping_word = llama_token_to_str(ctx, embd.back()); has_next_token = false; @@ -652,8 +652,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); - fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); @@ -774,23 +772,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_ctx = std::stoi(argv[i]); } - else if (arg == "-gqa" || arg == "--gqa") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.n_gqa = std::stoi(argv[i]); - } - else if (arg == "-eps" || arg == "--rms-norm-eps") { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.rms_norm_eps = std::stof(argv[i]); - } else if (arg == "--rope-freq-base") { if (++i >= argc) @@ -968,7 +949,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, static json format_generation_settings(llama_server_context &llama) { - const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); + const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx)); const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); @@ -1103,7 +1084,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla llama.params.logit_bias.clear(); if (body.value("ignore_eos", false)) { - llama.params.logit_bias[llama_token_eos()] = -INFINITY; + llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY; } const auto &logit_bias = body.find("logit_bias"); |