From b97bc3966e852adb626c90be64fd48282800f504 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Sun, 21 Apr 2024 13:50:41 +0200 Subject: llama : support Llama 3 HF conversion (#6745) * Support Llama 3 conversion The tokenizer is BPE. * style * Accept suggestion Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> * llama : add llama_token_is_eog() ggml-ci * llama : auto-detect more EOT tokens when missing in KV data * convert : replacing EOS token is a hack * llama : fix codegemma EOT token + add TODOs * llama : fix model type string for 8B model --------- Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Co-authored-by: Georgi Gerganov --- examples/batched.swift/Sources/main.swift | 2 +- examples/batched/batched.cpp | 4 ++-- examples/beam-search/beam-search.cpp | 2 +- examples/infill/infill.cpp | 10 +++++----- examples/llama.android/app/src/main/cpp/llama-android.cpp | 2 +- examples/llama.swiftui/llama.cpp.swift/LibLlama.swift | 2 +- examples/llava/llava-cli.cpp | 2 +- examples/lookahead/lookahead.cpp | 2 +- examples/lookup/lookup.cpp | 2 +- examples/main/main.cpp | 8 ++++---- examples/parallel/parallel.cpp | 2 +- examples/passkey/passkey.cpp | 4 ++-- examples/server/server.cpp | 2 +- examples/server/utils.hpp | 4 ---- examples/simple/simple.cpp | 4 ++-- examples/speculative/speculative.cpp | 2 +- 16 files changed, 25 insertions(+), 29 deletions(-) (limited to 'examples') diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index d75c503d..5764acb6 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -153,7 +153,7 @@ while n_cur <= n_len { // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of stream? -> mark the stream as finished - if new_token_id == llama_token_eos(model) || n_cur == n_len { + if llama_token_is_eog(model, new_token_id) || n_cur == n_len { i_batch[i] = -1 // print("") if n_parallel > 1 { diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 7aaf63ce..be30d20b 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -191,8 +191,8 @@ int main(int argc, char ** argv) { //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); - // is it an end of stream? -> mark the stream as finished - if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + // is it an end of generation? -> mark the stream as finished + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { i_batch[i] = -1; LOG_TEE("\n"); if (n_parallel > 1) { diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 866c6d7a..3d34378a 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -47,7 +47,7 @@ struct beam_search_callback_data { // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // For example, eob can be flagged due to maximum token length, stop words, etc. static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { - return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx)); + return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]); } // Function matching type llama_beam_search_callback_fn_t. diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index c69dcd06..afac145f 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -586,7 +586,7 @@ int main(int argc, char ** argv) { // deal with eot token in infill mode if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ - if(is_interacting && !params.interactive_first) { + if (is_interacting && !params.interactive_first) { // print an eot token printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); } @@ -651,8 +651,8 @@ int main(int argc, char ** argv) { // LOG_TEE("took new input\n"); is_interacting = false; } - // deal with end of text token in interactive mode - else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { + // deal with end of generation tokens in interactive mode + else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { LOG("found EOS token\n"); if (params.interactive) { @@ -731,8 +731,8 @@ int main(int argc, char ** argv) { } } - // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) { + // end of generation + if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) { break; } diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp index ce8ab3b7..4af9de30 100644 --- a/examples/llama.android/app/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp @@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop( const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); - if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { return env->NewStringUTF(""); } diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index c249291a..70c43a38 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -158,7 +158,7 @@ actor LlamaContext { new_token_id = llama_sample_token_greedy(context, &candidates_p) } - if new_token_id == llama_token_eos(model) || n_cur == n_len { + if llama_token_is_eog(model, new_token_id) || n_cur == n_len { print("\n") let new_token_str = String(cString: temporary_invalid_cchars + [0]) temporary_invalid_cchars.removeAll() diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 75948806..50dac4ca 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -45,7 +45,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling, const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); llama_sampling_accept(ctx_sampling, ctx_llama, id, true); static std::string ret; - if (id == llama_token_eos(llama_get_model(ctx_llama))) { + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; } else { ret = llama_token_to_piece(ctx_llama, id); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 5af6a8ab..9c3540b2 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -299,7 +299,7 @@ int main(int argc, char ** argv) { } fflush(stdout); - if (id == llama_token_eos(model)) { + if (llama_token_is_eog(model, id)) { has_eos = true; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 65ed408a..9526e898 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -141,7 +141,7 @@ int main(int argc, char ** argv){ printf("%s", token_str.c_str()); } - if (id == llama_token_eos(model)) { + if (llama_token_is_eog(model, id)) { has_eos = true; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 249fc2bb..1180734b 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -795,8 +795,8 @@ int main(int argc, char ** argv) { } } - // deal with end of text token in interactive mode - if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { + // deal with end of generation tokens in interactive mode + if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { LOG("found EOS token\n"); if (params.interactive) { @@ -920,8 +920,8 @@ int main(int argc, char ** argv) { } } - // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { + // end of generation + if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) { LOG_TEE(" [end of text]\n"); break; } diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index f66c9101..7c5595d6 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -359,7 +359,7 @@ int main(int argc, char ** argv) { // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); if (client.n_decoded > 2 && - (id == llama_token_eos(model) || + (llama_token_is_eog(model, id) || (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || client.response.find("User:") != std::string::npos || client.response.find('\n') != std::string::npos)) { diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 2cbc9e1f..f2ef9ca1 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -252,8 +252,8 @@ int main(int argc, char ** argv) { // sample the most likely token const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); - // is it an end of stream? - if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + // is it an end of generation? + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { LOG_TEE("\n"); break; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 634e653a..25bc2963 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1201,7 +1201,7 @@ struct server_context { }); } - if (result.tok == llama_token_eos(model)) { + if (llama_token_is_eog(model, result.tok)) { slot.stopped_eos = true; slot.has_next_token = false; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index a8d43ac6..1a221250 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -381,10 +381,6 @@ static json oaicompat_completion_params_parse( } else { llama_params["stop"] = json_value(body, "stop", json::array()); } - // Some chat templates don't use EOS token to stop generation - // We must add their end sequences to list of stop words - llama_params["stop"].push_back("<|im_end|>"); // chatml - llama_params["stop"].push_back(""); // gemma // Handle "response_format" field if (body.contains("response_format")) { diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 39e2d8ea..b0f8e0fd 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -133,8 +133,8 @@ int main(int argc, char ** argv) { // sample the most likely token const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); - // is it an end of stream? - if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + // is it an end of generation? + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { LOG_TEE("\n"); break; diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 6a7367b0..12e46fbc 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -360,7 +360,7 @@ int main(int argc, char ** argv) { } } - if (token_id == llama_token_eos(model_tgt)) { + if (llama_token_is_eog(model_tgt, token_id)) { has_eos = true; } ++n_predict; -- cgit v1.2.3