From 154e0d75fccf1784fe9ff6fd76a630b66563da3d Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sat, 27 Jul 2024 07:55:01 +0200 Subject: Merge mainline llama.cpp (#3) * Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow --- examples/batched/batched.cpp | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'examples/batched/batched.cpp') diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 62d9b144..53fbfb0a 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -31,7 +31,7 @@ int main(int argc, char ** argv) { int n_parallel = params.n_parallel; // total length of the sequences including the prompt - int n_predict = 32; + int n_predict = params.n_predict; // init LLM @@ -93,14 +93,34 @@ int main(int argc, char ** argv) { // create a llama_batch // we use this object to submit token data for decoding - llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1); + llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); + + std::vector seq_ids(n_parallel, 0); + for (int32_t i = 0; i < n_parallel; ++i) { + seq_ids[i] = i; + } // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); ++i) { - llama_batch_add(batch, tokens_list[i], i, { 0 }, false); + llama_batch_add(batch, tokens_list[i], i, seq_ids, false); } GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); + if (llama_model_has_encoder(model)) { + if (llama_encode(ctx, batch)) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == -1) { + decoder_start_token_id = llama_token_bos(model); + } + + llama_batch_clear(batch); + llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); + } + // llama_decode will output logits only for the last token of the prompt batch.logits[batch.n_tokens - 1] = true; @@ -109,11 +129,11 @@ int main(int argc, char ** argv) { return 1; } - // assign the system KV cache to all parallel sequences - // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them - for (int32_t i = 1; i < n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } + //// assign the system KV cache to all parallel sequences + //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them + //for (int32_t i = 1; i < n_parallel; ++i) { + // llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + //} if (n_parallel > 1) { LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); -- cgit v1.2.3