diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-07-27 07:55:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-27 07:55:01 +0200 |
commit | 154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch) | |
tree | 81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /examples/batched/batched.cpp | |
parent | 0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff) |
Merge mainline llama.cpp (#3)
* Merging mainline - WIP
* Merging mainline - WIP
AVX2 and CUDA appear to work.
CUDA performance seems slightly (~1-2%) lower as it is so often
the case with llama.cpp/ggml after some "improvements" have been made.
* Merging mainline - fix Metal
* Remove check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'examples/batched/batched.cpp')
-rw-r--r-- | examples/batched/batched.cpp | 36 |
1 files changed, 28 insertions, 8 deletions
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 62d9b144..53fbfb0a 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -31,7 +31,7 @@ int main(int argc, char ** argv) { int n_parallel = params.n_parallel; // total length of the sequences including the prompt - int n_predict = 32; + int n_predict = params.n_predict; // init LLM @@ -93,14 +93,34 @@ int main(int argc, char ** argv) { // create a llama_batch // we use this object to submit token data for decoding - llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1); + llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); + + std::vector<llama_seq_id> seq_ids(n_parallel, 0); + for (int32_t i = 0; i < n_parallel; ++i) { + seq_ids[i] = i; + } // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); ++i) { - llama_batch_add(batch, tokens_list[i], i, { 0 }, false); + llama_batch_add(batch, tokens_list[i], i, seq_ids, false); } GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); + if (llama_model_has_encoder(model)) { + if (llama_encode(ctx, batch)) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == -1) { + decoder_start_token_id = llama_token_bos(model); + } + + llama_batch_clear(batch); + llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); + } + // llama_decode will output logits only for the last token of the prompt batch.logits[batch.n_tokens - 1] = true; @@ -109,11 +129,11 @@ int main(int argc, char ** argv) { return 1; } - // assign the system KV cache to all parallel sequences - // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them - for (int32_t i = 1; i < n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } + //// assign the system KV cache to all parallel sequences + //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them + //for (int32_t i = 1; i < n_parallel; ++i) { + // llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + //} if (n_parallel > 1) { LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); |