diff options
author | slaren <slarengh@gmail.com> | 2024-03-09 19:55:54 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-09 19:55:54 +0100 |
commit | d894f352bf433157232dc8dc54eacd50014e898e (patch) | |
tree | 0ed89f222eb3f0e08a397c953d27e7e6846156ce /llama.cpp | |
parent | 098dbaab449f5309a54871ba7e5acef72ae696de (diff) |
perplexity : support using multiple sequences to allow larger batch sizes (#5946)
* perplexity : support using multiple sequences to allow larger batch sizes
ggml-ci
* set cparams.n_parallel to the number of sequences
* print tested n_ctx, add assert
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 22 |
1 files changed, 17 insertions, 5 deletions
@@ -8925,17 +8925,29 @@ static int llama_decode_internal( if (batch.logits) { logits_out.resize(n_vocab * n_tokens); + int32_t i_first = -1; for (uint32_t i = 0; i < n_tokens; i++) { - if (batch.logits[i] == 0) { - continue; + if (batch.logits[i] && i_first == -1) { + i_first = (int32_t) i; + } + if (batch.logits[i] == 0 || i == n_tokens - 1) { + if (i_first != -1) { + int i_last = batch.logits[i] == 0 ? i : i + 1; + // extract logits for the range [i_first, i_last) + // group the requests to minimize the number of calls to the backend + ggml_backend_tensor_get_async(backend_res, res, + logits_out.data() + (n_vocab*i_first), + (n_vocab*i_first)*sizeof(float), + (i_last - i_first)*n_vocab*sizeof(float)); + i_first = -1; + } } - ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG - logits_valid[i] = true; + logits_valid[i] = batch.logits[i] != 0; #endif } } else if (lctx.logits_all) { - logits_out.resize(n_vocab * n_tokens); + logits_out.resize(n_vocab*n_tokens); ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); #ifndef NDEBUG std::fill(logits_valid.begin(), logits_valid.end(), true); |