diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 38 |
1 files changed, 29 insertions, 9 deletions
@@ -2177,7 +2177,7 @@ struct llama_context { std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers size_t output_size = 0; // capacity (of tokens positions) for the output buffers - int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch + int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch bool logits_all = false; @@ -10411,6 +10411,9 @@ static int llama_decode_internal( n_outputs_prev += lctx.n_outputs; } + // set to total number of outputs in the batch, for use in llama_get_logits_ith + lctx.n_outputs = n_outputs; + // wait for the computation to finish (automatically done when obtaining the model output) //llama_synchronize(&lctx); @@ -15944,23 +15947,31 @@ float * llama_get_logits(struct llama_context * ctx) { } float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { + int32_t j = -1; llama_synchronize(ctx); try { if (ctx->logits == nullptr) { throw std::runtime_error("no logits"); } - if ((size_t) i >= ctx->output_ids.size()) { + + if (i < 0) { + j = ctx->n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs)); + } + } else if ((size_t) i >= ctx->output_ids.size()) { throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size())); + } else { + j = ctx->output_ids[i]; } - const int32_t j = ctx->output_ids[i]; if (j < 0) { throw std::runtime_error(format("batch.logits[%d] != true", i)); } - if ((size_t) j >= ctx->output_size) { + if (j >= ctx->n_outputs) { // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size)); + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs)); } return ctx->logits + j*ctx->model.hparams.n_vocab; @@ -15980,23 +15991,32 @@ float * llama_get_embeddings(struct llama_context * ctx) { } float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { + int32_t j = -1; + llama_synchronize(ctx); try { if (ctx->embd == nullptr) { throw std::runtime_error("no embeddings"); } - if ((size_t) i >= ctx->output_ids.size()) { + + if (i < 0) { + j = ctx->n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs)); + } + } else if ((size_t) i >= ctx->output_ids.size()) { throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size())); + } else { + j = ctx->output_ids[i]; } - const int32_t j = ctx->output_ids[i]; if (j < 0) { throw std::runtime_error(format("batch.logits[%d] != true", i)); } - if ((size_t) j >= ctx->output_size) { + if (j >= ctx->n_outputs) { // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size)); + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs)); } return ctx->embd + j*ctx->model.hparams.n_embd; |