llama : fix embeddings (#5796)

* llama : fix embeddings ggml-ci * llama : do not use KV cache for non-causal models ggml-ci * embeddings : fix llama_batch_init arg * llama : add pooling switch * llama : distinguish token vs sequence embeddings ggml-ci * llama : assert pooling tensor * llama : simplify causal mask condition ggml-ci * llama : assert input batch with pooling enabled * readme : update API changes list
author: Georgi Gerganov <ggerganov@gmail.com> 2024-03-04 22:31:20 +0200
committer: GitHub <noreply@github.com> 2024-03-04 22:31:20 +0200
commit: 29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957 (patch)
tree: a65058dfddf1672f1d765e324dac9f66abf1a7c1 /llama.h
parent: e0843afe1b37890b631bc7d3d2da2ed36c862b91 (diff)
1 files changed, 12 insertions, 6 deletions
diff --git a/llama.h b/llama.h
index 70da4cb3..3dc162b0 100644
--- a/llama.h
+++ b/llama.h
@@ -163,7 +163,7 @@ extern "C" {
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
     // - seq_id : the sequence to which the respective token belongs
-    // - logits : if zero, the logits for the respective token will not be output
+    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
     //
     typedef struct llama_batch {
         int32_t n_tokens;
@@ -173,7 +173,7 @@ extern "C" {
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-        int8_t       *  logits;
+        int8_t       *  logits; // TODO: rename this to "output"
 
         // NOTE: helpers for smooth API transition - can be deprecated in the future
         //       for future-proof code, use the above fields instead and ignore everything below
@@ -260,7 +260,7 @@ extern "C" {
 
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embedding;   // embedding mode only
+        bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
 
         // Abort callback
@@ -655,14 +655,20 @@ extern "C" {
     // llama_get_logits(ctx) + i*n_vocab
     LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
 
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
+    // Get all output token embeddings
+    // shape: [n_tokens*n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 
-    // Get the embeddings for the ith sequence
+    // Get the embeddings for the ith token
     // llama_get_embeddings(ctx) + i*n_embd
+    // shape: [n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
 
+    // Get the embeddings for a sequence id
+    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+    // shape: [n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
+
     //
     // Vocab
     //
author	Georgi Gerganov <ggerganov@gmail.com>	2024-03-04 22:31:20 +0200
committer	GitHub <noreply@github.com>	2024-03-04 22:31:20 +0200
commit	29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957 (patch)
tree	a65058dfddf1672f1d765e324dac9f66abf1a7c1 /llama.h
parent	e0843afe1b37890b631bc7d3d2da2ed36c862b91 (diff)