llama : allow pooled embeddings on any model (#7477)

* create append_pooling operation; allow to specify attention_type; add last token pooling; update examples * find result_norm/result_embd tensors properly; update output allocation logic * only use embd output for pooling_type NONE * get rid of old causal_attn accessor * take out attention_type; add in llama_set_embeddings * bypass logits when doing non-NONE pooling
author: Douglas Hanley <thesecretaryofwar@gmail.com> 2024-06-21 00:38:22 -0500
committer: GitHub <noreply@github.com> 2024-06-21 08:38:22 +0300
commit: 80ea089d771f0c2d97afa8bead80ded412f600d7 (patch)
tree: 25c04a967b5913ffdc00d1a851dcfbeb9ab37a37 /llama.h
parent: 0e64591e8290037db6412665a56354b789a0597e (diff)
1 files changed, 5 insertions, 1 deletions
diff --git a/llama.h b/llama.h
index da310ffa..05d8b092 100644
--- a/llama.h
+++ b/llama.h
@@ -174,6 +174,7 @@ extern "C" {
         LLAMA_POOLING_TYPE_NONE = 0,
         LLAMA_POOLING_TYPE_MEAN = 1,
         LLAMA_POOLING_TYPE_CLS  = 2,
+        LLAMA_POOLING_TYPE_LAST = 3,
     };
 
     enum llama_split_mode {
@@ -293,7 +294,6 @@ extern "C" {
 
         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
-                                                        // (ignored if no pooling layer)
 
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -786,6 +786,10 @@ extern "C" {
     // Get the number of threads used for prompt and batch processing (multiple token).
     LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
 
+    // Set whether the model is in embeddings model or not
+    // If true, embeddings will be returned but logits will not
+    LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
+
     // Set whether to use causal attention or not
     // If set to true, the model will only attend to the past tokens
     LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
author	Douglas Hanley <thesecretaryofwar@gmail.com>	2024-06-21 00:38:22 -0500
committer	GitHub <noreply@github.com>	2024-06-21 08:38:22 +0300
commit	80ea089d771f0c2d97afa8bead80ded412f600d7 (patch)
tree	25c04a967b5913ffdc00d1a851dcfbeb9ab37a37 /llama.h
parent	0e64591e8290037db6412665a56354b789a0597e (diff)