From 80ea089d771f0c2d97afa8bead80ded412f600d7 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Fri, 21 Jun 2024 00:38:22 -0500 Subject: llama : allow pooled embeddings on any model (#7477) * create append_pooling operation; allow to specify attention_type; add last token pooling; update examples * find result_norm/result_embd tensors properly; update output allocation logic * only use embd output for pooling_type NONE * get rid of old causal_attn accessor * take out attention_type; add in llama_set_embeddings * bypass logits when doing non-NONE pooling --- llama.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'llama.h') diff --git a/llama.h b/llama.h index da310ffa..05d8b092 100644 --- a/llama.h +++ b/llama.h @@ -174,6 +174,7 @@ extern "C" { LLAMA_POOLING_TYPE_NONE = 0, LLAMA_POOLING_TYPE_MEAN = 1, LLAMA_POOLING_TYPE_CLS = 2, + LLAMA_POOLING_TYPE_LAST = 3, }; enum llama_split_mode { @@ -293,7 +294,6 @@ extern "C" { enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id - // (ignored if no pooling layer) // ref: https://github.com/ggerganov/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency, 0 = from model @@ -786,6 +786,10 @@ extern "C" { // Get the number of threads used for prompt and batch processing (multiple token). LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); + // Set whether the model is in embeddings model or not + // If true, embeddings will be returned but logits will not + LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); + // Set whether to use causal attention or not // If set to true, the model will only attend to the past tokens LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); -- cgit v1.2.3