From 80ea089d771f0c2d97afa8bead80ded412f600d7 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Fri, 21 Jun 2024 00:38:22 -0500
Subject: llama : allow pooled embeddings on any model (#7477)

* create append_pooling operation; allow to specify attention_type; add last token pooling; update examples

* find result_norm/result_embd tensors properly; update output allocation logic

* only use embd output for pooling_type NONE

* get rid of old causal_attn accessor

* take out attention_type; add in llama_set_embeddings

* bypass logits when doing non-NONE pooling
---
 common/common.cpp | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'common/common.cpp')

diff --git a/common/common.cpp b/common/common.cpp
index 9c23d001..64f160af 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -541,6 +541,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
         else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
         else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+        else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
         else { invalid_param = true; }
         return true;
     }
@@ -1869,6 +1870,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
     options.push_back({ "backend" });
     options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });
+
     if (llama_supports_mlock()) {
         options.push_back({ "*",           "       --mlock",                "force system to keep model in RAM rather than swapping or compressing" });
     }
-- 
cgit v1.2.3