From 1d0331c12a2f2a6296b471232bd4e66fbf06e6a1 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Fri, 22 Mar 2024 19:47:14 +0100
Subject: quantize: options for output and token embedding tensors qtype
 (#6239)

* quantize: be able to specify the output tensor type

* quantize: be able to specify the token embedding tensor type

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 llama.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'llama.h')

diff --git a/llama.h b/llama.h
index 7e8ac4b6..74f0e56d 100644
--- a/llama.h
+++ b/llama.h
@@ -275,13 +275,15 @@ extern "C" {
 
     // model quantization parameters
     typedef struct llama_model_quantize_params {
-        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;      // quantize to this llama_ftype
-        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor; // quantize output.weight
-        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                   // quantize all tensors to the default type
-        void * imatrix;              // pointer to importance matrix data
+        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;              // quantize to this llama_ftype
+        enum ggml_type output_tensor_type;   // output tensor type
+        enum ggml_type token_embedding_type; // itoken embeddings tensor type
+        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;         // quantize output.weight
+        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                           // quantize all tensors to the default type
+        void * imatrix;                      // pointer to importance matrix data
     } llama_model_quantize_params;
 
     // grammar types
-- 
cgit v1.2.3