ggml : add IQ2 to test-backend-ops + refactoring (#4990)

* ggml : add IQ2 to test-backend-ops + refactoring ggml-ci * cuda : update supports_op for IQ2 ggml-ci * ci : enable LLAMA_CUBLAS=1 for CUDA nodes ggml-ci * cuda : fix out-of-bounds-access in `mul_mat_vec_q` ggml-ci * tests : avoid creating RNGs for each Q tensor ggml-ci * tests : avoid creating RNGs for each tensor ggml-ci
author: Georgi Gerganov <ggerganov@gmail.com> 2024-01-17 18:54:56 +0200
committer: GitHub <noreply@github.com> 2024-01-17 18:54:56 +0200
commit: 38566680cdfe982a495562332c25b9227de9cf8d (patch)
tree: 3936732879d0a3146577745232feadb80e5917c9 /llama.cpp
parent: ba69bbc84ced580fe4fdb0713ca2d95634325b7a (diff)
1 files changed, 1 insertions, 11 deletions
diff --git a/llama.cpp b/llama.cpp
index 81829b13..d28382f7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8747,8 +8747,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     // placeholder for the meta data
     ::zeros(fout, meta_size);
 
-    std::set<ggml_type> used_iq2;
-
     for (int i = 0; i < ml.n_tensors; ++i) {
         struct ggml_tensor * tensor = ml.get_tensor_meta(i);
 
@@ -8801,11 +8799,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         } else {
             const size_t nelements = ggml_nelements(tensor);
 
-            if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
-                ggml_init_iq2_quantization(new_type);
-                used_iq2.insert(new_type);
-            }
-
             const float * imatrix = nullptr;
             if (imatrix_data) {
                 auto it = imatrix_data->find(tensor->name);
@@ -8931,10 +8924,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     fout.close();
 
-    for (auto type : used_iq2) {
-        ggml_deinit_iq2_quantization(type);
-    }
-
     gguf_free(ctx_out);
 
     LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9342,6 +9331,7 @@ void llama_backend_free(void) {
 #ifdef GGML_USE_MPI
     ggml_mpi_backend_free();
 #endif
+    ggml_quantize_free();
 }
 
 int64_t llama_time_us(void) {
author	Georgi Gerganov <ggerganov@gmail.com>	2024-01-17 18:54:56 +0200
committer	GitHub <noreply@github.com>	2024-01-17 18:54:56 +0200
commit	38566680cdfe982a495562332c25b9227de9cf8d (patch)
tree	3936732879d0a3146577745232feadb80e5917c9 /llama.cpp
parent	ba69bbc84ced580fe4fdb0713ca2d95634325b7a (diff)