summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-01-17 18:54:56 +0200
committerGitHub <noreply@github.com>2024-01-17 18:54:56 +0200
commit38566680cdfe982a495562332c25b9227de9cf8d (patch)
tree3936732879d0a3146577745232feadb80e5917c9 /llama.cpp
parentba69bbc84ced580fe4fdb0713ca2d95634325b7a (diff)
ggml : add IQ2 to test-backend-ops + refactoring (#4990)
* ggml : add IQ2 to test-backend-ops + refactoring ggml-ci * cuda : update supports_op for IQ2 ggml-ci * ci : enable LLAMA_CUBLAS=1 for CUDA nodes ggml-ci * cuda : fix out-of-bounds-access in `mul_mat_vec_q` ggml-ci * tests : avoid creating RNGs for each Q tensor ggml-ci * tests : avoid creating RNGs for each tensor ggml-ci
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp12
1 files changed, 1 insertions, 11 deletions
diff --git a/llama.cpp b/llama.cpp
index 81829b13..d28382f7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8747,8 +8747,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// placeholder for the meta data
::zeros(fout, meta_size);
- std::set<ggml_type> used_iq2;
-
for (int i = 0; i < ml.n_tensors; ++i) {
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
@@ -8801,11 +8799,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else {
const size_t nelements = ggml_nelements(tensor);
- if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
- ggml_init_iq2_quantization(new_type);
- used_iq2.insert(new_type);
- }
-
const float * imatrix = nullptr;
if (imatrix_data) {
auto it = imatrix_data->find(tensor->name);
@@ -8931,10 +8924,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
fout.close();
- for (auto type : used_iq2) {
- ggml_deinit_iq2_quantization(type);
- }
-
gguf_free(ctx_out);
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9342,6 +9331,7 @@ void llama_backend_free(void) {
#ifdef GGML_USE_MPI
ggml_mpi_backend_free();
#endif
+ ggml_quantize_free();
}
int64_t llama_time_us(void) {