summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/llama.cpp248
1 files changed, 65 insertions, 183 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index 779fe317..2ed50b20 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16762,7 +16762,7 @@ static void llama_tensor_dequantize_internal(
workers.clear();
}
-static ggml_type change_type_if_necessar(ggml_type new_type, int nx, int ny) {
+static ggml_type change_type_if_necessary(ggml_type new_type, int nx, int ny) {
bool convert_incompatible_tensor = false;
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
@@ -16834,6 +16834,43 @@ static ggml_type change_type_if_necessar(ggml_type new_type, int nx, int ny) {
return new_type;
}
+static std::pair<ggml_type, int> interleaved_properties(ggml_type type) {
+ static std::unordered_map<ggml_type, std::pair<ggml_type, int>> k_map = {
+ { GGML_TYPE_Q4_0_4_4, { GGML_TYPE_Q4_0, 4} },
+ { GGML_TYPE_Q4_0_4_8, { GGML_TYPE_Q4_0, 4} },
+ { GGML_TYPE_Q4_0_8_8, { GGML_TYPE_Q4_0, 8} },
+ { GGML_TYPE_Q4_0_R8, { GGML_TYPE_Q4_0, 8} },
+ { GGML_TYPE_Q5_0_R4, { GGML_TYPE_Q5_0, 4} },
+ { GGML_TYPE_Q6_0_R4, { GGML_TYPE_Q6_0, 4} },
+ { GGML_TYPE_Q8_0_R8, { GGML_TYPE_Q8_0, 8} },
+ { GGML_TYPE_Q2_K_R4, { GGML_TYPE_Q2_K, 4} },
+ { GGML_TYPE_Q3_K_R4, { GGML_TYPE_Q3_K, 4} },
+ { GGML_TYPE_Q4_K_R4, { GGML_TYPE_Q4_K, 4} },
+ { GGML_TYPE_Q5_K_R4, { GGML_TYPE_Q5_K, 4} },
+ { GGML_TYPE_Q6_K_R4, { GGML_TYPE_Q6_K, 4} },
+ { GGML_TYPE_IQ2_XXS_R4, { GGML_TYPE_IQ2_XXS, 4} },
+ { GGML_TYPE_IQ2_XS_R4, { GGML_TYPE_IQ2_XS, 4} },
+ { GGML_TYPE_IQ2_S_R4, { GGML_TYPE_IQ2_S, 4} },
+ { GGML_TYPE_IQ3_XXS_R4, { GGML_TYPE_IQ3_XXS, 4} },
+ { GGML_TYPE_IQ3_S_R4, { GGML_TYPE_IQ3_S, 4} },
+ { GGML_TYPE_IQ4_XS_R8, { GGML_TYPE_IQ4_XS, 8} },
+ { GGML_TYPE_IQ4_NL_R4, { GGML_TYPE_IQ4_NL, 4} },
+ { GGML_TYPE_IQ1_S_R4, { GGML_TYPE_IQ1_S, 4} },
+ { GGML_TYPE_IQ1_M_R4, { GGML_TYPE_IQ1_M, 4} },
+ { GGML_TYPE_IQ2_BN_R4, { GGML_TYPE_IQ2_BN, 4} },
+ { GGML_TYPE_IQ2_K_R4, { GGML_TYPE_IQ2_K, 4} },
+ { GGML_TYPE_IQ3_K_R4, { GGML_TYPE_IQ3_K, 4} },
+ { GGML_TYPE_IQ4_K_R4, { GGML_TYPE_IQ4_K, 4} },
+ { GGML_TYPE_IQ4_KS_R4, { GGML_TYPE_IQ4_KS, 4} },
+ { GGML_TYPE_IQ5_K_R4, { GGML_TYPE_IQ5_K, 4} },
+ { GGML_TYPE_Q8_KV_R8, { GGML_TYPE_Q8_KV, 8} },
+ { GGML_TYPE_Q8_K_R8, { GGML_TYPE_Q8_K, 8} },
+ { GGML_TYPE_BF16_R16, { GGML_TYPE_BF16, 16} },
+ };
+ if (auto it = k_map.find(type); it != k_map.end()) return it->second;
+ return {type, 1};
+}
+
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
const std::string name = ggml_get_name(tensor);
@@ -16939,70 +16976,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN_R4) {
new_type = GGML_TYPE_IQ4_NL;
}
- else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
- new_type == GGML_TYPE_Q4_0_8_8) {
- new_type = GGML_TYPE_Q4_0;
- }
- else if (new_type == GGML_TYPE_IQ4_NL_R4) {
- new_type = GGML_TYPE_IQ4_NL;
- }
- else if (new_type == GGML_TYPE_IQ4_XS_R8) {
- new_type = GGML_TYPE_IQ4_XS;
- }
- else if (new_type == GGML_TYPE_Q2_K_R4) {
- new_type = GGML_TYPE_Q2_K;
- }
- else if (new_type == GGML_TYPE_Q3_K_R4) {
- new_type = GGML_TYPE_Q3_K;
- }
- else if (new_type == GGML_TYPE_Q4_K_R4) {
- new_type = GGML_TYPE_Q4_K;
- }
- else if (new_type == GGML_TYPE_Q5_K_R4) {
- new_type = GGML_TYPE_Q5_K;
- }
- else if (new_type == GGML_TYPE_Q6_K_R4) {
- new_type = GGML_TYPE_Q6_K;
- }
- else if (new_type == GGML_TYPE_Q8_K_R8) {
- new_type = GGML_TYPE_Q8_0;
- }
- else if (new_type == GGML_TYPE_Q8_KV_R8) {
- new_type = GGML_TYPE_Q8_0;
- }
- else if (new_type == GGML_TYPE_IQ2_K_R4) {
- new_type = GGML_TYPE_IQ2_K;
- }
- else if (new_type == GGML_TYPE_IQ3_K_R4) {
- new_type = GGML_TYPE_IQ3_K;
- }
- else if (new_type == GGML_TYPE_IQ3_S_R4) {
- new_type = GGML_TYPE_IQ3_S;
- }
- else if (new_type == GGML_TYPE_IQ4_K_R4) {
- new_type = GGML_TYPE_IQ4_K;
- }
- else if (new_type == GGML_TYPE_IQ5_K_R4) {
- new_type = GGML_TYPE_IQ5_K;
- }
- else if (new_type == GGML_TYPE_IQ4_KS_R4) {
- new_type = GGML_TYPE_IQ4_KS;
- }
- else if (new_type == GGML_TYPE_Q4_0_R8) {
- new_type = GGML_TYPE_Q4_0;
- }
- else if (new_type == GGML_TYPE_Q5_0_R4) {
- new_type = GGML_TYPE_Q5_0;
- }
- else if (new_type == GGML_TYPE_Q6_0_R4) {
- new_type = GGML_TYPE_Q6_0;
- }
- else if (new_type == GGML_TYPE_Q8_0_R8) {
- new_type = GGML_TYPE_Q8_0;
- }
- else if (new_type == GGML_TYPE_BF16_R16) {
- new_type = GGML_TYPE_BF16;
- }
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) {
if (name.find("attn_v.weight") != std::string::npos) {
@@ -17332,12 +17305,21 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
LLAMA_LOG_INFO("Using custom type %s for tensor %s\n", ggml_type_name(new_type), name.c_str());
}
- auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
+ auto working_type = change_type_if_necessary(new_type, tensor->ne[0], tensor->ne[1]);
if (working_type != new_type) {
++qs.n_fallback;
new_type = working_type;
}
+ if (name == "token_embd.weight") {
+ auto working_type = interleaved_properties(new_type).first;
+ if (working_type != new_type) {
+ printf("\n============ Token embeddings cannot be quantized with row-interleaved quants\n");
+ printf("---> Changed %s to %s\n", ggml_type_name(new_type), ggml_type_name(working_type));
+ new_type = working_type;
+ }
+ }
+
return new_type;
}
@@ -17834,14 +17816,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
if (quantize) {
+
new_type = default_type;
- if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) {
- new_type = GGML_TYPE_BF16;
- }
// get more optimal quantization type based on the tensor shape, layer, etc.
if (params->pure) {
- auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
+ auto working_type = change_type_if_necessary(new_type, tensor->ne[0], tensor->ne[1]);
if (working_type != new_type) {
++qs.n_fallback;
new_type = working_type;
@@ -17881,6 +17861,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_type = params->ffn_up_type;
}
+ if (strcmp(tensor->name, "token_embd.weight") == 0) {
+ // token embeddings cannot be quantized with row-interleaved quants
+ auto working_type = interleaved_properties(new_type).first;
+ if (working_type != new_type) {
+ printf("\n============ Token embeddings cannot be quantized with row-interleaved quants\n");
+ printf("---> Changed %s to %s\n", ggml_type_name(new_type), ggml_type_name(working_type));
+ new_type = working_type;
+ }
+ }
+
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize = tensor->type != new_type;
@@ -17965,119 +17955,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
int chunk_size_multiplier = 1;
- if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
- if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
- else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
- if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
- else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ4_NL_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_NL;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ4_XS_R8) {
- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_IQ4_XS;
- else chunk_size_multiplier = 8;
- }
- else if (new_type == GGML_TYPE_Q4_0_R8) {
- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q4_0;
- else chunk_size_multiplier = 8;
- }
- else if (new_type == GGML_TYPE_Q5_0_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_0;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_Q6_0_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_0;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_Q8_0_R8) {
- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
- else chunk_size_multiplier = 8;
- }
- else if (new_type == GGML_TYPE_Q2_K_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q2_K;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_Q3_K_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q3_K;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_Q4_K_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_K;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_Q5_K_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_K;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_Q6_K_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_K;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_Q8_K_R8) {
- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
- else chunk_size_multiplier = 8;
- }
- else if (new_type == GGML_TYPE_Q8_KV_R8) {
- if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
- else chunk_size_multiplier = 8;
- }
- else if (new_type == GGML_TYPE_IQ2_BN_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_BN;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ2_K_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_K;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ3_K_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_K;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ4_K_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_K;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ5_K_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ5_K;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ4_KS_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_KS;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ2_XXS_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_XXS;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ2_XS_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_XS;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ2_S_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_S;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ3_XXS_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_XXS;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ3_S_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_S;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ1_S_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_S;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_IQ1_M_R4) {
- if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_M;
- else chunk_size_multiplier = 4;
- }
- else if (new_type == GGML_TYPE_BF16_R16) {
- if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16;
- else chunk_size_multiplier = 16;
+ auto [working_type, num_rows] = interleaved_properties(new_type);
+ if (tensor->ne[1] % num_rows != 0) {
+ new_type = working_type;
+ } else {
+ chunk_size_multiplier = num_rows;
}
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));