summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorCarolinabanana <140120812+Carolinabanana@users.noreply.github.com>2024-04-09 09:16:13 +0100
committerGitHub <noreply@github.com>2024-04-09 11:16:13 +0300
commit5dc9dd7152dedc6046b646855585bd070c91e8c8 (patch)
treed2bae3652d91cdd9327e28fa85d167a67e050c53 /llama.cpp
parente11a8999b5690f810c2c99c14347f0834e68c524 (diff)
llama : add Command R Plus support (#6491)
* Add Command R Plus GGUF * Add Command R Plus GGUF * Loading works up to LayerNorm2D * Export new tensors in 1D so they are not quantized. * Fix embedding layer based on Noeda's example * Whitespace * Add line * Fix unexpected tokens on MPS. Re-add F16 fix. ((Noeda) * dranger003: Fix block index overflow in CUDA dequantizing. * Reverted blocked multiplication code as it still has issues and could affect other Llama arches * export norms as f32 * fix overflow issues during quant and other cleanup * Type convention Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * dranger003: Fix more int overflow during quant. --------- Co-authored-by: S <seast@Ss-Mac-Studio.local> Co-authored-by: S <s@example.com> Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp64
1 files changed, 50 insertions, 14 deletions
diff --git a/llama.cpp b/llama.cpp
index b16ddc64..6a090d1b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -926,6 +926,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
{
@@ -5406,6 +5408,11 @@ static bool llm_load_tensors(
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ if (n_layer >= 64){
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
+ }
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
@@ -9454,6 +9461,31 @@ struct llm_build_context {
cb(Vcur, "Vcur", il);
}
+ if (model.layers[il].attn_q_norm) {
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+ ggml_element_size(Qcur) * n_embd_head,
+ ggml_element_size(Qcur) * n_embd_head * n_head,
+ 0);
+ cb(Qcur, "Qcur", il);
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+ ggml_element_size(Kcur) * n_embd_head,
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
+ 0);
+ cb(Kcur, "Kcur", il);
+
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
+ model.layers[il].attn_q_norm,
+ NULL,
+ LLM_NORM, cb, il);
+ cb(Qcur, "Qcur", il);
+
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
+ model.layers[il].attn_k_norm,
+ NULL,
+ LLM_NORM, cb, il);
+ cb(Kcur, "Kcur", il);
+ }
+
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -13323,9 +13355,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
return new_type;
}
-static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
std::mutex mutex;
- int counter = 0;
+ int64_t counter = 0;
size_t new_size = 0;
if (nthread < 2) {
// single-thread
@@ -13333,11 +13365,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
}
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
nrows, n_per_row, imatrix]() {
- const int nrows_per_chunk = chunk_size / n_per_row;
+ const int64_t nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
- int first_row = counter; counter += nrows_per_chunk;
+ int64_t first_row = counter; counter += nrows_per_chunk;
if (first_row >= nrows) {
if (local_size > 0) {
new_size += local_size;
@@ -13345,7 +13377,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
break;
}
lock.unlock();
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
+ const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
}
};
@@ -13539,6 +13571,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// quantize only 2D and 3D tensors (experts)
quantize &= (ggml_n_dims(tensor) >= 2);
+
+ // do not quantize norm tensors
+ quantize &= name.find("_norm.weight") == std::string::npos;
+
quantize &= params->quantize_output_tensor || name != "output.weight";
quantize &= !params->only_copy;
@@ -13585,7 +13621,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
- const size_t nelements = ggml_nelements(tensor);
+ const int64_t nelements = ggml_nelements(tensor);
const float * imatrix = nullptr;
if (imatrix_data) {
@@ -13637,20 +13673,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout);
- if (work.size() < nelements * 4) {
+ if (work.size() < (size_t)nelements * 4) {
work.resize(nelements * 4); // upper bound on size
}
new_data = work.data();
- const int n_per_row = tensor->ne[0];
- const int nrows = tensor->ne[1];
+ const int64_t n_per_row = tensor->ne[0];
+ const int64_t nrows = tensor->ne[1];
- static const int min_chunk_size = 32 * 512;
- const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
+ static const int64_t min_chunk_size = 32 * 512;
+ const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
- const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
- const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
- const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
// quantize each expert separately since they have different importance matrices
new_size = 0;