summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ggml-metal.m11
-rw-r--r--ggml.h2
-rw-r--r--llama.cpp3545
3 files changed, 1422 insertions, 2136 deletions
diff --git a/ggml-metal.m b/ggml-metal.m
index 2380c431..bc881395 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -238,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
// load kernels
{
NSError * error = nil;
-#define GGML_METAL_ADD_KERNEL(name) \
- ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
- ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+
+ /*
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
(int) ctx->pipeline_##name.threadExecutionWidth); \
+ */
+#define GGML_METAL_ADD_KERNEL(name) \
+ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
if (error) { \
- GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+ GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \
}
diff --git a/ggml.h b/ggml.h
index 8c954904..9d16c5a7 100644
--- a/ggml.h
+++ b/ggml.h
@@ -709,7 +709,7 @@ extern "C" {
// Context tensor enumeration and lookup
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
- GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+ GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
diff --git a/llama.cpp b/llama.cpp
index 7ee58929..ead1d421 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -60,7 +60,9 @@
#include <cstdio>
#include <cstring>
#include <ctime>
+#include <forward_list>
#include <fstream>
+#include <functional>
#include <initializer_list>
#include <map>
#include <memory>
@@ -69,11 +71,10 @@
#include <queue>
#include <random>
#include <regex>
+#include <set>
#include <sstream>
#include <thread>
#include <unordered_map>
-#include <set>
-#include <forward_list>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -969,7 +970,7 @@ struct llama_mlock {
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
-static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
+static void ggml_offload_nop(struct ggml_tensor * tensor) {
(void) tensor;
}
@@ -1113,13 +1114,13 @@ struct llama_layer {
struct ggml_tensor * ffn_norm_b;
// ff
- struct ggml_tensor * w1; // ffn_gate
- struct ggml_tensor * w2; // ffn_down
- struct ggml_tensor * w3; // ffn_up
+ struct ggml_tensor * ffn_gate; // w1
+ struct ggml_tensor * ffn_down; // w2
+ struct ggml_tensor * ffn_up; // w3
// ff bias
- struct ggml_tensor * b2; // ffn_down
- struct ggml_tensor * b3; // ffn_up
+ struct ggml_tensor * ffn_down_b; // b2
+ struct ggml_tensor * ffn_up_b; // b3
};
struct llama_kv_cell {
@@ -1225,8 +1226,8 @@ struct llama_model {
llama_hparams hparams = {};
llama_vocab vocab;
- struct ggml_tensor * tok_embeddings;
- struct ggml_tensor * pos_embeddings;
+ struct ggml_tensor * tok_embd;
+ struct ggml_tensor * pos_embd;
struct ggml_tensor * tok_norm;
struct ggml_tensor * tok_norm_b;
@@ -2482,7 +2483,7 @@ static void llm_load_tensors(
case LLM_ARCH_LLAMA:
case LLM_ARCH_REFACT:
{
- model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
// output
{
@@ -2536,21 +2537,21 @@ static void llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
- layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
- layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
- ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
}
}
} break;
case LLM_ARCH_BAICHUAN:
{
- model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
{
ggml_backend_type backend_norm;
ggml_backend_type backend_output;
@@ -2602,15 +2603,15 @@ static void llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
- layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
- layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
- ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
}
}
} break;
@@ -2618,7 +2619,7 @@ static void llm_load_tensors(
{
// TODO: CPU-only for now
- model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
// output
{
@@ -2681,21 +2682,21 @@ static void llm_load_tensors(
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
- layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
- ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
}
}
} break;
case LLM_ARCH_STARCODER:
{
- model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
- model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
// output
{
@@ -2754,11 +2755,11 @@ static void llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
- layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
- layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+ layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
- layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
- layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
@@ -2766,14 +2767,14 @@ static void llm_load_tensors(
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
- ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
- ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) +
+ ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b);
}
}
} break;
case LLM_ARCH_PERSIMMON:
{
- model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
{
ggml_backend_type backend_norm;
@@ -2814,31 +2815,31 @@ static void llm_load_tensors(
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
- layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
- layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
- layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
- layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+ layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
- layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
+ layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
- layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
+ layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
}
} break;
case LLM_ARCH_BLOOM:
{
// TODO: CPU-only for now
- model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
- model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
- model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
+ model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
// output
{
@@ -2897,11 +2898,11 @@ static void llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
- layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
- layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+ layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
- layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
- layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
@@ -2909,14 +2910,14 @@ static void llm_load_tensors(
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
- ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
- ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
+ ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
}
}
} break;
case LLM_ARCH_MPT:
{
- model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
// output
{
@@ -2967,8 +2968,8 @@ static void llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
- layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
@@ -2976,8 +2977,8 @@ static void llm_load_tensors(
ggml_nbytes(layer.wqkv) +
ggml_nbytes(layer.wo) +
ggml_nbytes(layer.ffn_norm) +
- ggml_nbytes(layer.w2) +
- ggml_nbytes(layer.w3);
+ ggml_nbytes(layer.ffn_down) +
+ ggml_nbytes(layer.ffn_up);
}
}
} break;
@@ -3007,10 +3008,10 @@ static void llm_load_tensors(
#ifdef GGML_USE_CUBLAS
const int max_backend_supported_layers = hparams.n_layer + 3;
- const int max_offloadable_layers = hparams.n_layer + 3;
-#elif defined(GGML_USE_CLBLAST)
+ const int max_offloadable_layers = hparams.n_layer + 3;
+#elif GGML_USE_CLBLAST
const int max_backend_supported_layers = hparams.n_layer + 1;
- const int max_offloadable_layers = hparams.n_layer + 1;
+ const int max_offloadable_layers = hparams.n_layer + 1;
#endif // GGML_USE_CUBLAS
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
@@ -3089,397 +3090,359 @@ static bool llama_model_load(
return true;
}
-static struct ggml_cgraph * llm_build_llama(
- llama_context & lctx,
- const llama_batch & batch) {
- const auto & model = lctx.model;
- const auto & hparams = model.hparams;
- const auto & cparams = lctx.cparams;
+using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
- const auto & kv_self = lctx.kv_self;
+enum llm_rope_type {
+ LLM_ROPE,
+ LLM_ROPE_NEOX,
+ LLM_ROPE_GLM,
+};
- GGML_ASSERT(!!kv_self.ctx);
+static struct ggml_tensor * llm_build_inp_embd(
+ struct ggml_context * ctx,
+ const llama_batch & batch,
+ struct ggml_tensor * tok_embd,
+ int64_t n_embd,
+ int32_t n_tokens,
+ const llm_build_cb & cb) {
+ struct ggml_tensor * inpL;
- const int64_t n_embd = hparams.n_embd;
- const int64_t n_layer = hparams.n_layer;
- const int64_t n_ctx = cparams.n_ctx;
- const int64_t n_head = hparams.n_head;
- const int64_t n_head_kv = hparams.n_head_kv;
- const int64_t n_embd_head = hparams.n_embd_head();
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
+ if (batch.token) {
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+ cb(inp_tokens, "inp_tokens", -1);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
+ inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
+ } else {
+#ifdef GGML_USE_MPI
+ GGML_ASSERT(false && "not implemented");
+#endif
- const float freq_base = cparams.rope_freq_base;
- const float freq_scale = cparams.rope_freq_scale;
- const float norm_rms_eps = hparams.f_norm_rms_eps;
+ inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+ }
- const int n_gpu_layers = model.n_gpu_layers;
+ return inpL;
+}
- const int32_t n_tokens = batch.n_tokens;
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+// Persimmon: n_rot = n_embd_head/2
+// Other: n_rot = n_embd_head
+static void llm_build_k_shift(
+ const llama_context & lctx,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * graph,
+ int64_t n_rot,
+ llm_rope_type type,
+ const llm_build_cb & cb) {
+ const auto & model = lctx.model;
+ const auto & kv_self = lctx.kv_self;
+ const auto & cparams = lctx.cparams;
- const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
+ const auto & hparams = model.hparams;
- //printf("n_kv = %d\n", n_kv);
+ const int64_t n_layer = hparams.n_layer;
+ const int64_t n_head_kv = hparams.n_head_kv;
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
+ const int64_t n_embd_head = hparams.n_embd_head();
- auto & buf_compute = lctx.buf_compute;
+ const int64_t n_ctx = lctx.cparams.n_ctx;
- struct ggml_init_params params = {
- /*.mem_size =*/ buf_compute.size,
- /*.mem_buffer =*/ buf_compute.data,
- /*.no_alloc =*/ true,
- };
+ const float freq_base = cparams.rope_freq_base;
+ const float freq_scale = cparams.rope_freq_scale;
- struct ggml_context * ctx0 = ggml_init(params);
+ GGML_ASSERT(n_embd_head % n_rot == 0);
- ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+ cb(K_shift, "K_shift", -1);
- struct ggml_tensor * cur;
- struct ggml_tensor * inpL;
+ int rope_type = 0;
- if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ switch (type) {
+ case LLM_ROPE: rope_type = 0; break;
+ case LLM_ROPE_NEOX: rope_type = 2; break;
+ case LLM_ROPE_GLM: rope_type = 4; break;
+ }
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
- }
- ggml_set_name(inp_tokens, "inp_tokens");
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * tmp =
+ // we rotate only the first n_rot dimensions
+ ggml_rope_custom_inplace(ctx,
+ ggml_view_3d(ctx, kv_self.k,
+ n_rot, n_head_kv, n_ctx,
+ ggml_element_size(kv_self.k)*n_embd_head,
+ ggml_element_size(kv_self.k)*n_embd_gqa,
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+ K_shift, n_rot, rope_type, 0, freq_base, freq_scale);
+ cb(tmp, "K_shifted", il);
+ ggml_build_forward_expand(graph, tmp);
+ }
+}
+
+static void llm_build_kv_store(
+ const llama_context & lctx,
+ struct ggml_context * ctx,
+ struct ggml_cgraph * graph,
+ struct ggml_tensor * k_cur,
+ struct ggml_tensor * v_cur,
+ int32_t n_tokens,
+ int32_t kv_head,
+ const llm_build_cb & cb,
+ int64_t il) {
+ const auto & model = lctx.model;
+ const auto & kv_self = lctx.kv_self;
+ const auto & cparams = lctx.cparams;
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
- } else {
-#ifdef GGML_USE_MPI
- GGML_ASSERT(false && "not implemented");
-#endif
+ const auto & hparams = model.hparams;
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+ const int64_t n_ctx = cparams.n_ctx;
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
- ggml_allocr_alloc(lctx.alloc, inpL);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
- }
- }
+ // compute the transposed [n_tokens, n_embd] V matrix
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
+ //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
+ cb(v_cur_t, "v_cur_t", il);
- const int i_gpu_start = n_layer - n_gpu_layers;
- (void) i_gpu_start;
+ struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa,
+ (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+ cb(k_cache_view, "k_cache_view", il);
- // offload functions set the tensor output backend to GPU
- // tensors are GPU-accelerated if any input or the output has been offloaded
- offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
- offload_func_t offload_func_kq = llama_nop;
- offload_func_t offload_func_v = llama_nop;
+ struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa,
+ ( n_ctx)*ggml_element_size(kv_self.v),
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
+ cb(v_cache_view, "v_cache_view", il);
-#ifdef GGML_USE_CUBLAS
- if (n_gpu_layers > n_layer) {
- offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 1) {
- offload_func_v = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 2) {
- offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
+ // important: storing RoPE-ed version of K in the KV cache!
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
+}
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
- }
+enum llm_norm_type {
+ LLM_NORM,
+ LLM_NORM_RMS,
+};
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
- offload_func_kq(KQ_mask);
- ggml_set_name(KQ_mask, "KQ_mask");
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- float * data = (float *) KQ_mask->data;
- memset(data, 0, ggml_nbytes(KQ_mask));
-
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const llama_pos pos = batch.pos[j];
- const llama_seq_id seq_id = batch.seq_id[j][0];
-
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
- }
- }
- }
- }
+static struct ggml_tensor * llm_build_norm(
+ struct ggml_context * ctx,
+ struct ggml_tensor * cur,
+ struct ggml_tensor * mw,
+ struct ggml_tensor * mb,
+ llm_norm_type type,
+ float eps,
+ const llm_build_cb & cb,
+ int il) {
+ switch (type) {
+ case LLM_NORM: cur = ggml_norm (ctx, cur, eps); break;
+ case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, eps); break;
}
- // KQ_pos - contains the positions
- struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
- offload_func_kq(KQ_pos);
- ggml_set_name(KQ_pos, "KQ_pos");
- ggml_allocr_alloc(lctx.alloc, KQ_pos);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- int * data = (int *) KQ_pos->data;
- for (int i = 0; i < n_tokens; ++i) {
- data[i] = batch.pos[i];
- }
+ if (mw || mb) {
+ cb(cur, "norm", il);
}
- // shift the entire K-cache if needed
- if (do_rope_shift) {
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
- offload_func_kq(K_shift);
- ggml_set_name(K_shift, "K_shift");
- ggml_allocr_alloc(lctx.alloc, K_shift);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- int * data = (int *) K_shift->data;
- for (int i = 0; i < n_ctx; ++i) {
- data[i] = kv_self.cells[i].delta;
- }
- }
-
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * tmp =
- ggml_rope_custom_inplace(ctx0,
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_head_kv, n_ctx,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
- K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
- offload_func_kq(tmp);
- ggml_build_forward_expand(gf, tmp);
+ if (mw) {
+ cur = ggml_mul(ctx, cur, mw);
+ if (mb) {
+ cb(cur, "norm_w", il);
}
}
- for (int il = 0; il < n_layer; ++il) {
- ggml_format_name(inpL, "layer_inp_%d", il);
-
- offload_func_t offload_func = llama_nop;
+ if (mb) {
+ cur = ggml_add(ctx, cur, mb);
+ }
-#ifdef GGML_USE_CUBLAS
- if (il >= i_gpu_start) {
- offload_func = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
+ return cur;
+}
- struct ggml_tensor * inpSA = inpL;
+enum llm_ffn_op_type {
+ LLM_FFN_SILU,
+ LLM_FFN_GELU,
+ LLM_FFN_RELU,
+ LLM_FFN_RELU_SQR,
+};
- // norm
- {
- cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
- offload_func(cur);
- ggml_set_name(cur, "rms_norm_0");
+enum llm_ffn_gate_type {
+ LLM_FFN_SEQ,
+ LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
- // cur = cur*attn_norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
- offload_func(cur);
- ggml_set_name(cur, "attention_norm_0");
+static struct ggml_tensor * llm_build_ffn(
+ struct ggml_context * ctx,
+ struct ggml_tensor * cur,
+ struct ggml_tensor * up,
+ struct ggml_tensor * up_b,
+ struct ggml_tensor * gate,
+ struct ggml_tensor * gate_b,
+ struct ggml_tensor * down,
+ struct ggml_tensor * down_b,
+ llm_ffn_op_type type_op,
+ llm_ffn_gate_type type_gate,
+ const llm_build_cb & cb,
+ int il) {
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
+ cb(tmp, "ffn_up", il);
+
+ if (up_b) {
+ tmp = ggml_add(ctx, tmp, up_b);
+ cb(tmp, "ffn_up_b", il);
+ }
+
+ if (gate) {
+ switch (type_gate) {
+ case LLM_FFN_SEQ:
+ {
+ cur = ggml_mul_mat(ctx, gate, tmp);
+ cb(cur, "ffn_gate", il);
+ } break;
+ case LLM_FFN_PAR:
+ {
+ cur = ggml_mul_mat(ctx, gate, cur);
+ cb(cur, "ffn_gate", il);
+ } break;
}
- // self-attention
- {
- // compute Q and K and RoPE them
- struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
- offload_func_kq(tmpk);
- ggml_set_name(tmpk, "tmpk");
-
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
- offload_func_kq(tmpq);
- ggml_set_name(tmpq, "tmpq");
-
- struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
- offload_func_kq(Kcur);
- ggml_set_name(Kcur, "Kcur");
-
- struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
- offload_func_kq(Qcur);
- ggml_set_name(Qcur, "Qcur");
+ if (gate_b) {
+ cur = ggml_add(ctx, cur, gate_b);
+ cb(cur, "ffn_gate_b", il);
+ }
+ } else {
+ cur = tmp;
+ }
- // store key and value to memory
+ switch (type_op) {
+ case LLM_FFN_SILU:
{
- // compute the transposed [n_tokens, n_embd] V matrix
-
- struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
- offload_func_v(tmpv);
- ggml_set_name(tmpv, "tmpv");
-
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
- offload_func_v(Vcur);
- ggml_set_name(Vcur, "Vcur");
+ cur = ggml_silu(ctx, cur);
+ cb(cur, "ffn_silu", il);
+ } break;
+ case LLM_FFN_GELU:
+ {
+ cur = ggml_gelu(ctx, cur);
+ cb(cur, "ffn_gelu", il);
+ } break;
+ case LLM_FFN_RELU:
+ {
+ cur = ggml_relu(ctx, cur);
+ cb(cur, "ffn_relu", il);
+ } break;
+ case LLM_FFN_RELU_SQR:
+ {
+ cur = ggml_relu(ctx, cur);
+ cb(cur, "ffn_relu", il);
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
- offload_func_kq(k);
- ggml_set_name(k, "k");
+ cur = ggml_sqr(ctx, cur);
+ cb(cur, "ffn_sqr(relu)", il);
+ } break;
+ }
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
- ( n_ctx)*ggml_element_size(kv_self.v),
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
- offload_func_v(v);
- ggml_set_name(v, "v");
+ if (type_gate == LLM_FFN_PAR) {
+ cur = ggml_mul(ctx, cur, tmp);
+ cb(cur, "ffn_gate_par", il);
+ }
- // important: storing RoPE-ed version of K in the KV cache!
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
- }
+ cur = ggml_mul_mat(ctx, down, cur);
+ if (down_b) {
+ cb(cur, "ffn_down", il);
+ }
- struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- offload_func_kq(Q);
- ggml_set_name(Q, "Q");
+ if (down_b) {
+ cur = ggml_add(ctx, cur, down_b);
+ }
- struct ggml_tensor * K =
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_kv, n_head_kv,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
- offload_func_kq(K);
- ggml_set_name(K, "K");
-
- // K * Q
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- offload_func_kq(KQ);
- ggml_set_name(KQ, "KQ");
-
- // KQ_scaled = KQ / sqrt(n_embd_head)
- // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
- offload_func_kq(KQ_scaled);
- ggml_set_name(KQ_scaled, "KQ_scaled");
-
- // KQ_masked = mask_past(KQ_scaled)
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
- offload_func_kq(KQ_masked);
- ggml_set_name(KQ_masked, "KQ_masked");
-
- // KQ = soft_max(KQ_masked)
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
- offload_func_v(KQ_soft_max);
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
- // split cached V into n_head heads
- struct ggml_tensor * V =
- ggml_view_3d(ctx0, kv_self.v,
- n_kv, n_embd_head, n_head_kv,
- ggml_element_size(kv_self.v)*n_ctx,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
- offload_func_v(V);
- ggml_set_name(V, "V");
-
-#if 1
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
- offload_func_v(KQV);
- ggml_set_name(KQV, "KQV");
-#else
- // make V contiguous in memory to speed up the matmul, however we waste time on the copy
- // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
- // is there a better way?
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
-#endif
+ return cur;
+}
- // KQV_merged = KQV.permute(0, 2, 1, 3)
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- offload_func_v(KQV_merged);
- ggml_set_name(KQV_merged, "KQV_merged");
+// if max_alibi_bias > 0 then apply ALiBi
+static struct ggml_tensor * llm_build_kqv(
+ const llama_context & lctx,
+ struct ggml_context * ctx,
+ struct ggml_tensor * cur,
+ struct ggml_tensor * wo,
+ struct ggml_tensor * wo_b,
+ struct ggml_tensor * q_cur,
+ struct ggml_tensor * kq_scale,
+ struct ggml_tensor * kq_mask,
+ int32_t n_tokens,
+ int32_t n_kv,
+ float alibi_bias_max,
+ const llm_build_cb & cb,
+ int il) {
+ const auto & model = lctx.model;
+ const auto & kv_self = lctx.kv_self;
+ const auto & cparams = lctx.cparams;
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
- offload_func_v(cur);
- ggml_set_name(cur, "KQV_merged_contiguous");
+ const auto & hparams = model.hparams;
- // projection (no bias)
- cur = ggml_mul_mat(ctx0,
- model.layers[il].wo,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_wo");
- }
+ const int64_t n_ctx = cparams.n_ctx;
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_head = hparams.n_head;
+ const int64_t n_head_kv = hparams.n_head_kv;
+ const int64_t n_embd_head = hparams.n_embd_head();
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
- struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
- offload_func(inpFF);
- ggml_set_name(inpFF, "inpFF");
+ struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
+ cb(q, "q", il);
- // feed-forward network
- {
- // norm
- {
- cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
- offload_func(cur);
- ggml_set_name(cur, "rms_norm_1");
-
- // cur = cur*ffn_norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
- offload_func(cur);
- ggml_set_name(cur, "ffn_norm");
- }
+ struct ggml_tensor * k =
+ ggml_view_3d(ctx, kv_self.k,
+ n_embd_head, n_kv, n_head_kv,
+ ggml_element_size(kv_self.k)*n_embd_gqa,
+ ggml_element_size(kv_self.k)*n_embd_head,
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
+ cb(k, "k", il);
- struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
- model.layers[il].w3,
- cur);
- offload_func(tmp);
- ggml_set_name(tmp, "result_w3");
+ struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+ cb(kq, "kq", il);
- cur = ggml_mul_mat(ctx0,
- model.layers[il].w1,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_w1");
+ kq = ggml_scale(ctx, kq, kq_scale);
+ cb(kq, "kq_scaled", il);
- // SILU activation
- cur = ggml_silu(ctx0, cur);
- offload_func(cur);
- ggml_set_name(cur, "silu");
+ if (alibi_bias_max > 0.0f) {
+ // TODO: n_head or n_head_kv
+ // TODO: K-shift is likely not working
+ // TODO: change to ggml_add
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, alibi_bias_max);
+ cb(kq, "kq_scaled_alibi", il);
+ }
- cur = ggml_mul(ctx0, cur, tmp);
- offload_func(cur);
- ggml_set_name(cur, "silu_x_result_w3");
+ kq = ggml_add(ctx, kq, kq_mask);
+ cb(kq, "kq_masked", il);
- cur = ggml_mul_mat(ctx0,
- model.layers[il].w2,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_w2");
- }
+ kq = ggml_soft_max(ctx, kq);
+ cb(kq, "kq_soft_max", il);
- cur = ggml_add(ctx0, cur, inpFF);
- offload_func(cur);
- ggml_set_name(cur, "inpFF_+_result_w2");
+ // split cached v into n_head heads
+ struct ggml_tensor * v =
+ ggml_view_3d(ctx, kv_self.v,
+ n_kv, n_embd_head, n_head_kv,
+ ggml_element_size(kv_self.v)*n_ctx,
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
+ cb(v, "v", il);
- // input for next layer
- inpL = cur;
- }
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+ cb(kqv, "kqv", il);
- cur = inpL;
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
- // norm
- {
- cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
- offload_func_nr(cur);
- ggml_set_name(cur, "rms_norm_2");
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
- // cur = cur*norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.output_norm);
- // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
- ggml_set_name(cur, "result_norm");
+ cur = ggml_mul_mat(ctx, wo, cur);
+ if (wo_b) {
+ cb(cur, "kqv_wo", il);
}
- // lm_head
- cur = ggml_mul_mat(ctx0, model.output, cur);
- ggml_set_name(cur, "result_output");
-
- ggml_build_forward_expand(gf, cur);
-
- ggml_free(ctx0);
+ if (wo_b) {
+ cur = ggml_add(ctx, cur, wo_b);
+ }
- return gf;
+ return cur;
}
-static struct ggml_cgraph * llm_build_baichaun(
- llama_context & lctx,
- const llama_batch & batch) {
+static struct ggml_cgraph * llm_build_llama(
+ llama_context & lctx,
+ const llama_batch & batch,
+ const llm_build_cb & cb,
+ bool worst_case) {
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@@ -3494,7 +3457,6 @@ static struct ggml_cgraph * llm_build_baichaun(
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -3502,13 +3464,13 @@ static struct ggml_cgraph * llm_build_baichaun(
const float freq_scale = cparams.rope_freq_scale;
const float norm_rms_eps = hparams.f_norm_rms_eps;
- const int n_gpu_layers = model.n_gpu_layers;
-
const int32_t n_tokens = batch.n_tokens;
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
- const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
+ const bool do_rope_shift = worst_case || kv_self.has_shift;
+
+ //printf("n_kv = %d\n", n_kv);
auto & buf_compute = lctx.buf_compute;
@@ -3525,331 +3487,81 @@ static struct ggml_cgraph * llm_build_baichaun(
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
- }
- ggml_set_name(inp_tokens, "inp_tokens");
-
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
- } else {
-#ifdef GGML_USE_MPI
- GGML_ASSERT(false && "not implemented");
-#endif
-
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
- ggml_allocr_alloc(lctx.alloc, inpL);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
- }
- }
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+ cb(inpL, "inp_embd", -1);
- const int i_gpu_start = n_layer - n_gpu_layers;
- (void) i_gpu_start;
-
- // offload functions set the tensor output backend to GPU
- // tensors are GPU-accelerated if any input or the output has been offloaded
- offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
- offload_func_t offload_func_kq = llama_nop;
- offload_func_t offload_func_v = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
- if (n_gpu_layers > n_layer) {
- offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 1) {
- offload_func_v = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 2) {
- offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ cb(inp_pos, "inp_pos", -1);
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
- }
+ cb(KQ_scale, "KQ_scale", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
- offload_func_kq(KQ_mask);
- ggml_set_name(KQ_mask, "KQ_mask");
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- float * data = (float *) KQ_mask->data;
- memset(data, 0, ggml_nbytes(KQ_mask));
-
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const llama_pos pos = batch.pos[j];
- const llama_seq_id seq_id = batch.seq_id[j][0];
-
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
- }
- }
- }
- }
- }
-
- // KQ_pos - contains the positions
- struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
- offload_func_kq(KQ_pos);
- ggml_set_name(KQ_pos, "KQ_pos");
- ggml_allocr_alloc(lctx.alloc, KQ_pos);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- int * data = (int *) KQ_pos->data;
- for (int i = 0; i < n_tokens; ++i) {
- data[i] = batch.pos[i];
- }
- }
+ cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
- offload_func_kq(K_shift);
- ggml_set_name(K_shift, "K_shift");
- ggml_allocr_alloc(lctx.alloc, K_shift);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- int * data = (int *) K_shift->data;
- for (int i = 0; i < n_ctx; ++i) {
- data[i] = kv_self.cells[i].delta;
- }
- }
-
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * tmp =
- ggml_rope_custom_inplace(ctx0,
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_head_kv, n_ctx,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
- K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
- offload_func_kq(tmp);
- ggml_build_forward_expand(gf, tmp);
- }
+ llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb);
}
for (int il = 0; il < n_layer; ++il) {
- ggml_format_name(inpL, "layer_inp_%d", il);
-
- offload_func_t offload_func = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
- if (il >= i_gpu_start) {
- offload_func = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
-
struct ggml_tensor * inpSA = inpL;
// norm
- {
- cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
- offload_func(cur);
- ggml_set_name(cur, "rms_norm_0");
-
- // cur = cur*attn_norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
- offload_func(cur);
- ggml_set_name(cur, "attention_norm_0");
- }
+ cur = llm_build_norm(ctx0, inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, norm_rms_eps, cb, il);
+ cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
- struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
- offload_func_kq(tmpk);
- ggml_set_name(tmpk, "tmpk");
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
- offload_func_kq(tmpq);
- ggml_set_name(tmpq, "tmpq");
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
- struct ggml_tensor * Kcur;
- struct ggml_tensor * Qcur;
- switch (model.type) {
- case MODEL_7B:
- Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
- Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
- break;
- case MODEL_13B:
- Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
- Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
- break;
- default:
- GGML_ASSERT(false);
- }
-
- offload_func_kq(Kcur);
- ggml_set_name(Kcur, "Kcur");
-
- offload_func_kq(Qcur);
- ggml_set_name(Qcur, "Qcur");
-
- // store key and value to memory
- {
- // compute the transposed [n_tokens, n_embd] V matrix
-
- struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
- offload_func_v(tmpv);
- ggml_set_name(tmpv, "tmpv");
-
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
- offload_func_v(Vcur);
- ggml_set_name(Vcur, "Vcur");
-
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
- offload_func_kq(k);
- ggml_set_name(k, "k");
-
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
- ( n_ctx)*ggml_element_size(kv_self.v),
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
- offload_func_v(v);
- ggml_set_name(v, "v");
-
- // important: storing RoPE-ed version of K in the KV cache!
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
- }
-
- struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- offload_func_kq(Q);
- ggml_set_name(Q, "Q");
-
- struct ggml_tensor * K =
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_kv, n_head_kv,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
- offload_func_kq(K);
- ggml_set_name(K, "K");
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
- // K * Q
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- offload_func_kq(KQ);
- ggml_set_name(KQ, "KQ");
+ Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+ cb(Qcur, "Qcur", il);
- // KQ_scaled = KQ / sqrt(n_embd_head)
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
- offload_func_kq(KQ_scaled);
- ggml_set_name(KQ_scaled, "KQ_scaled");
+ Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+ cb(Kcur, "Kcur", il);
- struct ggml_tensor * KQ_masked;
- struct ggml_tensor * KQ_scaled_alibi;
+ llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
- switch (model.type) {
- case MODEL_7B:
- KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
- break;
- case MODEL_13B:
- // TODO: replace with ggml_add()
- KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
- ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
- KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
- break;
- default:
- GGML_ASSERT(false);
- }
+ cur = llm_build_kqv(lctx, ctx0, cur,
+ model.layers[il].wo, NULL,
+ Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il);
+ cb(cur, "kqv_out", il);
+ }
- // KQ = soft_max(KQ_masked)
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
- offload_func_v(KQ_soft_max);
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
- // split cached V into n_head heads
- struct ggml_tensor * V =
- ggml_view_3d(ctx0, kv_self.v,
- n_kv, n_embd_head, n_head_kv,
- ggml_element_size(kv_self.v)*n_ctx,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
- offload_func_v(V);
- ggml_set_name(V, "V");
-
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
- offload_func_v(KQV);
- ggml_set_name(KQV, "KQV");
-
- // KQV_merged = KQV.permute(0, 2, 1, 3)
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- offload_func_v(KQV_merged);
- ggml_set_name(KQV_merged, "KQV_merged");
-
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
- offload_func_v(cur);
- ggml_set_name(cur, "KQV_merged_contiguous");
-
- // projection (no bias)
- cur = ggml_mul_mat(ctx0,
- model.layers[il].wo,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_wo");
- }
-
- struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
- offload_func(inpFF);
- ggml_set_name(inpFF, "inpFF");
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
// feed-forward network
{
- // norm
- {
- cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
- offload_func(cur);
- ggml_set_name(cur, "rms_norm_1");
-
- // cur = cur*ffn_norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
- offload_func(cur);
- ggml_set_name(cur, "ffn_norm");
- }
-
- struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
- model.layers[il].w3,
- cur);
- offload_func(tmp);
- ggml_set_name(tmp, "result_w3");
-
- cur = ggml_mul_mat(ctx0,
- model.layers[il].w1,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_w1");
+ cur = llm_build_norm(ctx0, ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, norm_rms_eps, cb, il);
+ cb(cur, "ffn_norm", il);
- // SILU activation
- cur = ggml_silu(ctx0, cur);
- offload_func(cur);
- ggml_set_name(cur, "silu");
-
- cur = ggml_mul(ctx0, cur, tmp);
- offload_func(cur);
- ggml_set_name(cur, "silu_x_result_w3");
-
- cur = ggml_mul_mat(ctx0,
- model.layers[il].w2,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_w2");
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
}
- cur = ggml_add(ctx0, cur, inpFF);
- offload_func(cur);
- ggml_set_name(cur, "inpFF_+_result_w2");
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
// input for next layer
inpL = cur;
@@ -3857,21 +3569,14 @@ static struct ggml_cgraph * llm_build_baichaun(
cur = inpL;
- // norm
- {
- cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
- offload_func_nr(cur);
- ggml_set_name(cur, "rms_norm_2");
-
- // cur = cur*norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.output_norm);
- // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
- ggml_set_name(cur, "result_norm");
- }
+ cur = llm_build_norm(ctx0, cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, norm_rms_eps, cb, -1);
+ cb(cur, "result_norm", -1);
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
- ggml_set_name(cur, "result_output");
+ cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
@@ -3880,9 +3585,11 @@ static struct ggml_cgraph * llm_build_baichaun(
return gf;
}
-static struct ggml_cgraph * llm_build_refact(
+static struct ggml_cgraph * llm_build_baichaun(
llama_context & lctx,
- const llama_batch & batch) {
+ const llama_batch & batch,
+ const llm_build_cb & cb,
+ bool worst_case) {
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@@ -3897,17 +3604,18 @@ static struct ggml_cgraph * llm_build_refact(
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
- const float norm_rms_eps = hparams.f_norm_rms_eps;
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
- const int n_gpu_layers = model.n_gpu_layers;
+ const float freq_base = cparams.rope_freq_base;
+ const float freq_scale = cparams.rope_freq_scale;
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
const int32_t n_tokens = batch.n_tokens;
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
- // printf("n_kv = %d\n", n_kv);
+ const bool do_rope_shift = worst_case || kv_self.has_shift;
auto & buf_compute = lctx.buf_compute;
@@ -3924,277 +3632,91 @@ static struct ggml_cgraph * llm_build_refact(
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
- }
- ggml_set_name(inp_tokens, "inp_tokens");
-
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
- } else {
-#ifdef GGML_USE_MPI
- GGML_ASSERT(false && "not implemented");
-#endif
-
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
- ggml_allocr_alloc(lctx.alloc, inpL);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
- }
- }
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+ cb(inpL, "inp_embd", -1);
- const int i_gpu_start = n_layer - n_gpu_layers;
- (void) i_gpu_start;
-
- // offload functions set the tensor output backend to GPU
- // tensors are GPU-accelerated if any input or the output has been offloaded
- offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
- offload_func_t offload_func_kq = llama_nop;
- offload_func_t offload_func_v = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
- if (n_gpu_layers > n_layer) {
- offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 1) {
- offload_func_v = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 2) {
- offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ cb(inp_pos, "inp_pos", -1);
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
- }
+ cb(KQ_scale, "KQ_scale", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
- offload_func_kq(KQ_mask);
- ggml_set_name(KQ_mask, "KQ_mask");
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- float * data = (float *) KQ_mask->data;
- memset(data, 0, ggml_nbytes(KQ_mask));
-
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const llama_pos pos = batch.pos[j];
- const llama_seq_id seq_id = batch.seq_id[j][0];
-
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
- }
- }
- }
- }
+ cb(KQ_mask, "KQ_mask", -1);
+
+ // shift the entire K-cache if needed
+ if (do_rope_shift) {
+ llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb);
}
for (int il = 0; il < n_layer; ++il) {
- ggml_format_name(inpL, "layer_inp_%d", il);
-
- offload_func_t offload_func = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
- if (il >= i_gpu_start) {
- offload_func = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
-
struct ggml_tensor * inpSA = inpL;
- // norm
- {
- cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
- offload_func(cur);
- ggml_set_name(cur, "rms_norm_0");
-
- // cur = cur*attn_norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
- offload_func(cur);
- ggml_set_name(cur, "attention_norm_0");
- }
+ cur = llm_build_norm(ctx0, inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, norm_rms_eps, cb, il);
+ cb(cur, "attn_norm", il);
// self-attention
{
- // compute Q and K
- struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
- offload_func_kq(tmpk);
- ggml_set_name(tmpk, "tmpk");
-
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
- offload_func_kq(tmpq);
- ggml_set_name(tmpq, "tmpq");
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
- struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
- offload_func_kq(Kcur);
- ggml_set_name(Kcur, "Kcur");
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
- struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
- offload_func_kq(Qcur);
- ggml_set_name(Qcur, "Qcur");
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
- // store key and value to memory
- {
- // compute the transposed [n_tokens, n_embd] V matrix
-
- struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
- offload_func_v(tmpv);
- ggml_set_name(tmpv, "tmpv");
-
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
- offload_func_v(Vcur);
- ggml_set_name(Vcur, "Vcur");
-
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
- offload_func_kq(k);
- ggml_set_name(k, "k");
-
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
- ( n_ctx)*ggml_element_size(kv_self.v),
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
- offload_func_v(v);
- ggml_set_name(v, "v");
-
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ switch (model.type) {
+ case MODEL_7B:
+ Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+ Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+ break;
+ case MODEL_13B:
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
+ break;
+ default:
+ GGML_ASSERT(false);
}
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
- struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- offload_func_kq(Q);
- ggml_set_name(Q, "Q");
+ llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
- struct ggml_tensor * K =
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_kv, n_head_kv,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
- offload_func_kq(K);
- ggml_set_name(K, "K");
-
- // K * Q
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- offload_func_kq(KQ);
- ggml_set_name(KQ, "KQ");
-
- // KQ_scaled = KQ / sqrt(n_embd_head)
- // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
- offload_func_kq(KQ_scaled);
- ggml_set_name(KQ_scaled, "KQ_scaled");
-
- // KQ_masked = mask_past(KQ_scaled)
- struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
- ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
-
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
- offload_func_kq(KQ_masked);
- ggml_set_name(KQ_masked, "KQ_masked");
-
- // KQ = soft_max(KQ_masked)
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
- offload_func_v(KQ_soft_max);
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
- // split cached V into n_head heads
- struct ggml_tensor * V =
- ggml_view_3d(ctx0, kv_self.v,
- n_kv, n_embd_head, n_head_kv,
- ggml_element_size(kv_self.v)*n_ctx,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
- offload_func_v(V);
- ggml_set_name(V, "V");
-
-#if 1
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
- offload_func_v(KQV);
- ggml_set_name(KQV, "KQV");
-#else
- // make V contiguous in memory to speed up the matmul, however we waste time on the copy
- // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
- // is there a better way?
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
-#endif
-
- // KQV_merged = KQV.permute(0, 2, 1, 3)
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- offload_func_v(KQV_merged);
- ggml_set_name(KQV_merged, "KQV_merged");
+ // apply ALiBi for 13B model
+ const float alibi_bias_max = model.type == MODEL_13B ? 8.0f : -1.0f;
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
- offload_func_v(cur);
- ggml_set_name(cur, "KQV_merged_contiguous");
-
- // projection (no bias)
- cur = ggml_mul_mat(ctx0,
- model.layers[il].wo,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_wo");
+ cur = llm_build_kqv(lctx, ctx0, cur,
+ model.layers[il].wo, NULL,
+ Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, alibi_bias_max, cb, il);
+ cb(cur, "kqv_out", il);
}
- struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
- offload_func(inpFF);
- ggml_set_name(inpFF, "inpFF");
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
// feed-forward network
{
- // norm
- {
- cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
- offload_func(cur);
- ggml_set_name(cur, "rms_norm_1");
-
- // cur = cur*ffn_norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
- offload_func(cur);
- ggml_set_name(cur, "ffn_norm");
- }
+ cur = llm_build_norm(ctx0, ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, norm_rms_eps, cb, il);
+ cb(cur, "ffn_norm", il);
- struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
- model.layers[il].w3,
- cur);
- offload_func(tmp);
- ggml_set_name(tmp, "result_w3");
-
- cur = ggml_mul_mat(ctx0,
- model.layers[il].w1,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_w1");
-
- // SILU activation
- cur = ggml_silu(ctx0, cur);
- offload_func(cur);
- ggml_set_name(cur, "silu");
-
- cur = ggml_mul(ctx0, cur, tmp);
- offload_func(cur);
- ggml_set_name(cur, "silu_x_result_w3");
-
- cur = ggml_mul_mat(ctx0,
- model.layers[il].w2,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_w2");
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
}
- cur = ggml_add(ctx0, cur, inpFF);
- offload_func(cur);
- ggml_set_name(cur, "inpFF_+_result_w2");
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
// input for next layer
inpL = cur;
@@ -4202,21 +3724,14 @@ static struct ggml_cgraph * llm_build_refact(
cur = inpL;
- // norm
- {
- cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
- offload_func_nr(cur);
- ggml_set_name(cur, "rms_norm_2");
-
- // cur = cur*norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.output_norm);
- // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
- ggml_set_name(cur, "result_norm");
- }
+ cur = llm_build_norm(ctx0, cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, norm_rms_eps, cb, -1);
+ cb(cur, "result_norm", -1);
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
- ggml_set_name(cur, "result_output");
+ cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
@@ -4227,7 +3742,9 @@ static struct ggml_cgraph * llm_build_refact(
static struct ggml_cgraph * llm_build_falcon(
llama_context & lctx,
- const llama_batch & batch) {
+ const llama_batch & batch,
+ const llm_build_cb & cb,
+ bool worst_case) {
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@@ -4250,13 +3767,11 @@ static struct ggml_cgraph * llm_build_falcon(
const float freq_scale = cparams.rope_freq_scale;
const float norm_eps = hparams.f_norm_eps;
- const int n_gpu_layers = model.n_gpu_layers;
-
const int32_t n_tokens = batch.n_tokens;
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
- const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
+ const bool do_rope_shift = worst_case || kv_self.has_shift;
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
@@ -4276,294 +3791,94 @@ static struct ggml_cgraph * llm_build_falcon(
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
- }
- ggml_set_name(inp_tokens, "inp_tokens");
-
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
- } else {
-#ifdef GGML_USE_MPI
- GGML_ASSERT(false && "not implemented");
-#endif
-
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
- ggml_allocr_alloc(lctx.alloc, inpL);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
- }
- }
-
- const int i_gpu_start = n_layer - n_gpu_layers;
- (void) i_gpu_start;
-
- // offload functions set the tensor output backend to GPU
- // tensors are GPU-accelerated if any input or the output has been offloaded
- offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
- offload_func_t offload_func_kq = llama_nop;
- offload_func_t offload_func_v = llama_nop;
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+ cb(inpL, "inp_embd", -1);
-#ifdef GGML_USE_CUBLAS
- if (n_gpu_layers > n_layer) {
- offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 1) {
- offload_func_v = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 2) {
- offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ cb(inp_pos, "inp_pos", -1);
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
- }
+ cb(KQ_scale, "KQ_scale", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
- offload_func_kq(KQ_mask);
- ggml_set_name(KQ_mask, "KQ_mask");
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- float * data = (float *) KQ_mask->data;
- memset(data, 0, ggml_nbytes(KQ_mask));
-
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const llama_pos pos = batch.pos[j];
- const llama_seq_id seq_id = batch.seq_id[j][0];
-
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
- }
- }
- }
- }
- }
-
- // KQ_pos - contains the positions
- struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
- offload_func_kq(KQ_pos);
- ggml_set_name(KQ_pos, "KQ_pos");
- ggml_allocr_alloc(lctx.alloc, KQ_pos);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- int * data = (int *) KQ_pos->data;
- for (int i = 0; i < n_tokens; ++i) {
- data[i] = batch.pos[i];
- }
- }
+ cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
- offload_func_kq(K_shift);
- ggml_set_name(K_shift, "K_shift");
- ggml_allocr_alloc(lctx.alloc, K_shift);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- int * data = (int *) K_shift->data;
- for (int i = 0; i < n_ctx; ++i) {
- data[i] = kv_self.cells[i].delta;
- }
- }
-
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * tmp =
- ggml_rope_custom_inplace(ctx0,
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_head_kv, n_ctx,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
- K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
- offload_func_kq(tmp);
- ggml_build_forward_expand(gf, tmp);
- }
+ llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE_NEOX, cb);
}
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * attn_norm;
- offload_func_t offload_func = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
- if (il >= i_gpu_start) {
- offload_func = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
+ attn_norm = llm_build_norm(ctx0, inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(attn_norm, "attn_norm", il);
// self-attention
- // TODO: refactor into common function (shared with LLaMA)
{
- attn_norm = ggml_norm(ctx0, inpL, norm_eps);
- offload_func(attn_norm);
-
- attn_norm = ggml_add(ctx0,
- ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
- model.layers[il].attn_norm_b);
- offload_func(attn_norm->src[0]);
- offload_func(attn_norm);
-
- if (model.layers[il].attn_norm_2) { // Falcon-40B
- cur = ggml_norm(ctx0, inpL, norm_eps);
- offload_func(cur);
-
- cur = ggml_add(ctx0,
- ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
- model.layers[il].attn_norm_2_b);
- offload_func(cur->src[0]);
- offload_func(cur);
- } else { // Falcon 7B
+ if (model.layers[il].attn_norm_2) {
+ // Falcon-40B
+ cur = llm_build_norm(ctx0, attn_norm,
+ model.layers[il].attn_norm_2,
+ model.layers[il].attn_norm_2_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(cur, "attn_norm_2", il);
+ } else {
cur = attn_norm;
}
- // compute QKV
-
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
- offload_func_kq(cur);
-
- // Note that the strides for Kcur, Vcur are set up so that the
- // resulting views are misaligned with the tensor's storage
- // (by applying the K/V offset we shift the tensor's original
- // view to stick out behind the viewed QKV tensor's allocated
- // memory, so to say). This is ok because no actual accesses
- // happen to that out-of-range memory, but it can require some
- // trickery when trying to accurately dump these views for
- // debugging.
-
- const size_t wsize = ggml_type_size(cur->type);
-
- // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
- // non-contiguous views is added for the rope operator
- struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
- ctx0, cur, n_embd_head, n_head, n_tokens,
- wsize * n_embd_head,
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
- 0));
- offload_func_kq(tmpq);
-
- struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
- wsize * n_embd_head,
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
- wsize * n_embd_head * n_head));
- offload_func_kq(tmpk);
-
- struct ggml_tensor * tmpv = ggml_view_3d(
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
- wsize * n_embd_head,
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
- wsize * n_embd_head * (n_head + n_head_kv));
- offload_func_v(tmpv);
-
- // using mode = 2 for neox mode
- struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
- offload_func_kq(Qcur);
- struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
- offload_func_kq(Kcur);
-
- {
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
- offload_func_v(Vcur);
- offload_func_v(Vcur->src[0]->src[0]);
- ggml_set_name(Vcur, "Vcur");
-
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
- offload_func_kq(k);
- ggml_set_name(k, "k");
-
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
- ( n_ctx)*ggml_element_size(kv_self.v),
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
- offload_func_v(v);
-
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
- }
-
- struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- offload_func_kq(Q);
- ggml_set_name(Q, "Q");
+ cb(cur, "wqkv", il);
- struct ggml_tensor * K =
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_kv, n_head_kv,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
- offload_func_kq(K);
- ggml_set_name(K, "K");
-
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- offload_func_kq(KQ);
- ggml_set_name(KQ, "KQ");
-
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
- offload_func_kq(KQ_scaled);
- ggml_set_name(KQ_scaled, "KQ_scaled");
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
- offload_func_kq(KQ_masked);
- ggml_set_name(KQ_masked, "KQ_masked");
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
- offload_func_v(KQ_soft_max);
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- struct ggml_tensor * V =
- ggml_view_3d(ctx0, kv_self.v,
- n_kv, n_embd_head, n_head_kv,
- ggml_element_size(kv_self.v)*n_ctx,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
- offload_func_v(V);
- ggml_set_name(V, "V");
-
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
- offload_func_v(KQV);
- ggml_set_name(KQV, "KQV");
+ // using mode = 2 for neox mode
+ Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
+ cb(Qcur, "Qcur", il);
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- offload_func_v(KQV_merged);
- ggml_set_name(KQV_merged, "KQV_merged");
+ Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
+ cb(Kcur, "Kcur", il);
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
- offload_func_v(cur);
- ggml_set_name(cur, "KQV_merged_contiguous");
+ llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
- offload_func(cur);
- ggml_set_name(cur, "result_wo");
+ cur = llm_build_kqv(lctx, ctx0, attn_norm,
+ model.layers[il].wo, NULL,
+ Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il);
+ cb(cur, "kqv_out", il);
}
- struct ggml_tensor * attn_out = cur;
+ struct ggml_tensor * ffn_inp = cur;
// feed forward
{
- struct ggml_tensor * inpFF = attn_norm;
-
- cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
- offload_func(cur);
-
- cur = ggml_gelu(ctx0, cur);
- offload_func(cur);
- cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
- offload_func(cur);
+ cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
+ model.layers[il].ffn_up, NULL,
+ NULL, NULL,
+ model.layers[il].ffn_down, NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+ cb(cur, "ffn_out", il);
}
- cur = ggml_add(ctx0, cur, attn_out);
- offload_func(cur);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
+
cur = ggml_add(ctx0, cur, inpL);
- offload_func(cur);
+ cb(cur, "l_out", il);
// input for next layer
inpL = cur;
@@ -4572,18 +3887,14 @@ static struct ggml_cgraph * llm_build_falcon(
cur = inpL;
// norm
- {
- cur = ggml_norm(ctx0, cur, norm_eps);
- offload_func_nr(cur);
-
- cur = ggml_add(ctx0,
- ggml_mul(ctx0, cur, model.output_norm),
- model.output_norm_b);
- ggml_set_name(cur, "result_norm");
- }
+ cur = llm_build_norm(ctx0, cur,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, norm_eps, cb, -1);
+ cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur);
- ggml_set_name(cur, "result_output");
+ cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
@@ -4594,7 +3905,9 @@ static struct ggml_cgraph * llm_build_falcon(
static struct ggml_cgraph * llm_build_starcoder(
llama_context & lctx,
- const llama_batch & batch) {
+ const llama_batch & batch,
+ const llm_build_cb & cb,
+ bool worst_case) {
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@@ -4607,7 +3920,6 @@ static struct ggml_cgraph * llm_build_starcoder(
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = cparams.n_ctx;
const int64_t n_head = hparams.n_head;
- const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
@@ -4615,11 +3927,9 @@ static struct ggml_cgraph * llm_build_starcoder(
const float norm_eps = hparams.f_norm_eps;
- const int n_gpu_layers = model.n_gpu_layers;
-
const int32_t n_tokens = batch.n_tokens;
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
auto & buf_compute = lctx.buf_compute;
@@ -4634,266 +3944,95 @@ static struct ggml_cgraph * llm_build_starcoder(
ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur;
- struct ggml_tensor * token;
- struct ggml_tensor * position;
+ struct ggml_tensor * pos;
struct ggml_tensor * inpL;
- if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+ cb(inpL, "inp_embd", -1);
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
- }
- ggml_set_name(inp_tokens, "inp_tokens");
-
- token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
- } else {
-#ifdef GGML_USE_MPI
- GGML_ASSERT(false && "not implemented");
-#endif
-
- token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
- ggml_allocr_alloc(lctx.alloc, token);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
- }
- }
-
- const int i_gpu_start = n_layer - n_gpu_layers;
- (void) i_gpu_start;
-
- // offload functions set the tensor output backend to GPU
- // tensors are GPU-accelerated if any input or the output has been offloaded
- offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
- offload_func_t offload_func_kq = llama_nop;
- offload_func_t offload_func_v = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
- if (n_gpu_layers > n_layer) {
- offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 1) {
- offload_func_v = ggml_cuda_assign_buffers_no_alloc;
- }
- if (n_gpu_layers > n_layer + 2) {
- offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
-
- {
- // Compute position embeddings.
- struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
- ggml_allocr_alloc(lctx.alloc, inp_positions);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- for (int i = 0; i < n_tokens; ++i) {
- ((int32_t *) inp_positions->data)[i] = batch.pos[i];
- }
- }
- ggml_set_name(inp_positions, "inp_positions");
-
- position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
- }
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ cb(inp_pos, "inp_pos", -1);
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
- }
+ cb(KQ_scale, "KQ_scale", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
- ggml_set_name(KQ_mask, "KQ_mask");
- offload_func_kq(KQ_mask);
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- float * data = (float *) KQ_mask->data;
- memset(data, 0, ggml_nbytes(KQ_mask));
-
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const llama_pos pos = batch.pos[j];
- const llama_seq_id seq_id = batch.seq_id[j][0];
-
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
- }
- }
- }
- }
- }
+ cb(KQ_mask, "KQ_mask", -1);
- inpL = ggml_add(ctx0, token, position);
- ggml_set_name(inpL, "inpL");
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+ cb(pos, "pos_embd", -1);
- for (int il = 0; il < n_layer; ++il) {
- offload_func_t offload_func = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
- if (il >= i_gpu_start) {
- offload_func = ggml_cuda_assign_buffers_no_alloc;
- }
-#endif // GGML_USE_CUBLAS
-
- {
- // Norm
- cur = ggml_norm(ctx0, inpL, norm_eps);
- offload_func(cur);
+ inpL = ggml_add(ctx0, inpL, pos);
+ cb(inpL, "inpL", -1);
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
- offload_func(cur);
- }
+ for (int il = 0; il < n_layer; ++il) {
+ cur = llm_build_norm(ctx0, inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(cur, "attn_norm", il);
+ // self-attention
{
- // Self Attention
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
- offload_func_kq(cur);
+ cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- offload_func_kq(cur);
-
- struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+ cb(cur, "bqkv", il);
- ggml_set_name(tmpq, "tmpq");
- ggml_set_name(tmpk, "tmpk");
- ggml_set_name(tmpv, "tmpv");
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- offload_func_kq(tmpq);
- offload_func_kq(tmpk);
- offload_func_v (tmpv);
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
- struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
- struct ggml_tensor * Kcur = tmpk;
-
- {
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv);
- offload_func_v(Vcur);
- ggml_set_name(Vcur, "Vcur");
-
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
- offload_func_kq(k);
- ggml_set_name(k, "k");
-
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
- ( n_ctx)*ggml_element_size(kv_self.v),
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
- offload_func_v(v);
- ggml_set_name(v, "v");
-
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
- }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- offload_func_kq(Q);
- ggml_set_name(Q, "Q");
+ llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
- struct ggml_tensor * K =
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_kv, n_head_kv,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
- offload_func_kq(K);
- ggml_set_name(K, "K");
-
- // K * Q
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- offload_func_kq(KQ);
- ggml_set_name(KQ, "KQ");
-
- // KQ_scaled = KQ / sqrt(n_embd_head)
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
- offload_func_kq(KQ_scaled);
- ggml_set_name(KQ_scaled, "KQ_scaled");
-
- // KQ_masked = mask_past(KQ_scaled)
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
- offload_func_kq(KQ_masked);
- ggml_set_name(KQ_masked, "KQ_masked");
-
- // KQ = soft_max(KQ_masked)
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
- offload_func_v(KQ_soft_max);
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
- // split cached V into n_head heads
- struct ggml_tensor * V =
- ggml_view_3d(ctx0, kv_self.v,
- n_kv, n_embd_head, n_head_kv,
- ggml_element_size(kv_self.v)*n_ctx,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
- ggml_set_name(V, "V");
-
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
- offload_func_v(KQV);
- ggml_set_name(KQV, "KQV");
-
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- offload_func_v(KQV_merged);
- ggml_set_name(KQV_merged, "KQV_merged");
-
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
- offload_func_v(cur);
- ggml_set_name(cur, "KQV_merged_contiguous");
- }
-
- // Projection
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
- offload_func(cur);
-
- // Add the input
- cur = ggml_add(ctx0, cur, inpL);
- offload_func(cur);
+ cur = llm_build_kqv(lctx, ctx0, cur,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il);
+ cb(cur, "kqv_out", il);
+ }
- struct ggml_tensor * inpFF = cur;
+ // add the input
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
// FF
{
- // Norm
- {
- cur = ggml_norm(ctx0, inpFF, norm_eps);
- offload_func_nr(cur);
+ cur = llm_build_norm(ctx0, ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(cur, "ffn_norm", il);
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
- offload_func_nr(cur);
- }
-
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
- offload_func(cur);
-
- // GELU activation
- cur = ggml_gelu(ctx0, cur);
- offload_func(cur);
-
- // Projection
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
- offload_func(cur);
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
+ NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+ cb(cur, "ffn_out", il);
}
- inpL = ggml_add(ctx0, cur, inpFF);
-
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
}
- // Output Norm
- {
- cur = ggml_norm(ctx0, inpL, norm_eps);
- offload_func_nr(cur);
-
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
- ggml_set_name(cur, "result_norm");
- }
+ cur = llm_build_norm(ctx0, inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, norm_eps, cb, -1);
+ cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur);
- ggml_set_name(cur, "result_output");
+ cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
@@ -4903,7 +4042,9 @@ static struct ggml_cgraph * llm_build_starcoder(
static struct ggml_cgraph * llm_build_persimmon(
llama_context & lctx,
- const llama_batch & batch) {
+ const llama_batch & batch,
+ const llm_build_cb & cb,
+ bool worst_case) {
const auto & model = lctx.model;
const auto & hparams = model.hparams;
@@ -4912,29 +4053,27 @@ static struct ggml_cgraph * llm_build_persimmon(
GGML_ASSERT(!!kv_self.ctx);
const auto & cparams = lctx.cparams;
+
const int64_t n_embd = hparams.n_embd;
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = cparams.n_ctx;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_head = hparams.n_head;
const int64_t n_embd_head = hparams.n_embd_head();
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
- const size_t n_rot = n_embd_head / 2;
+ const int64_t n_rot = n_embd_head / 2;
const float freq_base = cparams.rope_freq_base;
const float freq_scale = cparams.rope_freq_scale;
- const float norm_eps = hparams.f_norm_eps;
-
- const int n_gpu_layers = model.n_gpu_layers;
-
+ const float norm_eps = hparams.f_norm_eps;
const int32_t n_tokens = batch.n_tokens;
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
- const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
+ const bool do_rope_shift = worst_case || kv_self.has_shift;
auto & buf_compute = lctx.buf_compute;
+
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size,
/*.mem_buffer =*/ buf_compute.data,
@@ -4948,146 +4087,77 @@ static struct ggml_cgraph * llm_build_persimmon(
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+ cb(inpL, "imp_embd", -1);
+
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ cb(inp_pos, "inp_pos", -1);
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
- }
- ggml_set_name(inp_tokens, "inp_tokens");
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
- } else {
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
- ggml_allocr_alloc(lctx.alloc, inpL);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
- }
- }
- const int i_gpu_start = n_layer - n_gpu_layers;
- (void) i_gpu_start;
- offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
- offload_func_t offload_func_kq = llama_nop;
- offload_func_t offload_func_v = llama_nop;
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
- }
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
+ cb(KQ_scale, "KQ_scale", -1);
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
- offload_func_kq(KQ_mask);
- ggml_set_name(KQ_mask, "KQ_mask");
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
-
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- float * data = (float *) KQ_mask->data;
- memset(data, 0, ggml_nbytes(KQ_mask));
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const llama_pos pos = batch.pos[j];
- const llama_seq_id seq_id = batch.seq_id[j][0];
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
- }
- }
- }
- }
- }
+ cb(KQ_mask, "KQ_mask", -1);
- struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
- offload_func_kq(KQ_pos);
- ggml_set_name(KQ_pos, "KQ_pos");
- ggml_allocr_alloc(lctx.alloc, KQ_pos);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- int * data = (int *) KQ_pos->data;
- for (int i = 0; i < n_tokens; ++i) {
- data[i] = batch.pos[i];
- }
- }
if (do_rope_shift) {
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
- offload_func_kq(K_shift);
- ggml_set_name(K_shift, "K_shift");
- ggml_allocr_alloc(lctx.alloc, K_shift);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- int * data = (int *) K_shift->data;
- for (int i = 0; i < n_ctx; ++i) {
- data[i] = kv_self.cells[i].delta;
- }
- }
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * tmp =
- // we rotate only the first n_rot dimensions.
- ggml_rope_custom_inplace(ctx0,
- ggml_view_3d(ctx0, kv_self.k,
- n_rot, n_head, n_ctx,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
- ),
- K_shift, n_rot, 2, 0, freq_base, freq_scale);
- offload_func_kq(tmp);
- ggml_build_forward_expand(gf, tmp);
- }
- }
- for (int il=0; il < n_layer; ++il) {
+ llm_build_k_shift(lctx, ctx0, gf, n_rot, LLM_ROPE_NEOX, cb);
+ }
+
+ for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * residual = inpL;
- offload_func_t offload_func = llama_nop;
- {
- cur = ggml_norm(ctx0, inpL, norm_eps);
- offload_func(cur);
- cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
- offload_func(cur);
- cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
- offload_func(cur);
- ggml_format_name(cur, "input_layernorm_%d", il);
- }
+
+ cur = llm_build_norm(ctx0, inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(cur, "attn_norm", il);
+
// self attention
{
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
- offload_func_kq(cur);
+ cb(cur, "wqkv", il);
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- offload_func_kq(cur);
+ cb(cur, "bqkv", il);
// split qkv
GGML_ASSERT(n_head_kv == n_head);
- ggml_set_name(cur, format("qkv_%d", il).c_str());
+
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
- offload_func_kq(tmpqkv);
+ cb(tmpqkv, "tmpqkv", il);
+
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
- offload_func_kq(tmpqkv_perm);
- ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
+ cb(tmpqkv_perm, "tmpqkv", il);
+
struct ggml_tensor * tmpq = ggml_view_3d(
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
ggml_element_size(tmpqkv_perm) * n_embd_head,
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
0
);
- offload_func_kq(tmpq);
+ cb(tmpq, "tmpq", il);
+
struct ggml_tensor * tmpk = ggml_view_3d(
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
ggml_element_size(tmpqkv_perm) * n_embd_head,
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
);
- offload_func_kq(tmpk);
+ cb(tmpk, "tmpk", il);
+
// Q/K Layernorm
- tmpq = ggml_norm(ctx0, tmpq, norm_eps);
- offload_func_kq(tmpq);
- tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
- offload_func_kq(tmpq);
- tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
- offload_func_kq(tmpq);
-
- tmpk = ggml_norm(ctx0, tmpk, norm_eps);
- offload_func_v(tmpk);
- tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
- offload_func_v(tmpk);
- tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
- offload_func_v(tmpk);
+ tmpq = llm_build_norm(ctx0, tmpq,
+ model.layers[il].attn_q_norm,
+ model.layers[il].attn_q_norm_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(tmpq, "tmpq", il);
+
+ tmpk = llm_build_norm(ctx0, tmpk,
+ model.layers[il].attn_k_norm,
+ model.layers[il].attn_k_norm_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(tmpk, "tmpk", il);
// RoPE the first n_rot of q/k, pass the other half, and concat.
struct ggml_tensor * qrot = ggml_view_3d(
@@ -5096,16 +4166,15 @@ static struct ggml_cgraph * llm_build_persimmon(
ggml_element_size(tmpq) * n_embd_head * n_head,
0
);
- offload_func_kq(qrot);
- ggml_format_name(qrot, "qrot_%d", il);
+ cb(qrot, "qrot", il);
+
struct ggml_tensor * krot = ggml_view_3d(
ctx0, tmpk, n_rot, n_head, n_tokens,
ggml_element_size(tmpk) * n_embd_head,
ggml_element_size(tmpk) * n_embd_head * n_head,
0
);
- offload_func_kq(krot);
- ggml_format_name(krot, "krot_%d", il);
+ cb(krot, "krot", il);
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
struct ggml_tensor * qpass = ggml_view_3d(
@@ -5114,193 +4183,117 @@ static struct ggml_cgraph * llm_build_persimmon(
ggml_element_size(tmpq) * n_embd_head * n_head,
ggml_element_size(tmpq) * n_rot
);
- offload_func_kq(qpass);
- ggml_format_name(qpass, "qpass_%d", il);
+ cb(qpass, "qpass", il);
+
struct ggml_tensor * kpass = ggml_view_3d(
ctx0, tmpk, n_rot, n_head, n_tokens,
ggml_element_size(tmpk) * n_embd_head,
ggml_element_size(tmpk) * n_embd_head * n_head,
ggml_element_size(tmpk) * n_rot
);
- offload_func_kq(kpass);
- ggml_format_name(kpass, "kpass_%d", il);
+ cb(kpass, "kpass", il);
- struct ggml_tensor * qrotated = ggml_rope_custom(
- ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
+ struct ggml_tensor * qrotated = ggml_rope_custom(
+ ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale
);
- offload_func_kq(qrotated);
+ cb(qrotated, "qrotated", il);
+
struct ggml_tensor * krotated = ggml_rope_custom(
- ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
+ ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale
);
- offload_func_kq(krotated);
+ cb(krotated, "krotated", il);
+
// ggml currently only supports concatenation on dim=2
// so we need to permute qrot, qpass, concat, then permute back.
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
- offload_func_kq(qrotated);
+ cb(qrotated, "qrotated", il);
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
- offload_func_kq(krotated);
+ cb(krotated, "krotated", il);
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
- offload_func_kq(qpass);
+ cb(qpass, "qpass", il);
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
- offload_func_kq(kpass);
+ cb(kpass, "kpass", il);
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
- offload_func_kq(Qcur);
+ cb(Qcur, "Qcur", il);
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
- offload_func_kq(Kcur);
+ cb(Kcur, "Kcur", il);
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
- offload_func_kq(Q);
+ cb(Q, "Q", il);
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
- offload_func_kq(Kcur);
- {
- struct ggml_tensor * tmpv = ggml_view_3d(
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
- ggml_element_size(tmpqkv_perm) * n_embd_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_view_3d(
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
);
- offload_func_v(tmpv);
- // store K, V in cache
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
- offload_func_v(Vcur);
- ggml_set_name(Vcur, "Vcur");
-
- struct ggml_tensor * k = ggml_view_1d(
- ctx0, kv_self.k, n_tokens*n_embd_gqa,
- (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
- );
- offload_func_kq(k);
- ggml_set_name(k, "k");
-
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
- ( n_ctx)*ggml_element_size(kv_self.v),
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
- offload_func_v(v);
- ggml_set_name(v, "v");
-
- // important: storing RoPE-ed version of K in the KV cache!
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
- }
- struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_kv, n_head_kv,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-
- offload_func_kq(K);
- ggml_format_name(K, "K_%d", il);
-
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- offload_func_kq(KQ);
- ggml_set_name(KQ, "KQ");
-
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
- offload_func_kq(KQ_scaled);
- ggml_set_name(KQ_scaled, "KQ_scaled");
-
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
- offload_func_kq(KQ_masked);
- ggml_set_name(KQ_masked, "KQ_masked");
-
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
- offload_func_kq(KQ_soft_max);
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
- struct ggml_tensor * V =
- ggml_view_3d(ctx0, kv_self.v,
- n_kv, n_embd_head, n_head_kv,
- ggml_element_size(kv_self.v)*n_ctx,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
- offload_func_v(V);
- ggml_set_name(V, "V");
-
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
- offload_func_v(KQV);
- ggml_set_name(KQV, "KQV");
-
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- offload_func_v(KQV_merged);
- ggml_set_name(KQV_merged, "KQV_merged");
-
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
- offload_func_v(cur);
- ggml_set_name(cur, "KQV_merged_contiguous");
-
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
- offload_func(cur);
- cur = ggml_add(ctx0, cur, model.layers[il].bo);
- offload_func(cur);
- ggml_set_name(cur, "result_wo");
- }
-
- struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
- offload_func(inpFF);
- ggml_set_name(inpFF, "inpFF");
+ cb(Vcur, "Vcur", il);
+
+ llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+
+ // TODO: not tested, could be broken
+ cur = llm_build_kqv(lctx, ctx0, Q,
+ model.layers[il].wo, model.layers[il].bo,
+ Q, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il);
+ cb(cur, "kqv_out", il);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
{
- // MLP
- {
- // Norm
- cur = ggml_norm(ctx0, inpFF, norm_eps);
- offload_func(cur);
- cur = ggml_add(ctx0,
- ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
- model.layers[il].ffn_norm_b
- );
- ggml_set_name(cur, "ffn_norm");
- offload_func(cur);
- }
- cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
- offload_func(cur);
-
- cur = ggml_add(ctx0, cur, model.layers[il].b3);
- offload_func(cur);
- ggml_set_name(cur, "result_ffn_up");
-
- cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
- ggml_set_name(cur, "result_ffn_act");
- offload_func(cur);
- offload_func(cur->src[0]);
-
- cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
- offload_func(cur);
- cur = ggml_add(ctx0,
- cur,
- model.layers[il].b2);
- offload_func(cur);
- ggml_set_name(cur, "outFF");
- }
- cur = ggml_add(ctx0, cur, inpFF);
- offload_func(cur);
- ggml_set_name(cur, "inpFF_+_outFF");
+ cur = llm_build_norm(ctx0, ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
+ NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
+
inpL = cur;
}
+
cur = inpL;
- {
- cur = ggml_norm(ctx0, cur, norm_eps);
- offload_func_nr(cur);
- cur = ggml_mul(ctx0, cur, model.output_norm);
- offload_func_nr(cur);
- cur = ggml_add(ctx0, cur, model.output_norm_b);
- // offload_func_nr(cur);
+ cur = llm_build_norm(ctx0, cur,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, norm_eps, cb, -1);
+ cb(cur, "result_norm", -1);
- ggml_set_name(cur, "result_norm");
- }
cur = ggml_mul_mat(ctx0, model.output, cur);
- ggml_set_name(cur, "result_output");
+ cb(cur, "result_output", -1);
+
ggml_build_forward_expand(gf, cur);
+
ggml_free(ctx0);
+
return gf;
}
-static struct ggml_cgraph * llm_build_bloom(
+static struct ggml_cgraph * llm_build_refact(
llama_context & lctx,
- const llama_batch & batch) {
+ const llama_batch & batch,
+ const llm_build_cb & cb,
+ bool worst_case) {
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@@ -5315,223 +4308,242 @@ static struct ggml_cgraph * llm_build_bloom(
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
-
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- const float norm_eps = hparams.f_norm_eps;
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
const int32_t n_tokens = batch.n_tokens;
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
auto & buf_compute = lctx.buf_compute;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size,
/*.mem_buffer =*/ buf_compute.data,
- /*.no_alloc =*/ false,
+ /*.no_alloc =*/ true,
};
- params.no_alloc = true;
-
struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur;
- struct ggml_tensor * token;
struct ggml_tensor * inpL;
- if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
- }
- ggml_set_name(inp_tokens, "inp_tokens");
-
- token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
- } else {
-#ifdef GGML_USE_MPI
- GGML_ASSERT(false && "not implemented");
-#endif
-
- token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
- ggml_allocr_alloc(lctx.alloc, token);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
- }
- }
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+ cb(inpL, "inp_embd", -1);
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
- }
+ cb(KQ_scale, "KQ_scale", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
- ggml_set_name(KQ_mask, "KQ_mask");
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- float * data = (float *) KQ_mask->data;
- memset(data, 0, ggml_nbytes(KQ_mask));
-
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const llama_pos pos = batch.pos[j];
- const llama_seq_id seq_id = batch.seq_id[j][0];
-
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
- }
- }
- }
- }
- }
+ cb(KQ_mask, "KQ_mask", -1);
- // norm
- {
- inpL = ggml_norm(ctx0, token, norm_eps);
- inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
- }
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
- ggml_set_name(inpL, "inpL");
+ cur = llm_build_norm(ctx0, inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, norm_rms_eps, cb, il);
+ cb(cur, "attn_norm", il);
- for (int il = 0; il < n_layer; ++il) {
+ // self-attention
{
- // Norm
- cur = ggml_norm(ctx0, inpL, norm_eps);
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ cb(Kcur, "Kcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ cb(Qcur, "Qcur", il);
+
+ llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+
+ cur = llm_build_kqv(lctx, ctx0, Qcur,
+ model.layers[il].wo, NULL,
+ Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il);
+ cb(cur, "kqv_out", il);
}
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
{
- // Self Attention
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
+ cur = llm_build_norm(ctx0, ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, norm_rms_eps, cb, il);
+ cb(cur, "ffn_norm", il);
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ }
- struct ggml_tensor * Qcur = tmpq;
- struct ggml_tensor * Kcur = tmpk;
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
- // store key and value to memory
- {
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
- ggml_set_name(Vcur, "Vcur");
+ // input for next layer
+ inpL = cur;
+ }
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
- ggml_set_name(k, "k");
+ cur = inpL;
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
- ( n_ctx)*ggml_element_size(kv_self.v),
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
+ cur = llm_build_norm(ctx0, cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, norm_rms_eps, cb, -1);
+ cb(cur, "result_norm", -1);
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
- }
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
- struct ggml_tensor * Q =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
- Qcur,
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
- 0, 2, 1, 3);
- ggml_set_name(Q, "Q");
-
- struct ggml_tensor * K =
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_kv, n_head_kv,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
- ggml_set_name(K, "K");
+ ggml_build_forward_expand(gf, cur);
+
+ ggml_free(ctx0);
+
+ return gf;
+}
- // K * Q
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- ggml_set_name(KQ, "KQ");
+static struct ggml_cgraph * llm_build_bloom(
+ llama_context & lctx,
+ const llama_batch & batch,
+ const llm_build_cb & cb,
+ bool worst_case) {
+ const auto & model = lctx.model;
+ const auto & hparams = model.hparams;
+ const auto & cparams = lctx.cparams;
- // KQ_scaled = KQ / sqrt(n_embd_head)
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
- ggml_set_name(KQ_scaled, "KQ_scaled");
+ const auto & kv_self = lctx.kv_self;
- struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
- ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
+ GGML_ASSERT(!!kv_self.ctx);
- // KQ_masked = mask_past(KQ_scaled)
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
- ggml_set_name(KQ_masked, "KQ_masked");
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_layer = hparams.n_layer;
+ const int64_t n_ctx = cparams.n_ctx;
+ const int64_t n_head = hparams.n_head;
+ const int64_t n_embd_head = hparams.n_embd_head();
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
- // KQ = soft_max(KQ_masked)
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
- // split cached V into n_head heads
- struct ggml_tensor * V =
- ggml_view_3d(ctx0, kv_self.v,
- n_kv, n_embd_head, n_head_kv,
- ggml_element_size(kv_self.v)*n_ctx,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
- ggml_set_name(V, "V");
+ const float norm_eps = hparams.f_norm_eps;
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
- ggml_set_name(KQV, "KQV");
+ const int32_t n_tokens = batch.n_tokens;
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
- // KQV_merged = KQV.permute(0, 2, 1, 3)
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- ggml_set_name(KQV_merged, "KQV_merged");
+ auto & buf_compute = lctx.buf_compute;
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
- ggml_set_name(cur, "KQV_merged_contiguous");
- }
+ struct ggml_init_params params = {
+ /*.mem_size =*/ buf_compute.size,
+ /*.mem_buffer =*/ buf_compute.data,
+ /*.no_alloc =*/ false,
+ };
- // Projection
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
+ params.no_alloc = true;
- // Add the input
- cur = ggml_add(ctx0, cur, inpL);
+ struct ggml_context * ctx0 = ggml_init(params);
- struct ggml_tensor * inpFF = cur;
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
- // FF
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+ cb(inpL, "inp_embd", -1);
+
+ // KQ_scale
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+ cb(KQ_scale, "KQ_scale", -1);
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ cb(KQ_mask, "KQ_mask", -1);
+
+ inpL = llm_build_norm(ctx0, inpL,
+ model.tok_norm,
+ model.tok_norm_b,
+ LLM_NORM, norm_eps, cb, -1);
+ cb(inpL, "inp_norm", -1);
+
+ for (int il = 0; il < n_layer; ++il) {
+ cur = llm_build_norm(ctx0, inpL,
+ model.layers[il].attn_norm,
+ model.layers[il].attn_norm_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
{
- // Norm
- {
- cur = ggml_norm(ctx0, inpFF, norm_eps);
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
- }
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- // GELU activation
- cur = ggml_gelu(ctx0, cur);
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
- // Projection
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+ llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+
+ cur = llm_build_kqv(lctx, ctx0, Qcur,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il);
+ cb(cur, "kqv_out", il);
}
- inpL = ggml_add(ctx0, cur, inpFF);
- }
+ // Add the input
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
- // Output Norm
- {
- cur = ggml_norm(ctx0, inpL, norm_eps);
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
+ // FF
+ {
+ cur = llm_build_norm(ctx0, ffn_inp,
+ model.layers[il].ffn_norm,
+ model.layers[il].ffn_norm_b,
+ LLM_NORM, norm_eps, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
+ NULL, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
}
- ggml_set_name(cur, "result_norm");
+
+ cur = llm_build_norm(ctx0, inpL,
+ model.output_norm,
+ model.output_norm_b,
+ LLM_NORM, norm_eps, cb, -1);
+ cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur);
- ggml_set_name(cur, "result_output");
+ cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
@@ -5542,7 +4554,9 @@ static struct ggml_cgraph * llm_build_bloom(
static struct ggml_cgraph * llm_build_mpt(
llama_context & lctx,
- const llama_batch & batch) {
+ const llama_batch & batch,
+ const llm_build_cb & cb,
+ bool worst_case) {
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@@ -5555,7 +4569,6 @@ static struct ggml_cgraph * llm_build_mpt(
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = cparams.n_ctx;
const int64_t n_head = hparams.n_head;
- const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
@@ -5563,11 +4576,9 @@ static struct ggml_cgraph * llm_build_mpt(
const float clamp_kqv = hparams.f_clamp_kqv;
const float max_alibi_bias = hparams.f_max_alibi_bias;
- const int n_gpu_layers = model.n_gpu_layers;
-
const int32_t n_tokens = batch.n_tokens;
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
auto & buf_compute = lctx.buf_compute;
@@ -5586,321 +4597,589 @@ static struct ggml_cgraph * llm_build_mpt(
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- //int warmup = 0;
- if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+ cb(inpL, "inp_embd", -1);
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
- //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
- }
+ // KQ_scale
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+ cb(KQ_scale, "KQ_scale", -1);
- ggml_set_name(inp_tokens, "inp_tokens");
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ cb(KQ_mask, "KQ_mask", -1);
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
- } else {
-#ifdef GGML_USE_MPI
- GGML_ASSERT(false && "not implemented");
-#endif
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * attn_norm;
+
+ attn_norm = llm_build_norm(ctx0, inpL,
+ model.layers[il].attn_norm,
+ NULL,
+ LLM_NORM, norm_eps, cb, il);
+ cb(attn_norm, "attn_norm", il);
+
+ // self-attention
+ {
+ cur = attn_norm;
+
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ if (clamp_kqv > 0.0f) {
+ cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
+ cb(cur, "wqkv_clamped", il);
+ }
+
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+ llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+ cur = llm_build_kqv(lctx, ctx0, Qcur,
+ model.layers[il].wo, NULL,
+ Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, max_alibi_bias, cb, il);
+ cb(cur, "kqv_out", il);
+ }
+
+ // Add the input
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
- ggml_allocr_alloc(lctx.alloc, inpL);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
+ // feed forward
+ {
+ cur = llm_build_norm(ctx0, ffn_inp,
+ model.layers[il].ffn_norm,
+ NULL,
+ LLM_NORM, norm_eps, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ NULL, NULL,
+ model.layers[il].ffn_down, NULL,
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+ cb(cur, "ffn_out", il);
}
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
}
- const int i_gpu_start = n_layer - n_gpu_layers;
- (void) i_gpu_start;
+ cur = inpL;
- // offload functions set the tensor output backend to GPU
- // tensors are GPU-accelerated if any input or the output has been offloaded
- offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
- offload_func_t offload_func_kq = llama_nop;
- offload_func_t offload_func_v = llama_nop;
+ cur = llm_build_norm(ctx0, cur,
+ model.output_norm,
+ NULL,
+ LLM_NORM, norm_eps, cb, -1);
+ cb(cur, "result_norm", -1);
-#ifdef GGML_USE_CUBLAS
- if (n_gpu_layers > n_layer) {
- offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ ggml_free(ctx0);
+
+ return gf;
+}
+
+//
+// tensor offloading helpers
+//
+// TODO: will be removed with backend v2
+
+enum llm_offload_func_e {
+ OFFLOAD_FUNC_NOP,
+ OFFLOAD_FUNC,
+ OFFLOAD_FUNC_KQ,
+ OFFLOAD_FUNC_V,
+ OFFLOAD_FUNC_NR,
+ OFFLOAD_FUNC_EMB,
+ OFFLOAD_FUNC_OUT,
+};
+
+// TODO: will be removed with backend v2
+struct llm_offload_trie {
+ struct node {
+ ~node() {
+ for (int i = 0; i < 256; ++i) {
+ if (children[i]) {
+ delete children[i];
+ }
+ }
+ }
+
+ node * children[256] = { nullptr };
+ llm_offload_func_e func = OFFLOAD_FUNC_NOP;
+ };
+
+ llm_offload_trie() {
+ root = new node;
}
- if (n_gpu_layers > n_layer + 1) {
- offload_func_v = ggml_cuda_assign_buffers_no_alloc;
+
+ llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
+ root = new node;
+
+ for (const auto & kv : map) {
+ add(kv.first, kv.second);
+ }
}
- if (n_gpu_layers > n_layer + 2) {
- offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
+
+ ~llm_offload_trie() {
+ delete root;
}
-#endif // GGML_USE_CUBLAS
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+ void add(const char * name, llm_offload_func_e func) {
+ node * cur = root;
+
+ for (int i = 0; ; ++i) {
+ const uint8_t c = name[i];
+
+ if (!c) {
+ break;
+ }
+
+ if (!cur->children[c]) {
+ cur->children[c] = new node;
+ }
+
+ cur = cur->children[c];
+ }
+
+ cur->func = func;
}
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
- offload_func_kq(KQ_mask);
- ggml_set_name(KQ_mask, "KQ_mask");
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- float * data = (float *) KQ_mask->data;
- memset(data, 0, ggml_nbytes(KQ_mask));
-
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const llama_pos pos = batch.pos[j];
- const llama_seq_id seq_id = batch.seq_id[j][0];
-
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
- }
- }
+ llm_offload_func_e find(const char * name) const {
+ const node * cur = root;
+
+ for (int i = 0; ; ++i) {
+ const uint8_t c = name[i];
+
+ if (!c) {
+ break;
}
+
+ if (!cur->children[c]) {
+ return OFFLOAD_FUNC_NOP;
+ }
+
+ cur = cur->children[c];
}
+
+ return cur->func;
}
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * attn_norm;
+ node * root = nullptr;
+};
- offload_func_t offload_func = llama_nop;
+// TODO: will be removed with backend v2
+static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
+ //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
+ //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
+ { "pos_embd", OFFLOAD_FUNC_NR },
+
+ { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope)
+ { "KQ_scale", OFFLOAD_FUNC_KQ },
+ { "KQ_mask", OFFLOAD_FUNC_KQ },
+ { "K_shift", OFFLOAD_FUNC_KQ },
+ { "K_shifted", OFFLOAD_FUNC_KQ },
+
+ { "inp_norm", OFFLOAD_FUNC_NR },
+ { "inp_norm_w", OFFLOAD_FUNC_NR },
+ { "inp_norm_wb", OFFLOAD_FUNC_NR },
+
+ { "norm", OFFLOAD_FUNC },
+ { "norm_w", OFFLOAD_FUNC },
+ { "norm_wb", OFFLOAD_FUNC },
+
+ { "attn_norm", OFFLOAD_FUNC },
+ { "attn_norm_2", OFFLOAD_FUNC },
+
+ { "wqkv", OFFLOAD_FUNC_KQ },
+ { "bqkv", OFFLOAD_FUNC_KQ },
+ { "wqkv_clamped", OFFLOAD_FUNC_KQ },
+
+ { "tmpk", OFFLOAD_FUNC_KQ },
+ { "tmpq", OFFLOAD_FUNC_KQ },
+ { "tmpv", OFFLOAD_FUNC_V },
+ { "Kcur", OFFLOAD_FUNC_KQ },
+ { "Qcur", OFFLOAD_FUNC_KQ },
+ { "Vcur", OFFLOAD_FUNC_V },
+
+ { "krot", OFFLOAD_FUNC_KQ },
+ { "qrot", OFFLOAD_FUNC_KQ },
+ { "kpass", OFFLOAD_FUNC_KQ },
+ { "qpass", OFFLOAD_FUNC_KQ },
+ { "krotated", OFFLOAD_FUNC_KQ },
+ { "qrotated", OFFLOAD_FUNC_KQ },
+
+ { "q", OFFLOAD_FUNC_KQ },
+ { "k", OFFLOAD_FUNC_KQ },
+ { "kq", OFFLOAD_FUNC_KQ },
+ { "kq_scaled", OFFLOAD_FUNC_KQ },
+ { "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
+ { "kq_masked", OFFLOAD_FUNC_KQ },
+ { "kq_soft_max", OFFLOAD_FUNC_V },
+ { "v", OFFLOAD_FUNC_V },
+ { "kqv", OFFLOAD_FUNC_V },
+ { "kqv_merged", OFFLOAD_FUNC_V },
+ { "kqv_merged_cont", OFFLOAD_FUNC_V },
+ { "kqv_wo", OFFLOAD_FUNC_V },
+ { "kqv_out", OFFLOAD_FUNC_V },
+
+ { "ffn_inp", OFFLOAD_FUNC },
+ { "ffn_norm", OFFLOAD_FUNC },
+
+ { "ffn_up", OFFLOAD_FUNC },
+ { "ffn_up_b", OFFLOAD_FUNC },
+ { "ffn_gate", OFFLOAD_FUNC },
+ { "ffn_gate_b", OFFLOAD_FUNC },
+ { "ffn_gate_par", OFFLOAD_FUNC },
+ { "ffn_down", OFFLOAD_FUNC },
+ { "ffn_down_b", OFFLOAD_FUNC },
+ { "ffn_out", OFFLOAD_FUNC },
+
+ { "ffn_silu", OFFLOAD_FUNC },
+ { "ffn_gelu", OFFLOAD_FUNC },
+ { "ffn_relu", OFFLOAD_FUNC },
+ { "ffn_sqr(relu)", OFFLOAD_FUNC },
+
+ { "l_out", OFFLOAD_FUNC },
+
+ { "result_norm", OFFLOAD_FUNC_EMB },
+ { "result_output", OFFLOAD_FUNC_OUT },
+};
+
+static llm_offload_trie k_offload_func_trie(k_offload_map);
+
+static struct ggml_cgraph * llama_build_graph(
+ llama_context & lctx,
+ const llama_batch & batch) {
+ const auto & model = lctx.model;
+
+ // check if we should build the worst-case graph (for memory measurement)
+ const bool worst_case = ggml_allocr_is_measure(lctx.alloc);
+
+ // keep track of the input that has already been allocated
+ bool alloc_inp_tokens = false;
+ bool alloc_inp_embd = false;
+ bool alloc_inp_pos = false;
+ bool alloc_inp_KQ_scale = false;
+ bool alloc_inp_KQ_mask = false;
+ bool alloc_inp_K_shift = false;
#ifdef GGML_USE_CUBLAS
- if (il >= i_gpu_start) {
- offload_func = ggml_cuda_assign_buffers_no_alloc;
+ const bool do_offload = true;
+#else
+ const bool do_offload = true; // TODO: set to false after finishing refactoring
+#endif
+
+ int n_non_view = 0; // number of non-view tensors that have been processed by the callback
+
+ // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+ // TODO: will be removed with backend v2
+ llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
+ if (il >= 0) {
+ ggml_format_name(cur, "%s-%d", name, il);
+ } else {
+ ggml_set_name(cur, name);
}
-#endif // GGML_USE_CUBLAS
- // self-attention
- // TODO: refactor into common function (shared with LLaMA)
- {
- attn_norm = ggml_norm(ctx0, inpL, norm_eps);
- offload_func(attn_norm);
+ //
+ // allocate input tensors and set input data
+ //
+ // TODO: will be removed with backend v2
- attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
- offload_func(attn_norm);
+ if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
+ ggml_allocr_alloc(lctx.alloc, cur);
- if (1) {
- cur = attn_norm;
+ if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
+ const int64_t n_tokens = cur->ne[0];
+
+ memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur));
}
- // compute QKV
+ alloc_inp_tokens = true;
+ }
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
- offload_func_kq(cur);
+ if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
+ ggml_allocr_alloc(lctx.alloc, cur);
- if (clamp_kqv > 0.0f) {
- cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
- offload_func_kq(cur);
+ if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) {
+ const int64_t n_embd = cur->ne[0];
+ const int64_t n_tokens = cur->ne[1];
+
+ memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur));
}
- const size_t wsize = ggml_type_size(cur->type);
+ alloc_inp_embd = true;
+ }
- struct ggml_tensor * Qcur = ggml_view_3d(
- ctx0, cur, n_embd_head, n_head, n_tokens,
- wsize * n_embd_head,
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
- 0);
- offload_func_kq(Qcur);
+ if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
+ ggml_allocr_alloc(lctx.alloc, cur);
- struct ggml_tensor * Kcur = ggml_view_3d(
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
- wsize * n_embd_head,
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
- wsize * n_embd_head * n_head);
- offload_func_kq(Kcur);
+ if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
+ const int64_t n_tokens = cur->ne[0];
- struct ggml_tensor * tmpv = ggml_view_3d(
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
- wsize * n_embd_head,
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
- wsize * n_embd_head * (n_head + n_head_kv));
- offload_func_kq(Kcur);
+ int32_t * data = (int32_t *) cur->data;
- ggml_set_name(Qcur, "Qcur");
- ggml_set_name(Kcur, "Kcur");
+ for (int i = 0; i < n_tokens; ++i) {
+ data[i] = batch.pos[i];
+ }
+ }
- {
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
- offload_func_v(Vcur);
- offload_func_v(Vcur->src[0]->src[0]);
- ggml_set_name(Vcur, "Vcur");
-
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
- offload_func_kq(k);
- ggml_set_name(k, "k");
-
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
- ( n_ctx)*ggml_element_size(kv_self.v),
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
- offload_func_v(v);
-
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ alloc_inp_pos = true;
+ }
+
+ if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
+ ggml_allocr_alloc(lctx.alloc, cur);
+
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
+ const int64_t n_embd_head = model.hparams.n_embd_head();
+ ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
}
- struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- offload_func_kq(Q);
- ggml_set_name(Q, "Q");
+ alloc_inp_KQ_scale = true;
+ }
- struct ggml_tensor * K =
- ggml_view_3d(ctx0, kv_self.k,
- n_embd_head, n_kv, n_head_kv,
- ggml_element_size(kv_self.k)*n_embd_gqa,
- ggml_element_size(kv_self.k)*n_embd_head,
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
- offload_func_kq(K);
- ggml_set_name(K, "K");
-
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- offload_func_kq(KQ);
- ggml_set_name(KQ, "KQ");
-
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
- offload_func_kq(KQ_scaled);
- ggml_set_name(KQ_scaled, "KQ_scaled");
-
- // TODO: replace with ggml_add()
- struct ggml_tensor * KQ_scaled_alibi =
- ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
- offload_func_kq(KQ_scaled_alibi);
- ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
-
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
- offload_func_kq(KQ_masked);
- ggml_set_name(KQ_masked, "KQ_masked");
-
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
- offload_func_v(KQ_soft_max);
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
- struct ggml_tensor * V =
- ggml_view_3d(ctx0, kv_self.v,
- n_kv, n_embd_head, n_head_kv,
- ggml_element_size(kv_self.v)*n_ctx,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
- offload_func_v(V);
- ggml_set_name(V, "V");
-
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
- offload_func_v(KQV);
- ggml_set_name(KQV, "KQV");
-
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- offload_func_v(KQV_merged);
- ggml_set_name(KQV_merged, "KQV_merged");
-
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
- offload_func_v(cur);
- ggml_set_name(cur, "KQV_merged_contiguous");
-
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
- offload_func(cur);
- ggml_set_name(cur, "result_wo");
+ if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
+ ggml_allocr_alloc(lctx.alloc, cur);
+
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
+ const int64_t n_kv = cur->ne[0];
+ const int64_t n_tokens = cur->ne[1];
+
+ float * data = (float *) cur->data;
+ memset(data, 0, ggml_nbytes(cur));
+
+ for (int h = 0; h < 1; ++h) {
+ for (int j = 0; j < n_tokens; ++j) {
+ const llama_pos pos = batch.pos[j];
+ const llama_seq_id seq_id = batch.seq_id[j][0];
+
+ for (int i = 0; i < n_kv; ++i) {
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+ }
+ }
+ }
+ }
+ }
+
+ alloc_inp_KQ_mask = true;
}
- // Add the input
- cur = ggml_add(ctx0, cur, inpL);
- offload_func(cur);
+ if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
+ ggml_allocr_alloc(lctx.alloc, cur);
- struct ggml_tensor * attn_out = cur;
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
+ const int64_t n_ctx = cur->ne[0];
- // feed forward
- {
- // Norm
- {
- cur = ggml_norm(ctx0, attn_out, norm_eps);
- offload_func(cur);
+ int32_t * data = (int32_t *) cur->data;
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
- offload_func(cur);
+ for (int i = 0; i < n_ctx; ++i) {
+ data[i] = lctx.kv_self.cells[i].delta;
+ }
}
- cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
- offload_func(cur);
+ alloc_inp_K_shift = true;
+ }
- cur = ggml_gelu(ctx0, cur);
- offload_func(cur);
- cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
- offload_func(cur);
+ // view tensors are not processed further
+ if (cur->view_src != nullptr) {
+ return;
}
- cur = ggml_add(ctx0, cur, attn_out);
- offload_func(cur);
- // input for next layer
- inpL = cur;
- }
+ if (cur->op != GGML_OP_NONE) {
+ n_non_view++;
+ }
- cur = inpL;
+ //
+ // offload layers
+ //
+ // TODO: will be removed with backend v2
- // norm
- {
- cur = ggml_norm(ctx0, cur, norm_eps);
- offload_func_nr(cur);
+//#define LLAMA_OFFLOAD_DEBUG
- cur = ggml_mul(ctx0, cur, model.output_norm);
- ggml_set_name(cur, "result_norm");
- }
+ if (!do_offload) {
+ return;
+ }
- cur = ggml_mul_mat(ctx0, model.output, cur);
- ggml_set_name(cur, "result_output");
+ const int n_layer = model.hparams.n_layer;
- ggml_build_forward_expand(gf, cur);
+ const int n_gpu_layers = model.n_gpu_layers;
+ const int i_gpu_start = n_layer - n_gpu_layers;
- ggml_free(ctx0);
+ // should we offload the final norm? yes if we are not computing embeddings
+ const bool offload_emb = lctx.embedding.empty();
- return gf;
-}
+ static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
+ { OFFLOAD_FUNC_NOP, "CPU" },
+ { OFFLOAD_FUNC_OUT, "CPU" },
+#ifdef GGML_USE_CUBLAS
+ { OFFLOAD_FUNC, "GPU (CUDA)" },
+ { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" },
+ { OFFLOAD_FUNC_V, "GPU (CUDA) V" },
+ { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
+ { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
+#else
+ { OFFLOAD_FUNC, "CPU" },
+ { OFFLOAD_FUNC_KQ, "CPU" },
+ { OFFLOAD_FUNC_V, "CPU" },
+ { OFFLOAD_FUNC_NR, "CPU" },
+ { OFFLOAD_FUNC_EMB, "CPU" },
+#endif // GGML_USE_CUBLAS
+ };
-static struct ggml_cgraph * llama_build_graph(
- llama_context & lctx,
- const llama_batch & batch) {
- const auto & model = lctx.model;
+ // check the global map for what offload function to use for this tensor
+ llm_offload_func_e func_e = k_offload_func_trie.find(name);
+
+ if (func_e == OFFLOAD_FUNC_NOP) {
+#ifdef LLAMA_OFFLOAD_DEBUG
+ // if a tensor hasn't been offloaded, we warn the user
+ if (worst_case) {
+ LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
+ cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
+ }
+#endif
+
+ return;
+ }
+
+ // count the number of layers and respect the provided n_gpu_layers
+ switch (func_e) {
+ case OFFLOAD_FUNC_NOP:
+ case OFFLOAD_FUNC_OUT:
+ break;
+ case OFFLOAD_FUNC:
+ if (n_gpu_layers < n_layer) {
+ if (il < i_gpu_start) {
+ func_e = OFFLOAD_FUNC_NOP;
+ }
+ }
+ break;
+ case OFFLOAD_FUNC_NR:
+ if (n_gpu_layers <= n_layer + 0) {
+ func_e = OFFLOAD_FUNC_NOP;
+ }
+ break;
+ case OFFLOAD_FUNC_V:
+ if (n_gpu_layers <= n_layer + 1) {
+ func_e = OFFLOAD_FUNC_NOP;
+ }
+ break;
+ case OFFLOAD_FUNC_KQ:
+ if (n_gpu_layers <= n_layer + 2) {
+ func_e = OFFLOAD_FUNC_NOP;
+ }
+ break;
+ case OFFLOAD_FUNC_EMB:
+ if (!offload_emb || n_gpu_layers < n_layer) {
+ func_e = OFFLOAD_FUNC_NOP;
+ }
+ break;
+ default: GGML_ASSERT(false);
+ }
+
+ offload_func_t func = ggml_offload_nop;
+
+ // this is needed for compatibility with Metal for example
+#ifdef GGML_USE_CUBLAS
+ static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
+#else
+ static offload_func_t ggml_offload_gpu = ggml_offload_nop;
+#endif
+
+ switch (func_e) {
+ case OFFLOAD_FUNC_NOP:
+ case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
+ case OFFLOAD_FUNC:
+ case OFFLOAD_FUNC_KQ:
+ case OFFLOAD_FUNC_V:
+ case OFFLOAD_FUNC_NR:
+ case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
+ default: GGML_ASSERT(false);
+ }
+
+ // apply offload function to the tensor
+ func(cur);
+
+#ifdef LLAMA_OFFLOAD_DEBUG
+ if (worst_case) {
+ LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
+ }
+#endif
+ };
struct ggml_cgraph * result = NULL;
switch (model.arch) {
case LLM_ARCH_LLAMA:
{
- result = llm_build_llama(lctx, batch);
+ result = llm_build_llama(lctx, batch, cb, worst_case);
} break;
case LLM_ARCH_BAICHUAN:
{
- result = llm_build_baichaun(lctx, batch);
+ result = llm_build_baichaun(lctx, batch, cb, worst_case);
} break;
case LLM_ARCH_FALCON:
{
- result = llm_build_falcon(lctx, batch);
+ result = llm_build_falcon(lctx, batch, cb, worst_case);
} break;
case LLM_ARCH_STARCODER:
{
- result = llm_build_starcoder(lctx, batch);
+ result = llm_build_starcoder(lctx, batch, cb, worst_case);
} break;
case LLM_ARCH_PERSIMMON:
{
- result = llm_build_persimmon(lctx, batch);
+ result = llm_build_persimmon(lctx, batch, cb, worst_case);
} break;
case LLM_ARCH_REFACT:
{
- result = llm_build_refact(lctx, batch);
+ result = llm_build_refact(lctx, batch, cb, worst_case);
} break;
case LLM_ARCH_BLOOM:
{
- result = llm_build_bloom(lctx, batch);
+ result = llm_build_bloom(lctx, batch, cb, worst_case);
} break;
case LLM_ARCH_MPT:
{
- result = llm_build_mpt(lctx, batch);
+ result = llm_build_mpt(lctx, batch, cb, worst_case);
} break;
default:
GGML_ASSERT(false);
}
+ if (worst_case) {
+ int n_non_view_total = 0;
+
+ for (int i = 0; i < result->n_nodes; ++i) {
+ if (result->nodes[i]->view_src == nullptr) {
+ n_non_view_total++;
+ }
+ }
+
+ LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
+
+ if (n_non_view != n_non_view_total) {
+ LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
+ LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__);
+ LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__);
+ LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__);
+ LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__);
+ LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
+ }
+ }
+
return result;
}
@@ -6043,11 +5322,13 @@ static int llama_decode_internal(
}
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
- const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
+ const bool full_offload_supported =
+ model.arch == LLM_ARCH_LLAMA ||
model.arch == LLM_ARCH_BAICHUAN ||
- model.arch == LLM_ARCH_FALCON ||
- model.arch == LLM_ARCH_REFACT ||
+ model.arch == LLM_ARCH_FALCON ||
+ model.arch == LLM_ARCH_REFACT ||
model.arch == LLM_ARCH_MPT;
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
n_threads = 1;
@@ -6102,6 +5383,8 @@ static int llama_decode_internal(
//}
// extract logits
+ // TODO: do not compute and extract logits if only embeddings are needed
+ // need to update the graphs to skip "result_output"
{
auto & logits_out = lctx.logits;
@@ -8713,8 +7996,8 @@ static int llama_apply_lora_from_file_internal(
ggml_tensor * dest_t = model_tensors[base_name];
- offload_func_t offload_func = llama_nop;
- offload_func_t offload_func_force_inplace = llama_nop;
+ offload_func_t offload_func = ggml_offload_nop;
+ offload_func_t offload_func_force_inplace = ggml_offload_nop;
#ifdef GGML_USE_CUBLAS
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {