diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-12-21 23:20:49 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-12-21 23:20:49 +0200 |
commit | afefa319f1f59b002dfa0d1ef407a2c74bd9770b (patch) | |
tree | a6923e0a6214293d88957cd11e25943f2c0fb80a /examples | |
parent | 769a7bc85eaa44e3d7eadf39abfeff7bb0b9cc2f (diff) |
ggml : change ggml_scale to take a float instead of tensor (#4573)
* ggml : change ggml_scale to take a float instead of tensor
* ggml : fix CPU implementation
* tests : fix test-grad0
ggml-ci
Diffstat (limited to 'examples')
-rw-r--r-- | examples/baby-llama/baby-llama.cpp | 15 | ||||
-rw-r--r-- | examples/export-lora/export-lora.cpp | 2 | ||||
-rw-r--r-- | examples/finetune/finetune.cpp | 42 | ||||
-rw-r--r-- | examples/llava/clip.cpp | 8 | ||||
-rw-r--r-- | examples/train-text-from-scratch/train-text-from-scratch.cpp | 14 |
5 files changed, 30 insertions, 51 deletions
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 2dc2988d..e7d2ad59 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -575,10 +575,7 @@ static struct ggml_tensor * forward( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // KQ_masked shape [n_past + N, N, n_head, 1] @@ -844,10 +841,7 @@ static struct ggml_tensor * forward_batch( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); // KQ_masked = mask_past(KQ_scaled) @@ -1131,10 +1125,7 @@ static struct ggml_tensor * forward_lora( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // KQ_masked shape [n_past + N, N, n_head, 1] diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index c8754ce7..58fbe204 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -309,7 +309,7 @@ static struct ggml_cgraph * build_graph_lora( ) { struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b); if (scaling != 1.0f) { - ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling)); + ab = ggml_scale(ctx, ab, scaling); } struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab); diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 6a668d76..7b1333a9 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -269,7 +269,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h float rope_freq_scale = 1.0f; GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); if (rope_freq_scale != 1.0f) { hparams->rope_freq_scale = 1.0f / rope_freq_scale; } @@ -612,6 +612,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( const int n_rot = hparams.n_embd_head(); const int n_embd_head = hparams.n_embd_head(); const int n_embd_gqa = hparams.n_embd_gqa(); + const float rms_norm_eps = hparams.f_norm_rms_eps; const float rope_freq_base = hparams.rope_freq_base; const float rope_freq_scale = hparams.rope_freq_scale; @@ -680,10 +681,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( checkpoints.push_back(t01); } - struct ggml_tensor * kv_scale = NULL; - if (!enable_flash_attn) { - kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); - } + const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); for (int il = 0; il < n_layer; ++il) { struct my_llama_layer & layer = model->layers[il]; @@ -781,32 +779,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // make sure some tensors are not reallocated by inserting new temporary nodes depending on them int n_leafs_before = gb->n_leafs; int n_nodes_before = gb->n_nodes; - struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f); + // output tensors - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); // input gradient - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); ggml_allocr_alloc(alloc, t36->grad); // KQ_pos - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); // make sure base model tensors data cannot be used in viewable operations - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f)); for (int il = 0; il < n_layer; ++il) { struct my_llama_layer & layer = model->layers[il]; - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f)); } // allocating checkpoints in one block to reduce memory fragmentation diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 11246596..f06ec400 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -330,12 +330,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima ggml_repeat(ctx0, model.pre_ln_b, embeddings)); } - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(ctx->alloc, KQ_scale); - if (!ggml_allocr_is_measure(ctx->alloc)) { - ggml_set_f32(KQ_scale, 1.0f / sqrt((float)d_head)); - } - // loop over layers for (int il = 0; il < n_layer - 1; il++) { struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states @@ -356,7 +350,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima struct ggml_tensor * Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur)); - Q = ggml_scale_inplace(ctx0, Q, KQ_scale); + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index f7ed6336..4a9a2340 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -369,10 +369,7 @@ static struct ggml_tensor * llama_build_train_graphs( checkpoints.push_back(t00); checkpoints.push_back(t01); - struct ggml_tensor * kv_scale = NULL; - if (!enable_flash_attn) { - kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); - } + const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); for (int il = 0; il < n_layer; ++il) { struct my_llama_layer & layer = model->layers[il]; @@ -444,14 +441,13 @@ static struct ggml_tensor * llama_build_train_graphs( // make sure some tensors are not reallocated by inserting new temporary nodes depending on them int n_leafs_before = gb->n_leafs; int n_nodes_before = gb->n_nodes; - struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f); // output tensors - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); // input gradient - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); // KQ_pos - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); ggml_allocr_alloc(alloc, t36->grad); |