summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Godfrey <AndrewGodfrey@users.noreply.github.com>2023-11-17 02:23:11 -0800
committerGitHub <noreply@github.com>2023-11-17 11:23:11 +0100
commit947f64f1630bb8b0b363a3bb5e29e11425312d57 (patch)
tree9d8e8ecea7cdf6bd3b6cb192900f82dec82fb825
parentb83e149ec6264d078e6a47412e7347bf5c2bfcc9 (diff)
finetune : zero the loraB initial vectors (#4082)
* finetune : zero the loraB initial vectors Without this, the first iteration is starting out far from the base model, instead of exactly on it. Zeroing loraB is what the paper recommends. loralib also zeroes at least one of the init vector pairs (though it departs from the paper in using a different distribution for the other vector, in some cases). * tabs to spaces * Use ggml_set_zero instead of adding a new function
-rw-r--r--examples/finetune/finetune.cpp24
1 files changed, 12 insertions, 12 deletions
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 5a6cf22c..7fecce25 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
randomize_tensor_normal(lora->tok_embeddings_a, rnd);
- randomize_tensor_normal(lora->tok_embeddings_b, rnd);
+ ggml_set_zero(lora->tok_embeddings_b);
randomize_tensor_normal(lora->norm_a, rnd);
- randomize_tensor_normal(lora->norm_b, rnd);
+ ggml_set_zero(lora->norm_b);
randomize_tensor_normal(lora->output_a, rnd);
- randomize_tensor_normal(lora->output_b, rnd);
+ ggml_set_zero(lora->output_b);
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = lora->layers[i];
randomize_tensor_normal(layer.attention_norm_a, rnd);
- randomize_tensor_normal(layer.attention_norm_b, rnd);
+ ggml_set_zero(layer.attention_norm_b);
randomize_tensor_normal(layer.wq_a, rnd);
- randomize_tensor_normal(layer.wq_b, rnd);
+ ggml_set_zero(layer.wq_b);
randomize_tensor_normal(layer.wk_a, rnd);
- randomize_tensor_normal(layer.wk_b, rnd);
+ ggml_set_zero(layer.wk_b);
randomize_tensor_normal(layer.wv_a, rnd);
- randomize_tensor_normal(layer.wv_b, rnd);
+ ggml_set_zero(layer.wv_b);
randomize_tensor_normal(layer.wo_a, rnd);
- randomize_tensor_normal(layer.wo_b, rnd);
+ ggml_set_zero(layer.wo_b);
randomize_tensor_normal(layer.ffn_norm_a, rnd);
- randomize_tensor_normal(layer.ffn_norm_b, rnd);
+ ggml_set_zero(layer.ffn_norm_b);
randomize_tensor_normal(layer.w1_a, rnd);
- randomize_tensor_normal(layer.w1_b, rnd);
+ ggml_set_zero(layer.w1_b);
randomize_tensor_normal(layer.w2_a, rnd);
- randomize_tensor_normal(layer.w2_b, rnd);
+ ggml_set_zero(layer.w2_b);
randomize_tensor_normal(layer.w3_a, rnd);
- randomize_tensor_normal(layer.w3_b, rnd);
+ ggml_set_zero(layer.w3_b);
}
free_random_normal_distribution(rnd);