summaryrefslogtreecommitdiff
path: root/examples/baby-llama/baby-llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'examples/baby-llama/baby-llama.cpp')
-rw-r--r--examples/baby-llama/baby-llama.cpp176
1 files changed, 41 insertions, 135 deletions
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index b02a8086..fb1a15c4 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1,4 +1,5 @@
#include "ggml.h"
+#include "train.h"
#include <vector>
#include <cassert>
#include <random>
@@ -14,31 +15,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
constexpr float rms_norm_eps = 5e-6f;
#endif
-static float frand() {
- return (float)rand()/(float)RAND_MAX;
-}
-
-struct random_normal_distribution {
- std::mt19937 gen;
- std::normal_distribution<float> nd;
- float min;
- float max;
-};
-
-static void init_random_normal_distribution(
- struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
-) {
- rnd->gen = std::mt19937(seed);
- rnd->nd = std::normal_distribution<float>{mean, std};
- rnd->min = min;
- rnd->max = max;
-}
-
-static float frand_normal(struct random_normal_distribution * rnd) {
- const float r = rnd->nd(rnd->gen);
- return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
-}
-
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
@@ -93,54 +69,6 @@ static struct ggml_tensor * randomize_tensor(
return tensor;
}
-static struct ggml_tensor * randomize_tensor_normal(
- struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
-) {
- float scale = 1.0; // xavier
- switch (ndims) {
- case 1:
- scale /= sqrtf(ne[0]);
- for (int i0 = 0; i0 < ne[0]; i0++) {
- ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
- }
- break;
- case 2:
- scale /= sqrtf(ne[0]+ne[1]);
- for (int i1 = 0; i1 < ne[1]; i1++) {
- for (int i0 = 0; i0 < ne[0]; i0++) {
- ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
- }
- }
- break;
- case 3:
- scale /= sqrtf(ne[0]+ne[1]);
- for (int i2 = 0; i2 < ne[2]; i2++) {
- for (int i1 = 0; i1 < ne[1]; i1++) {
- for (int i0 = 0; i0 < ne[0]; i0++) {
- ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
- }
- }
- }
- break;
- case 4:
- scale /= sqrtf(ne[0]+ne[1]);
- for (int i3 = 0; i3 < ne[3]; i3++) {
- for (int i2 = 0; i2 < ne[2]; i2++) {
- for (int i1 = 0; i1 < ne[1]; i1++) {
- for (int i0 = 0; i0 < ne[0]; i0++) {
- ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
- }
- }
- }
- }
- break;
- default:
- assert(false);
- };
-
- return tensor;
-}
-
struct llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
@@ -398,27 +326,29 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl
const uint32_t n_layer = hparams.n_layer;
- struct random_normal_distribution rnd;
- init_random_normal_distribution(&rnd, seed, mean, std, min, max);
- randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
- randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
- randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd);
+ struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
+
+ randomize_tensor_normal(model->tok_embeddings , rnd);
+ randomize_tensor_normal(model->norm , rnd);
+ randomize_tensor_normal(model->output , rnd);
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model->layers[i];
- randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+ randomize_tensor_normal(layer.attention_norm, rnd);
- randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
- randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
- randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
- randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
+ randomize_tensor_normal(layer.wq, rnd);
+ randomize_tensor_normal(layer.wk, rnd);
+ randomize_tensor_normal(layer.wv, rnd);
+ randomize_tensor_normal(layer.wo, rnd);
- randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+ randomize_tensor_normal(layer.ffn_norm, rnd);
- randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
- randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
- randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+ randomize_tensor_normal(layer.w1, rnd);
+ randomize_tensor_normal(layer.w2, rnd);
+ randomize_tensor_normal(layer.w3, rnd);
}
+
+ free_random_normal_distribution(rnd);
}
@@ -429,32 +359,34 @@ static void randomize_model_lora(
const uint32_t n_layer = hparams.n_layer;
- struct random_normal_distribution rnd;
- init_random_normal_distribution(&rnd, seed, mean, std, min, max);
- randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
- randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
- randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd);
- randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd);
+ struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
+
+ randomize_tensor_normal(model->tok_embeddings, rnd);
+ randomize_tensor_normal(model->norm , rnd);
+ randomize_tensor_normal(model->outputa , rnd);
+ randomize_tensor_normal(model->outputb , rnd);
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model->layers[i];
- randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
-
- randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
- randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
- randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
- randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
- randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
- randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
- randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
- randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
-
- randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
-
- randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
- randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
- randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+ randomize_tensor_normal(layer.attention_norm, rnd);
+
+ randomize_tensor_normal(layer.wqa, rnd);
+ randomize_tensor_normal(layer.wqb, rnd);
+ randomize_tensor_normal(layer.wka, rnd);
+ randomize_tensor_normal(layer.wkb, rnd);
+ randomize_tensor_normal(layer.wva, rnd);
+ randomize_tensor_normal(layer.wvb, rnd);
+ randomize_tensor_normal(layer.woa, rnd);
+ randomize_tensor_normal(layer.wob, rnd);
+
+ randomize_tensor_normal(layer.ffn_norm, rnd);
+
+ randomize_tensor_normal(layer.w1, rnd);
+ randomize_tensor_normal(layer.w2, rnd);
+ randomize_tensor_normal(layer.w3, rnd);
}
+
+ free_random_normal_distribution(rnd);
}
static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
@@ -762,32 +694,6 @@ static struct ggml_tensor * forward(
return inpL;
}
-static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
- GGML_ASSERT(tensor->n_dims == 1);
- GGML_ASSERT(tensor->ne[0] == ne0);
-}
-
-static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
- GGML_ASSERT(tensor->n_dims == 2);
- GGML_ASSERT(tensor->ne[0] == ne0);
- GGML_ASSERT(tensor->ne[1] == ne1);
-}
-
-static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
- GGML_ASSERT(tensor->n_dims == 3);
- GGML_ASSERT(tensor->ne[0] == ne0);
- GGML_ASSERT(tensor->ne[1] == ne1);
- GGML_ASSERT(tensor->ne[2] == ne2);
-}
-
-static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
- GGML_ASSERT(tensor->n_dims == 4);
- GGML_ASSERT(tensor->ne[0] == ne0);
- GGML_ASSERT(tensor->ne[1] == ne1);
- GGML_ASSERT(tensor->ne[2] == ne2);
- GGML_ASSERT(tensor->ne[3] == ne3);
-}
-
static struct ggml_tensor * forward_batch(
struct llama_model * model,
struct llama_kv_cache * cache,