diff options
Diffstat (limited to 'examples/train-text-from-scratch')
3 files changed, 532 insertions, 1497 deletions
diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md index f4ffcd98..1b345406 100644 --- a/examples/train-text-from-scratch/README.md +++ b/examples/train-text-from-scratch/README.md @@ -10,9 +10,9 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s ./bin/train-text-from-scratch \ --vocab-model ../models/ggml-vocab-llama.gguf \ --ctx 64 --embd 256 --head 8 --layer 16 \ - --checkpoint-in chk-shakespeare-256x16.gguf \ - --checkpoint-out chk-shakespeare-256x16.gguf \ - --model-out ggml-shakespeare-256x16-f32.gguf \ + --checkpoint-in chk-shakespeare-256x16-LATEST.gguf \ + --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \ + --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \ --train-data "shakespeare.txt" \ -t 6 -b 16 --seed 1 --adam-iter 256 \ --no-checkpointing @@ -20,3 +20,8 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s # predict ./bin/main -m ggml-shakespeare-256x16-f32.gguf ``` + +Output files will be saved every N iterations (config with `--save-every N`). +The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output. + +To train GGUF models just pass them to `--checkpoint-in FN`. diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py index a527d615..351e7bc2 100644 --- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py +++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py @@ -47,10 +47,13 @@ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" -LLM_KV_TRAINING_FILE_VERSION = "training.file_version" -LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" -LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" -LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" +LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model" +LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora" +LLM_KV_TRAINING_TYPE = "training.type" +LLM_KV_TRAINING_FILE_VERSION = "training.file_version" +LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" +LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" +LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" class Tensor: def __init__(self, dtype='f', ne=None): @@ -460,6 +463,7 @@ class Checkpoint: gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32) gguf_writer.add_layer_norm_rms_eps(1e-5) gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0) + gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL) gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its) gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples) gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 5f541a14..d5205aff 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1,6 +1,7 @@ #include "ggml.h" #include "ggml-alloc.h" #include "common.h" +#include "train.h" #include "llama.h" #include <unordered_map> #include <vector> @@ -18,142 +19,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -struct random_normal_distribution { - std::mt19937 gen; - std::normal_distribution<float> rd; - float min; - float max; -}; - -struct random_uniform_distribution { - std::mt19937 gen; - std::uniform_real_distribution<float> rd; -}; - -void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { - rnd->gen = std::mt19937(seed); - rnd->rd = std::normal_distribution<float>{mean, std}; - rnd->min = min; - rnd->max = max; -} - -void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) { - rnd->gen = std::mt19937(seed); - rnd->rd = std::uniform_real_distribution<float>{min, max}; -} - -int clamp(const int v, const int min, const int max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -float fclamp(const float v, const float min, const float max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -float frand() { - return (float)rand()/(float)RAND_MAX; -} - -float frand_normal(struct random_normal_distribution * rnd) { - return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); -} - -float frand_uniform(struct random_uniform_distribution * rnd) { - return rnd->rd(rnd->gen); -} - -struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { - float scale = 1.0f; // xavier - switch (tensor->n_dims) { - case 1: - scale /= sqrtf(tensor->ne[0]); - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = scale * frand_normal(rnd); - } - break; - case 2: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = scale * frand_normal(rnd); - } - } - break; - case 3: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = scale * frand_normal(rnd); - } - } - } - break; - case 4: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = scale * frand_normal(rnd); - } - } - } - } - break; - default: - assert(false); - }; - return tensor; -} - -struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { - switch (tensor->n_dims) { - case 1: - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = frand_uniform(rnd); - } - break; - case 2: - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = frand_uniform(rnd); - } - } - break; - case 3: - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = frand_uniform(rnd); - } - } - } - break; - case 4: - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = frand_uniform(rnd); - } - } - } - } - break; - default: - assert(false); - }; - return tensor; -} +static const size_t tensor_alignment = 32; struct my_llama_hparams { uint32_t n_vocab = 32000; @@ -164,8 +30,8 @@ struct my_llama_hparams { uint32_t n_rot = 64; uint32_t n_ff = 11008; - // float f_norm_eps = 1e-5; // falcon - float f_norm_rms_eps = 1e-5; // llama + // float f_norm_eps = 1e-5f; // falcon + float f_norm_rms_eps = 1e-5f; // llama float rope_freq_base = 10000.0f; float rope_freq_scale = 1.0f; @@ -192,6 +58,7 @@ struct my_llama_layer { struct my_llama_model { struct ggml_context * ctx = NULL; + std::vector<uint8_t> data; my_llama_hparams hparams; @@ -201,92 +68,50 @@ struct my_llama_model { struct ggml_tensor * output; std::vector<my_llama_layer> layers; - - uint32_t train_its = 0; - uint32_t train_samples = 0; - uint32_t train_tokens = 0; }; -// gguf constants -const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; -const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; -const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; -const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; -const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; -const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; -const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; -const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; -const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; -const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; -const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; -const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; -const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; -const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; -const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; -const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; -const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; -const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; - -const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; -const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; -const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; - -const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; - -const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version"; -const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"; -const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"; -const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"; - // gguf constants (sync with gguf.py) - -const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; -const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; - -const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; -const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; -const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; -const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; -const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; -const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; -const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; -const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp -const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; - -const char * LLM_KV_TOKENIZER_MODEL = "tokenizer.ggml.model"; -const char * LLM_KV_TOKENIZER_LIST = "tokenizer.ggml.tokens"; -const char * LLM_KV_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"; -const char * LLM_KV_TOKENIZER_SCORES = "tokenizer.ggml.scores"; -const char * LLM_KV_TOKENIZER_MERGES = "tokenizer.ggml.merges"; -const char * LLM_KV_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"; -const char * LLM_KV_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"; -const char * LLM_KV_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"; -const char * LLM_KV_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"; -const char * LLM_KV_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"; - -const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; -const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; -const char * LLM_TENSOR_OUTPUT = "output"; -const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; -const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; -const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; -const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; -const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; -const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; -const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; -const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; -const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; - -void print_params(struct my_llama_hparams * params) { +static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"; +static const char * LLM_KV_TRAINING_TYPE = "training.type"; + +static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; +static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; + +static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; +static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; +static const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; +static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; +static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; +static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; +static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; +static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp +static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; + +static const char * LLM_KV_TOKENIZER_MODEL = "tokenizer.ggml.model"; +static const char * LLM_KV_TOKENIZER_LIST = "tokenizer.ggml.tokens"; +static const char * LLM_KV_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"; +static const char * LLM_KV_TOKENIZER_SCORES = "tokenizer.ggml.scores"; +static const char * LLM_KV_TOKENIZER_MERGES = "tokenizer.ggml.merges"; +static const char * LLM_KV_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"; +static const char * LLM_KV_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"; +static const char * LLM_KV_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"; +static const char * LLM_KV_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"; +static const char * LLM_KV_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"; + +static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; +static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; +static const char * LLM_TENSOR_OUTPUT = "output"; +static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; +static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; +static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; +static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; +static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; +static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; +static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; +static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; +static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; + +static void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_embd: %d\n", __func__, params->n_embd); @@ -296,7 +121,66 @@ void print_params(struct my_llama_hparams * params) { printf("%s: n_rot: %d\n", __func__, params->n_rot); } -void init_model(struct my_llama_model * model) { +static void set_param_model(struct my_llama_model * model) { + const auto& hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct ggml_context* ctx = model->ctx; + + ggml_set_param(ctx, model->tok_embeddings); + ggml_set_param(ctx, model->norm); + ggml_set_param(ctx, model->output); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_set_param(ctx, layer.attention_norm); + ggml_set_param(ctx, layer.wq); + ggml_set_param(ctx, layer.wk); + ggml_set_param(ctx, layer.wv); + ggml_set_param(ctx, layer.wo); + ggml_set_param(ctx, layer.ffn_norm); + ggml_set_param(ctx, layer.w1); + ggml_set_param(ctx, layer.w2); + ggml_set_param(ctx, layer.w3); + } +} + +static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) { + ggml_allocr_alloc(alloc, model->tok_embeddings); + ggml_allocr_alloc(alloc, model->norm); + ggml_allocr_alloc(alloc, model->output); + for (uint32_t i = 0; i < model->layers.size(); ++i) { + auto & layer = model->layers[i]; + ggml_allocr_alloc(alloc, layer.attention_norm); + ggml_allocr_alloc(alloc, layer.wq); + ggml_allocr_alloc(alloc, layer.wk); + ggml_allocr_alloc(alloc, layer.wv); + ggml_allocr_alloc(alloc, layer.wo); + ggml_allocr_alloc(alloc, layer.ffn_norm); + ggml_allocr_alloc(alloc, layer.w1); + ggml_allocr_alloc(alloc, layer.w2); + ggml_allocr_alloc(alloc, layer.w3); + } + ggml_allocr_alloc(alloc, model->tok_embeddings->grad); + ggml_allocr_alloc(alloc, model->norm->grad); + ggml_allocr_alloc(alloc, model->output->grad); + for (uint32_t i = 0; i < model->layers.size(); ++i) { + auto & layer = model->layers[i]; + ggml_allocr_alloc(alloc, layer.attention_norm->grad); + ggml_allocr_alloc(alloc, layer.wq->grad); + ggml_allocr_alloc(alloc, layer.wk->grad); + ggml_allocr_alloc(alloc, layer.wv->grad); + ggml_allocr_alloc(alloc, layer.wo->grad); + ggml_allocr_alloc(alloc, layer.ffn_norm->grad); + ggml_allocr_alloc(alloc, layer.w1->grad); + ggml_allocr_alloc(alloc, layer.w2->grad); + ggml_allocr_alloc(alloc, layer.w3->grad); + } +} + +static void init_model(struct my_llama_model * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -304,11 +188,6 @@ void init_model(struct my_llama_model * model) { const uint32_t n_vocab = hparams.n_vocab; const uint32_t n_ff = hparams.n_ff; - struct ggml_context * ctx = model->ctx; - - model->train_its = 0; - model->train_samples = 0; - model->train_tokens = 0; std::vector<char> tn_buf; tn_buf.resize(GGML_MAX_NAME); @@ -323,6 +202,15 @@ void init_model(struct my_llama_model * model) { return tn_buf.data(); }; + // context for model tensors without their data + struct ggml_init_params ctx_model_params; + ctx_model_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18); + ctx_model_params.mem_buffer = NULL; + ctx_model_params.no_alloc = true; + + struct ggml_context * ctx = ggml_init(ctx_model_params); + model->ctx = ctx; + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); @@ -361,288 +249,53 @@ void init_model(struct my_llama_model * model) { ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i)); ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i)); } -} -void set_param_model(struct my_llama_model * model) { - const auto& hparams = model->hparams; + set_param_model(model); - const uint32_t n_layer = hparams.n_layer; - - struct ggml_context* ctx = model->ctx; + // measure data size + struct ggml_allocr * alloc = NULL; + alloc = ggml_allocr_new_measure(tensor_alignment); + alloc_model(alloc, model); - ggml_set_param(ctx, model->tok_embeddings); - ggml_set_param(ctx, model->norm); - ggml_set_param(ctx, model->output); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - ggml_set_param(ctx, layer.attention_norm); - ggml_set_param(ctx, layer.wq); - ggml_set_param(ctx, layer.wk); - ggml_set_param(ctx, layer.wv); - ggml_set_param(ctx, layer.wo); - ggml_set_param(ctx, layer.ffn_norm); - ggml_set_param(ctx, layer.w1); - ggml_set_param(ctx, layer.w2); - ggml_set_param(ctx, layer.w3); - } + // allocate data + model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment); + ggml_allocr_free(alloc); + alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment); + alloc_model(alloc, model); + ggml_allocr_free(alloc); } -void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { +static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { const auto & hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; - struct random_normal_distribution rnd; - init_random_normal_distribution(&rnd, seed, mean, std, min, max); + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - randomize_tensor_normal(model->tok_embeddings, &rnd); - randomize_tensor_normal(model->norm, &rnd); - randomize_tensor_normal(model->output, &rnd); + randomize_tensor_normal(model->tok_embeddings, rnd); + randomize_tensor_normal(model->norm, rnd); + randomize_tensor_normal(model->output, rnd); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, &rnd); - - randomize_tensor_normal(layer.wq, &rnd); - randomize_tensor_normal(layer.wk, &rnd); - randomize_tensor_normal(layer.wv, &rnd); - randomize_tensor_normal(layer.wo, &rnd); - - randomize_tensor_normal(layer.ffn_norm, &rnd); - - randomize_tensor_normal(layer.w1, &rnd); - randomize_tensor_normal(layer.w2, &rnd); - randomize_tensor_normal(layer.w3, &rnd); - } -} - -void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->n_dims == 1); - GGML_ASSERT(tensor->ne[0] == ne0); -} - -void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->n_dims == 2); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); -} - -void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->n_dims == 3); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); -} - -void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->n_dims == 4); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); - GGML_ASSERT(tensor->ne[3] == ne3); -} - -static size_t hash(void * p) { - return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; -} - -static size_t hash_find(void * hash_table[], void * p) { - size_t h = hash(p); - - // linear probing - size_t i = h; - while (hash_table[i] != NULL && hash_table[i] != p) { - i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; - if (i == h) { - // visited all hash table entries -> not found - return GGML_GRAPH_HASHTABLE_SIZE; - } - } - return i; -} - -static bool hash_insert(void * hash_table[], void * p) { - //size_t h = hash(p); - size_t i = hash_find(hash_table, p); - - GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - - if (hash_table[i] == p) { - return true; - } - - // insert - GGML_ASSERT(hash_table[i] == NULL); - hash_table[i] = p; - return false; -} - -static bool hash_contains(void * hash_table[], void * p) { - size_t i = hash_find(hash_table, p); - return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p); -} - -struct hash_map { - void * keys[GGML_GRAPH_HASHTABLE_SIZE]; - void * vals[GGML_GRAPH_HASHTABLE_SIZE]; -}; -//static const size_t HASH_MAP_SIZE = sizeof(struct hash_map); - -struct hash_map * new_hash_map() { - struct hash_map * result = new struct hash_map; - for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) { - result->keys[i] = NULL; - result->vals[i] = NULL; - } - return result; -}; - -void free_hash_map(struct hash_map * map) { - delete map; -} - -static bool ggml_is_view(struct ggml_tensor * t) { - return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || - t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY; -} - -static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) { - switch (t->op) { - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - return t->src[0]; - case GGML_OP_CPY: - return t->src[1]; - default: - return NULL; - } -} - -static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { - struct ggml_tensor * parent = t; - do { - parent = get_view_parent(parent); - } while (ggml_is_view(parent)); - return parent; -} - -struct ggml_tensor * ggml_recompute_graph_node( - struct ggml_context * ctx, - struct ggml_cgraph * graph, - struct hash_map * replacements, - struct ggml_tensor * node) { - - if (node == NULL) { - return NULL; - } - - if (node->is_param) { - return node; - } - - if (!hash_contains(graph->visited_hash_table, node)) { - return node; - } - - int count_children = 0; - for (int k = 0; k < GGML_MAX_SRC; ++k) { - if (node->src[k]) { - ++count_children; - } - } - - if (count_children == 0) { - return node; - } + randomize_tensor_normal(layer.attention_norm, rnd); - size_t i = hash_find(replacements->keys, node); - GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - if (replacements->keys[i] == node) { - return (struct ggml_tensor *) replacements->vals[i]; - } - - struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); + randomize_tensor_normal(layer.wq, rnd); + randomize_tensor_normal(layer.wk, rnd); + randomize_tensor_normal(layer.wv, rnd); + randomize_tensor_normal(layer.wo, rnd); - // insert clone into replacements - GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite - replacements->keys[i] = node; - replacements->vals[i] = clone; + randomize_tensor_normal(layer.ffn_norm, rnd); - clone->op = node->op; - clone->grad = node->grad; - clone->is_param = node->is_param; - clone->extra = node->extra; - for (int k = 0; k < GGML_MAX_DIMS; ++k) { - clone->nb[k] = node->nb[k]; - } - for (int k = 0; k < GGML_MAX_SRC; ++k) { - clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]); - } - if (ggml_is_view(clone)) { - struct ggml_tensor * source = get_view_source(clone); - GGML_ASSERT(source != NULL); - clone->data = source->data; + randomize_tensor_normal(layer.w1, rnd); + randomize_tensor_normal(layer.w2, rnd); + randomize_tensor_normal(layer.w3, rnd); } - GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t))); - GGML_ASSERT(sizeof(node->name) == GGML_MAX_NAME); - memcpy(clone->op_params, node->op_params, sizeof(node->op_params)); - ggml_format_name(clone, "%s (clone)", ggml_get_name(node)); - - return clone; -}; - -void ggml_build_backward_gradient_checkpointing( - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - struct ggml_cgraph * gb_tmp, - struct ggml_tensor * * checkpoints, - int n_checkpoints) { - *gb_tmp = *gf; - ggml_build_backward_expand(ctx, gf, gb_tmp, true); - - if (n_checkpoints <= 0) { - *gb = *gb_tmp; - return; - } - - struct hash_map * replacements = new_hash_map(); - - // insert checkpoints in replacements - for (int i = 0; i < n_checkpoints; ++i) { - size_t k = hash_find(replacements->keys, checkpoints[i]); - GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite - replacements->keys[k] = checkpoints[i]; - replacements->vals[k] = checkpoints[i]; - } - - *gb = *gf; - // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes], - // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]), - // by recomputing them from checkpoints - for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) { - struct ggml_tensor * node = gb_tmp->nodes[i]; - for (int k = 0; k < GGML_MAX_SRC; ++k) { - // insert new tensors recomputing src, reusing already made replacements, - // remember replacements: remember new tensors with mapping from corresponding gf nodes - // recurse for input tensors, - // unless (i.e. terminating when) input tensors are checkpoints - node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]); - } - // insert rewritten backward node with replacements made into resulting backward graph gb - ggml_build_forward_expand(gb, node); - } - - free_hash_map(replacements); + free_random_normal_distribution(rnd); } -struct ggml_tensor * llama_build_train_graphs( +static struct ggml_tensor * llama_build_train_graphs( struct my_llama_model * model, struct ggml_allocr * alloc, struct ggml_context * ctx, @@ -714,7 +367,7 @@ struct ggml_tensor * llama_build_train_graphs( checkpoints.push_back(t00); checkpoints.push_back(t01); - struct ggml_tensor * kv_scale; + struct ggml_tensor * kv_scale = NULL; if (!enable_flash_attn) { kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); } @@ -797,21 +450,14 @@ struct ggml_tensor * llama_build_train_graphs( ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); // KQ_pos ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); - GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad)); + GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); + ggml_allocr_alloc(alloc, t36->grad); - // gradient tensors (will be set to zero by ggml_graph_reset) - // pinning these produces large unnecessary memory overhead, which will be resolved by PR 2632 - for (int i = 0; i < gf->n_nodes; ++i) { - if (!gf->grads[i]) continue; - if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) { - ggml_allocr_alloc(alloc, gf->grads[i]); - } - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one)); - } + // allocating checkpoints in one block to reduce memory fragmentation // note: they will be freed in reverse order for (int i = 0; i < (int) checkpoints.size(); ++i) { - if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) { + if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { ggml_allocr_alloc(alloc, checkpoints[i]); } } @@ -836,194 +482,6 @@ struct ggml_tensor * llama_build_train_graphs( return t36; } -void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *ptr = value; -} - -void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *ptr = value; -} - -void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) { - int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *ptr = value; -} - -float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - return *ptr; -} - -int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - return *ptr; -} - -void print_row(struct ggml_tensor * probs, int i) { - for (int k = 0; k < probs->ne[0]; ++k) { - float p = get_f32_2d(probs, k, i); - printf(" %.2f", p); - } - printf("\n"); -} - -void print_matrix(struct ggml_tensor * probs) { - assert(probs->n_dims == 2); - for (int i = 0; i < probs->ne[1]; ++i) { - for (int k = 0; k < probs->ne[0]; ++k) { - float p = get_f32_2d(probs, k, i); - printf(" %.2f", p); - } - printf("\n"); - } -} - -void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { - int n_tokens = tokens_input->ne[0]; - int n_vocab = target_logits->ne[0]; - - size_t sample = train_samples[example_id % n_train_samples]; - GGML_ASSERT(sample+n_tokens-1 < n_train_data); - - ggml_set_f32(target_logits, -1.0f/n_vocab); - ggml_set_f32(target_probs, 0.0f); - ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx)); - for (int i=1; i<n_tokens+1; ++i) { - int token = clamp(train_data[sample+i-1], 0, n_vocab-1); - set_f32_2d(target_logits, token, i-1, +1.0f); - set_f32_2d(target_probs, token, i-1, +1.0f); - if (i<n_tokens) { - ggml_set_i32_1d(tokens_input, i, token); - } - } -} - -void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { - GGML_ASSERT(tokens_input->n_dims == 2); - GGML_ASSERT(target_logits->n_dims == 3); - GGML_ASSERT(target_probs->n_dims == 3); - int n_vocab = target_logits->ne[0]; - int n_tokens = tokens_input->ne[0]; - int n_batch = tokens_input->ne[1]; - GGML_ASSERT(n_tokens == target_logits->ne[1]); - GGML_ASSERT(n_batch == target_logits->ne[2]); - GGML_ASSERT(n_vocab == target_probs->ne[0]); - GGML_ASSERT(n_tokens == target_probs->ne[1]); - GGML_ASSERT(n_batch == target_probs->ne[2]); - - ggml_set_f32(target_logits, -1.0f/n_vocab); - ggml_set_f32(target_probs, 0.0f); - // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); - for (int k=0; k<n_batch; ++k) { - // printf("%s: batch %d\n", __func__, k); - size_t sample_idx = (example_id*n_batch + k) % n_train_samples; - size_t sample = train_samples[sample_idx]; - // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample); - GGML_ASSERT(sample+n_tokens-1 < n_train_data); - - set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx)); - for (int i=1; i<n_tokens+1; ++i) { - int token = clamp(train_data[sample+i-1], 0, n_vocab-1); - set_f32_3d(target_logits, token, i-1, k, +1.0f); - set_f32_3d(target_probs, token, i-1, k, +1.0f); - if (i<n_tokens) { - set_i32_2d(tokens_input, i, k, token); - } - } - } -} - -int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) { - FILE * fp = std::fopen(filename, "rb"); - if (fp == NULL) { - return 0; - } - -#ifdef _WIN32 - GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_END) == 0); -#else - GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_END) == 0); -#endif - - size_t size = 0; -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); - size = ret; -#else - long ret = std::ftell(fp); - size = ret; -#endif - -#ifdef _WIN32 - GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_SET) == 0); -#else - GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_SET) == 0); -#endif - - std::vector<char> buf; - buf.resize(size+1); - out.resize(size+1); - - if (std::fread(buf.data(), size, 1, fp) != 1) { - die("unexpectedly reached end of file"); - } - if (ferror(fp)) { - die_fmt("fread failed: %s", strerror(errno)); - } - - buf[size] = '\0'; - - int n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false); - if (n_tokens < 0) { - out.resize(-n_tokens); - n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false); - } - GGML_ASSERT(n_tokens >= 0); - out.resize(n_tokens); - - bool verify = false; - if (verify) { - const char * in = buf.data(); - const char * end = buf.data() + buf.size(); - for (int i = 0; i < (int) out.size(); ++i) { - std::string s = llama_token_to_piece(lctx, out[i]); - int len = s.length(); - if (in >= end) { - printf("%s: unexpected end of original text.\n", __func__); - break; - } - const bool matches = (strncmp(in, s.c_str(), len) == 0); - if (matches) { - in += len; - } else { - printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str()); - } - } - } - - return n_tokens; -} - -void shuffle_ints(int * begin, int * end) { - if (end <= begin) return; - int max=begin[0]; - for (int i=1; i<end-begin; ++i) { - if (begin[i] > max) { - max = begin[i]; - } - } - std::vector<float> vals; - vals.resize(max+1); - for (int i=0; i<max+1; ++i) { - vals[i] = frand(); - } - std::sort(begin, end, [&vals](int a, int b){ - return vals.at(a) < vals.at(b); - }); -} - #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ { \ const std::string skey(key); \ @@ -1039,159 +497,7 @@ void shuffle_ints(int * begin, int * end) { } \ } - -bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) { - GGML_ASSERT(a != NULL); - GGML_ASSERT(b != NULL); - GGML_ASSERT(a->type == b->type); - GGML_ASSERT(ggml_are_same_shape(a, b)); - GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); - - return true; -} - -void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { - if (dst == NULL) { - return; - } - struct ggml_tensor * t = ggml_get_tensor(ctx, name); - GGML_ASSERT(are_same_layout(dst, t)); - memcpy(dst->data, t->data, ggml_nbytes(t)); - - if (strlen(ggml_get_name(dst)) == 0) { - ggml_set_name(dst, name); - } -} - -void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { - // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read - - uint32_t file_version; - GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); - GGML_ASSERT(file_version == 0); - - GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); - GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); - GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); - - uint64_t nx; - GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); - opt->nx = (size_t) nx; - - // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know - - std::string opt_type; - GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); - if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { - opt->params.type = GGML_OPT_ADAM; - - GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); - GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); - - GGML_ASSERT(opt->ctx != NULL); - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - read_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - read_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_LBFGS; - - GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); - GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); - GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); - GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); - GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); - GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); - - GGML_ASSERT(opt->ctx != NULL); - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - read_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - read_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - read_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - read_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - read_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - read_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - } else { - die("unknown optimizer type"); - } -} - -void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); - gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); - gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); - - switch (opt->params.type) { - case GGML_OPT_ADAM: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); - - ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - if (opt->adam.pf) { - ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } - - gguf_add_tensor(fctx, opt->adam.m); - gguf_add_tensor(fctx, opt->adam.v); - if (opt->adam.pf) { - gguf_add_tensor(fctx, opt->adam.pf); - } - } break; - case GGML_OPT_LBFGS: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); - - ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - if (opt->lbfgs.pf) { - ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - } - ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - - gguf_add_tensor(fctx, opt->lbfgs.x); - gguf_add_tensor(fctx, opt->lbfgs.xp); - gguf_add_tensor(fctx, opt->lbfgs.g); - gguf_add_tensor(fctx, opt->lbfgs.gp); - gguf_add_tensor(fctx, opt->lbfgs.d); - if (opt->lbfgs.pf) { - gguf_add_tensor(fctx, opt->lbfgs.pf); - } - gguf_add_tensor(fctx, opt->lbfgs.lmal); - gguf_add_tensor(fctx, opt->lbfgs.lmys); - gguf_add_tensor(fctx, opt->lbfgs.lms); - gguf_add_tensor(fctx, opt->lbfgs.lmy); - } break; - } -} - -void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { +static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read std::string arch; @@ -1243,26 +549,26 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g init_model(model); - read_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD)); - read_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); - read_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); + copy_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD)); + copy_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); + copy_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; - read_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i)); - read_tensor_by_name(layer.wq, f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i)); - read_tensor_by_name(layer.wk, f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i)); - read_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); - read_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); - read_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); - read_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); - read_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); - read_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); + copy_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i)); + copy_tensor_by_name(layer.wq, f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i)); + copy_tensor_by_name(layer.wk, f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i)); + copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); + copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); + copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); + copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); + copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); + copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); } } -void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) { +static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) { const char * arch = "llama"; enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; @@ -1405,7 +711,8 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod } } -void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) { +static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) { + printf("%s: saving to %s\n", __func__, filename); struct gguf_context * fctx = gguf_init_empty(); save_llama_model_gguf(fctx, fn_vocab_model, model); @@ -1416,32 +723,24 @@ void save_llama_model_file(const char * filename, const char * fn_vocab_model, s gguf_free(fctx); } -void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) { +static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct train_state * train) { load_llama_model_gguf(fctx, f_ggml_ctx, model); - - uint32_t file_version; - GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION); - GGML_ASSERT(file_version == 0); - - GGUF_GET_KEY(fctx, model->train_its, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT); - GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT); - GGUF_GET_KEY(fctx, model->train_tokens, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT); - - load_opt_context_gguf(fctx, f_ggml_ctx, opt); + if (load_train_state_gguf(fctx, f_ggml_ctx, train)) { + std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL; + GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE); + GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL); + } else { + printf("%s: loaded llama model as checkpoint\n", __func__); + } } -void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) { +static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) { + gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL); save_llama_model_gguf(fctx, fn_vocab_model, model); - - gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 0); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT, model->train_samples); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT, model->train_tokens); - - save_opt_context_gguf(fctx, opt); + save_train_state_gguf(fctx, train); } -bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) { +static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) { struct ggml_context * f_ggml_ctx; struct gguf_init_params params; params.no_alloc = false; @@ -1451,15 +750,16 @@ bool load_checkpoint_file(const char * filename, struct my_llama_model * model, return false; } - load_checkpoint_gguf(fctx, f_ggml_ctx, model, opt); + load_checkpoint_gguf(fctx, f_ggml_ctx, model, train); return true; } -void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) { +static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) { + printf("%s: saving to %s\n", __func__, filename); struct gguf_context * fctx = gguf_init_empty(); - save_checkpoint_gguf(fctx, fn_vocab_model, model, opt); + save_checkpoint_gguf(fctx, fn_vocab_model, model, train); // write file const bool only_meta = false; @@ -1467,33 +767,13 @@ void save_checkpoint_file(const char * filename, const char * fn_vocab_model, st gguf_free(fctx); } -float cosine_decay(const int decay_steps, const float minimum, int step) { - if (step > decay_steps) { - step = decay_steps; - } - const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); - const float decay = (1 - minimum)*cosine_decay + minimum; - return decay; -} - -float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) { - if (enable_restart) { - while (step > decay_steps) { - step -= decay_steps; - decay_steps = (int) restart_step_mult * decay_steps; - } - } - return cosine_decay(decay_steps, minimum, step); -} - struct train_params { + struct train_params_common common; + const char * fn_vocab_model; - const char * fn_train_data; - const char * fn_checkpoint_in; - const char * fn_checkpoint_out; const char * fn_model_out; - uint32_t seed; + bool only_write_model; int n_ctx; int n_embd; @@ -1501,58 +781,18 @@ struct train_params { int n_layer; int n_ff; - int n_threads; - int n_batch; - int n_examples; - float f_norm_rms_eps; float rope_freq_base; float rope_freq_scale; - - int print_info_interval; - - bool samples_start_after_nl; - bool use_adam; - bool use_flash; - bool use_checkpointing; - bool use_alloc; - - // only adam - int warmup; - int cos_decay_steps; - float cos_decay_restart; - float cos_decay_min; - bool enable_restart; - - int opt_past; - float opt_delta; - int opt_max_no_improvement; - - int lbfgs_n_iter; - int adam_n_iter; - float adam_alpha; - float adam_min_alpha; - float adam_decay; - int adam_decay_min_ndim; - float adam_beta1; - float adam_beta2; - float adam_gclip; - float adam_eps_f; - - int mem_model_gb; - int mem_compute_gb; - int mem_compute0_gb; }; struct train_params get_default_train_params() { struct train_params params; + params.common = get_default_train_params_common(); params.fn_vocab_model = "ggml-vic7b-uncensored-q4_0.bin"; - params.fn_train_data = "shakespeare.txt"; - params.fn_checkpoint_in = "checkpoint.bin"; - params.fn_checkpoint_out = "checkpoint.bin"; params.fn_model_out = "ggml-checkpoint-f32.bin"; - params.seed = -1; + params.only_write_model = false; params.n_ctx = 128; params.n_embd = 256; @@ -1560,62 +800,22 @@ struct train_params get_default_train_params() { params.n_layer = 16; params.n_ff = 768; - params.n_threads = 6; - params.n_batch = 8; - params.n_examples = 1; - - params.f_norm_rms_eps = 1e-5; + params.f_norm_rms_eps = 1e-5f; params.rope_freq_base = 10000.0f; params.rope_freq_scale = 1.0f; - params.print_info_interval = 1; - - params.samples_start_after_nl = false; - params.use_adam = true; - params.use_flash = true; - params.use_checkpointing = true; - params.use_alloc = true; - - params.opt_past = 0; - params.opt_delta = 1e-5f; - params.opt_max_no_improvement = 0; - - // only adam - params.warmup = 100; - params.cos_decay_steps = 1000; - params.cos_decay_restart = 1.1f; - params.cos_decay_min = 0.1f; - params.enable_restart = false; - - params.lbfgs_n_iter = 256; - params.adam_n_iter = 256; - params.adam_alpha = 1e-3f; - params.adam_min_alpha = 0; - params.adam_decay = 1e-1f; - params.adam_decay_min_ndim = 2; - params.adam_beta1 = 0.9f; - params.adam_beta2 = 0.999f; - params.adam_gclip = 1.0f; - params.adam_eps_f = 0.0f; - - params.mem_model_gb = 2; - params.mem_compute_gb = 24; - params.mem_compute0_gb = 8; return params; } -void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) { +static void train_print_usage(int argc, char ** argv, const struct train_params * params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " --vocab-model FNAME model path from which to load vocab (default '%s')\n", params->fn_vocab_model); - fprintf(stderr, " --train-data FNAME path from which to load training data (default '%s')\n", params->fn_train_data); - fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in); - fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out); fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n"); - fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx); + fprintf(stderr, " --only-write-model only save llama model, don't do any training. use this if you only want to convert a checkpoint to a model.\n"); fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd); fprintf(stderr, " --ff N Feedforward size used for new models. (default %d)\n", params->n_ff); fprintf(stderr, " --head N Number of heads for new models (default %d)\n", params->n_head); @@ -1623,45 +823,11 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); - fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); - fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); - fprintf(stderr, " -n N, --examples N Number of examples to train (default %d)\n", params->n_examples); - fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval); - fprintf(stderr, " --samples-after-nl Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off"); - fprintf(stderr, " --use-lbfgs Use LBFGS optimizer instead of default Adam\n"); - fprintf(stderr, " --use-adam Use Adam optimizer (default)\n"); - fprintf(stderr, " --no-flash Don't use flash attention \n"); - fprintf(stderr, " --use-flash Use flash attention (default)\n"); - fprintf(stderr, " --no-checkpointing Don't use gradient checkpointing\n"); - fprintf(stderr, " --use-checkpointing Use gradient checkpointing (default)\n"); - fprintf(stderr, " --no-alloc Don't use allocator\n"); - fprintf(stderr, " --use-alloc Use allocator (default)\n"); - fprintf(stderr, " --warmup N Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup); - fprintf(stderr, " --cos-decay-steps N Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps); - fprintf(stderr, " --cos-decay-restart N Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); - fprintf(stderr, " --cos-decay-min N Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min); - fprintf(stderr, " --enable-restart N Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : ""); - fprintf(stderr, " --disable-restart N Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : ""); - fprintf(stderr, " --opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past); - fprintf(stderr, " --opt-delta N Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta); - fprintf(stderr, " --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement); - fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); - fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); - fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); - fprintf(stderr, " --adam-min-alpha N Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha); - fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); - fprintf(stderr, " --adam-decay-min-ndim N Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim); - fprintf(stderr, " --adam-beta1 N AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1); - fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); - fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); - fprintf(stderr, " --lbfgs-iter N Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter); - fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb); - fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb); - fprintf(stderr, " --mem-compute0 N Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb); - fprintf(stderr, "\n"); + + print_common_train_usage(argc, argv, ¶ms->common); } -bool train_params_parse(int argc, char ** argv, struct train_params * params) { +static bool train_params_parse(int argc, char ** argv, struct train_params * params) { bool invalid_param = false; std::string arg; struct train_params default_params = get_default_train_params(); @@ -1673,48 +839,27 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { std::replace(arg.begin(), arg.end(), '_', '-'); } - if (arg == "--vocab-model") { - if (++i >= argc) { - invalid_param = true; + if (consume_common_train_arg(argc, argv, &i, ¶ms->common, &invalid_param)) { + if (invalid_param) { break; + } else if (params->common.print_usage) { + train_print_usage(argc, argv, &default_params); + exit(0); } - params->fn_vocab_model = argv[i]; - } else if (arg == "--train-data") { + } else if (arg == "--vocab-model") { if (++i >= argc) { invalid_param = true; break; } - params->fn_train_data = argv[i]; - } else if (arg == "--checkpoint-in") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_checkpoint_in = argv[i]; - } else if (arg == "--checkpoint-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_checkpoint_out = argv[i]; + params->fn_vocab_model = argv[i]; } else if (arg == "--model-out") { if (++i >= argc) { invalid_param = true; break; } params->fn_model_out = argv[i]; - } else if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->seed = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_ctx = std::stoi(argv[i]); + } else if (arg == "--only-write-model") { + params->only_write_model = true; } else if (arg == "--embd") { if (++i >= argc) { invalid_param = true; @@ -1757,175 +902,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->rope_freq_scale = std::stof(argv[i]); - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_threads = std::stoi(argv[i]); - } else if (arg == "-b" || arg == "--batch") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_batch = std::stoi(argv[i]); - } else if (arg == "-n" || arg == "--examples") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_examples = std::stoi(argv[i]); - } else if (arg == "--print-info-interval") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->print_info_interval = std::stoi(argv[i]); - } else if (arg == "--samples-after-nl") { - params->samples_start_after_nl = true; - } else if (arg == "--use-lbfgs") { - params->use_adam = false; - } else if (arg == "--use-adam") { - params->use_adam = true; - } else if (arg == "--no-flash") { - params->use_flash = false; - } else if (arg == "--use-flash") { - params->use_flash = true; - } else if (arg == "--no-checkpointing") { - params->use_checkpointing = false; - } else if (arg == "--use-checkpointing") { - params->use_checkpointing = true; - } else if (arg == "--no-alloc") { - params->use_alloc = false; - } else if (arg == "--use-alloc") { - params->use_alloc = true; - } else if (arg == "--warmup") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->warmup = std::stoi(argv[i]); - } else if (arg == "--cos-decay-steps") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->cos_decay_steps = std::stof(argv[i]); - } else if (arg == "--cos-decay-restart") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->cos_decay_restart = std::stof(argv[i]); - } else if (arg == "--cos-decay-min") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->cos_decay_min = std::stof(argv[i]); - } else if (arg == "--enable-restart") { - params->enable_restart = true; - } else if (arg == "--disable-restart") { - params->enable_restart = false; - } else if (arg == "--opt-past") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->opt_past = std::stoi(argv[i]); - } else if (arg == "--opt-delta") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->opt_delta = std::stof(argv[i]); - } else if (arg == "--opt-max-no-improvement") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->opt_max_no_improvement = std::stoi(argv[i]); - } else if (arg == "--adam-epsf") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_eps_f = std::stof(argv[i]); - } else if (arg == "--adam-iter") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_n_iter = std::stoi(argv[i]); - } else if (arg == "--adam-alpha") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_alpha = std::stof(argv[i]); - } else if (arg == "--adam-min-alpha") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_min_alpha = std::stof(argv[i]); - } else if (arg == "--adam-decay") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_decay = std::stof(argv[i]); - } else if (arg == "--adam-decay-min-ndim") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_decay_min_ndim = std::stoi(argv[i]); - } else if (arg == "--adam-beta1") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_beta1 = std::stof(argv[i]); - } else if (arg == "--adam-beta2") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_beta2 = std::stof(argv[i]); - } else if (arg == "--adam-gclip") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_gclip = std::stof(argv[i]); - } else if (arg == "--lbfgs-iter") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->lbfgs_n_iter = std::stoi(argv[i]); - } else if (arg == "--mem-model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_model_gb = std::stoi(argv[i]); - } else if (arg == "--mem-compute") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_compute_gb = std::stoi(argv[i]); - } else if (arg == "--mem-compute0") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_compute0_gb = std::stoi(argv[i]); - } else if (arg == "-h" || arg == "--help") { - train_print_usage(argc, argv, &default_params); - exit(0); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); train_print_usage(argc, argv, &default_params); @@ -1937,65 +913,54 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { train_print_usage(argc, argv, &default_params); exit(1); } + finish_processing_train_args(¶ms->common); return true; } -struct opt_callback_data { - struct train_params * params; - struct ggml_opt_context * opt; - struct llama_context * lctx; - llama_token * tokens_data; - size_t tokens_size; - int * samples_data; - size_t samples_size; - int shuffle_countdown; - struct ggml_tensor * tokens_input; - struct ggml_tensor * target_logits; - struct ggml_tensor * target_probs; +struct save_train_files_data { + const char * fn_checkpoint_out; + const char * fn_model_out; + const char * fn_vocab_model; + const char * pattern_fn_it; + const char * fn_latest; + struct my_llama_model * model; }; -void opt_callback(void * vdata, float * sched) { - struct opt_callback_data * data = (struct opt_callback_data *) vdata; - struct train_params * params = data->params; - struct ggml_opt_context * opt = data->opt; - int n_batch = params->n_batch; - - *sched = (opt->iter < params->warmup) - ? (float) opt->iter / (float) params->warmup - : cosine_decay_restart( - params->cos_decay_steps, - params->cos_decay_min, - opt->iter - params->warmup, - params->cos_decay_restart, - params->enable_restart); - float min_sched = params->adam_min_alpha / params->adam_alpha; - *sched = min_sched + *sched * (1.0f - min_sched); - - int impr_plot = std::isnan(opt->loss_after) ? 0 : -std::lround(1 + (opt->loss_before - opt->loss_after) * 10.0f); - printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0); - - if (data->shuffle_countdown < n_batch) { - printf("%s: reshuffle samples\n", __func__); - shuffle_ints(data->samples_data, data->samples_data + data->samples_size); - for (int i = 0; i < (int) data->samples_size; ++i) { - GGML_ASSERT(data->samples_data[i]+params->n_ctx-1 < (int) data->tokens_size); - } - data->shuffle_countdown = data->samples_size; +static void save_train_files(void * vdata, struct train_state * train) { + struct save_train_files_data * data = (struct save_train_files_data *) vdata; + int64_t iter = train->opt->iter; + + if (strlen(data->fn_checkpoint_out) > 0) { + save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model, train); + save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->fn_vocab_model, data->model, train); + + } + if (strlen(data->fn_model_out) > 0) { + save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model); + save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->fn_vocab_model, data->model); } +} + +static int64_t get_parameter_count(struct my_llama_model* model) { + int64_t nx = 0; + nx += ggml_nelements(model->tok_embeddings); + nx += ggml_nelements(model->norm); + nx += ggml_nelements(model->output); - get_example_targets_batch( - data->lctx, - data->samples_data, - data->samples_size, - data->tokens_data, - data->tokens_size, - opt->iter, - data->tokens_input, - data->target_logits, - data->target_probs); - - data->shuffle_countdown -= n_batch; + for (uint32_t i = 0; i < model->layers.size(); ++i) { + auto & layer = model->layers[i]; + nx += ggml_nelements(layer.attention_norm); + nx += ggml_nelements(layer.wq); + nx += ggml_nelements(layer.wk); + nx += ggml_nelements(layer.wv); + nx += ggml_nelements(layer.wo); + nx += ggml_nelements(layer.ffn_norm); + nx += ggml_nelements(layer.w1); + nx += ggml_nelements(layer.w2); + nx += ggml_nelements(layer.w3); + } + return nx; } int main(int argc, char ** argv) { @@ -2005,11 +970,11 @@ int main(int argc, char ** argv) { return 1; } - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); + if (params.common.seed == LLAMA_DEFAULT_SEED) { + params.common.seed = time(NULL); } - printf("%s: seed: %u\n", __func__, params.seed); - srand(params.seed); + printf("%s: seed: %u\n", __func__, params.common.seed); + srand(params.common.seed); struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; @@ -2017,16 +982,9 @@ int main(int argc, char ** argv) { struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params); struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); - printf("%s: tokenize training data\n", __func__); - std::vector<llama_token> train_tokens; - if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) { - fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data); - } - printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size()); - struct my_llama_model model; model.hparams.n_vocab = llama_n_vocab(lctx); - model.hparams.n_ctx = params.n_ctx; + model.hparams.n_ctx = params.common.n_ctx; model.hparams.n_embd = params.n_embd; model.hparams.n_head = params.n_head; model.hparams.n_layer = params.n_layer; @@ -2037,243 +995,311 @@ int main(int argc, char ** argv) { model.hparams.rope_freq_base = params.rope_freq_base; model.hparams.rope_freq_scale = params.rope_freq_scale; - print_params(&model.hparams); - - std::vector<size_t> token_noccurs; - std::vector<bool> token_notavail; - token_noccurs.resize(model.hparams.n_vocab, 0); - token_notavail.resize(model.hparams.n_vocab, true); - for (int i = 0; i < (int) train_tokens.size(); ++i) { - ++token_noccurs[train_tokens[i]]; - token_notavail[train_tokens[i]] = false; - } - - std::vector<float> token_freq; - token_freq.resize(model.hparams.n_vocab, 0); - int n_unique_tokens = 0; - for (int i = 0; i < (int) token_noccurs.size(); ++i) { - token_freq[i] = (float) token_noccurs[i] / (float) train_tokens.size(); - n_unique_tokens += (token_noccurs[i] > 0) ? 1 : 0; - } - printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); + struct train_state * train = init_train_state(); + struct ggml_opt_context * opt = train->opt; + + // set opt params from command line + opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params.print_forward_graph = false; + opt->params.print_backward_graph = false; + opt->params.n_threads = params.common.n_threads; + opt->params.past = params.common.opt_past; + opt->params.delta = params.common.opt_delta; + opt->params.max_no_improvement = params.common.opt_max_no_improvement; + opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation; + opt->params.adam.n_iter = params.common.adam_n_iter; + opt->params.adam.sched = 1.0f; + opt->params.adam.alpha = params.common.adam_alpha; + opt->params.adam.decay = params.common.adam_decay; + opt->params.adam.decay_min_ndim = params.common.adam_decay_min_ndim; + opt->params.adam.beta1 = params.common.adam_beta1; + opt->params.adam.beta2 = params.common.adam_beta2; + opt->params.adam.gclip = params.common.adam_gclip; + opt->params.adam.eps_f = params.common.adam_eps_f; - struct ggml_init_params lcparams; - lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); - lcparams.mem_buffer = NULL; - lcparams.no_alloc = false; + printf("%s: init model\n", __func__); + bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train); + if (existed) { + // overwrite last n_ctx with user provided n_ctx + if (params.common.custom_n_ctx) { + model.hparams.n_ctx = params.common.n_ctx; + } - model.ctx = ggml_init(lcparams); + const bool opt_past_changed = opt->params.past != params.common.opt_past; - int n_tokens = model.hparams.n_ctx; - int n_vocab = model.hparams.n_vocab; - int n_batch = params.n_batch; - - struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); - memset(opt, 0, sizeof(struct ggml_opt_context)); - - struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); - struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); - opt_params_adam.print_forward_graph = false; - opt_params_adam.print_backward_graph = false; - opt_params_adam.n_threads = params.n_threads; - opt_params_adam.past = params.opt_past; - opt_params_adam.delta = params.opt_delta; - opt_params_adam.max_no_improvement = params.opt_max_no_improvement; - opt_params_adam.adam.n_iter = params.adam_n_iter; - opt_params_adam.adam.sched = 1.0f; - opt_params_adam.adam.alpha = params.adam_alpha; - opt_params_adam.adam.decay = params.adam_decay; - opt_params_adam.adam.decay_min_ndim = params.adam_decay_min_ndim; - opt_params_adam.adam.beta1 = params.adam_beta1; - opt_params_adam.adam.beta2 = params.adam_beta2; - opt_params_adam.adam.gclip = params.adam_gclip; - opt_params_adam.adam.eps_f = params.adam_eps_f; - - opt_params_lbfgs.print_forward_graph = false; - opt_params_lbfgs.print_backward_graph = false; - opt_params_lbfgs.n_threads = params.n_threads; - opt_params_adam.past = params.opt_past; - opt_params_adam.delta = params.opt_delta; - opt_params_adam.max_no_improvement = params.opt_max_no_improvement; - opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter; - - opt->ctx = model.ctx; - opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; - - printf("%s: init model\n", __func__); - bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, opt); - if (!existed) { + if (opt_past_changed) { + die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value train from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting"); + // need to discard previous optimizer past function value statistics and opt_init with new shapes + // TODO + } + } else { init_model(&model); + randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f); + if (!params.only_write_model) { + ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&model)); + } } - set_param_model(&model); - - opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; + opt->iter = train->train_its; - opt->iter = model.train_its; - printf("%s: opt iter %d\n", __func__, opt->iter); - - bool from_scratch = !existed; - if (from_scratch) { - randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f); + print_params(&model.hparams); + printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its); + printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); + printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); + printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); + printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f)); + + if (params.only_write_model) { + save_train_files_data save_data; + save_data.fn_checkpoint_out = ""; + save_data.fn_model_out = params.fn_model_out; + save_data.fn_vocab_model = params.fn_vocab_model; + save_data.pattern_fn_it = params.common.pattern_fn_it; + save_data.fn_latest = params.common.fn_latest; + save_data.model = &model; + + save_train_files(&save_data, train); + + free_train_state(train); + ggml_free(model.ctx); + llama_free(lctx); + llama_free_model(lmodel); + return 0; } - printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx)); - // ggml_print_tensor_objects(model.ctx); + printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); + printf("%s: opt iter %d\n", __func__, opt->iter); - // TODO: use std::vector<uint8_t> intead of "new" - size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); - uint8_t * compute_addr = new uint8_t[compute_size]; + int n_tokens = model.hparams.n_ctx; + int n_vocab = model.hparams.n_vocab; + int n_batch = params.common.n_batch; - size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb); - uint8_t * compute_buf_0 = new uint8_t[size_buf_0]; + std::vector<uint8_t> mem_input_data; + std::vector<uint8_t> mem_compute_data; ggml_allocr * alloc = NULL; - if (params.use_alloc) { - static const size_t tensor_alignment = 32; - alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment); - } - - GGML_ASSERT(n_tokens < (int) train_tokens.size()); - std::vector<int> train_samples; - train_samples.push_back(0); - for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) { - if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) { - train_samples.push_back(i); - } - } - shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); - for (int i = 0; i < (int) train_samples.size(); ++i) { - GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); - } - - printf("%s: begin training\n", __func__); - - struct opt_callback_data opt_cb_data; - opt_cb_data.params = ¶ms; - opt_cb_data.opt = opt; - opt_cb_data.lctx = lctx; - opt_cb_data.tokens_data = train_tokens.data(); - opt_cb_data.tokens_size = train_tokens.size(); - opt_cb_data.samples_data = train_samples.data(); - opt_cb_data.samples_size = train_samples.size(); - opt_cb_data.shuffle_countdown = train_samples.size(); - opt_cb_data.tokens_input = NULL; - opt_cb_data.target_logits = NULL; - opt_cb_data.target_probs = NULL; - - int64_t t0 = ggml_time_ms(); - - for (int ex = 0; ex < params.n_examples; ++ex) { - if (ex*n_batch >= (int) train_samples.size()) { - shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); - for (int i = 0; i < (int) train_samples.size(); ++i) { - GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); - } - } - - struct ggml_init_params cparams = { - compute_size, // mem_size - compute_addr, // mem_buffer - false, // no_alloc - }; - struct ggml_context * ctx0 = ggml_init(cparams); - - ggml_set_no_alloc(ctx0, false); - - // don't use alloc for input tensors, so we can safely fill them with data - //struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); - //struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); - struct ggml_tensor * target_logits = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - - ggml_set_no_alloc(ctx0, (alloc != NULL)); - if (alloc) { - ggml_allocr_reset(alloc); - } - - opt_cb_data.tokens_input = tokens_input; - opt_cb_data.target_logits = target_logits; - opt_cb_data.target_probs = target_probs; - - int n_past = 0; - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - struct ggml_cgraph * gb = ggml_new_graph(ctx0); - struct ggml_cgraph * gb_tmp = params.use_checkpointing - ? ggml_new_graph(ctx0) + // context for input tensors without their data + struct ggml_init_params ctx_input_params = { + ggml_tensor_overhead() * 2, // mem_size + NULL, // mem_buffer + true, // no_alloc + }; + struct ggml_context * ctx_input = ggml_init(ctx_input_params); + + // the input tensors + struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); + struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); + + // measure required memory for input tensors + alloc = ggml_allocr_new_measure(tensor_alignment); + ggml_allocr_alloc(alloc, tokens_input); + ggml_allocr_alloc(alloc, target_probs); + size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment; + ggml_allocr_free(alloc); + printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); + + // allocate input tensors + mem_input_data.resize(max_input_size); + alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); + ggml_allocr_alloc(alloc, tokens_input); + ggml_allocr_alloc(alloc, target_probs); + ggml_allocr_free(alloc); + + // context for compute tensors without their data + size_t estimated_compute_size_wo_data = ( + ggml_tensor_overhead()*GGML_MAX_NODES*2 + + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*( + params.common.use_checkpointing ? 3 : 2 + ) + ); + struct ggml_init_params ctx_compute_params = { + estimated_compute_size_wo_data, // mem_size + NULL, // mem_buffer + true, // no_alloc + }; + struct ggml_context * ctx_compute = NULL; + + struct ggml_tensor * loss = NULL; + struct ggml_tensor * logits = NULL; + + struct ggml_cgraph * gf = NULL; + struct ggml_cgraph * gb = NULL; + struct ggml_cgraph * gb_tmp = NULL; + + // measure required memory for compute tensors + size_t best_compute_size = SIZE_MAX; + enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT; + // find best evaluation order + for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { + ctx_compute = ggml_init(ctx_compute_params); + alloc = ggml_allocr_new_measure(tensor_alignment); + gf = ggml_new_graph(ctx_compute); + gf->order = (enum ggml_cgraph_eval_order) order; + gb = ggml_new_graph(ctx_compute); + gb_tmp = params.common.use_checkpointing + ? ggml_new_graph(ctx_compute) : NULL; - - GGML_ASSERT(n_past == 0); - - struct ggml_tensor * loss = NULL; - struct ggml_tensor * logits = NULL; - loss = llama_build_train_graphs( - &model, alloc, ctx0, + &model, alloc, ctx_compute, gf, gb, gb_tmp, &logits, tokens_input, target_probs, n_tokens, n_batch, - params.use_flash, - params.use_checkpointing + params.common.use_flash, + params.common.use_checkpointing ); + size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; + if (max_compute_size < best_compute_size) { + best_compute_size = max_compute_size; + best_order = gf->order; + } + ggml_allocr_free(alloc); + ggml_free(ctx_compute); + } + size_t max_compute_size = best_compute_size; + printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); + printf("%s: evaluation order = %s\n", __func__, + (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : + (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : + "invalid"); + + // allocate compute tensors + mem_compute_data.resize(max_compute_size); + ctx_compute = ggml_init(ctx_compute_params); + alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); + gf = ggml_new_graph(ctx_compute); + gf->order = best_order; + gb = ggml_new_graph(ctx_compute); + gb_tmp = params.common.use_checkpointing + ? ggml_new_graph(ctx_compute) + : NULL; + loss = llama_build_train_graphs( + &model, alloc, ctx_compute, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.common.use_flash, + params.common.use_checkpointing + ); + ggml_allocr_free(alloc); - size_t used_mem_before_opt = ggml_used_mem(ctx0); - - opt->params.adam.sched = (opt->iter < params.warmup) - ? (float) opt->iter / (float) params.warmup - : cosine_decay_restart( - params.cos_decay_steps, - params.cos_decay_min, - opt->iter - params.warmup, - params.cos_decay_restart, - params.enable_restart); - - float min_sched = params.adam_min_alpha / params.adam_alpha; - opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched); - - printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); - - ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data); + std::vector<llama_token> train_tokens; + std::vector<size_t> train_samples_begin; + std::vector<size_t> train_samples_size; + printf("%s: tokenize training data\n", __func__); + tokenize_file(lctx, + params.common.fn_train_data, + params.common.sample_start, + params.common.include_sample_start, + params.common.overlapping_samples, + n_tokens, + train_tokens, + train_samples_begin, + train_samples_size); + GGML_ASSERT(train_samples_begin.size() == train_samples_size.size()); + + printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size()); + + size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size()); + const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size()); + if (changed_train_data) { + printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__); + } + if (params.common.force_reshuffle) { + printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__); + } + if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) { + train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed); + train->shuffle_sample_count = train_samples_size.size(); + train->shuffle_next_sample = 0; + train->shuffle_samples_hash = shuffle_samples_hash; + } + std::vector<size_t> train_shuffled_samples_offs; + std::vector<size_t> train_shuffled_samples_begin; + std::vector<size_t> train_shuffled_samples_size; + train_shuffled_samples_offs.resize(train_samples_begin.size()); + train_shuffled_samples_begin.resize(train_samples_begin.size()); + train_shuffled_samples_size.resize(train_samples_size.size()); + train->shuffle_rng_state_next = shuffle_samples( + train->shuffle_rng_state_current, + train_shuffled_samples_offs.data(), + train_shuffled_samples_begin.data(), + train_shuffled_samples_size.data(), + train_samples_begin.data(), + train_samples_size.data(), + train_samples_size.size()); + printf("%s: begin training\n", __func__); - size_t used_mem_after_opt = ggml_used_mem(ctx0); + save_train_files_data save_data; + save_data.fn_checkpoint_out = params.common.fn_checkpoint_out; + save_data.fn_model_out = params.fn_model_out; + save_data.fn_vocab_model = params.fn_vocab_model; + save_data.pattern_fn_it = params.common.pattern_fn_it; + save_data.fn_latest = params.common.fn_latest; + save_data.model = &model; + + struct train_opt_callback_data opt_cb_data; + opt_cb_data.params = ¶ms.common; + opt_cb_data.train = train; + opt_cb_data.save_cb = &save_train_files; + opt_cb_data.save_data = &save_data; + opt_cb_data.lctx = lctx; + opt_cb_data.last_save_iter = opt->iter; + opt_cb_data.tokens_data = train_tokens.data(); + opt_cb_data.tokens_size = train_tokens.size(); + opt_cb_data.samples_begin = train_samples_begin.data(); + opt_cb_data.samples_size = train_samples_size.data(); + opt_cb_data.shuffled_samples_offs = train_shuffled_samples_offs.data(); + opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data(); + opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data(); + opt_cb_data.samples_count = train_samples_size.size(); + opt_cb_data.tokens_input = tokens_input; + opt_cb_data.target_probs = target_probs; + opt_cb_data.first_iter = opt->iter; + opt_cb_data.first_epoch = train->train_epochs; + opt_cb_data.iter_at_last_epoch = -1; + opt_cb_data.last_time = ggml_time_ms(); + opt_cb_data.millis_per_iter = 0.0; + + // measure required memory for work buffer + size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; + printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); + + // context for work buffer + struct ggml_init_params ctx_work_params = { + max_work_size, // mem_size + NULL, // mem_buffer + false, // no_alloc + }; + struct ggml_context * ctx_work = ggml_init(ctx_work_params); - int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter; - model.train_its = opt->iter; - model.train_samples += n_batch * n_iter; - model.train_tokens += n_batch * n_tokens * n_iter; + int64_t t0 = ggml_time_ms(); - if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) { - printf("Example %d, opt iter %d\n", ex, opt->iter); - printf("error_before_opt: %.6f\n", opt->loss_before); - printf("error_after_opt: %.6f\n", opt->loss_after); - printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt); - printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); - } + ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data); - ggml_free(ctx0); - } + ggml_free(ctx_work); + ggml_free(ctx_compute); + ggml_free(ctx_input); int64_t t1 = ggml_time_ms(); - int64_t d = t1-t0; - double dd = (double) d * 1e-3; - printf("%s: total training time=%f seconds\n", __func__, dd); + printf("%s: total training time: ", __func__); + print_duration((double) (t1 - t0)); + printf("\n"); - if (params.n_examples > 0) { - save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt); - } + int new_iters = opt->iter - opt_cb_data.last_save_iter; + if (new_iters > 0) { + train->train_its += new_iters; + train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens; - if (strlen(params.fn_model_out) > 0) { - save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model); + save_train_files(&save_data, train); + opt_cb_data.last_save_iter = opt->iter; } if (alloc) { ggml_allocr_free(alloc); } - delete[] compute_addr; - delete[] compute_buf_0; + ggml_free(opt->ctx); + free_train_state(train); ggml_free(model.ctx); llama_free(lctx); llama_free_model(lmodel); |