diff options
author | Cebtenzzre <cebtenzzre@gmail.com> | 2023-09-15 15:38:27 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-15 15:38:27 -0400 |
commit | 3aefaab9e59335ebb07d5205dbc8633efd680e58 (patch) | |
tree | 1249b50bd748a1bdcc85d010f44feda0d884fef7 /examples | |
parent | 69eb67e28275cd2d57693405f768754a7b2245ad (diff) |
check C++ code with -Wmissing-declarations (#3184)
Diffstat (limited to 'examples')
-rw-r--r-- | examples/baby-llama/baby-llama.cpp | 148 | ||||
-rw-r--r-- | examples/beam-search/beam-search.cpp | 7 | ||||
-rw-r--r-- | examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp | 38 | ||||
-rw-r--r-- | examples/gguf/gguf.cpp | 8 | ||||
-rw-r--r-- | examples/main/main.cpp | 5 | ||||
-rw-r--r-- | examples/perplexity/perplexity.cpp | 30 | ||||
-rw-r--r-- | examples/quantize-stats/quantize-stats.cpp | 49 | ||||
-rw-r--r-- | examples/quantize/quantize.cpp | 4 | ||||
-rw-r--r-- | examples/server/server.cpp | 12 |
9 files changed, 150 insertions, 151 deletions
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index a99ece9a..ed61125e 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -9,12 +9,12 @@ #endif #ifdef LLAMA_DEFAULT_RMS_EPS -static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; +constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; #else -static const float rms_norm_eps = 5e-6f; +constexpr float rms_norm_eps = 5e-6f; #endif -float frand() { +static float frand() { return (float)rand()/(float)RAND_MAX; } @@ -25,19 +25,21 @@ struct random_normal_distribution { float max; }; -void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { +static void init_random_normal_distribution( + struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max +) { rnd->gen = std::mt19937(seed); rnd->nd = std::normal_distribution<float>{mean, std}; rnd->min = min; rnd->max = max; } -float frand_normal(struct random_normal_distribution * rnd) { +static float frand_normal(struct random_normal_distribution * rnd) { const float r = rnd->nd(rnd->gen); return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); } -void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { +static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); if (plan.work_size > 0) { @@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, ggml_graph_compute(graph, &plan); } -struct ggml_tensor * randomize_tensor( - struct ggml_tensor * tensor, - int ndims, - const int64_t ne[], - float fmin, - float fmax) { - +static struct ggml_tensor * randomize_tensor( + struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax +) { switch (ndims) { case 1: for (int i0 = 0; i0 < ne[0]; i0++) { @@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor( return tensor; } -struct ggml_tensor * randomize_tensor_normal( - struct ggml_tensor * tensor, - int ndims, - const int64_t ne[], - struct random_normal_distribution * rnd) { +static struct ggml_tensor * randomize_tensor_normal( + struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd +) { float scale = 1.0; // xavier switch (ndims) { case 1: @@ -159,7 +155,7 @@ struct llama_hparams { } }; -uint32_t get_n_ff(const struct llama_hparams* hparams) { +static uint32_t get_n_ff(const struct llama_hparams* hparams) { const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; return n_ff; } @@ -260,7 +256,7 @@ struct llama_model_lora { std::vector<llama_layer_lora> layers; }; -void init_model(struct llama_model * model) { +static void init_model(struct llama_model * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -297,7 +293,7 @@ void init_model(struct llama_model * model) { } -void init_model_lora(struct llama_model_lora * model) { +static void init_model_lora(struct llama_model_lora * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) { } } -void set_param_model(struct llama_model * model) { +static void set_param_model(struct llama_model * model) { const auto& hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) { } } -void set_param_model_lora(struct llama_model_lora * model) { +static void set_param_model_lora(struct llama_model_lora * model) { const auto& hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) { } } -void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { +static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { const auto & hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std } -void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) { +static void randomize_model_lora( + struct llama_model_lora * model, int seed, float mean, float std, float min, float max +) { const auto & hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, } } -bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { +static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int return true; } -bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { +static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * return true; } -struct ggml_tensor * forward( - struct llama_model * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past) { - +static struct ggml_tensor * forward( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past +) { const int N = n_tokens; struct llama_kv_cache& kv_self = *cache; @@ -756,25 +754,25 @@ struct ggml_tensor * forward( return inpL; } -void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { +static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->ne[0] == ne0); } -void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { +static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { GGML_ASSERT(tensor->n_dims == 2); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); } -void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { +static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { GGML_ASSERT(tensor->n_dims == 3); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[2] == ne2); } -void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { +static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { GGML_ASSERT(tensor->n_dims == 4); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); @@ -782,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6 GGML_ASSERT(tensor->ne[3] == ne3); } -struct ggml_tensor * forward_batch( - struct llama_model * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past, - const int n_batch) { - +static struct ggml_tensor * forward_batch( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past, + const int n_batch +) { const int N = n_tokens; struct llama_kv_cache& kv_self = *cache; @@ -1073,16 +1071,15 @@ struct ggml_tensor * forward_batch( return inpL; } - -struct ggml_tensor * forward_lora( - struct llama_model_lora * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past) { - +static struct ggml_tensor * forward_lora( + struct llama_model_lora * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past +) { const int N = n_tokens; struct llama_kv_cache& kv_self = *cache; @@ -1328,7 +1325,7 @@ struct ggml_tensor * forward_lora( return inpL; } -void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { +static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { assert(logits->n_dims == 2); assert(probs->n_dims == 2); assert(best_samples->n_dims == 1); @@ -1359,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str } } -void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { +static void sample_softmax_batch( + struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, + struct ggml_tensor * best_samples +) { GGML_ASSERT(best_samples->n_dims == 2); GGML_ASSERT(logits->n_dims == 3); GGML_ASSERT(probs->n_dims == 3); @@ -1393,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits } } -void print_row(struct ggml_tensor * probs, int i) { +static void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); printf(" %.2f", p); @@ -1401,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) { printf("\n"); } -void print_matrix(struct ggml_tensor * probs) { +static void print_matrix(struct ggml_tensor * probs) { assert(probs->n_dims == 2); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { @@ -1412,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) { } } -void print_token(int token, int n_vocab) { +static void print_token(int token, int n_vocab) { for (int k = 0; k < token; ++k) { printf(" "); } @@ -1423,14 +1423,14 @@ void print_token(int token, int n_vocab) { printf("\n"); } -void print_tokens(struct ggml_tensor * tokens, int n_vocab) { +static void print_tokens(struct ggml_tensor * tokens, int n_vocab) { for (int i=0; i<tokens->ne[0]; ++i) { int token = ggml_get_i32_1d(tokens, i); print_token(token, n_vocab); } } -void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { +static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { int n_tokens = tokens_input->ne[0]; int n_vocab = targets->ne[0]; float randomness = 0.0f; @@ -1451,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru } } -void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { +static void get_example_targets_batch( + struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets +) { GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT( targets->n_dims == 3); int n_tokens = tokens_input->ne[0]; @@ -1474,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct } } -void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { +static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { int n_tokens = tokens_input->ne[0]; int n_vocab = targets->ne[0]; for (int i=0; i<n_tokens-n_shift; ++i) { @@ -1485,12 +1487,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar } } -struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { +static struct ggml_tensor * square_error_loss( + struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b +) { // todo: instead of a-b: a[1:]-b[:-1] return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b))); } -struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { +static struct ggml_tensor * cross_entropy_loss( + struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b +) { const float eps = 1e-3f; return ggml_sum(ctx, diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 6b31aea7..805170c9 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -30,7 +30,8 @@ struct ostream_beam_view { llama_context * ctx; llama_beam_view beam_view; }; -std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) { + +static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) { os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); @@ -46,7 +47,7 @@ struct beam_search_callback_data { // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // For example, eob can be flagged due to maximum token length, stop words, etc. -bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) { +static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); } @@ -56,7 +57,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke // * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // This is also called when the stop condition is met. // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. -void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { +static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr); // Mark beams as EOS as needed. for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 293b455d..c291f0ad 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -115,7 +115,7 @@ struct TransformerWeights { } }; -void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { +static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { // we calloc instead of malloc to keep valgrind happy w->token_embedding_table = new float[p->vocab_size * p->dim](); printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); @@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { } } -int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { +static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1; if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1; if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1; @@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar return 0; } -void print_sample_weights(TransformerWeights *w){ +static void print_sample_weights(TransformerWeights *w){ printf("----- Quick print of first of the weight vales of all the variables\n"); printf("%f\n", w->token_embedding_table[0]); printf("%f\n", w->rms_att_weight[0]); @@ -324,7 +324,7 @@ struct train_params { int mem_compute1_gb; }; -void print_params(struct my_llama_hparams * params) { +static void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_embd: %d\n", __func__, params->n_embd); @@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) { printf("%s: n_rot: %d\n", __func__, params->n_rot); } -void init_model(struct my_llama_model * model) { +static void init_model(struct my_llama_model * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) { } } -float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { +static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); return *ptr; } -int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { +static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); return *ptr; } -void print_row(struct ggml_tensor * probs, int i) { +static void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); printf(" %f", p); @@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) { printf("\n"); } -void print_matrix(struct ggml_tensor * probs) { +static void print_matrix(struct ggml_tensor * probs) { assert(probs->n_dims == 2); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { @@ -531,7 +531,7 @@ struct llama_file { } }; -bool is_ggml_file(const char *filename) { +static bool is_ggml_file(const char * filename) { llama_file file(filename, "rb"); if (file.size < 4) { return false; @@ -540,7 +540,7 @@ bool is_ggml_file(const char *filename) { return magic == GGUF_MAGIC; } -static std::string llama_escape_whitespaces(const std::string& text) { +static std::string llama_escape_whitespaces(const std::string & text) { std::ostringstream out; for (char c : text) { if (c == ' ') out << "\xe2\x96\x81"; @@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) { return out.str(); } -void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { +static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { if (is_ggml_file(filename)) { struct ggml_context * ctx_data = NULL; @@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) } } -void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { +static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; switch (gg_weights->n_dims){ case 1: @@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar } } -void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { +static void save_as_llama_model( + struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename +) { // convert AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor @@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod gguf_free(ctx); } -struct train_params get_default_train_params() { +static struct train_params get_default_train_params() { struct train_params params; params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_llama2c_output_model = "ak_llama_model.bin"; @@ -835,7 +837,7 @@ struct train_params get_default_train_params() { return params; } -void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { +static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); @@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params) fprintf(stderr, "\n"); } -bool params_parse(int argc, char ** argv, struct train_params * params) { +static bool params_parse(int argc, char ** argv, struct train_params * params) { bool invalid_param = false; bool reqd_param_found = false; std::string arg; @@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) { return true; } -std::string basename(const std::string &path) { +static std::string basename(const std::string &path) { size_t pos = path.find_last_of("/\\"); if (pos == std::string::npos) { return path; diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index a34010f1..9ab63a29 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -13,14 +13,14 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -template<typename T> +template <typename T> static std::string to_string(const T & val) { std::stringstream ss; ss << val; return ss.str(); } -bool gguf_ex_write(const std::string & fname) { +static bool gguf_ex_write(const std::string & fname) { struct gguf_context * ctx = gguf_init_empty(); gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12); @@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) { } // just read tensor info -bool gguf_ex_read_0(const std::string & fname) { +static bool gguf_ex_read_0(const std::string & fname) { struct gguf_init_params params = { /*.no_alloc = */ false, /*.ctx = */ NULL, @@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) { } // read and create ggml_context containing the tensors and their data -bool gguf_ex_read_1(const std::string & fname) { +static bool gguf_ex_read_1(const std::string & fname) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a8179f1b..e3cc3d39 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -41,7 +41,8 @@ static std::ostringstream * g_output_ss; static std::vector<llama_token> * g_output_tokens; static bool is_interacting = false; -void write_logfile( + +static void write_logfile( const llama_context * ctx, const gpt_params & params, const llama_model * model, const std::vector<llama_token> & input_tokens, const std::string & output, const std::vector<llama_token> & output_tokens @@ -86,7 +87,7 @@ void write_logfile( } #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) -void sigint_handler(int signo) { +static void sigint_handler(int signo) { if (signo == SIGINT) { if (!is_interacting) { is_interacting = true; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 3a1c8c28..4620c43a 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -28,9 +28,10 @@ struct results_log_softmax { float prob; }; -void write_logfile(const llama_context * ctx, const gpt_params & params, - const llama_model * model, const struct results_perplexity & results) { - +static void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const struct results_perplexity & results +) { if (params.logdir.empty()) { return; } @@ -76,7 +77,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params, fclose(logfile); } -std::vector<float> softmax(const std::vector<float>& logits) { +static std::vector<float> softmax(const std::vector<float>& logits) { std::vector<float> probs(logits.size()); float max_logit = logits[0]; for (float v : logits) max_logit = std::max(max_logit, v); @@ -92,7 +93,7 @@ std::vector<float> softmax(const std::vector<float>& logits) { return probs; } -results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { +static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { float max_logit = logits[0]; for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); double sum_exp = 0.0; @@ -100,9 +101,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; } -void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, - double & nll, double & nll2, float * logit_history, float * prob_history) { - +static void process_logits( + int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, + double & nll, double & nll2, float * logit_history, float * prob_history +) { std::mutex mutex; int counter = 0; auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { @@ -130,7 +132,7 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n } -results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { +static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` @@ -260,8 +262,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) return {tokens, std::exp(nll / count), logit_history, prob_history}; } -results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { - +static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { if (params.ppl_stride > 0) { return perplexity_v2(ctx, params); } @@ -400,8 +401,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { return {tokens, ppl, logit_history, prob_history}; } -std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, - int n_vocab, int n_thread) { +static std::vector<float> hellaswag_evaluate_tokens( + llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread +) { std::vector<float> result; result.reserve(tokens.size() * n_vocab); size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch; @@ -421,7 +423,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec return result; } -void hellaswag_score(llama_context * ctx, const gpt_params & params) { +static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6ce03ba7..bfe70889 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -34,8 +34,8 @@ struct quantize_stats_params { std::vector<enum ggml_type> include_types; }; -const size_t HISTOGRAM_BUCKETS = 150; -const double HISTOGRAM_RANGE = 0.03; +constexpr size_t HISTOGRAM_BUCKETS = 150; +constexpr double HISTOGRAM_RANGE = 0.03; struct error_stats { size_t num_samples; @@ -44,8 +44,7 @@ struct error_stats { uint64_t error_histogram[HISTOGRAM_BUCKETS]; }; - -void quantize_stats_print_usage(int /*argc*/, char ** argv) { +static void quantize_stats_print_usage(int /*argc*/, char ** argv) { quantize_stats_params params; fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); @@ -71,7 +70,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { } // Check if a layer is included/excluded by command line -bool layer_included(const quantize_stats_params & params, const std::string & layer) { +static bool layer_included(const quantize_stats_params & params, const std::string & layer) { for (const auto& excluded : params.exclude_layers) { if (std::regex_search(layer, std::regex(excluded))) { return false; @@ -86,7 +85,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la } // Update error statistics given vectors with the before/after result of quantization -void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { +static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { for (int64_t i = 0; i < nelements; i++) { double diff = input[i] - output[i]; stats.total_error += diff * diff; @@ -96,14 +95,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou stats.num_samples += nelements; } -void combine_error_stats(error_stats & into, const error_stats & from) { +static void combine_error_stats(error_stats & into, const error_stats & from) { into.num_samples += from.num_samples; into.total_error += from.total_error; if (from.max_error > into.max_error) into.max_error = from.max_error; for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i]; } -double find_quantile(const error_stats & stats, double quantile) { +static double find_quantile(const error_stats & stats, double quantile) { double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); double accum = 0; @@ -116,7 +115,7 @@ double find_quantile(const error_stats & stats, double quantile) { return INFINITY; } -void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { +static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { double rmse = sqrt(stats.total_error / (double) stats.num_samples); double median = find_quantile(stats, .5); double pct95 = find_quantile(stats, .95); @@ -143,17 +142,10 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -void test_roundtrip_on_chunk( - const ggml_tensor * layer, - int64_t offset, - int64_t chunk_size, - const ggml_type_traits_t & qfns, - bool use_reference, - float * input_scratch, - char * quantized_scratch, - float * output_scratch, - error_stats & stats) { - +static void test_roundtrip_on_chunk( + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference, + float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats +) { if (layer->type == GGML_TYPE_F16) { for (int i = 0; i < chunk_size; i++) { input_scratch[i] = ggml_get_f32_1d(layer, i + offset); @@ -174,18 +166,11 @@ void test_roundtrip_on_chunk( // Run quantization function for a single layer and update error stats -void test_roundtrip_on_layer( - std::string & name, - bool print_layer_stats, - const ggml_type_traits_t & qfns, - bool use_reference, - const ggml_tensor * layer, - std::vector<float> & input_scratch, - std::vector<char> & quantized_scratch, - std::vector<float> & output_scratch, - error_stats & total_error, - int max_thread = 0) { - +static void test_roundtrip_on_layer( + std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference, + const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch, + std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0 +) { assert(tensor_is_contiguous(layer)); error_stats layer_error {}; uint64_t nelements = ggml_nelements(layer); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1bf18248..300788c9 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -40,7 +40,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = { }; -bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { +static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; for (auto ch : ftype_str_in) { @@ -72,7 +72,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: // usage: // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // -void usage(const char * executable) { +static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3f3c6465..1bb8e92c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1083,8 +1083,9 @@ static json format_final_response(llama_server_context &llama, const std::string return res; } -static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs) -{ +static json format_partial_response( + llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs +) { json res = json{ {"content", content}, {"stop", false}, @@ -1215,7 +1216,7 @@ static void log_server_request(const Request &req, const Response &res) }); } -bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) { +static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) { return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx); } @@ -1225,7 +1226,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens // * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // This is also called when the stop condition is met. // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. -void beam_search_callback(void * callback_data, llama_beams_state beams_state) { +static void beam_search_callback(void *callback_data, llama_beams_state beams_state) { auto & llama = *static_cast<llama_server_context*>(callback_data); // Mark beams as EOS as needed. for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { @@ -1258,7 +1259,8 @@ struct token_translator { std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); } }; -void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) { +static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama) +{ auto & gtps = llama.generated_token_probs; auto translator = token_translator{llama.ctx}; auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); }; |