9 files changed, 150 insertions, 151 deletions
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index a99ece9a..ed61125e 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -9,12 +9,12 @@
 #endif
 
 #ifdef LLAMA_DEFAULT_RMS_EPS
-static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 #else
-static const float rms_norm_eps = 5e-6f;
+constexpr float rms_norm_eps = 5e-6f;
 #endif
 
-float frand() {
+static float frand() {
     return (float)rand()/(float)RAND_MAX;
 }
 
@@ -25,19 +25,21 @@ struct random_normal_distribution {
     float max;
 };
 
-void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
+static void init_random_normal_distribution(
+    struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
+) {
     rnd->gen = std::mt19937(seed);
     rnd->nd = std::normal_distribution<float>{mean, std};
     rnd->min = min;
     rnd->max = max;
 }
 
-float frand_normal(struct random_normal_distribution * rnd) {
+static float frand_normal(struct random_normal_distribution * rnd) {
     const float r = rnd->nd(rnd->gen);
     return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
 }
 
-void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 
     if (plan.work_size > 0) {
@@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
     ggml_graph_compute(graph, &plan);
 }
 
-struct ggml_tensor * randomize_tensor(
-        struct ggml_tensor * tensor,
-        int ndims,
-        const int64_t ne[],
-        float fmin,
-        float fmax) {
-
+static struct ggml_tensor * randomize_tensor(
+    struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
+) {
     switch (ndims) {
         case 1:
             for (int i0 = 0; i0 < ne[0]; i0++) {
@@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor(
     return tensor;
 }
 
-struct ggml_tensor * randomize_tensor_normal(
-        struct ggml_tensor * tensor,
-        int ndims,
-        const int64_t ne[],
-        struct random_normal_distribution * rnd) {
+static struct ggml_tensor * randomize_tensor_normal(
+    struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
+) {
     float scale = 1.0; // xavier
     switch (ndims) {
         case 1:
@@ -159,7 +155,7 @@ struct llama_hparams {
     }
 };
 
-uint32_t get_n_ff(const struct llama_hparams* hparams) {
+static uint32_t get_n_ff(const struct llama_hparams* hparams) {
     const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
     return n_ff;
 }
@@ -260,7 +256,7 @@ struct llama_model_lora {
     std::vector<llama_layer_lora> layers;
 };
 
-void init_model(struct llama_model * model) {
+static void init_model(struct llama_model * model) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_embd  = hparams.n_embd;
@@ -297,7 +293,7 @@ void init_model(struct llama_model * model) {
 }
 
 
-void init_model_lora(struct llama_model_lora * model) {
+static void init_model_lora(struct llama_model_lora * model) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_embd  = hparams.n_embd;
@@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) {
     }
 }
 
-void set_param_model(struct llama_model * model) {
+static void set_param_model(struct llama_model * model) {
     const auto& hparams = model->hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) {
     }
 }
 
-void set_param_model_lora(struct llama_model_lora * model) {
+static void set_param_model_lora(struct llama_model_lora * model) {
     const auto& hparams = model->hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
     }
 }
 
-void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
+static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
 }
 
 
-void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
+static void randomize_model_lora(
+    struct llama_model_lora * model, int seed, float mean, float std, float min, float max
+) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
     }
 }
 
-bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
+static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_ctx   = hparams.n_ctx;
@@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
     return true;
 }
 
-bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
+static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_ctx   = hparams.n_ctx;
@@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
     return true;
 }
 
-struct ggml_tensor * forward(
-        struct llama_model    * model,
-        struct llama_kv_cache * cache,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_past) {
-
+static struct ggml_tensor * forward(
+    struct llama_model    * model,
+    struct llama_kv_cache * cache,
+    struct ggml_context   * ctx0,
+    struct ggml_cgraph    * gf,
+    struct ggml_tensor    * tokens_input,
+    const  int              n_tokens,
+    const  int              n_past
+) {
     const int N = n_tokens;
 
     struct llama_kv_cache& kv_self = *cache;
@@ -756,25 +754,25 @@ struct ggml_tensor * forward(
     return inpL;
 }
 
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
     GGML_ASSERT(tensor->n_dims == 1);
     GGML_ASSERT(tensor->ne[0] == ne0);
 }
 
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
     GGML_ASSERT(tensor->n_dims == 2);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
 }
 
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
     GGML_ASSERT(tensor->n_dims == 3);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
     GGML_ASSERT(tensor->ne[2] == ne2);
 }
 
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
     GGML_ASSERT(tensor->n_dims == 4);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
@@ -782,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
     GGML_ASSERT(tensor->ne[3] == ne3);
 }
 
-struct ggml_tensor * forward_batch(
-        struct llama_model    * model,
-        struct llama_kv_cache * cache,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_past,
-        const  int              n_batch) {
-
+static struct ggml_tensor * forward_batch(
+    struct llama_model    * model,
+    struct llama_kv_cache * cache,
+    struct ggml_context   * ctx0,
+    struct ggml_cgraph    * gf,
+    struct ggml_tensor    * tokens_input,
+    const  int              n_tokens,
+    const  int              n_past,
+    const  int              n_batch
+) {
     const int N = n_tokens;
 
     struct llama_kv_cache& kv_self = *cache;
@@ -1073,16 +1071,15 @@ struct ggml_tensor * forward_batch(
     return inpL;
 }
 
-
-struct ggml_tensor * forward_lora(
-        struct llama_model_lora * model,
-        struct llama_kv_cache   * cache,
-        struct ggml_context     * ctx0,
-        struct ggml_cgraph      * gf,
-        struct ggml_tensor      * tokens_input,
-        const  int                n_tokens,
-        const  int                n_past) {
-
+static struct ggml_tensor * forward_lora(
+    struct llama_model_lora * model,
+    struct llama_kv_cache   * cache,
+    struct ggml_context     * ctx0,
+    struct ggml_cgraph      * gf,
+    struct ggml_tensor      * tokens_input,
+    const  int                n_tokens,
+    const  int                n_past
+) {
     const int N = n_tokens;
 
     struct llama_kv_cache& kv_self = *cache;
@@ -1328,7 +1325,7 @@ struct ggml_tensor * forward_lora(
     return inpL;
 }
 
-void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
+static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
     assert(logits->n_dims == 2);
     assert(probs->n_dims == 2);
     assert(best_samples->n_dims == 1);
@@ -1359,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
     }
 }
 
-void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
+static void sample_softmax_batch(
+    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
+    struct ggml_tensor * best_samples
+) {
     GGML_ASSERT(best_samples->n_dims == 2);
     GGML_ASSERT(logits->n_dims == 3);
     GGML_ASSERT(probs->n_dims == 3);
@@ -1393,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
     }
 }
 
-void print_row(struct ggml_tensor * probs, int i) {
+static void print_row(struct ggml_tensor * probs, int i) {
     for (int k = 0; k < probs->ne[0]; ++k) {
         float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
         printf(" %.2f", p);
@@ -1401,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) {
     printf("\n");
 }
 
-void print_matrix(struct ggml_tensor * probs) {
+static void print_matrix(struct ggml_tensor * probs) {
     assert(probs->n_dims == 2);
     for (int i = 0; i < probs->ne[1]; ++i) {
         for (int k = 0; k < probs->ne[0]; ++k) {
@@ -1412,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) {
     }
 }
 
-void print_token(int token, int n_vocab) {
+static void print_token(int token, int n_vocab) {
     for (int k = 0; k < token; ++k) {
         printf(" ");
     }
@@ -1423,14 +1423,14 @@ void print_token(int token, int n_vocab) {
     printf("\n");
 }
 
-void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
+static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
     for (int i=0; i<tokens->ne[0]; ++i) {
         int token = ggml_get_i32_1d(tokens, i);
         print_token(token, n_vocab);
     }
 }
 
-void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab = targets->ne[0];
     float randomness = 0.0f;
@@ -1451,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
     }
 }
 
-void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+static void get_example_targets_batch(
+    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
+) {
     GGML_ASSERT(tokens_input->n_dims == 2);
     GGML_ASSERT(     targets->n_dims == 3);
     int n_tokens = tokens_input->ne[0];
@@ -1474,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
     }
 }
 
-void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
+static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab = targets->ne[0];
     for (int i=0; i<n_tokens-n_shift; ++i) {
@@ -1485,12 +1487,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
     }
 }
 
-struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+static struct ggml_tensor * square_error_loss(
+    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
+) {
     // todo: instead of a-b: a[1:]-b[:-1]
     return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
 }
 
-struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+static struct ggml_tensor * cross_entropy_loss(
+    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
+) {
     const float eps = 1e-3f;
     return
         ggml_sum(ctx,
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
index 6b31aea7..805170c9 100644
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -30,7 +30,8 @@ struct ostream_beam_view {
     llama_context * ctx;
     llama_beam_view beam_view;
 };
-std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
+
+static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
     os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
     for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
         os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
@@ -46,7 +47,7 @@ struct beam_search_callback_data {
 
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
-bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
+static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
     return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
 }
 
@@ -56,7 +57,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
+static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
     auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
     // Mark beams as EOS as needed.
     for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 293b455d..c291f0ad 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -115,7 +115,7 @@ struct TransformerWeights {
     }
 };
 
-void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
+static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
     // we calloc instead of malloc to keep valgrind happy
     w->token_embedding_table = new float[p->vocab_size * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
     }
 }
 
-int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
+static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
     if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
     if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
     if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
     return 0;
 }
 
-void print_sample_weights(TransformerWeights *w){
+static void print_sample_weights(TransformerWeights *w){
     printf("----- Quick print of first of the weight vales of all the variables\n");
     printf("%f\n", w->token_embedding_table[0]);
     printf("%f\n", w->rms_att_weight[0]);
@@ -324,7 +324,7 @@ struct train_params {
     int mem_compute1_gb;
 };
 
-void print_params(struct my_llama_hparams * params) {
+static void print_params(struct my_llama_hparams * params) {
     printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
     printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
     printf("%s: n_embd:  %d\n", __func__, params->n_embd);
@@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) {
     printf("%s: n_rot:   %d\n", __func__, params->n_rot);
 }
 
-void init_model(struct my_llama_model * model) {
+static void init_model(struct my_llama_model * model) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_embd  = hparams.n_embd;
@@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) {
     }
 }
 
-float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
     return *ptr;
 }
 
-int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
     int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
     return *ptr;
 }
 
-void print_row(struct ggml_tensor * probs, int i) {
+static void print_row(struct ggml_tensor * probs, int i) {
     for (int k = 0; k < probs->ne[0]; ++k) {
         float p = get_f32_2d(probs, k, i);
         printf(" %f", p);
@@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
     printf("\n");
 }
 
-void print_matrix(struct ggml_tensor * probs) {
+static void print_matrix(struct ggml_tensor * probs) {
     assert(probs->n_dims == 2);
     for (int i = 0; i < probs->ne[1]; ++i) {
         for (int k = 0; k < probs->ne[0]; ++k) {
@@ -531,7 +531,7 @@ struct llama_file {
     }
 };
 
-bool is_ggml_file(const char *filename) {
+static bool is_ggml_file(const char * filename) {
     llama_file file(filename, "rb");
     if (file.size < 4) {
         return false;
@@ -540,7 +540,7 @@ bool is_ggml_file(const char *filename) {
     return magic == GGUF_MAGIC;
 }
 
-static std::string llama_escape_whitespaces(const std::string& text) {
+static std::string llama_escape_whitespaces(const std::string & text) {
     std::ostringstream out;
     for (char c : text) {
         if (c == ' ') out << "\xe2\x96\x81";
@@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) {
     return out.str();
 }
 
-void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
     if (is_ggml_file(filename)) {
         struct ggml_context * ctx_data = NULL;
 
@@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
     }
 }
 
-void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
+static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
     int ct;
     switch (gg_weights->n_dims){
         case 1:
@@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar
     }
 }
 
-void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
+static void save_as_llama_model(
+    struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
+) {
     // convert AK weights into GG weights one by one.
     // w->token_embedding_table -> model->tok_embeddings
     // float*                   -> struct ggml_tensor
@@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     gguf_free(ctx);
 }
 
-struct train_params get_default_train_params() {
+static struct train_params get_default_train_params() {
     struct train_params params;
     params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf";
     params.fn_llama2c_output_model = "ak_llama_model.bin";
@@ -835,7 +837,7 @@ struct train_params get_default_train_params() {
     return params;
 }
 
-void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
@@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
     fprintf(stderr, "\n");
 }
 
-bool params_parse(int argc, char ** argv, struct train_params * params) {
+static bool params_parse(int argc, char ** argv, struct train_params * params) {
     bool invalid_param = false;
     bool reqd_param_found = false;
     std::string arg;
@@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
     return true;
 }
 
-std::string basename(const std::string &path) {
+static std::string basename(const std::string &path) {
     size_t pos = path.find_last_of("/\\");
     if (pos == std::string::npos) {
         return path;
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index a34010f1..9ab63a29 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -13,14 +13,14 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
-template<typename T>
+template <typename T>
 static std::string to_string(const T & val) {
     std::stringstream ss;
     ss << val;
     return ss.str();
 }
 
-bool gguf_ex_write(const std::string & fname) {
+static bool gguf_ex_write(const std::string & fname) {
     struct gguf_context * ctx = gguf_init_empty();
 
     gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
@@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) {
 }
 
 // just read tensor info
-bool gguf_ex_read_0(const std::string & fname) {
+static bool gguf_ex_read_0(const std::string & fname) {
     struct gguf_init_params params = {
         /*.no_alloc = */ false,
         /*.ctx      = */ NULL,
@@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) {
 }
 
 // read and create ggml_context containing the tensors and their data
-bool gguf_ex_read_1(const std::string & fname) {
+static bool gguf_ex_read_1(const std::string & fname) {
     struct ggml_context * ctx_data = NULL;
 
     struct gguf_init_params params = {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index a8179f1b..e3cc3d39 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -41,7 +41,8 @@ static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 
-void write_logfile(
+
+static void write_logfile(
     const llama_context * ctx, const gpt_params & params, const llama_model * model,
     const std::vector<llama_token> & input_tokens, const std::string & output,
     const std::vector<llama_token> & output_tokens
@@ -86,7 +87,7 @@ void write_logfile(
 }
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-void sigint_handler(int signo) {
+static void sigint_handler(int signo) {
     if (signo == SIGINT) {
         if (!is_interacting) {
             is_interacting = true;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 3a1c8c28..4620c43a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -28,9 +28,10 @@ struct results_log_softmax {
     float  prob;
 };
 
-void write_logfile(const llama_context * ctx, const gpt_params & params,
-                   const llama_model * model, const struct results_perplexity & results) {
-
+static void write_logfile(
+    const llama_context * ctx, const gpt_params & params, const llama_model * model,
+    const struct results_perplexity & results
+) {
     if (params.logdir.empty()) {
         return;
     }
@@ -76,7 +77,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params,
     fclose(logfile);
 }
 
-std::vector<float> softmax(const std::vector<float>& logits) {
+static std::vector<float> softmax(const std::vector<float>& logits) {
     std::vector<float> probs(logits.size());
     float max_logit = logits[0];
     for (float v : logits) max_logit = std::max(max_logit, v);
@@ -92,7 +93,7 @@ std::vector<float> softmax(const std::vector<float>& logits) {
     return probs;
 }
 
-results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
+static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
     float max_logit = logits[0];
     for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
     double sum_exp = 0.0;
@@ -100,9 +101,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
     return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }
 
-void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-        double & nll, double & nll2, float * logit_history, float * prob_history) {
-
+static void process_logits(
+    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
+    double & nll, double & nll2, float * logit_history, float * prob_history
+) {
     std::mutex mutex;
     int counter = 0;
     auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@@ -130,7 +132,7 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
 
 }
 
-results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
+static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
     // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
     // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
@@ -260,8 +262,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
     return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
 
-results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
-
+static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
     if (params.ppl_stride > 0) {
         return perplexity_v2(ctx, params);
     }
@@ -400,8 +401,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
     return {tokens, ppl, logit_history, prob_history};
 }
 
-std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
-        int n_vocab, int n_thread) {
+static std::vector<float> hellaswag_evaluate_tokens(
+    llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
+) {
     std::vector<float> result;
     result.reserve(tokens.size() * n_vocab);
     size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
@@ -421,7 +423,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec
     return result;
 }
 
-void hellaswag_score(llama_context * ctx, const gpt_params & params) {
+static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     // Calculates hellaswag score (acc_norm) from prompt
     //
     // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 6ce03ba7..bfe70889 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -34,8 +34,8 @@ struct quantize_stats_params {
     std::vector<enum ggml_type> include_types;
 };
 
-const size_t HISTOGRAM_BUCKETS = 150;
-const double HISTOGRAM_RANGE = 0.03;
+constexpr size_t HISTOGRAM_BUCKETS = 150;
+constexpr double HISTOGRAM_RANGE = 0.03;
 
 struct error_stats {
     size_t num_samples;
@@ -44,8 +44,7 @@ struct error_stats {
     uint64_t error_histogram[HISTOGRAM_BUCKETS];
 };
 
-
-void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
     quantize_stats_params params;
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
@@ -71,7 +70,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
 }
 
 // Check if a layer is included/excluded by command line
-bool layer_included(const quantize_stats_params & params, const std::string & layer) {
+static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
     for (const auto& excluded : params.exclude_layers) {
         if (std::regex_search(layer, std::regex(excluded))) {
             return false;
@@ -86,7 +85,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la
 }
 
 // Update error statistics given vectors with the before/after result of quantization
-void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
     for (int64_t i = 0; i < nelements; i++) {
         double diff = input[i] - output[i];
         stats.total_error += diff * diff;
@@ -96,14 +95,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
     stats.num_samples += nelements;
 }
 
-void combine_error_stats(error_stats & into, const error_stats & from) {
+static void combine_error_stats(error_stats & into, const error_stats & from) {
     into.num_samples += from.num_samples;
     into.total_error += from.total_error;
     if (from.max_error > into.max_error) into.max_error = from.max_error;
     for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
 }
 
-double find_quantile(const error_stats & stats, double quantile) {
+static double find_quantile(const error_stats & stats, double quantile) {
     double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
 
     double accum = 0;
@@ -116,7 +115,7 @@ double find_quantile(const error_stats & stats, double quantile) {
     return INFINITY;
 }
 
-void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
     double rmse = sqrt(stats.total_error / (double) stats.num_samples);
     double median = find_quantile(stats, .5);
     double pct95 = find_quantile(stats, .95);
@@ -143,17 +142,10 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 
-void test_roundtrip_on_chunk(
-        const ggml_tensor * layer,
-        int64_t offset,
-        int64_t chunk_size,
-        const ggml_type_traits_t & qfns,
-        bool use_reference,
-        float * input_scratch,
-        char * quantized_scratch,
-        float * output_scratch,
-        error_stats & stats) {
-
+static void test_roundtrip_on_chunk(
+    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
+    float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
+) {
     if (layer->type == GGML_TYPE_F16) {
         for (int i = 0; i < chunk_size; i++) {
             input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
@@ -174,18 +166,11 @@ void test_roundtrip_on_chunk(
 
 
 // Run quantization function for a single layer and update error stats
-void test_roundtrip_on_layer(
-        std::string & name,
-        bool print_layer_stats,
-        const ggml_type_traits_t & qfns,
-        bool use_reference,
-        const ggml_tensor * layer,
-        std::vector<float> & input_scratch,
-        std::vector<char> & quantized_scratch,
-        std::vector<float> & output_scratch,
-        error_stats & total_error,
-        int max_thread = 0) {
-
+static void test_roundtrip_on_layer(
+    std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
+    const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
+    std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
+) {
     assert(tensor_is_contiguous(layer));
     error_stats layer_error {};
     uint64_t nelements = ggml_nelements(layer);
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 1bf18248..300788c9 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -40,7 +40,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
 };
 
 
-bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
+static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
     std::string ftype_str;
 
     for (auto ch : ftype_str_in) {
@@ -72,7 +72,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 // usage:
 //  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
-void usage(const char * executable) {
+static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3f3c6465..1bb8e92c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1083,8 +1083,9 @@ static json format_final_response(llama_server_context &llama, const std::string
     return res;
 }
 
-static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs)
-{
+static json format_partial_response(
+    llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs
+) {
     json res = json{
         {"content", content},
         {"stop", false},
@@ -1215,7 +1216,7 @@ static void log_server_request(const Request &req, const Response &res)
                            });
 }
 
-bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
+static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
     return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
 }
 
@@ -1225,7 +1226,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
+static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
     auto & llama = *static_cast<llama_server_context*>(callback_data);
     // Mark beams as EOS as needed.
     for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
@@ -1258,7 +1259,8 @@ struct token_translator {
     std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
 };
 
-void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
+static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
+{
     auto & gtps = llama.generated_token_probs;
     auto translator = token_translator{llama.ctx};
     auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };