diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-03-11 17:49:47 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-11 17:49:47 +0200 |
commit | 05b06210c954491cf0f12034b0a62bd4d69ce78b (patch) | |
tree | 8cce0e429b8591dcbabd2d1e1ea6451a6a2f6894 /llama.cpp | |
parent | 83796e62bc9f6caae6228168e359890f51e60fee (diff) |
llama : more consistent names of count variables (#5994)
* llama : more consistent names of count variables
ggml-ci
* llama : n_parallel -> n_seq_max
* common : fix param name
* examples : fix param name
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 24 |
1 files changed, 12 insertions, 12 deletions
@@ -12538,7 +12538,7 @@ struct llama_context_params llama_context_default_params() { /*.seed =*/ LLAMA_DEFAULT_SEED, /*.n_ctx =*/ 512, /*.n_batch =*/ 512, - /*.n_parallel =*/ 1, + /*.n_seq_max =*/ 1, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, @@ -12700,7 +12700,7 @@ struct llama_context * llama_new_context_with_model( auto & cparams = ctx->cparams; cparams.n_batch = params.n_batch; - // TODO: maybe add n_parallel here too + // TODO: maybe add n_seq_max here too cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; cparams.yarn_ext_factor = params.yarn_ext_factor; @@ -12767,7 +12767,7 @@ struct llama_context * llama_new_context_with_model( // Mamba only needs a constant number of KV cache cells per sequence if (model->arch == LLM_ARCH_MAMBA) { // Mamba needs at least as many KV cells as there are sequences kept at any time - kv_size = std::max((uint32_t) 1, params.n_parallel); + kv_size = std::max((uint32_t) 1, params.n_seq_max); // it's probably best to keep as much precision as possible for the states type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states @@ -13024,7 +13024,7 @@ uint32_t llama_n_batch(const struct llama_context * ctx) { return ctx->cparams.n_batch; } -uint32_t llama_n_max_seq(const struct llama_context * ctx) { +uint32_t llama_n_seq_max(const struct llama_context * ctx) { return ctx->kv_self.size; } @@ -13188,10 +13188,10 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } } -struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { +struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { struct llama_kv_cache_view result = { /*.n_cells = */ 0, - /*.n_max_seq = */ n_max_seq, + /*.n_seq_max = */ n_seq_max, /*.token_count = */ 0, /*.used_cells = */ llama_get_kv_cache_used_cells(ctx), /*.max_contiguous = */ 0, @@ -13219,7 +13219,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells); GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); view->cells = (struct llama_kv_cache_view_cell *)p; - p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells); + p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells); GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences"); view->cells_sequences = (llama_seq_id *)p; } @@ -13233,7 +13233,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k uint32_t max_contig = 0; int32_t max_contig_idx = -1; - for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) { + for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) { const size_t curr_size = kv_cells[i].seq_id.size(); token_count += curr_size; c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; @@ -13250,7 +13250,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k int seq_idx = 0; for (const llama_seq_id it : kv_cells[i].seq_id) { - if (seq_idx >= view->n_max_seq) { + if (seq_idx >= view->n_seq_max) { break; } cs_curr[seq_idx] = it; @@ -13259,7 +13259,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k if (seq_idx != 0) { used_cells++; } - for (; seq_idx < view->n_max_seq; seq_idx++) { + for (; seq_idx < view->n_seq_max; seq_idx++) { cs_curr[seq_idx] = -1; } } @@ -13921,12 +13921,12 @@ int32_t llama_tokenize( const char * text, int32_t text_len, llama_token * tokens, - int32_t n_max_tokens, + int32_t n_tokens_max, bool add_bos, bool special) { auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special); - if (n_max_tokens < (int) res.size()) { + if (n_tokens_max < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); return -((int) res.size()); } |