diff options
author | slaren <slarengh@gmail.com> | 2023-08-18 12:44:58 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-18 12:44:58 +0200 |
commit | 097e121e2f17ed3541cf02c55ff7e9febc091b19 (patch) | |
tree | f3bead40b2632be95479e3f9b31baffc6681f572 /llama.cpp | |
parent | eaf98c2649d7da705de255712f0038ac7e47c610 (diff) |
llama : add benchmark example (#2626)
* llama : add benchmark example
* add to examples CMakeLists.txt
* fix msvc build
* add missing include
* add Bessel's correction to stdev calculation
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
* improve markdown formatting
* add missing include
* print warning is NDEBUG is not defined
* remove n_prompt and n_gen from the matrix, use each value separately instead
* better checks for non-optimized builds
* llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call
* fix json formatting
* add sql output
* add basic cpu and gpu info (linx/cuda only)
* markdown: also show values that differ from the default
* markdown: add build id
* cleanup
* improve formatting
* formatting
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 21 |
1 files changed, 16 insertions, 5 deletions
@@ -115,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * // memory sizes (calculated for n_batch == 512) // -static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx) +static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx) { - static std::map<e_model, size_t> k_sizes = { + std::map<e_model, size_t> k_sizes = { { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB }, { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB }, { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB }, @@ -984,7 +984,7 @@ int64_t llama_time_us() { // model loading // -static const char *llama_file_version_name(llama_file_version version) { +static const char * llama_file_version_name(llama_file_version version) { switch (version) { case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; @@ -996,7 +996,7 @@ static const char *llama_file_version_name(llama_file_version version) { return "unknown"; } -static const char *llama_ftype_name(enum llama_ftype ftype) { +const char * llama_ftype_name(enum llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; @@ -1021,7 +1021,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { } } -static const char *llama_model_type_name(e_model type) { +static const char * llama_model_type_name(e_model type) { switch (type) { case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; @@ -1799,6 +1799,13 @@ static bool llama_eval_internal( LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); + LLAMA_ASSERT(n_tokens > 0); + LLAMA_ASSERT(n_past >= 0); + LLAMA_ASSERT(n_threads > 0); + // TODO: keep the values of n_batch and n_ctx + // LLAMA_ASSERT(n_tokens <= n_batch); + // LLAMA_ASSERT(n_past + n_tokens <= n_ctx); + const int64_t t_start_us = ggml_time_us(); #ifdef GGML_USE_MPI @@ -4274,6 +4281,10 @@ int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } +int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) { + return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_ftype_name(model->hparams.ftype)); +} + int llama_get_vocab_from_model( const struct llama_model * model, const char * * strings, |