diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-11-17 10:00:15 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-17 10:00:15 +0200 |
commit | 4f447a48339977073a1af4f33ae873465ff64994 (patch) | |
tree | b35f0966175135c2836ebb54ca9c87d5e318caef /llama.cpp | |
parent | 91f6499393d2d999331fbfdba47a7f8b9f913f0d (diff) |
llama : fix data units (#4101)
* llama : fix data units
ggml-ci
* Revert "llama : fix data units"
This reverts commit f5feac831fe225ed7f3db938d115732a49dccfc4.
* llama : disambiguate data units
ggml-ci
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 32 |
1 files changed, 16 insertions, 16 deletions
@@ -1087,9 +1087,9 @@ enum e_model { MODEL_70B, }; -static const size_t kB = 1024; -static const size_t MB = 1024*kB; -static const size_t GB = 1024*MB; +static const size_t kiB = 1024; +static const size_t MiB = 1024*kiB; +static const size_t GiB = 1024*MiB; struct llama_hparams { bool vocab_only; @@ -1488,7 +1488,7 @@ static bool llama_kv_cache_init( vram_kv_cache += ggml_nbytes(cache.k); } if (vram_kv_cache > 0) { - LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); } } #endif @@ -2543,8 +2543,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); - if (ml.n_bytes < GB) { - LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + if (ml.n_bytes < GiB) { + LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); } else { LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); } @@ -2582,7 +2582,7 @@ static void llm_load_tensors( ml.calc_sizes(ctx_size, mmapped_size); - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); // create the ggml context { @@ -3231,7 +3231,7 @@ static void llm_load_tensors( ctx_size + mmapped_size - vram_weights; // weights in VRAM not in memory - LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0); #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -3250,7 +3250,7 @@ static void llm_load_tensors( #endif // GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); #else (void) n_gpu_layers; #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) @@ -7962,7 +7962,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s workers.clear(); } - LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); int64_t tot_count = 0; for (size_t i = 0; i < hist_cur.size(); i++) { hist_all[i] += hist_cur[i]; @@ -8502,7 +8502,7 @@ struct llama_context * llama_new_context_with_model( { const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); - LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0); } // resized during inference @@ -8547,7 +8547,7 @@ struct llama_context * llama_new_context_with_model( // measure memory requirements for the graph size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); // recreate allocator with exact memory requirements ggml_allocr_free(ctx->alloc); @@ -8561,7 +8561,7 @@ struct llama_context * llama_new_context_with_model( #endif #ifdef GGML_USE_CUBLAS ggml_cuda_set_scratch_size(alloc_size); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); // calculate total VRAM usage auto add_tensor = [](const ggml_tensor * t, size_t & size) { @@ -8581,10 +8581,10 @@ struct llama_context * llama_new_context_with_model( size_t ctx_vram_size = alloc_size + kv_vram_size; size_t total_vram_size = model_vram_size + ctx_vram_size; - LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__, + LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, total_vram_size / 1024.0 / 1024.0, model_vram_size / 1024.0 / 1024.0, - ctx_vram_size / 1024.0 / 1024.0); + ctx_vram_size / 1024.0 / 1024.0); #endif } @@ -8605,7 +8605,7 @@ struct llama_context * llama_new_context_with_model( const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0); #define LLAMA_METAL_CHECK_BUF(result) \ if (!(result)) { \ |