diff options
author | Pierrick Hymbert <pierrick.hymbert@gmail.com> | 2024-04-11 14:51:07 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-11 14:51:07 +0200 |
commit | b804b1ef77351d2a11be945462c6c251710476cb (patch) | |
tree | f963c03b90a54083ee67c22c882d20e388820897 /common | |
parent | 8228b66dbc16290c5cbd70e80ab47c068e2569d8 (diff) |
eval-callback: Example how to use eval callback for debugging (#6576)
* gguf-debug: Example how to use ggml callback for debugging
* gguf-debug: no mutex, verify type, fix stride.
* llama: cv eval: move cb eval field in common gpt_params
* ggml_debug: use common gpt_params to pass cb eval.
Fix get tensor SIGV random.
* ggml_debug: ci: add tests
* ggml_debug: EOL in CMakeLists.txt
* ggml_debug: Remove unused param n_batch, no batching here
* ggml_debug: fix trailing spaces
* ggml_debug: fix trailing spaces
* common: fix cb_eval and user data not initialized
* ci: build revert label
* ggml_debug: add main test label
* doc: add a model: add a link to ggml-debug
* ggml-debug: add to make toolchain
* ggml-debug: tests add the main label
* ggml-debug: ci add test curl label
* common: allow the warmup to be disabled in llama_init_from_gpt_params
* ci: add curl test
* ggml-debug: better tensor type support
* gitignore : ggml-debug
* ggml-debug: printing also the sum of each tensor
* ggml-debug: remove block size
* eval-callback: renamed from ggml-debug
* eval-callback: fix make toolchain
---------
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'common')
-rw-r--r-- | common/common.cpp | 4 | ||||
-rw-r--r-- | common/common.h | 4 |
2 files changed, 7 insertions, 1 deletions
diff --git a/common/common.cpp b/common/common.cpp index 98fc8388..dda51478 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1745,6 +1745,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.pooling_type = params.pooling_type; cparams.defrag_thold = params.defrag_thold; + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); @@ -2192,7 +2194,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } - { + if (params.warmup) { LOG("warming up the model with an empty run\n"); std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), }; diff --git a/common/common.h b/common/common.h index a7f476c1..65272b0b 100644 --- a/common/common.h +++ b/common/common.h @@ -80,6 +80,9 @@ struct gpt_params { int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold + ggml_backend_sched_eval_callback cb_eval = nullptr; + void * cb_eval_user_data = nullptr; + ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; @@ -156,6 +159,7 @@ struct gpt_params { bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading + bool warmup = true; // warmup run std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V |