From 94b659a2f106e017e5eeb6f492dc9f290e136833 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Tue, 25 Feb 2025 17:55:58 +0200 Subject: Give the user the option to override where model weights are stored (#232) * Give the user the option to override where model weights are stored * Fix ggml_nbytes() problem and cleanup For a tensor with zero elements ggml_nbytes() was returning uint64_t::max, and this was causing graph allocation failure. * Add timing info to CUDA graph evaluation * Add more timing info --------- Co-authored-by: Iwan Kawrakow --- include/llama.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/llama.h b/include/llama.h index 23e32642..beb6ecba 100644 --- a/include/llama.h +++ b/include/llama.h @@ -305,6 +305,11 @@ extern "C" { }; }; + struct llama_model_tensor_buft_override { + const char * pattern; + ggml_backend_buffer_type_t buft; + }; + struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM enum llama_split_mode split_mode; // how to split the model across multiple GPUs @@ -332,6 +337,8 @@ extern "C" { // override key-value pairs of the model meta data const struct llama_model_kv_override * kv_overrides; + const struct llama_model_tensor_buft_override * tensor_buft_overrides; + // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible -- cgit v1.2.3