From 94b659a2f106e017e5eeb6f492dc9f290e136833 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Tue, 25 Feb 2025 17:55:58 +0200
Subject: Give the user the option to override where model weights are stored
 (#232)

* Give the user the option to override where model weights are stored

* Fix ggml_nbytes() problem and cleanup

For a tensor with zero elements ggml_nbytes() was returning
uint64_t::max, and this was causing graph allocation failure.

* Add timing info to CUDA graph evaluation

* Add more timing info

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 include/llama.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/llama.h b/include/llama.h
index 23e32642..beb6ecba 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -305,6 +305,11 @@ extern "C" {
         };
     };
 
+    struct llama_model_tensor_buft_override {
+        const char * pattern;
+        ggml_backend_buffer_type_t buft;
+    };
+
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
@@ -332,6 +337,8 @@ extern "C" {
         // override key-value pairs of the model meta data
         const struct llama_model_kv_override * kv_overrides;
 
+        const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only;    // only load the vocabulary, no weights
         bool use_mmap;      // use mmap if possible
-- 
cgit v1.2.3