Give the user the option to override where model weights are stored (#232)

* Give the user the option to override where model weights are stored * Fix ggml_nbytes() problem and cleanup For a tensor with zero elements ggml_nbytes() was returning uint64_t::max, and this was causing graph allocation failure. * Add timing info to CUDA graph evaluation * Add more timing info --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2025-02-25 17:55:58 +0200
committer: GitHub <noreply@github.com> 2025-02-25 17:55:58 +0200
commit: 94b659a2f106e017e5eeb6f492dc9f290e136833 (patch)
tree: 8b744ff19b476f7d08e9691def83ad3fbb27c763 /ggml/src
parent: 547eee81d99a2676975a9768166b7d164473b8fa (diff)
4 files changed, 55 insertions, 0 deletions
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index e485326a..d811dee6 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -174,6 +174,8 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
             // this should never happen
             fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                     __func__, size, max_avail);
+            fprintf(stderr, "%s: tensor was %s with %zu elements and %zu bytes\n", __func__, tensor->name,
+                    ggml_nelements(tensor), ggml_nbytes(tensor));
             GGML_ABORT("not enough space in the buffer");
         }
     }
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index e1651cc6..0458bd0c 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#define IK_PRINT_TIMING 0
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
@@ -229,7 +230,17 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
         return;
     }
 
+
+#if IK_PRINT_TIMING
+    int64_t tim1 = ggml_time_us();
+#endif
     buf->iface.set_tensor(buf, tensor, data, offset, size);
+#if IK_PRINT_TIMING
+    int64_t tim2 = ggml_time_us();
+    //printf("%s(%s) %zu %d us\n", __func__, tensor->name, size, (int)(tim2-tim1));
+    printf("%s(%s): %d us\n", __func__, tensor->name, (int)(tim2-tim1));
+#endif
+
 }
 
 GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -243,7 +254,15 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
         return;
     }
 
+#if IK_PRINT_TIMING
+    int64_t tim1 = ggml_time_us();
+#endif
     buf->iface.get_tensor(buf, tensor, data, offset, size);
+#if IK_PRINT_TIMING
+    int64_t tim2 = ggml_time_us();
+    //printf("%s(%s) %zu %d us\n", __func__, tensor->name, size, (int)(tim2-tim1));
+    printf("%s(%s): %d us\n", __func__, tensor->name, (int)(tim2-tim1));
+#endif
 }
 
 void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -1751,7 +1770,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
 static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
     struct ggml_backend_sched_split * splits = sched->splits;
 
+
     for (int i = 0; i < sched->n_splits; i++) {
+#if IK_PRINT_TIMING
+        int64_t tim1 = ggml_time_us();
+#endif
         struct ggml_backend_sched_split * split = &splits[i];
         int split_backend_id = split->backend_id;
         ggml_backend_t split_backend = sched->backends[split_backend_id];
@@ -1792,6 +1815,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         }
 
         if (!sched->callback_eval) {
+#if IK_PRINT_TIMING
+            int64_t tim2 = ggml_time_us();
+            printf("%s(.1.): %d us\n", __func__, (int)(tim2-tim1));
+#endif
             enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
             if (ec != GGML_STATUS_SUCCESS) {
                 return ec;
@@ -1814,6 +1841,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
 
                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
 
+#if IK_PRINT_TIMING
+                int64_t tim2 = ggml_time_us();
+                printf("%s(.2.): %d us\n", __func__, (int)(tim2-tim1));
+#endif
+
                 enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
                 if (ec != GGML_STATUS_SUCCESS) {
                     return ec;
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 26d06d56..c305cd89 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -50,6 +50,8 @@
 #include <string>
 #include <vector>
 
+#define IK_PRINT_TIMING 0
+
 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
 static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
@@ -2446,6 +2448,10 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
     }
 
+#if IK_PRINT_TIMING
+    int64_t tim1 = ggml_time_us();
+#endif
+
     switch (dst->op) {
         case GGML_OP_REPEAT:
             ggml_cuda_op_repeat(ctx, dst);
@@ -2618,6 +2624,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         CUDA_CHECK(err);
     }
 
+#if IK_PRINT_TIMING
+    int64_t tim2 = ggml_time_us();
+    printf("%s(%s): %d us\n", ggml_op_name(dst->op), dst->name, (int)(tim2 - tim1));
+#endif
+
     return true;
 }
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 8efe2653..80dd25ff 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4267,6 +4267,9 @@ GGML_CALL int64_t ggml_blck_size(enum ggml_type type) {
 }
 
 GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (tensor->ne[i] <= 0) return 0;
+    }
     size_t nbytes;
     size_t blck_size = ggml_blck_size(tensor->type);
     if (blck_size == 1) {
@@ -21480,6 +21483,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
 
 #ifdef GGML_USE_OPENMP
     if (n_threads > 1) {
+//#if IK_PRINT_TIMING
+//        int64_t tim1 = ggml_time_us();
+//#endif
         #pragma omp parallel num_threads(n_threads)
         {
             #pragma omp single
@@ -21496,6 +21502,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             };
             ggml_graph_compute_thread(&worker);
         }
+//#if IK_PRINT_TIMING
+//        int64_t tim2 = ggml_time_us();
+//        printf("%s(...): %d us\n", __func__, (int)(tim2-tim1));
+//#endif
     } else {
         struct ggml_compute_state worker = {
             .thrd   = 0,
author	Kawrakow <iwankawrakow@gmail.com>	2025-02-25 17:55:58 +0200
committer	GitHub <noreply@github.com>	2025-02-25 17:55:58 +0200
commit	94b659a2f106e017e5eeb6f492dc9f290e136833 (patch)
tree	8b744ff19b476f7d08e9691def83ad3fbb27c763 /ggml/src
parent	547eee81d99a2676975a9768166b7d164473b8fa (diff)