diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2024-10-25 13:08:43 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-25 13:08:43 +0200 |
commit | 6b968f38946117552ffed300771c44ba9b39d3e4 (patch) | |
tree | dc6b0df69f31ea77d9941d6798a4ef411c688080 /ggml/src/ggml-backend.c | |
parent | 9114078959b404899fd67e1af45f0dcbee51b47f (diff) |
Bitnet changes (#106)
* Adapting iq2_bn to work without separate scale tensors
Why? It is becoming burdensome to maintain the special Bitnet
conversion in convert_hf_to_gguf.py, so I thnk it is better
to make iq1_bn and iq2_bn just work with the mainline
conversion script (which does not generate scales).
* Adapting iq1_bn to work without separate scale tensors
* Adapting iq2_bn: CUDA dequantize
* Adapting iq2_bn: CUDA works
* Adapting iq1_bn: CUDA works
* Adapting iq1_bn, iq2_bn: NEON
* Adapting iq1_bn, iq2_bn: Metal
Dequantize works, but there is still something wrong
with the dot products.
* WIP
Absoolutely don't see what is wrong with the iq1_bn and iq2_bn
vector dot product kernels.
* Remove iq1_tn and iq2_tn - Part 1
Now that iq1_bn and iq2_bn have per row scales, there is no
reason to also have iq1_tn and iq2_tn.
* Remove iq1_tn and iq2_tn - Part 2
* Bitnet: use the standard llm_build_kv to build self attention
My main motivation was to enable FA. But FA does not work anyway
because head size is 100 for the Botnet ternary models
(and I had forgotten this little detail).
* Revert "Avoid rebuild of GGML graph for each token (#98)"
This reverts commit f2d315b46f7aacc7df4b86bd8acba387b30e11ca.
As far as I can tell, the commit breaks Metal TG.
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-backend.c')
-rw-r--r-- | ggml/src/ggml-backend.c | 45 |
1 files changed, 8 insertions, 37 deletions
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 76d37f74..e1651cc6 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -1040,13 +1040,6 @@ struct ggml_backend_sched_split { struct ggml_cgraph graph; }; -// Object to facilitate GML graph caching -struct ggml_cached_graph { - bool is_active; - ggml_backend_t input_backend; - struct ggml_tensor * input_cpy[GGML_SCHED_MAX_SPLIT_INPUTS]; -}; - struct ggml_backend_sched { bool is_reset; // true if the scheduler has been reset since the last graph split bool is_alloc; @@ -1092,8 +1085,6 @@ struct ggml_backend_sched { size_t context_buffer_size; bool debug; - - struct ggml_cached_graph cached_graph; }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) @@ -1771,14 +1762,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); - if (!sched->cached_graph.is_active) { - sched->cached_graph.input_backend = input_backend; - sched->cached_graph.input_cpy[j] = input_cpy; - } else { - input_backend = sched->cached_graph.input_backend; - input_cpy = sched->cached_graph.input_cpy[j]; - } - if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done if (sched->events[split_backend_id][sched->cur_copy] != NULL) { @@ -1910,8 +1893,6 @@ ggml_backend_sched_t ggml_backend_sched_new( ggml_backend_sched_reset(sched); - sched->cached_graph.is_active = false; - return sched; } @@ -1988,16 +1969,16 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st } enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - if(!sched->cached_graph.is_active) { - if (!sched->is_reset && !sched->is_alloc) { - ggml_backend_sched_reset(sched); - } - if (!sched->is_alloc) { - if (!ggml_backend_sched_alloc_graph(sched, graph)) { - return GGML_STATUS_ALLOC_FAILED; - } + if (!sched->is_reset && !sched->is_alloc) { + ggml_backend_sched_reset(sched); + } + + if (!sched->is_alloc) { + if (!ggml_backend_sched_alloc_graph(sched, graph)) { + return GGML_STATUS_ALLOC_FAILED; } } + return ggml_backend_sched_compute_splits(sched); } @@ -2262,13 +2243,3 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t return true; } - -bool ggml_use_cached_graph(ggml_backend_sched_t sched) { - return sched->cached_graph.is_active; -} - -void ggml_set_cached_graph(ggml_backend_sched_t sched, bool set_value) { - sched->cached_graph.is_active = set_value; -} - - |