Reduce memory usage and allocate enough memory for largest context (#473)

* Reduce memory usage and allocate enough memory for large contexts * Simpler scratch buffer usage * Reenable BLAS for quantized mul_mat * Fix number of layers in 30B and 65B * Fix KV cache size for F32
author: Georgi Gerganov <ggerganov@gmail.com> 2023-03-24 23:17:37 +0200
committer: GitHub <noreply@github.com> 2023-03-24 23:17:37 +0200
commit: 7a9b6c3a8bdc1cb75fefc826dfaa7331eb63695d (patch)
tree: 339815189c912e9a759a0259613621f6a2adcbf4 /ggml.c
parent: 31572d966531f7d768eb773322016ab78eb6e835 (diff)
1 files changed, 10 insertions, 2 deletions
diff --git a/ggml.c b/ggml.c
index 92b857a0..cfdf427d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5846,7 +5846,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
-    UNUSED(src0);
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
 
     const int ne10 = src1->ne[0];
 
@@ -5856,7 +5857,14 @@ static bool ggml_compute_forward_mul_mat_use_blas(
     // TODO: find the optimal values for these
     if (ggml_is_contiguous(src0) &&
         ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
-        //printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
+
+        //// disable BLAS for Q4_0 and Q4_1
+        //// looks like there is no benefit and we only waste a lot of memory
+        //if (src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1) {
+        //    return false;
+        //}
+
+        //printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);
         return true;
     }
author	Georgi Gerganov <ggerganov@gmail.com>	2023-03-24 23:17:37 +0200
committer	GitHub <noreply@github.com>	2023-03-24 23:17:37 +0200
commit	7a9b6c3a8bdc1cb75fefc826dfaa7331eb63695d (patch)
tree	339815189c912e9a759a0259613621f6a2adcbf4 /ggml.c
parent	31572d966531f7d768eb773322016ab78eb6e835 (diff)