summaryrefslogtreecommitdiff
path: root/ggml/src/ggml-backend.c
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2025-03-13 12:07:43 +0200
committerGitHub <noreply@github.com>2025-03-13 12:07:43 +0200
commit305fabfc3b694d603fdb05d671dd59e2d4c7d58e (patch)
tree645b23c154fa8af405f55138f38d264e05faa2ce /ggml/src/ggml-backend.c
parent3f23ed68f17583a8ee63afd0c214f5b39226226c (diff)
FlashMLA-2 (CPU): faster and smaller compute buffer size (#253)
* FlashMLA-2: eliminate intermediate f32 tensors This works on the CPU. PP performance is ~13% better for 16k tokens and compute buffer is quite a bit smaller. * FlashMLA-2: enable fast path only on the CPU for now I did implement the necessary ops on CUDA, but something is still wrong there, so for now we only use it when running CPU-only. * FlashMLA-2: slightly smaller computer buffer size --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-backend.c')
-rw-r--r--ggml/src/ggml-backend.c3
1 files changed, 2 insertions, 1 deletions
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 0458bd0c..fd538f50 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -843,7 +843,8 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+ return true;
+ //return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
default:
return true;
}