diff options
Diffstat (limited to 'ggml/src')
-rw-r--r-- | ggml/src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | ggml/src/ggml-cuda.cu | 31 |
2 files changed, 28 insertions, 4 deletions
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index b0db417d..5ecdb4f1 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -382,6 +382,7 @@ if (GGML_CUDA) add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + add_compile_definitions(GGML_CUDA_MIN_BATCH_OFFLOAD=${GGML_CUDA_MIN_BATCH_OFFLOAD}) if (GGML_CUDA_USE_GRAPHS) add_compile_definitions(GGML_CUDA_USE_GRAPHS) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 0961e2bd..da3dc334 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3656,10 +3656,33 @@ GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, gg } GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { - const int min_batch_size = 32; - - return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= min_batch_size && (op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_MOE_FUSED_UP_GATE)); + constexpr int min_batch_size = GGML_CUDA_MIN_BATCH_OFFLOAD; + + // Why do we want to do this? The heuristics that the batch must have more than min_batch_size tokens to be worth it + // offloading the required model weights comes from dense models. For MoE models, the average number of tokens + // each expert deals with in a batch is (active_experts / total_experts) * batch_size. Hence, according to the + // learned heuristics, we need (active_experts / total_experts) * batch_size >= min_batch_size. + // Rearranging we get + // + // batch_size * active_experts >= min_batch_size * total_experts + // + // as the condition for offloading model weights resinding in RAM to the GPU. + // In this case, the number of tokens is not as usual in op->ne[1] but rather in op->ne[2]. + if (op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_MOE_FUSED_UP_GATE) { + auto ids = op->op == GGML_OP_MUL_MAT_ID ? op->src[2] : op->src[3]; + int64_t batch_size = op->ne[2]; + if (batch_size < min_batch_size) return false; + int64_t n_experts_tot = op->src[0]->ne[2]; + int64_t n_experts_active = ids->ne[0]; + //printf("%s(%s): op->ne[2] = %ld, n_experts_tot = %ld, n_experts_active = %ld, ids: %s, %ld x %ld x %ld x %ld\n", __func__, op->name, op->ne[2], n_experts_tot, n_experts_active, ids->name, ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3]); + return batch_size*n_experts_active >= min_batch_size*n_experts_tot; + } + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; + + // Original: + //return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || + // (op->ne[2] >= min_batch_size && (op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_MOE_FUSED_UP_GATE)); GGML_UNUSED(backend); } |