cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds * batched : add NGL arg * ggml : skip nops in compute_forward * cuda : minor indentation * cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops) * Apply suggestions from code review These changes plus: ```c++ #define cublasGemmBatchedEx hipblasGemmBatchedEx ``` are needed to compile with ROCM. I haven't done performance testing, but it seems to work. I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up. * cuda : add ROCm / hipBLAS cublasGemmBatchedEx define * cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases * cuda : reduce mallocs in cublasGemmBatchedEx branch * cuda : add TODO for calling cublas from kernel + using mem pool --------- Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
author: Georgi Gerganov <ggerganov@gmail.com> 2023-10-24 16:48:37 +0300
committer: GitHub <noreply@github.com> 2023-10-24 16:48:37 +0300
commit: 2b4ea35e56792064598e922e46d081e02bc96b94 (patch)
tree: dea0a7b3e47c7d876cbce5d30b31c4c78d7bb030 /examples/batched/batched.cpp
parent: daab3d7f45832e10773c99f3484b0d5b14d86c0c (diff)
1 files changed, 9 insertions, 2 deletions
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 75856a81..22a4265d 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
     gpt_params params;
 
     if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
         return 1 ;
     }
 
@@ -21,6 +21,9 @@ int main(int argc, char ** argv) {
     // total length of the sequences including the prompt
     int n_len = 32;
 
+    // number of layers to offload to the GPU
+    int n_gpu_layers = 0;
+
     if (argc >= 2) {
         params.model = argv[1];
     }
@@ -37,6 +40,10 @@ int main(int argc, char ** argv) {
         n_len = std::atoi(argv[4]);
     }
 
+    if (argc >= 6) {
+        n_gpu_layers = std::atoi(argv[5]);
+    }
+
     if (params.prompt.empty()) {
         params.prompt = "Hello my name is";
     }
@@ -49,7 +56,7 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = llama_model_default_params();
 
-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+    model_params.n_gpu_layers = n_gpu_layers;
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-10-24 16:48:37 +0300
committer	GitHub <noreply@github.com>	2023-10-24 16:48:37 +0300
commit	2b4ea35e56792064598e922e46d081e02bc96b94 (patch)
tree	dea0a7b3e47c7d876cbce5d30b31c4c78d7bb030 /examples/batched/batched.cpp
parent	daab3d7f45832e10773c99f3484b0d5b14d86c0c (diff)