diff options
author | slaren <slarengh@gmail.com> | 2024-03-18 11:03:04 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-18 11:03:04 +0100 |
commit | 2bf8d0f7c4cc1235755ad06961ca761e458c5e55 (patch) | |
tree | d2a462deb3c0e34cfb26eab6881a65bfb9fc3b28 /examples | |
parent | 496bc79bc2b79bfd6124b8687a8dbd6a646e9b06 (diff) |
backend : offload large batches to GPU (#6083)
* backend : offload large batches to GPU
* fix hip
* code cleanup
* fix CUDA split buffers
* Update ggml-backend-impl.h
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
* cuda : fix memset without set_device
* imatrix : remove sched affix from weight names
* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup
* update backends
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Diffstat (limited to 'examples')
-rw-r--r-- | examples/imatrix/imatrix.cpp | 32 | ||||
-rw-r--r-- | examples/llama-bench/llama-bench.cpp | 4 |
2 files changed, 27 insertions, 9 deletions
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index f21bc48f..ea79b906 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -56,13 +56,31 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const struct ggml_tensor * src0 = t->src[0]; const struct ggml_tensor * src1 = t->src[1]; + std::string wname; + { + // remove any prefix and suffixes from the name + // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight + const char * p = strchr(src0->name, '#'); + if (p != NULL) { + p = p + 1; + const char * q = strchr(p, '#'); + if (q != NULL) { + wname = std::string(p, q - p); + } else { + wname = p; + } + } else { + wname = src0->name; + } + } + // when ask is true, the scheduler wants to know if we are interested in data from this tensor // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection if (ask) { if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications if (t->op != GGML_OP_MUL_MAT) return false; if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; - if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false; + if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; return true; } @@ -94,12 +112,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * // this is necessary to guarantee equal number of "ncall" for each tensor for (int ex = 0; ex < n_as; ++ex) { src0 = t->src[2 + ex]; - auto& e = m_stats[src0->name]; + auto& e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ASSERT(false); } // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger @@ -107,7 +125,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * //if (idx == t->src[0]->ne[0] - 1) ++e.ncall; ++e.ncall; if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { const int excur = m_ids[row*n_as + idx]; @@ -129,17 +147,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } } else { - auto& e = m_stats[src0->name]; + auto& e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ASSERT(false); } ++e.ncall; if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { const float * x = data + row * src1->ne[0]; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 32eea786..4cb23080 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -114,10 +114,10 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::string id; #ifdef GGML_USE_CUBLAS - int count = ggml_cuda_get_device_count(); + int count = ggml_backend_cuda_get_device_count(); for (int i = 0; i < count; i++) { char buf[128]; - ggml_cuda_get_device_description(i, buf, sizeof(buf)); + ggml_backend_cuda_get_device_description(i, buf, sizeof(buf)); id += buf; if (i < count - 1) { id += "/"; |