summaryrefslogtreecommitdiff
path: root/ggml-cuda.cu
diff options
context:
space:
mode:
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r--ggml-cuda.cu175
1 files changed, 146 insertions, 29 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index b8834ed0..d1b5e52b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -72,6 +72,7 @@
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
@@ -81,6 +82,7 @@
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
+#define cudaLaunchHostFunc hipLaunchHostFunc
#ifdef GGML_HIP_UMA
#define cudaMalloc hipMallocManaged
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
@@ -104,6 +106,7 @@
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaStream_t hipStream_t
@@ -10641,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
#define UNUSED GGML_UNUSED
struct ggml_backend_cuda_context {
+ explicit ggml_backend_cuda_context(int device) :
+ device(device),
+ name(GGML_CUDA_NAME + std::to_string(device)) {
+ }
+
+ ~ggml_backend_cuda_context() {
+ if (copy_event != nullptr) {
+ CUDA_CHECK(cudaEventDestroy(copy_event));
+ }
+ }
+
int device;
std::string name;
+ cudaEvent_t copy_event = nullptr;
};
// cuda buffer
@@ -10732,9 +10747,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
- CUDA_CHECK(cudaDeviceSynchronize());
- CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
- CUDA_CHECK(cudaDeviceSynchronize());
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -10743,26 +10757,25 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
- CUDA_CHECK(cudaDeviceSynchronize());
- CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
- CUDA_CHECK(cudaDeviceSynchronize());
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
if (ggml_backend_buffer_is_cuda(src->buffer)) {
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
- ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
- ggml_cuda_set_device(src_ctx->device);
- CUDA_CHECK(cudaDeviceSynchronize());
- ggml_cuda_set_device(dst_ctx->device);
- CUDA_CHECK(cudaDeviceSynchronize());
- CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaDeviceSynchronize());
-
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
+ if (src_ctx->device == dst_ctx->device) {
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
+ } else {
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
+ }
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
return true;
}
return false;
+
+ UNUSED(buffer);
}
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -11007,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
}
const char * buf_host = (const char *)data + offset_split;
- CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+ }
+
+ for (int id = 0; id < g_device_count; ++id) {
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
}
@@ -11041,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
}
char * buf_host = (char *)data + offset_split;
- CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
+ }
+
+ for (int id = 0; id < g_device_count; ++id) {
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
}
@@ -11220,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
return &ggml_backend_cuda_buffer_type_host;
}
+//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
+// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
+//}
+
// backend
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
@@ -11243,8 +11268,9 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@@ -11252,22 +11278,61 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
}
-GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
+ GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
- if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
- return true;
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
+
+ if (!ggml_backend_buffer_is_cuda(src->buffer)) {
+ return false;
}
- return false;
+ if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
+ return false;
+ }
+
+ // device -> device
+ ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
+ ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
+
+ if (backend_src != backend_dst) {
+ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
+ ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
+
+ GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
+ GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
+
+ if (!cuda_ctx_src->copy_event) {
+ ggml_cuda_set_device(cuda_ctx_src->device);
+ CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
+ }
+
+ // copy on src stream
+ if (cuda_ctx_src->device == cuda_ctx_dst->device) {
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
+ } else {
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
+ }
+
+ // record event on src stream
+ CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
+
+ // wait on dst stream for the copy to complete
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
+ } else {
+ // src and dst are on the same backend
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
+ }
+ return true;
}
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
@@ -11444,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
UNUSED(backend);
}
+static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+ ggml_cuda_set_device(cuda_ctx->device);
+
+ cudaEvent_t event;
+ CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+
+ return new ggml_backend_event {
+ /* .backend = */ backend,
+ /* .context = */ event,
+ };
+}
+
+static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
+ CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
+
+ delete event;
+}
+
+static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
+
+ CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
+}
+
+static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+ if (ggml_backend_is_cuda(event->backend)) {
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
+ } else {
+ // untested
+ auto wait_fn = [](void * user_data) {
+ ggml_backend_event_t event = (ggml_backend_event_t)user_data;
+ ggml_backend_event_synchronize(event);
+ };
+
+ CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
+ }
+}
+
+static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
+ CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
+}
+
static ggml_backend_i ggml_backend_cuda_interface = {
/* .get_name = */ ggml_backend_cuda_name,
/* .free = */ ggml_backend_cuda_free,
@@ -11457,6 +11568,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op,
+ /* .event_new = */ ggml_backend_cuda_event_new,
+ /* .event_free = */ ggml_backend_cuda_event_free,
+ /* .event_record = */ ggml_backend_cuda_event_record,
+ /* .event_wait = */ ggml_backend_cuda_event_wait,
+ /* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
};
static ggml_guid_t ggml_backend_cuda_guid() {
@@ -11475,10 +11591,11 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
// not strictly necessary, but it may reduce the overhead of the first graph_compute
ggml_cuda_set_main_device(device);
- ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
- /* .device = */ device,
- /* .name = */ GGML_CUDA_NAME + std::to_string(device),
- };
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
+ if (ctx == nullptr) {
+ fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
+ return nullptr;
+ }
ggml_backend_t cuda_backend = new ggml_backend {
/* .guid = */ ggml_backend_cuda_guid(),