diff options
author | Johannes Gäßler <johannesg@5d6.de> | 2024-06-14 18:41:49 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-14 18:41:49 +0200 |
commit | 76d66ee0be91e2bec93206e821ee1db8d023cff5 (patch) | |
tree | 9bf121667539f91b90b54b237e54bdbd9a16161c /ggml-cuda.cu | |
parent | 66ef1ceedf983773c8ceb4d925285d41d4e50e2a (diff) |
CUDA: faster q2_K, q3_K MMQ + int8 tensor cores (#7921)
* CUDA: faster q2_K, q3_K MMQ + int8 tensor cores
* try CI fix
* try CI fix
* try CI fix
* fix data race
* rever q2_K precision related changes
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r-- | ggml-cuda.cu | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 64d3b674..593fa4cd 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() { info.default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; + info.devices[id].nsm = prop.multiProcessorCount; + info.devices[id].smpb = prop.sharedMemPerBlock; #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; #else + info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = 100*prop.major + 10*prop.minor; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - info.devices[id].smpb = prop.sharedMemPerBlock; - info.devices[id].nsm = prop.multiProcessorCount; } for (int id = 0; id < info.device_count; ++id) { |