llama : remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD (#5240)

* llama : remove LLAMA_MAX_DEVICES from llama.h ggml-ci * Update llama.cpp Co-authored-by: slaren <slarengh@gmail.com> * server : remove LLAMA_MAX_DEVICES ggml-ci * llama : remove LLAMA_SUPPORTS_GPU_OFFLOAD ggml-ci * train : remove LLAMA_SUPPORTS_GPU_OFFLOAD * readme : add deprecation notice * readme : change deprecation notice to "remove" and fix url * llama : remove gpu includes from llama.h ggml-ci --------- Co-authored-by: slaren <slarengh@gmail.com>
author: Georgi Gerganov <ggerganov@gmail.com> 2024-01-31 17:30:17 +0200
committer: GitHub <noreply@github.com> 2024-01-31 17:30:17 +0200
commit: 5cb04dbc16d1da38c8fdcc0111b40e67d00dd1c3 (patch)
tree: 3ef8dc640d5c08466309c09a8ac2963bb760af06 /llama.h
parent: efb7bdbbd061d087c788598b97992c653f992ddd (diff)
1 files changed, 10 insertions, 19 deletions
diff --git a/llama.h b/llama.h
index 17d43d03..9a60e9bf 100644
--- a/llama.h
+++ b/llama.h
@@ -3,15 +3,7 @@
 
 #include "ggml.h"
 #include "ggml-backend.h"
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
-#elif defined(GGML_USE_SYCL)
-#include "ggml-sycl.h"
-#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
-#else
-#define LLAMA_MAX_DEVICES 1
-#endif // GGML_USE_CUBLAS
+
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -49,12 +41,6 @@
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 4
 
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
-// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
-#define LLAMA_SUPPORTS_GPU_OFFLOAD
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -201,7 +187,7 @@ extern "C" {
         // LLAMA_SPLIT_LAYER: ignored
         int32_t main_gpu;
 
-        // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
         const float * tensor_split;
 
         // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
@@ -338,9 +324,14 @@ extern "C" {
 
     LLAMA_API int64_t llama_time_us(void);
 
-    LLAMA_API int32_t  llama_max_devices(void);
-    LLAMA_API bool llama_mmap_supported (void);
-    LLAMA_API bool llama_mlock_supported(void);
+    LLAMA_API size_t llama_max_devices(void);
+
+    LLAMA_API bool llama_supports_mmap       (void);
+    LLAMA_API bool llama_supports_mlock      (void);
+    LLAMA_API bool llama_supports_gpu_offload(void);
+
+    LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
+    LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
 
     LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
author	Georgi Gerganov <ggerganov@gmail.com>	2024-01-31 17:30:17 +0200
committer	GitHub <noreply@github.com>	2024-01-31 17:30:17 +0200
commit	5cb04dbc16d1da38c8fdcc0111b40e67d00dd1c3 (patch)
tree	3ef8dc640d5c08466309c09a8ac2963bb760af06 /llama.h
parent	efb7bdbbd061d087c788598b97992c653f992ddd (diff)