summaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
Diffstat (limited to 'common')
-rw-r--r--common/CMakeLists.txt7
-rw-r--r--common/cmake/build-info-gen-cpp.cmake24
-rw-r--r--common/common.cpp1190
-rw-r--r--common/common.h81
-rw-r--r--common/json-schema-to-grammar.cpp351
-rw-r--r--common/log.h2
-rw-r--r--common/ngram-cache.h13
-rw-r--r--common/sampling.cpp29
8 files changed, 929 insertions, 768 deletions
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 171530c9..761971d6 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,5 +1,6 @@
# common
+find_package(Threads REQUIRED)
# Build info header
#
@@ -36,7 +37,7 @@ add_custom_command(
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
- -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
+ -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
@@ -83,5 +84,5 @@ if (LLAMA_CURL)
endif ()
target_include_directories(${TARGET} PUBLIC .)
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_compile_features (${TARGET} PUBLIC cxx_std_11)
+target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/cmake/build-info-gen-cpp.cmake b/common/cmake/build-info-gen-cpp.cmake
new file mode 100644
index 00000000..fbc92b52
--- /dev/null
+++ b/common/cmake/build-info-gen-cpp.cmake
@@ -0,0 +1,24 @@
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
+
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
+set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
+
+# Only write the build info if it changed
+if(EXISTS ${OUTPUT_FILE})
+ file(READ ${OUTPUT_FILE} CONTENTS)
+ string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
+ set(OLD_COMMIT ${CMAKE_MATCH_1})
+ string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
+ set(OLD_COMPILER ${CMAKE_MATCH_1})
+ string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
+ set(OLD_TARGET ${CMAKE_MATCH_1})
+ if (
+ NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
+ NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
+ NOT OLD_TARGET STREQUAL BUILD_TARGET
+ )
+ configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+ endif()
+else()
+ configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+endif()
diff --git a/common/common.cpp b/common/common.cpp
index 8eb23ade..4d1d88c6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,3 +1,7 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
#include "common.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
@@ -190,6 +194,12 @@ int32_t cpu_get_num_math() {
// CLI argument parsing
//
+void gpt_params_handle_hf_token(gpt_params & params) {
+ if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
+ params.hf_token = std::getenv("HF_TOKEN");
+ }
+}
+
void gpt_params_handle_model_default(gpt_params & params) {
if (!params.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
@@ -237,6 +247,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
gpt_params_handle_model_default(params);
+ gpt_params_handle_hf_token(params);
+
if (params.escape) {
string_process_escapes(params.prompt);
string_process_escapes(params.input_prefix);
@@ -273,26 +285,22 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return true;
}
+#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
+
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
const char split_delim = ',';
llama_sampling_params & sparams = params.sparams;
if (arg == "-s" || arg == "--seed") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
// TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
params.seed = std::stoul(argv[i]);
sparams.seed = std::stoul(argv[i]);
return true;
}
if (arg == "-t" || arg == "--threads") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency();
@@ -300,10 +308,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-tb" || arg == "--threads-batch") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads_batch = std::stoi(argv[i]);
if (params.n_threads_batch <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency();
@@ -311,10 +316,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-td" || arg == "--threads-draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads_draft = std::stoi(argv[i]);
if (params.n_threads_draft <= 0) {
params.n_threads_draft = std::thread::hardware_concurrency();
@@ -322,10 +324,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-tbd" || arg == "--threads-batch-draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads_batch_draft = std::stoi(argv[i]);
if (params.n_threads_batch_draft <= 0) {
params.n_threads_batch_draft = std::thread::hardware_concurrency();
@@ -333,10 +332,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-p" || arg == "--prompt") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.prompt = argv[i];
return true;
}
@@ -349,10 +345,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--prompt-cache") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.path_prompt_cache = argv[i];
return true;
}
@@ -365,10 +358,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-bf" || arg == "--binary-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i], std::ios::binary);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -384,10 +374,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-f" || arg == "--file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -403,10 +390,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--in-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -417,66 +401,42 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_predict = std::stoi(argv[i]);
return true;
}
if (arg == "--top-k") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.top_k = std::stoi(argv[i]);
return true;
}
if (arg == "-c" || arg == "--ctx-size") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_ctx = std::stoi(argv[i]);
return true;
}
if (arg == "--grp-attn-n" || arg == "-gan") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.grp_attn_n = std::stoi(argv[i]);
return true;
}
if (arg == "--grp-attn-w" || arg == "-gaw") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.grp_attn_w = std::stoi(argv[i]);
return true;
}
if (arg == "--rope-freq-base") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.rope_freq_base = std::stof(argv[i]);
return true;
}
if (arg == "--rope-freq-scale") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.rope_freq_scale = std::stof(argv[i]);
return true;
}
if (arg == "--rope-scaling") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
@@ -485,58 +445,37 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--rope-scale") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.rope_freq_scale = 1.0f / std::stof(argv[i]);
return true;
}
if (arg == "--yarn-orig-ctx") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_orig_ctx = std::stoi(argv[i]);
return true;
}
if (arg == "--yarn-ext-factor") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_ext_factor = std::stof(argv[i]);
return true;
}
if (arg == "--yarn-attn-factor") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_attn_factor = std::stof(argv[i]);
return true;
}
if (arg == "--yarn-beta-fast") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_beta_fast = std::stof(argv[i]);
return true;
}
if (arg == "--yarn-beta-slow") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_beta_slow = std::stof(argv[i]);
return true;
}
if (arg == "--pooling") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
@@ -545,158 +484,109 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
+ if (arg == "--attention") {
+ CHECK_ARG
+ std::string value(argv[i]);
+ /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
+ else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+ else { invalid_param = true; }
+ return true;
+ }
if (arg == "--defrag-thold" || arg == "-dt") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.defrag_thold = std::stof(argv[i]);
return true;
}
if (arg == "--samplers") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
const auto sampler_names = string_split(argv[i], ';');
sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
return true;
}
if (arg == "--sampling-seq") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
return true;
}
if (arg == "--top-p") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.top_p = std::stof(argv[i]);
return true;
}
if (arg == "--min-p") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.min_p = std::stof(argv[i]);
return true;
}
if (arg == "--temp") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.temp = std::stof(argv[i]);
sparams.temp = std::max(sparams.temp, 0.0f);
return true;
}
if (arg == "--tfs") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.tfs_z = std::stof(argv[i]);
return true;
}
if (arg == "--typical") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.typical_p = std::stof(argv[i]);
return true;
}
if (arg == "--repeat-last-n") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.penalty_last_n = std::stoi(argv[i]);
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
return true;
}
if (arg == "--repeat-penalty") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.penalty_repeat = std::stof(argv[i]);
return true;
}
if (arg == "--frequency-penalty") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.penalty_freq = std::stof(argv[i]);
return true;
}
if (arg == "--presence-penalty") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.penalty_present = std::stof(argv[i]);
return true;
}
if (arg == "--dynatemp-range") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.dynatemp_range = std::stof(argv[i]);
return true;
}
if (arg == "--dynatemp-exp") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.dynatemp_exponent = std::stof(argv[i]);
return true;
}
if (arg == "--mirostat") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.mirostat = std::stoi(argv[i]);
return true;
}
if (arg == "--mirostat-lr") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.mirostat_eta = std::stof(argv[i]);
return true;
}
if (arg == "--mirostat-ent") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.mirostat_tau = std::stof(argv[i]);
return true;
}
if (arg == "--cfg-negative-prompt") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.cfg_negative_prompt = argv[i];
return true;
}
if (arg == "--cfg-negative-prompt-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -710,203 +600,126 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--cfg-scale") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.cfg_scale = std::stof(argv[i]);
return true;
}
if (arg == "-b" || arg == "--batch-size") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_batch = std::stoi(argv[i]);
return true;
}
if (arg == "-ub" || arg == "--ubatch-size") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_ubatch = std::stoi(argv[i]);
return true;
}
if (arg == "--keep") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_keep = std::stoi(argv[i]);
return true;
}
if (arg == "--draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_draft = std::stoi(argv[i]);
return true;
}
if (arg == "--chunks") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_chunks = std::stoi(argv[i]);
return true;
}
if (arg == "-np" || arg == "--parallel") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_parallel = std::stoi(argv[i]);
return true;
}
if (arg == "-ns" || arg == "--sequences") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_sequences = std::stoi(argv[i]);
return true;
}
if (arg == "--p-split" || arg == "-ps") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.p_split = std::stof(argv[i]);
return true;
}
if (arg == "-m" || arg == "--model") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.model = argv[i];
return true;
}
if (arg == "-md" || arg == "--model-draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.model_draft = argv[i];
return true;
}
if (arg == "-a" || arg == "--alias") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.model_alias = argv[i];
return true;
}
if (arg == "-mu" || arg == "--model-url") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.model_url = argv[i];
return true;
}
- if (arg == "-hfr" || arg == "--hf-repo") {
+ if (arg == "-hft" || arg == "--hf-token") {
if (++i >= argc) {
- invalid_param = true;
- return true;
+ invalid_param = true;
+ return true;
}
+ params.hf_token = argv[i];
+ return true;
+ }
+ if (arg == "-hfr" || arg == "--hf-repo") {
+ CHECK_ARG
params.hf_repo = argv[i];
return true;
}
if (arg == "-hff" || arg == "--hf-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.hf_file = argv[i];
return true;
}
if (arg == "--lora") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.lora_adapter.emplace_back(argv[i], 1.0f);
- params.use_mmap = false;
return true;
}
if (arg == "--lora-scaled") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
const char* lora_adapter = argv[i];
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
- params.use_mmap = false;
- return true;
- }
- if (arg == "--lora-base") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
- params.lora_base = argv[i];
return true;
}
if (arg == "--control-vector") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.control_vectors.push_back({ 1.0f, argv[i], });
return true;
}
if (arg == "--control-vector-scaled") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
const char* fname = argv[i];
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
return true;
}
if (arg == "--control-vector-layer-range") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.control_vector_layer_start = std::stoi(argv[i]);
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.control_vector_layer_end = std::stoi(argv[i]);
return true;
}
if (arg == "--mmproj") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.mmproj = argv[i];
return true;
}
if (arg == "--image") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.image.emplace_back(argv[i]);
return true;
}
@@ -922,6 +735,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.embedding = true;
return true;
}
+ if (arg == "--embd-normalize") {
+ CHECK_ARG
+ params.embd_normalize = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--embd-output-format") {
+ CHECK_ARG
+ params.embd_out = argv[i];
+ return true;
+ }
+ if (arg == "--embd-separator") {
+ CHECK_ARG
+ params.embd_sep = argv[i];
+ return true;
+ }
if (arg == "-if" || arg == "--interactive-first") {
params.interactive_first = true;
return true;
@@ -950,7 +778,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cache_type_v = argv[++i];
return true;
}
- if (arg == "--multiline-input") {
+ if (arg == "-mli" || arg == "--multiline-input") {
params.multiline_input = true;
return true;
}
@@ -962,6 +790,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cont_batching = true;
return true;
}
+ if (arg == "-nocb" || arg == "--no-cont-batching") {
+ params.cont_batching = false;
+ return true;
+ }
if (arg == "-fa" || arg == "--flash-attn") {
params.flash_attn = true;
return true;
@@ -975,10 +807,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_gpu_layers = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
@@ -987,10 +816,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_gpu_layers_draft = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
@@ -999,10 +825,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--main-gpu" || arg == "-mg") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUDA_SYCL_VULKAN
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
@@ -1010,10 +833,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--split-mode" || arg == "-sm") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string arg_next = argv[i];
if (arg_next == "none") {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
@@ -1038,10 +858,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--tensor-split" || arg == "-ts") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string arg_next = argv[i];
// split string by , and /
@@ -1066,10 +883,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--rpc") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.rpc_servers = argv[i];
return true;
}
@@ -1078,10 +892,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--numa") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
@@ -1094,10 +905,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--verbosity") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.verbosity = std::stoi(argv[i]);
return true;
}
@@ -1110,18 +918,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-r" || arg == "--reverse-prompt") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.antiprompt.emplace_back(argv[i]);
return true;
}
if (arg == "-ld" || arg == "--logdir") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.logdir = argv[i];
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
@@ -1130,26 +932,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-lcs" || arg == "--lookup-cache-static") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.lookup_cache_static = argv[i];
return true;
}
if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.lookup_cache_dynamic = argv[i];
return true;
}
if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.logits_file = argv[i];
return true;
}
@@ -1158,26 +951,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--ppl-stride") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.ppl_stride = std::stoi(argv[i]);
return true;
}
if (arg == "--ppl-output-type") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.ppl_output_type = std::stoi(argv[i]);
return true;
}
if (arg == "-ptc" || arg == "--print-token-count") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_print = std::stoi(argv[i]);
return true;
}
@@ -1190,10 +974,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--hellaswag-tasks") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.hellaswag_tasks = std::stoi(argv[i]);
return true;
}
@@ -1202,10 +983,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--winogrande-tasks") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.winogrande_tasks = std::stoi(argv[i]);
return true;
}
@@ -1214,10 +992,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--multiple-choice-tasks") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.multiple_choice_tasks = std::stoi(argv[i]);
return true;
}
@@ -1234,10 +1009,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-l" || arg == "--logit-bias") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::stringstream ss(argv[i]);
llama_token key;
char sign;
@@ -1267,37 +1039,32 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
+ params.enable_chat_template = false;
return true;
}
if (arg == "--in-prefix") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.input_prefix = argv[i];
+ params.enable_chat_template = false;
return true;
}
if (arg == "--in-suffix") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.input_suffix = argv[i];
+ params.enable_chat_template = false;
+ return true;
+ }
+ if (arg == "--spm-infill") {
+ params.spm_infill = true;
return true;
}
if (arg == "--grammar") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.grammar = argv[i];
return true;
}
if (arg == "--grammar-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -1312,18 +1079,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-j" || arg == "--json-schema") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
return true;
}
if (arg == "--override-kv") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true;
@@ -1332,42 +1093,27 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--host") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.hostname = argv[i];
return true;
}
if (arg == "--port") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.port = std::stoi(argv[i]);
return true;
}
if (arg == "--path") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.public_path = argv[i];
return true;
}
if (arg == "--api-key") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.api_keys.push_back(argv[i]);
return true;
}
if (arg == "--api-key-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream key_file(argv[i]);
if (!key_file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -1384,43 +1130,28 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--ssl-key-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.ssl_file_key = argv[i];
return true;
}
if (arg == "--ssl-cert-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.ssl_file_cert = argv[i];
return true;
}
if (arg == "--timeout" || arg == "-to") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.timeout_read = std::stoi(argv[i]);
params.timeout_write = std::stoi(argv[i]);
return true;
}
if (arg == "--threads-http") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads_http = std::stoi(argv[i]);
return true;
}
if (arg == "-spf" || arg == "--system-prompt-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -1437,10 +1168,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--log-format") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
if (std::strcmp(argv[i], "json") == 0) {
params.log_json = true;
} else if (std::strcmp(argv[i], "text") == 0) {
@@ -1460,10 +1188,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--slot-save-path") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.slot_save_path = argv[i];
// if doesn't end with DIRECTORY_SEPARATOR, add it
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
@@ -1472,10 +1197,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--chat-template") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
if (!llama_chat_verify_template(argv[i])) {
fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
@@ -1486,10 +1208,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.slot_prompt_similarity = std::stof(argv[i]);
return true;
}
@@ -1498,37 +1217,25 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-npp") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
auto p = string_split<int>(argv[i], split_delim);
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
return true;
}
if (arg == "-ntg") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
auto p = string_split<int>(argv[i], split_delim);
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
return true;
}
if (arg == "-npl") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
auto p = string_split<int>(argv[i], split_delim);
params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
return true;
}
if (arg == "--context-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i], std::ios::binary);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -1539,59 +1246,39 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--chunk-size") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.chunk_size = std::stoi(argv[i]);
return true;
}
if (arg == "--chunk-separator") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.chunk_separator = argv[i];
return true;
}
if (arg == "--junk") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_junk = std::stoi(argv[i]);
return true;
}
if (arg == "--pos") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.i_pos = std::stoi(argv[i]);
return true;
}
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.out_file = argv[i];
params.cvector_outfile = argv[i];
+ params.lora_outfile = argv[i];
return true;
}
if (arg == "-ofreq" || arg == "--output-frequency") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_out_freq = std::stoi(argv[i]);
return true;
}
if (arg == "--save-frequency") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_save_freq = std::stoi(argv[i]);
return true;
}
@@ -1612,62 +1299,39 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--chunk" || arg == "--from-chunk") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.i_chunk = std::stoi(argv[i]);
return true;
}
// cvector params
- if (arg == "--completions-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
- params.cvector_completions_file = argv[i];
- return true;
- }
if (arg == "--positive-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.cvector_positive_file = argv[i];
return true;
}
if (arg == "--negative-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.cvector_negative_file = argv[i];
return true;
}
- if (arg == "--completions") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
- params.n_completions = std::stoi(argv[i]);
- return true;
- }
if (arg == "--pca-batch") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_pca_batch = std::stoi(argv[i]);
return true;
}
if (arg == "--pca-iter") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_pca_iterations = std::stoi(argv[i]);
return true;
}
+ if (arg == "--method") {
+ CHECK_ARG
+ std::string value(argv[i]);
+ /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
+ else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
+ else { invalid_param = true; }
+ return true;
+ }
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
@@ -1679,10 +1343,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
// We have a matching known parameter requiring an argument,
// now we need to check if there is anything after this argv
// and flag invalid_param or parse it.
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
invalid_param = true;
return true;
@@ -1767,7 +1428,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
- options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
+ options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
+ "in conversation mode, this will be used as system prompt\n"
+ "(default: '%s')", params.prompt.c_str() });
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
@@ -1782,13 +1445,17 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"halt generation at PROMPT, return control in interactive mode\n"
"can be specified more than once for multiple prompts" });
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
- options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
+ options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
+ "if suffix/prefix are not specified, default chat template will be used\n"
+ "(default: %s)", params.conversation ? "true" : "false" });
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
+ options.push_back({ "server infill",
+ " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
options.push_back({ "sampling" });
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
@@ -1822,7 +1489,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
"negative prompt file to use for guidance" });
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
-
+ options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
+ "if suffix/prefix are specified, template will be disabled\n"
+ "only commonly used templates are accepted:\n"
+ "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "grammar" });
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
@@ -1831,8 +1502,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
options.push_back({ "embedding" });
- options.push_back({ "embedding", " --pooling {none,mean,cls}",
+ options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
"pooling type for embeddings, use model default if unspecified" });
+ options.push_back({ "embedding", " --attention {causal,non-causal}",
+ "attention type for embeddings, use model default if unspecified" });
options.push_back({ "context hacking" });
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
@@ -1871,6 +1544,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
+ options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
options.push_back({ "multi-modality" });
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
@@ -1913,12 +1587,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
"advanced option to override model metadata by key. may be specified multiple times.\n"
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
- options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
- options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
- options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
- options.push_back({ "*", " --control-vector FNAME", "add a control vector" });
+ options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
+ options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
+ options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
+ "note: this argument can be repeated to add multiple control vectors" });
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
- "add a control vector with user defined scaling SCALE" });
+ "add a control vector with user defined scaling SCALE\n"
+ "note: this argument can be repeated to add multiple scaled control vectors" });
options.push_back({ "*", " --control-vector-layer-range START END",
"layer range to apply the control vector(s) to, start and end inclusive" });
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
@@ -1927,6 +1602,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
+ options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
options.push_back({ "retrieval" });
options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
@@ -1952,6 +1628,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
+ options.push_back({ "embedding" });
+ options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
+ options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
+ options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" });
+
options.push_back({ "server" });
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
@@ -1994,11 +1675,16 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
- options.push_back({ "cvector", " --completions-file FNAME",
- "completions file (default: '%s')", params.cvector_completions_file.c_str() });
- options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
- options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
- options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
+ options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
+ options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
+ options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
+
+ options.push_back({ "export-lora" });
+ options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
+ options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
+ options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
+ options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
+ options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]);
@@ -2363,9 +2049,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) {
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+ model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
@@ -2411,19 +2097,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
- int err = llama_model_apply_lora_from_file(model,
- lora_adapter.c_str(),
- lora_scale,
- ((i > 0) || params.lora_base.empty())
- ? NULL
- : params.lora_base.c_str(),
- params.n_threads);
- if (err != 0) {
+ auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+ if (adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
+ llama_lora_adapter_set(lctx, adapter, lora_scale);
}
if (params.ignore_eos) {
@@ -2433,7 +2114,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (params.warmup) {
LOG("warming up the model with an empty run\n");
- std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
+ std::vector<llama_token> tmp;
+ llama_token bos = llama_token_bos(model);
+ llama_token eos = llama_token_eos(model);
+ // some models (e.g. T5) don't have a BOS token
+ if (bos != -1) {
+ tmp.push_back(bos);
+ }
+ tmp.push_back(eos);
+
+ if (llama_model_has_encoder(model)) {
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+ llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+ if (decoder_start_token_id == -1) {
+ decoder_start_token_id = bos;
+ }
+ tmp.clear();
+ tmp.push_back(decoder_start_token_id);
+ }
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
@@ -2516,6 +2214,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
+ cparams.attention_type = params.attention_type;
cparams.defrag_thold = params.defrag_thold;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
@@ -2535,7 +2234,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) {
return str.rfind(prefix, 0) == 0;
}
-static bool llama_download_file(const std::string & url, const std::string & path) {
+static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@@ -2550,6 +2249,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+ // Check if hf-token or bearer-token was specified
+ if (!hf_token.empty()) {
+ std::string auth_header = "Authorization: Bearer ";
+ auth_header += hf_token.c_str();
+ struct curl_slist *http_headers = NULL;
+ http_headers = curl_slist_append(http_headers, auth_header.c_str());
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
+ }
+
#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
@@ -2745,6 +2453,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
struct llama_model * llama_load_model_from_url(
const char * model_url,
const char * path_model,
+ const char * hf_token,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) {
@@ -2752,7 +2461,7 @@ struct llama_model * llama_load_model_from_url(
return NULL;
}
- if (!llama_download_file(model_url, path_model)) {
+ if (!llama_download_file(model_url, path_model, hf_token)) {
return NULL;
}
@@ -2800,14 +2509,14 @@ struct llama_model * llama_load_model_from_url(
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (int idx = 1; idx < n_split; idx++) {
- futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+ futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
- return llama_download_file(split_url, split_path);
+ return llama_download_file(split_url, split_path, hf_token);
}, idx));
}
@@ -2826,6 +2535,7 @@ struct llama_model * llama_load_model_from_hf(
const char * repo,
const char * model,
const char * path_model,
+ const char * hf_token,
const struct llama_model_params & params) {
// construct hugging face model url:
//
@@ -2841,7 +2551,7 @@ struct llama_model * llama_load_model_from_hf(
model_url += "/resolve/main/";
model_url += model;
- return llama_load_model_from_url(model_url.c_str(), path_model, params);
+ return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
}
#else
@@ -2849,6 +2559,7 @@ struct llama_model * llama_load_model_from_hf(
struct llama_model * llama_load_model_from_url(
const char * /*model_url*/,
const char * /*path_model*/,
+ const char * /*hf_token*/,
const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
@@ -2858,6 +2569,7 @@ struct llama_model * llama_load_model_from_hf(
const char * /*repo*/,
const char * /*model*/,
const char * /*path_model*/,
+ const char * /*hf_token*/,
const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;
@@ -2922,51 +2634,35 @@ std::vector<llama_token> llama_tokenize(
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
- std::vector<char> result(8, 0);
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
- if (n_tokens < 0) {
- result.resize(-n_tokens);
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
- GGML_ASSERT(check == -n_tokens);
- } else {
- result.resize(n_tokens);
- }
-
- return std::string(result.data(), result.size());
-}
-
-std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
- const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
-
std::string piece;
- std::string result;
-
- for (size_t i = 0; i < tokens.size(); ++i) {
- piece = llama_token_to_piece(ctx, tokens[i]);
-
- // remove the leading space of the first non-BOS token
- if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
- piece = piece.substr(1);
- }
-
- result += piece;
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
+ const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+ if (n_chars < 0) {
+ piece.resize(-n_chars);
+ int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+ GGML_ASSERT(check == -n_chars);
+ }
+ else {
+ piece.resize(n_chars);
}
- return result;
+ return piece;
}
-std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
- std::string piece;
- std::string result;
-
- for (size_t i = 0; i < tokens.size(); ++i) {
- piece = llama_token_to_piece(ctx, tokens[i]);
-
- result += piece;
+std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+ std::string text;
+ text.resize(std::max(text.capacity(), tokens.size()));
+ int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+ if (n_chars < 0) {
+ text.resize(-n_chars);
+ n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+ GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}
+ text.resize(n_chars);
+
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
- return result;
+ return text;
}
bool llama_should_add_bos_token(const llama_model * model) {
@@ -2975,12 +2671,91 @@ bool llama_should_add_bos_token(const llama_model * model) {
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
}
+//
+// Chat template utils
+//
+
bool llama_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0;
}
+std::string llama_chat_apply_template(const struct llama_model * model,
+ const std::string & tmpl,
+ const std::vector<llama_chat_msg> & msgs,
+ bool add_ass) {
+ int alloc_size = 0;
+ bool fallback = false; // indicate if we must fallback to default chatml
+ std::vector<llama_chat_message> chat;
+ for (auto & msg : msgs) {
+ chat.push_back({msg.role.c_str(), msg.content.c_str()});
+ alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
+ }
+
+ const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+ std::vector<char> buf(alloc_size);
+
+ // run the first time to get the total output length
+ int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+
+ // error: chat template is not supported
+ if (res < 0) {
+ if (ptr_tmpl != nullptr) {
+ // if the custom "tmpl" is not supported, we throw an error
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+ throw std::runtime_error("this custom template is not supported");
+ } else {
+ // If the built-in template is not supported, we default to chatml
+ res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+ fallback = true;
+ }
+ }
+
+ // if it turns out that our buffer is too small, we resize it
+ if ((size_t) res > buf.size()) {
+ buf.resize(res);
+ res = llama_chat_apply_template(
+ fallback ? nullptr : model,
+ fallback ? "chatml" : ptr_tmpl,
+ chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+ }
+
+ std::string formatted_chat(buf.data(), res);
+ return formatted_chat;
+}
+
+std::string llama_chat_format_single(const struct llama_model * model,
+ const std::string & tmpl,
+ const std::vector<llama_chat_msg> & past_msg,
+ const llama_chat_msg & new_msg,
+ bool add_ass) {
+ std::ostringstream ss;
+ auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
+ std::vector<llama_chat_msg> chat_new(past_msg);
+ // if the past_msg ends with a newline, we must preserve it in the formatted version
+ if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
+ ss << "\n";
+ };
+ // format chat with new_msg
+ chat_new.push_back(new_msg);
+ auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
+ // get the diff part
+ ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+ return ss.str();
+}
+
+std::string llama_chat_format_example(const struct llama_model * model,
+ const std::string & tmpl) {
+ std::vector<llama_chat_msg> msgs = {
+ {"system", "You are a helpful assistant"},
+ {"user", "Hello"},
+ {"assistant", "Hi there"},
+ {"user", "How are you?"},
+ };
+ return llama_chat_apply_template(model, tmpl, msgs, true);
+}
+
//
// KV cache utils
//
@@ -3060,14 +2835,34 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
// Embedding utils
//
-void llama_embd_normalize(const float * inp, float * out, int n) {
+void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
double sum = 0.0;
- for (int i = 0; i < n; i++) {
- sum += inp[i] * inp[i];
+
+ switch (embd_norm) {
+ case -1: // no normalisation
+ sum = 1.0;
+ break;
+ case 0: // max absolute
+ for (int i = 0; i < n; i++) {
+ if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
+ }
+ sum /= 32760.0; // make an int16 range
+ break;
+ case 2: // euclidean
+ for (int i = 0; i < n; i++) {
+ sum += inp[i] * inp[i];
+ }
+ sum = std::sqrt(sum);
+ break;
+ default: // p-norm (euclidean is p-norm p=2)
+ for (int i = 0; i < n; i++) {
+ sum += std::pow(std::abs(inp[i]), embd_norm);
+ }
+ sum = std::pow(sum, 1.0 / embd_norm);
+ break;
}
- sum = sqrt(sum);
- const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
+ const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
for (int i = 0; i < n; i++) {
out[i] = inp[i] * norm;
@@ -3085,6 +2880,14 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
sum2 += embd2[i] * embd2[i];
}
+ // Handle the case where one or both vectors are zero vectors
+ if (sum1 == 0.0 || sum2 == 0.0) {
+ if (sum1 == 0.0 && sum2 == 0.0) {
+ return 1.0f; // two zero vectors are similar
+ }
+ return 0.0f;
+ }
+
return sum / (sqrt(sum1) * sqrt(sum2));
}
@@ -3093,125 +2896,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
//
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
- int32_t n_tensors;
-
- size_t n_bytes = 0;
-
- uint32_t max_direction_layer = 0;
-
llama_control_vector_data result = { -1, {} };
- // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
- {
- struct ggml_init_params meta_params = {
- /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
- /* .mem_buffer = */ nullptr,
- /* .no_alloc = */ true,
- };
- ggml_context * meta_ctx = ggml_init(meta_params);
- struct gguf_init_params meta_gguf_params = {
- /* .no_alloc = */ true,
- /* .ctx = */ &meta_ctx,
- };
- struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
- if (!meta_ctx_gguf) {
- fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- return result;
- }
-
- n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
- for (int i = 0; i < n_tensors; i++) {
- std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
-
- // split on '.'
- size_t dotpos = name.find('.');
- if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
- try {
- uint32_t layer = std::stoi(name.substr(dotpos + 1));
- if (layer == 0) {
- fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
- return result;
- }
- if (layer > max_direction_layer) {
- max_direction_layer = layer;
- }
- } catch (...) {
- fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
- return result;
- }
- }
-
- struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
- if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
- fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
- return result;
- }
- if (result.n_embd == -1) {
- result.n_embd = ggml_nelements(tensor_meta);
- } else if (ggml_nelements(tensor_meta) != result.n_embd) {
- fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
- return result;
- }
- n_bytes += ggml_nbytes(tensor_meta);
- }
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
+ ggml_context * ctx = nullptr;
+ struct gguf_init_params meta_gguf_params = {
+ /* .no_alloc = */ false,
+ /* .ctx = */ &ctx,
+ };
+ struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+ if (!ctx_gguf) {
+ fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+ return result;
}
+ int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
if (n_tensors == 0) {
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
- return result;
}
- // load and scale tensors into final control vector context
- struct ggml_init_params ggml_params = {
- /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
- /* .mem_buffer = */ nullptr,
- /* .no_alloc = */ false,
- };
- struct ggml_context * ctx = ggml_init(ggml_params);
+ for (int i = 0; i < n_tensors; i++) {
+ std::string name = gguf_get_tensor_name(ctx_gguf, i);
- struct gguf_init_params params = {
- /*.no_alloc = */ false,
- /*.ctx = */ &ctx,
- };
- struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
- if (!ctx_gguf) {
- fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
- ggml_free(ctx);
- return result;
- }
+ int layer_idx = -1;
+
+ // split on '.'
+ size_t dotpos = name.find('.');
+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+ try {
+ layer_idx = std::stoi(name.substr(dotpos + 1));
+ } catch (...) {
+ layer_idx = -1;
+ }
+ }
+ if (layer_idx < 0) {
+ fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ } else if (layer_idx == 0) {
+ fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ }
- // do not store data for layer 0 (it's not used)
- result.data.resize(result.n_embd * max_direction_layer);
+ struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+ if (tensor->type != GGML_TYPE_F32) {
+ fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ }
+ if (ggml_n_dims(tensor) != 1) {
+ fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ }
- for (uint32_t il = 1; il <= max_direction_layer; il++) {
- const std::string name = "direction." + std::to_string(il);
- const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+ if (result.n_embd == -1) {
+ result.n_embd = ggml_nelements(tensor);
+ } else if (ggml_nelements(tensor) != result.n_embd) {
+ fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ }
- float * dst = result.data.data() + result.n_embd * (il - 1);
+ // extend if necessary - do not store data for layer 0 (it's not used)
+ result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
- if (tensor) {
- const float * src = (const float *) tensor->data;
- for (int j = 0; j < result.n_embd; j++) {
- dst[j] = src[j] * load_info.strength;
- }
- } else {
- for (int j = 0; j < result.n_embd; j++) {
- dst[j] = 0.0f;
- }
+ const float * src = (const float *) tensor->data;
+ float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
+ for (int j = 0; j < result.n_embd; j++) {
+ dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
}
+
}
+ if (result.n_embd == -1) {
+ fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+ result.data.clear();
+ }
+
+ gguf_free(ctx_gguf);
+ ggml_free(ctx);
+
return result;
}
@@ -3222,16 +2987,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
auto cur = llama_control_vector_load_one(info);
if (cur.n_embd == -1) {
- return result;
+ result.n_embd = -1;
+ break;
}
- if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
- fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
- return result;
+ if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
+ fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+ result.n_embd = -1;
+ break;
}
if (result.n_embd == -1) {
result = std::move(cur);
} else {
+ result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
for (size_t i = 0; i < cur.data.size(); i++) {
result.data[i] += cur.data[i];
}
@@ -3239,7 +3007,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
}
if (result.n_embd == -1) {
- fprintf(stderr, "%s: no vectors passed\n", __func__);
+ fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
+ result.data.clear();
}
return result;
@@ -3407,7 +3176,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
- fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
diff --git a/common/common.h b/common/common.h
index bb45b3b4..979762e1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -52,6 +52,12 @@ int32_t cpu_get_num_math();
// CLI argument parsing
//
+// dimensionality reduction methods, used by cvector-generator
+enum dimre_method {
+ DIMRE_METHOD_PCA,
+ DIMRE_METHOD_MEAN,
+};
+
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
@@ -93,6 +99,7 @@ struct gpt_params {
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
@@ -101,6 +108,7 @@ struct gpt_params {
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
+ std::string hf_token = ""; // HF token
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string prompt = "";
@@ -120,7 +128,6 @@ struct gpt_params {
// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
- std::string lora_base = ""; // base model path for the lora adapter
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
@@ -152,7 +159,6 @@ struct gpt_params {
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
- bool embedding = false; // get only sentence embedding
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
@@ -179,6 +185,12 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
+ // embedding
+ bool embedding = false; // get only sentence embedding
+ int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
+ std::string embd_sep = "\n"; // separator of embendings
+
// server params
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
@@ -189,6 +201,7 @@ struct gpt_params {
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";
+ bool enable_chat_template = true;
std::vector<std::string> api_keys;
@@ -234,15 +247,19 @@ struct gpt_params {
bool compute_ppl = true; // whether to compute perplexity
// cvector-generator params
- int n_completions = 64;
- int n_pca_batch = 20;
+ int n_pca_batch = 100;
int n_pca_iterations = 1000;
- std::string cvector_outfile = "control_vector.gguf";
- std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
+ std::string cvector_outfile = "control_vector.gguf";
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
+
+ std::string lora_outfile = "ggml-lora-merged-f16.gguf";
};
+void gpt_params_handle_hf_token(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params);
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
@@ -298,8 +315,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
// Batch utils
@@ -337,21 +354,13 @@ std::string llama_token_to_piece(
llama_token token,
bool special = true);
-// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
-// that takes into account the tokenizer type and decides how to handle the leading space
-//
-// detokenizes a vector of tokens into a string
-// should work similar to Python's `tokenizer.decode`
-// removes the leading space from the first non-BOS token
-std::string llama_detokenize_spm(
- llama_context * ctx,
- const std::vector<llama_token> & tokens);
-
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
-std::string llama_detokenize_bpe(
+// optionally renders special/control tokens
+std::string llama_detokenize(
llama_context * ctx,
- const std::vector<llama_token> & tokens);
+ const std::vector<llama_token> & tokens,
+ bool special = true);
// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
@@ -361,9 +370,34 @@ bool llama_should_add_bos_token(const llama_model * model);
// Chat template utils
//
+// same with llama_chat_message, but uses std::string
+struct llama_chat_msg {
+ std::string role;
+ std::string content;
+};
+
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl);
+// CPP wrapper for llama_chat_apply_template
+// If the built-in template is not supported, we default to chatml
+// If the custom "tmpl" is not supported, we throw an error
+std::string llama_chat_apply_template(const struct llama_model * model,
+ const std::string & tmpl,
+ const std::vector<llama_chat_msg> & chat,
+ bool add_ass);
+
+// Format single message, while taking into account the position of that message in chat history
+std::string llama_chat_format_single(const struct llama_model * model,
+ const std::string & tmpl,
+ const std::vector<llama_chat_msg> & past_msg,
+ const llama_chat_msg & new_msg,
+ bool add_ass);
+
+// Returns an example of formatted chat
+std::string llama_chat_format_example(const struct llama_model * model,
+ const std::string & tmpl);
+
//
// KV cache utils
//
@@ -378,7 +412,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
// Embedding utils
//
-void llama_embd_normalize(const float * inp, float * out, int n);
+void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@@ -422,4 +456,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
-
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 10b9b3d1..881eb49e 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return result;
}
+/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
+class string_view {
+ const std::string & _str;
+ const size_t _start;
+ const size_t _end;
+public:
+ string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
+
+ size_t size() const {
+ return _end - _start;
+ }
+
+ size_t length() const {
+ return size();
+ }
+
+ operator std::string() const {
+ return str();
+ }
+
+ std::string str() const {
+ return _str.substr(_start, _end - _start);
+ }
+
+ string_view substr(size_t pos, size_t len = std::string::npos) const {
+ return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
+ }
+
+ char operator[](size_t pos) const {
+ auto index = _start + pos;
+ if (index >= _end) {
+ throw std::out_of_range("string_view index out of range");
+ }
+ return _str[_start + pos];
+ }
+
+ bool operator==(const string_view & other) const {
+ std::string this_str = *this;
+ std::string other_str = other;
+ return this_str == other_str;
+ }
+};
+
+static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
+ auto has_min = min_value != std::numeric_limits<int>::min();
+ auto has_max = max_value != std::numeric_limits<int>::max();
+
+ auto digit_range = [&](char from, char to) {
+ out << "[";
+ if (from == to) {
+ out << from;
+ } else {
+ out << from << "-" << to;
+ }
+ out << "]";
+ };
+ auto more_digits = [&](int min_digits, int max_digits) {
+ out << "[0-9]";
+ if (min_digits == max_digits && min_digits == 1) {
+ return;
+ }
+ out << "{";
+ out << min_digits;
+ if (max_digits != min_digits) {
+ out << ",";
+ if (max_digits != std::numeric_limits<int>::max()) {
+ out << max_digits;
+ }
+ }
+ out << "}";
+ };
+ std::function<void(const string_view &, const string_view &)> uniform_range =
+ [&](const string_view & from, const string_view & to) {
+ size_t i = 0;
+ while (i < from.length() && i < to.length() && from[i] == to[i]) {
+ i++;
+ }
+ if (i > 0) {
+ out << "\"" << from.substr(0, i).str() << "\"";
+ }
+ if (i < from.length() && i < to.length()) {
+ if (i > 0) {
+ out << " ";
+ }
+ auto sub_len = from.length() - i - 1;
+ if (sub_len > 0) {
+ auto from_sub = from.substr(i + 1);
+ auto to_sub = to.substr(i + 1);
+ auto sub_zeros = repeat("0", sub_len);
+ auto sub_nines = repeat("9", sub_len);
+
+ auto to_reached = false;
+ out << "(";
+ if (from_sub == sub_zeros) {
+ digit_range(from[i], to[i] - 1);
+ out << " ";
+ more_digits(sub_len, sub_len);
+ } else {
+ out << "[" << from[i] << "] ";
+ out << "(";
+ uniform_range(from_sub, sub_nines);
+ out << ")";
+ if (from[i] < to[i] - 1) {
+ out << " | ";
+ if (to_sub == sub_nines) {
+ digit_range(from[i] + 1, to[i]);
+ to_reached = true;
+ } else {
+ digit_range(from[i] + 1, to[i] - 1);
+ }
+ out << " ";
+ more_digits(sub_len, sub_len);
+ }
+ }
+ if (!to_reached) {
+ out << " | ";
+ digit_range(to[i], to[i]);
+ out << " ";
+ uniform_range(sub_zeros, to_sub);
+ }
+ out << ")";
+ } else {
+ out << "[" << from[i] << "-" << to[i] << "]";
+ }
+ }
+ };
+
+ if (has_min && has_max) {
+ if (min_value < 0 && max_value < 0) {
+ out << "\"-\" (";
+ _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
+ out << ")";
+ return;
+ }
+
+ if (min_value < 0) {
+ out << "\"-\" (";
+ _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
+ out << ") | ";
+ min_value = 0;
+ }
+
+ auto min_s = std::to_string(min_value);
+ auto max_s = std::to_string(max_value);
+ auto min_digits = min_s.length();
+ auto max_digits = max_s.length();
+
+ for (auto digits = min_digits; digits < max_digits; digits++) {
+ uniform_range(min_s, repeat("9", digits));
+ min_s = "1" + repeat("0", digits);
+ out << " | ";
+ }
+ uniform_range(min_s, max_s);
+ return;
+ }
+
+ auto less_decimals = std::max(decimals_left - 1, 1);
+
+ if (has_min) {
+ if (min_value < 0) {
+ out << "\"-\" (";
+ _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
+ out << ") | [0] | [1-9] ";
+ more_digits(0, decimals_left - 1);
+ } else if (min_value == 0) {
+ if (top_level) {
+ out << "[0] | [1-9] ";
+ more_digits(0, less_decimals);
+ } else {
+ more_digits(1, decimals_left);
+ }
+ } else if (min_value <= 9) {
+ char c = '0' + min_value;
+ auto range_start = top_level ? '1' : '0';
+ if (c > range_start) {
+ digit_range(range_start, c - 1);
+ out << " ";
+ more_digits(1, less_decimals);
+ out << " | ";
+ }
+ digit_range(c, '9');
+ out << " ";
+ more_digits(0, less_decimals);
+ } else {
+ auto min_s = std::to_string(min_value);
+ auto len = min_s.length();
+ auto c = min_s[0];
+
+ if (c > '1') {
+ digit_range(top_level ? '1' : '0', c - 1);
+ out << " ";
+ more_digits(len, less_decimals);
+ out << " | ";
+ }
+ digit_range(c, c);
+ out << " (";
+ _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
+ out << ")";
+ if (c < '9') {
+ out << " | ";
+ digit_range(c + 1, '9');
+ out << " ";
+ more_digits(len - 1, less_decimals);
+ }
+ }
+ return;
+ }
+
+ if (has_max) {
+ if (max_value >= 0) {
+ if (top_level) {
+ out << "\"-\" [1-9] ";
+ more_digits(0, less_decimals);
+ out << " | ";
+ }
+ _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
+ } else {
+ out << "\"-\" (";
+ _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
+ out << ")";
+ }
+ return;
+ }
+
+ throw std::runtime_error("At least one of min_value or max_value must be set");
+}
+
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
struct BuiltinRule {
@@ -89,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
};
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
template <typename Iterator>
std::string join(Iterator begin, Iterator end, const std::string & separator) {
@@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) {
return "\"" + escaped + "\"";
}
-
class SchemaConverter {
private:
std::function<json(const std::string &)> _fetch_json;
@@ -388,6 +614,75 @@ private:
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
}
+ /*
+ Returns a rule that matches a JSON string that is none of the provided strings
+
+ not_strings({"a"})
+ -> ["] ( [a] char+ | [^"a] char* )? ["] space
+ not_strings({"and", "also"})
+ -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
+ */
+ std::string _not_strings(const std::vector<std::string> & strings) {
+
+ struct TrieNode {
+ std::map<char, TrieNode> children;
+ bool is_end_of_string;
+
+ TrieNode() : is_end_of_string(false) {}
+
+ void insert(const std::string & string) {
+ auto node = this;
+ for (char c : string) {
+ node = &node->children[c];
+ }
+ node->is_end_of_string = true;
+ }
+ };
+
+ TrieNode trie;
+ for (const auto & s : strings) {
+ trie.insert(s);
+ }
+
+ std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
+ std::ostringstream out;
+ out << "[\"] ( ";
+ std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
+ std::ostringstream rejects;
+ auto first = true;
+ for (const auto & kv : node.children) {
+ rejects << kv.first;
+ if (first) {
+ first = false;
+ } else {
+ out << " | ";
+ }
+ out << "[" << kv.first << "]";
+ if (!kv.second.children.empty()) {
+ out << " (";
+ visit(kv.second);
+ out << ")";
+ } else if (kv.second.is_end_of_string) {
+ out << " " << char_rule << "+";
+ }
+ }
+ if (!node.children.empty()) {
+ if (!first) {
+ out << " | ";
+ }
+ out << "[^\"" << rejects.str() << "] " << char_rule << "*";
+ }
+ };
+ visit(trie);
+
+ out << " )";
+ if (!trie.is_end_of_string) {
+ out << "?";
+ }
+ out << " [\"] space";
+ return out.str();
+ }
+
std::string _resolve_ref(const std::string & ref) {
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
@@ -408,6 +703,7 @@ private:
std::vector<std::string> required_props;
std::vector<std::string> optional_props;
std::unordered_map<std::string, std::string> prop_kv_rule_names;
+ std::vector<std::string> prop_names;
for (const auto & kv : properties) {
const auto &prop_name = kv.first;
const auto &prop_schema = kv.second;
@@ -422,11 +718,18 @@ private:
} else {
optional_props.push_back(prop_name);
}
+ prop_names.push_back(prop_name);
}
- if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
+ if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
- std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
- std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+ std::string value_rule =
+ additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
+ : _add_primitive("value", PRIMITIVE_RULES.at("value"));
+
+ auto key_rule =
+ prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
+ : _add_rule(sub_name + "-k", _not_strings(prop_names));
+ std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*");
}
@@ -452,15 +755,11 @@ private:
}
std::string k = ks[0];
std::string kv_rule_name = prop_kv_rule_names[k];
- if (k == "*") {
- res = _add_rule(
- name + (name.empty() ? "" : "-") + "additional-kvs",
- kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
- );
- } else if (first_is_optional) {
- res = "( \",\" space " + kv_rule_name + " )?";
+ std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
+ if (first_is_optional) {
+ res = comma_ref + (k == "*" ? "*" : "?");
} else {
- res = kv_rule_name;
+ res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
}
if (ks.size() > 1) {
res += " " + _add_rule(
@@ -594,17 +893,19 @@ public:
} else if (schema_type.is_array()) {
std::vector<json> schema_types;
for (const auto & t : schema_type) {
- schema_types.push_back({{"type", t}});
+ json schema_copy(schema);
+ schema_copy["type"] = t;
+ schema_types.push_back(schema_copy);
}
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
} else if (schema.contains("const")) {
- return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
+ return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
} else if (schema.contains("enum")) {
std::vector<std::string> enum_values;
for (const auto & v : schema["enum"]) {
enum_values.push_back(_generate_constant_rule(v));
}
- return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
+ return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
} else if ((schema_type.is_null() || schema_type == "object")
&& (schema.contains("properties") ||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -686,6 +987,24 @@ public:
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
+ } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
+ int min_value = std::numeric_limits<int>::min();
+ int max_value = std::numeric_limits<int>::max();
+ if (schema.contains("minimum")) {
+ min_value = schema["minimum"].get<int>();
+ } else if (schema.contains("exclusiveMinimum")) {
+ min_value = schema["exclusiveMinimum"].get<int>() + 1;
+ }
+ if (schema.contains("maximum")) {
+ max_value = schema["maximum"].get<int>();
+ } else if (schema.contains("exclusiveMaximum")) {
+ max_value = schema["exclusiveMaximum"].get<int>() - 1;
+ }
+ std::stringstream out;
+ out << "(";
+ _build_min_max_int(min_value, max_value, out);
+ out << ") space";
+ return _add_rule(rule_name, out.str());
} else if (schema.empty() || schema_type == "object") {
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
} else {
diff --git a/common/log.h b/common/log.h
index 09fa63c2..1bc5328c 100644
--- a/common/log.h
+++ b/common/log.h
@@ -630,7 +630,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
buf << "[ ";
bool first = true;
- for (const auto &token : tokens)
+ for (const auto & token : tokens)
{
if (!first) {
buf << ", ";
diff --git a/common/ngram-cache.h b/common/ngram-cache.h
index e4fa4cbd..ab4c9b37 100644
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -37,11 +37,18 @@ struct llama_ngram {
}
};
+struct llama_token_hash_function {
+ size_t operator()(const llama_token token) const {
+ // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+ return token * 11400714819323198485llu;
+ }
+};
+
struct llama_ngram_hash_function {
size_t operator()(const llama_ngram & ngram) const {
- size_t hash = 0;
- for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
- hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
+ size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
+ for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
+ hash ^= llama_token_hash_function{}(ngram.tokens[i]);
}
return hash;
}
diff --git a/common/sampling.cpp b/common/sampling.cpp
index f1f80351..079e4051 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
- result->grammar = llama_grammar_init(
+ struct llama_grammar * grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
+ if (grammar == nullptr) {
+ throw std::runtime_error("Failed to initialize llama_grammar");
+ }
+ result->grammar = grammar;
}
result->prev.resize(params.n_prev);
@@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
if (!ctx->parsed_grammar.rules.empty()) {
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
- ctx->grammar = llama_grammar_init(
+ struct llama_grammar * grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
+ if (grammar == nullptr) {
+ throw std::runtime_error("Failed to initialize llama_grammar");
+ }
+ ctx->grammar = grammar;
}
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
@@ -274,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
GGML_ASSERT(!original_logits.empty());
}
llama_token id = 0;
- // Get a pointer to the logits
- float * logits = llama_get_logits_ith(ctx_main, idx);
if (temp < 0.0) {
// greedy sampling, with probs
@@ -316,12 +322,15 @@ static llama_token llama_sampling_sample_impl(
}
if (ctx_sampling->grammar != NULL && !is_resampling) {
+ // Get a pointer to the logits
+ float * logits = llama_get_logits_ith(ctx_main, idx);
+
// Create an array with a single token data element for the sampled id
llama_token_data single_token_data = {id, logits[id], 0.0f};
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
// Apply grammar constraints to the single token
- llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
+ llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
@@ -369,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
- *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
+ *original_logits = {logits, logits + n_vocab};
}
// apply params.logit_bias map
@@ -382,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
- cur.clear();
+ cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
- cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
@@ -412,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
// apply grammar checks before sampling logic
if (apply_grammar && ctx_sampling->grammar != NULL) {
- llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
+ llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
}
return cur_p;
@@ -446,6 +455,6 @@ void llama_sampling_accept(
ctx_sampling->prev.push_back(id);
if (ctx_sampling->grammar != NULL && apply_grammar) {
- llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
+ llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
}
}