`build`: rename main → llama-cli, server → llama-server, llava-cli → llama-llava-cli, etc... (#7809)

* `main`/`server`: rename to `llama` / `llama-server` for consistency w/ homebrew * server: update refs -> llama-server gitignore llama-server * server: simplify nix package * main: update refs -> llama fix examples/main ref * main/server: fix targets * update more names * Update build.yml * rm accidentally checked in bins * update straggling refs * Update .gitignore * Update server-llm.sh * main: target name -> llama-cli * Prefix all example bins w/ llama- * fix main refs * rename {main->llama}-cmake-pkg binary * prefix more cmake targets w/ llama- * add/fix gbnf-validator subfolder to cmake * sort cmake example subdirs * rm bin files * fix llama-lookup-* Makefile rules * gitignore /llama-* * rename Dockerfiles * rename llama|main -> llama-cli; consistent RPM bin prefixes * fix some missing -cli suffixes * rename dockerfile w/ llama-cli * rename(make): llama-baby-llama * update dockerfile refs * more llama-cli(.exe) * fix test-eval-callback * rename: llama-cli-cmake-pkg(.exe) * address gbnf-validator unused fread warning (switched to C++ / ifstream) * add two missing llama- prefixes * Updating docs for eval-callback binary to use new `llama-` prefix. * Updating a few lingering doc references for rename of main to llama-cli * Updating `run-with-preset.py` to use new binary names. Updating docs around `perplexity` binary rename. * Updating documentation references for lookup-merge and export-lora * Updating two small `main` references missed earlier in the finetune docs. * Update apps.nix * update grammar/README.md w/ new llama-* names * update llama-rpc-server bin name + doc * Revert "update llama-rpc-server bin name + doc" This reverts commit e474ef1df481fd8936cd7d098e3065d7de378930. * add hot topic notice to README.md * Update README.md * Update README.md * rename gguf-split & quantize bins refs in **/tests.sh --------- Co-authored-by: HanClinto <hanclinto@gmail.com>
author: Olivier Chafik <ochafik@users.noreply.github.com> 2024-06-13 00:41:52 +0100
committer: GitHub <noreply@github.com> 2024-06-13 00:41:52 +0100
commit: 1c641e6aac5c18b964e7b32d9dbbb4bf5301d0d7 (patch)
tree: 616348dac8e67d80a03a81847ce9ee4bb7e19d49 /examples
parent: 963552903f51043ee947a8deeaaa7ec00bc3f1a4 (diff)
82 files changed, 201 insertions, 203 deletions
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 53002f8e..d6ce35f4 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -13,42 +13,43 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
     add_subdirectory(baby-llama)
-    add_subdirectory(batched)
     add_subdirectory(batched-bench)
+    add_subdirectory(batched)
     add_subdirectory(benchmark)
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(embedding)
     add_subdirectory(eval-callback)
+    add_subdirectory(export-lora)
     add_subdirectory(finetune)
-    add_subdirectory(gritlm)
+    add_subdirectory(gbnf-validator)
     add_subdirectory(gguf-split)
+    add_subdirectory(gguf)
+    add_subdirectory(gritlm)
+    add_subdirectory(imatrix)
     add_subdirectory(infill)
     add_subdirectory(llama-bench)
     add_subdirectory(llava)
-    if (LLAMA_SYCL)
-        add_subdirectory(sycl)
-    endif()
+    add_subdirectory(lookahead)
+    add_subdirectory(lookup)
     add_subdirectory(main)
-    add_subdirectory(tokenize)
     add_subdirectory(parallel)
+    add_subdirectory(passkey)
     add_subdirectory(perplexity)
-    add_subdirectory(quantize)
     add_subdirectory(quantize-stats)
+    add_subdirectory(quantize)
     add_subdirectory(retrieval)
+    if (LLAMA_RPC)
+        add_subdirectory(rpc)
+    endif()
+    if (LLAMA_BUILD_SERVER)
+    add_subdirectory(server)
+    endif()
+    if (LLAMA_SYCL)
+        add_subdirectory(sycl)
+    endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
-    add_subdirectory(passkey)
     add_subdirectory(speculative)
-    add_subdirectory(lookahead)
-    add_subdirectory(lookup)
-    add_subdirectory(gguf)
+    add_subdirectory(tokenize)
     add_subdirectory(train-text-from-scratch)
-    add_subdirectory(imatrix)
-    if (LLAMA_BUILD_SERVER)
-        add_subdirectory(server)
-    endif()
-    add_subdirectory(export-lora)
-    if (LLAMA_RPC)
-        add_subdirectory(rpc)
-    endif()
 endif()
diff --git a/examples/Miku.sh b/examples/Miku.sh
index b9174b4e..0f6c8c87 100755
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
     GEN_OPTIONS+=(--threads "$N_THREAD")
 fi
 
-./main "${GEN_OPTIONS[@]}" \
+./llama-cli "${GEN_OPTIONS[@]}" \
     --model "$MODEL" \
     --in-prefix " " \
     --in-suffix "${AI_NAME}:" \
diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
index 7b70227a..71b82105 100644
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET baby-llama)
+set(TARGET llama-baby-llama)
 add_executable(${TARGET} baby-llama.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/base-translate.sh b/examples/base-translate.sh
index 00dedd0d..103a52f5 100755
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@@ -58,4 +58,4 @@ echo "$2
 model=$1
 
 # generate the most likely continuation until the string "===" is found
-./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
+./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt
index 40a032c5..959acaee 100644
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET batched-bench)
+set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
index fa4baf64..4a07fe6b 100644
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
 
 ```bash
-./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
+./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
 
 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
+./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
 
 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
+./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
 
 # custom set of batches
-./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
+./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
 ```
 
 ## Sample results
diff --git a/examples/batched.swift/Makefile b/examples/batched.swift/Makefile
index 2afb24fb..1f9156e5 100755
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@@ -1,6 +1,6 @@
 .PHONY: build
 
 build:
-	xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
-	rm -f ./batched_swift
-	ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
+	xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
+	rm -f ./llama-batched-swift
+	ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
diff --git a/examples/batched.swift/Package.swift b/examples/batched.swift/Package.swift
index 826491de..7e8afd08 100644
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@@ -4,7 +4,7 @@
 import PackageDescription
 
 let package = Package(
-    name: "batched_swift",
+    name: "llama-batched-swift",
     platforms: [.macOS(.v12)],
     dependencies: [
         .package(name: "llama", path: "../../"),
@@ -13,7 +13,7 @@ let package = Package(
         // Targets are the basic building blocks of a package, defining a module or a test suite.
         // Targets can depend on other targets in this package and products from dependencies.
         .executableTarget(
-            name: "batched_swift",
+            name: "llama-batched-swift",
             dependencies: ["llama"],
             path: "Sources",
             linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md
index 4c2721fe..7f2e2fcd 100644
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@@ -1,4 +1,4 @@
 This is a swift clone of `examples/batched`.
 
 $ `make`
-$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt
index 6aa178d4..77e33343 100644
--- a/examples/batched/CMakeLists.txt
+++ b/examples/batched/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET batched)
+set(TARGET llama-batched)
 add_executable(${TARGET} batched.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/batched/README.md b/examples/batched/README.md
index ed204c30..6013aab0 100644
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt
 
 ```bash
-./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
+./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
 
 ...
 
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 2bb47bab..34a58cc0 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET benchmark)
+set(TARGET llama-bench-matmult)
 add_executable(${TARGET} benchmark-matmult.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh
index 35c089d5..1828903c 100755
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
      $PROMPT_TEMPLATE > $PROMPT_FILE
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./main $GEN_OPTIONS \
+./llama-cli $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --n_predict "$N_PREDICTS" \
diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh
index 22f5b83d..d9cab983 100755
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -62,7 +62,7 @@ fi
 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
     echo 'Prompt cache does not exist, building...'
     # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./main 2>>"$LOG" \
+    ./llama-cli 2>>"$LOG" \
         --batch_size 64 \
         "${OPTS[@]}" \
         --prompt-cache "$PROMPT_CACHE_FILE" \
@@ -109,13 +109,13 @@ while read -e line; do
 
     printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
 
-    ./main 2>>"$LOG" "${OPTS[@]}" \
+    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
             --prompt-cache "$CUR_PROMPT_CACHE" \
             --prompt-cache-all \
             --file "$CUR_PROMPT_FILE" \
             --reverse-prompt "${USER_NAME}:" \
             --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./main
+        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
         tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
         skip_bytes "$n_prompt_len_pre"  # print generation
 
@@ -133,7 +133,7 @@ while read -e line; do
     # TODO get both messages in one go
     if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
         ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
-        echo >&2 "Couldn't get number of tokens from ./main output!"
+        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
         exit 1
     fi
 
@@ -144,7 +144,7 @@ while read -e line; do
     fi
 
     # Update cache for next prompt in background, ideally during user input
-    ./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
           --prompt-cache "$NEXT_PROMPT_CACHE" \
           --file "$NEXT_PROMPT_FILE" \
           --n_predict 1 &
diff --git a/examples/chat-vicuna.sh b/examples/chat-vicuna.sh
index 8c7b7bef..ffdd2008 100755
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
      $PROMPT_TEMPLATE > $PROMPT_FILE
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/main $GEN_OPTIONS \
+./bin/llama-cli $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --n_predict "$N_PREDICTS" \
diff --git a/examples/chat.sh b/examples/chat.sh
index d567acec..9f85d1e2 100755
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
+./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
     --repeat_penalty 1.0 --color -i \
     -r "User:" -f prompts/chat-with-bob.txt
diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt
index e262d44f..a6790e61 100644
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET convert-llama2c-to-ggml)
+set(TARGET llama-convert-llama2c-to-ggml)
 add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
index 742dcf7a..5774ac83 100644
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
 
 After successful compilation, following usage options are available:
 ```
-usage: ./convert-llama2c-to-ggml [options]
+usage: ./llama-convert-llama2c-to-ggml [options]
 
 options:
   -h, --help                       show this help message and exit
@@ -19,10 +19,10 @@ options:
 
 An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
 
-`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
+`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
 
 Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
 
 Now you can use the model with a command like:
 
-`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
+`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index 8ffc3386..8256e789 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET embedding)
+set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/embedding/README.md b/examples/embedding/README.md
index 6929454c..2298ec3e 100644
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
+./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
 ```
 
 ### Windows:
 
 ```powershell
-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
+llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
 ```
 
 The above command will output space-separated float values.
diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt
index c56ba780..a48753d3 100644
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -1,9 +1,9 @@
-set(TARGET eval-callback)
+set(TARGET llama-eval-callback)
 add_executable(${TARGET} eval-callback.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md
index 66a37e87..63a57ad6 100644
--- a/examples/eval-callback/README.md
+++ b/examples/eval-callback/README.md
@@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
 Usage:
 
 ```shell
-eval-callback \
+llama-eval-callback \
   --hf-repo ggml-org/models \
   --hf-file phi-2/ggml-model-q4_0.gguf \
   --model phi-2-q4_0.gguf \
diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt
index cbbdaec6..1cef6e71 100644
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET export-lora)
+set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md
index 0cf3e8e4..1fb17fee 100644
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -3,7 +3,7 @@
 Apply LORA adapters to base model and export the resulting model.
 
 ```
-usage: export-lora [options]
+usage: llama-export-lora [options]
 
 options:
   -h, --help                         show this help message and exit
@@ -17,7 +17,7 @@ options:
 For example:
 
 ```bash
-./bin/export-lora \
+./bin/llama-export-lora \
     -m open-llama-3b-v2-q8_0.gguf \
     -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
     -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt
index 2b52d21c..64afe6dd 100644
--- a/examples/finetune/CMakeLists.txt
+++ b/examples/finetune/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET finetune)
+set(TARGET llama-finetune)
 add_executable(${TARGET} finetune.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/finetune/README.md b/examples/finetune/README.md
index 2fafd505..a6ae6498 100644
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -7,7 +7,7 @@ Basic usage instructions:
 wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
 
 # finetune LORA adapter
-./bin/finetune \
+./bin/llama-finetune \
         --model-base open-llama-3b-v2-q8_0.gguf \
         --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
         --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
@@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
         --use-checkpointing
 
 # predict
-./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 ```
 
 **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@@ -38,14 +38,14 @@ After 10 more iterations:
 Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
 
 llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
-These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
+These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
 
-In `main` you can also load multiple LORA adapters, which will then be mixed together.
+In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
 
 For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
 
 ```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
   --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
   --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
 ```
@@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
 For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
 
 ```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
   --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
   --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
   --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
index 079bfa11..d7f2165e 100644
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@@ -2,7 +2,7 @@
 cd `dirname $0`
 cd ../..
 
-EXE="./finetune"
+EXE="./llama-finetune"
 
 if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
 if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt
index 166e3ad2..4edd6ec7 100644
--- a/examples/gbnf-validator/CMakeLists.txt
+++ b/examples/gbnf-validator/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET gbnf-validator)
+set(TARGET llama-gbnf-validator)
 add_executable(${TARGET} gbnf-validator.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
index 091069ff..0406dc33 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -7,6 +7,8 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <sstream>
+#include <fstream>
 #include <string>
 #include <vector>
 
@@ -69,13 +71,14 @@ int main(int argc, char** argv) {
         return 1;
     }
 
-    fseek(grammar_file, 0, SEEK_END);
-    size_t grammar_size = ftell(grammar_file);
-    fseek(grammar_file, 0, SEEK_SET);
-
-    std::string grammar_str(grammar_size, ' ');
-    fread(&grammar_str[0], 1, grammar_size, grammar_file);
-    fclose(grammar_file);
+    std::string grammar_str;
+    {
+        std::ifstream grammar_file(grammar_filename);
+        GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
+        std::stringstream buffer;
+        buffer << grammar_file.rdbuf();
+        grammar_str = buffer.str();
+    }
 
     // Parse the GBNF grammar
     auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
@@ -100,20 +103,15 @@ int main(int argc, char** argv) {
             grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
 
     // Read the input file
-    FILE* input_file = fopen(input_filename.c_str(), "r");
-    if (!input_file) {
-        fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
-        return 1;
+    std::string input_str;
+    {
+        std::ifstream input_file(input_filename);
+        GGML_ASSERT(input_file.is_open() && "Failed to open input file");
+        std::stringstream buffer;
+        buffer << input_file.rdbuf();
+        input_str = buffer.str();
     }
 
-    fseek(input_file, 0, SEEK_END);
-    size_t input_size = ftell(input_file);
-    fseek(input_file, 0, SEEK_SET);
-
-    std::string input_str(input_size, ' ');
-    fread(&input_str[0], 1, input_size, input_file);
-    fclose(input_file);
-
     // Validate the input string against the grammar
     size_t error_pos;
     std::string error_msg;
diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt
index 828e6243..f63887da 100644
--- a/examples/gguf-split/CMakeLists.txt
+++ b/examples/gguf-split/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET gguf-split)
+set(TARGET llama-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh
index 3bc0fa47..d5a92d60 100755
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -18,8 +18,8 @@ fi
 
 set -x
 
-SPLIT=$1/gguf-split
-MAIN=$1/main
+SPLIT=$1/llama-gguf-split
+MAIN=$1/llama-cli
 WORK_PATH=$TMP_DIR/gguf-split
 ROOT_DIR=$(realpath $(dirname $0)/../../)
 
diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt
index 6481f087..a9569b41 100644
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET gguf)
+set(TARGET llama-gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt
index ac4a5ae7..86dfddca 100644
--- a/examples/gritlm/CMakeLists.txt
+++ b/examples/gritlm/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET gritlm)
+set(TARGET llama-gritlm)
 add_executable(${TARGET} gritlm.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md
index a3a3c138..786ba573 100644
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou
 
 Run the example using the downloaded model:
 ```console
-$ ./gritlm -m models/gritlm-7b_q4_1.gguf
+$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
 
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt
index d688a162..d4c8265b 100644
--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET imatrix)
+set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index 866ca9f5..38b36ee5 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
 ## Usage
 
 ```
-./imatrix \
+./llama-imatrix \
     -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
     [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
     [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
@@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 LLAMA_CUDA=1 make -j
 
 # generate importance matrix (imatrix.dat)
-./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
+./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
 
 # use the imatrix to perform a Q4_K_M quantization
-./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
+./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
 ```
diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt
index e4e8028d..9b1aa3b6 100644
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET infill)
+set(TARGET llama-infill)
 add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/infill/README.md b/examples/infill/README.md
index 6b076c83..74f42d2f 100644
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -42,5 +42,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu
 ```
 
 ```bash
-./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
+./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
 ```
diff --git a/examples/jeopardy/jeopardy.sh b/examples/jeopardy/jeopardy.sh
index 9bdbc755..07bcb3b8 100755
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@@ -21,7 +21,7 @@ counter=1
 echo 'Running'
 while IFS= read -r question
 do
-  exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
   echo $counter
   echo "Current Question: $question"
   eval "$exe_cmd"
diff --git a/examples/json-schema-pydantic-example.py b/examples/json-schema-pydantic-example.py
index 69ebfd40..cc64e572 100644
--- a/examples/json-schema-pydantic-example.py
+++ b/examples/json-schema-pydantic-example.py
@@ -1,5 +1,5 @@
 # Usage:
-#! ./server -m some-model.gguf &
+#! ./llama-server -m some-model.gguf &
 #! pip install pydantic
 #! python json-schema-pydantic-example.py
 
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index ab19e20d..b588497b 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -523,7 +523,7 @@ class SchemaConverter:
 def main(args_in = None):
     parser = argparse.ArgumentParser(
         description='''
-            Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
+            Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
             given JSON schema. Only a subset of JSON schema features are supported; more may be
             added in the future.
         ''',
diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
index fd95b35f..52b0e74d 100644
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -1,4 +1,4 @@
-# llama.cpp/example/llama-bench
+# llama.cpp/examples/llama-bench
 
 Performance testing tool for llama.cpp.
 
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 2985caff..e9fa73ac 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -30,8 +30,9 @@ if(TARGET BUILD_INFO)
     add_dependencies(llava BUILD_INFO)
 endif()
 
-set(TARGET llava-cli)
-add_executable(llava-cli llava-cli.cpp)
-install(TARGETS llava-cli RUNTIME)
-target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(llava PRIVATE cxx_std_11)
+set(TARGET llama-llava-cli)
+add_executable(${TARGET} llava-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 74f021de..05a8207e 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
 Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
 
 ## Usage
-Build with cmake or run `make llava-cli` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.
 
-After building, run: `./llava-cli` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:
 
 ```sh
-./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
     --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
     --image path/to/an/image.jpg \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
@@ -62,7 +62,7 @@ python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
 
 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
 ```sh
-./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```
 
 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
@@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ### case 1
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -102,7 +102,7 @@ llama_print_timings:       total time =   34731.93 ms
 ### case 2
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -126,7 +126,7 @@ llama_print_timings:       total time =   34570.79 ms
 #### llava-cli release-b2005
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ### case 1
 **input**
 ```sh
-./llava-cli \
+./llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     --image /data/local/tmp/demo.jpeg \
@@ -224,7 +224,7 @@ llama_print_timings:       total time =    1352.63 ms /   252 tokens
 ### case 2
 **input**
 ```sh
-./llava-cli \
+./llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 8d1ae527..f4554de6 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
 After API is confirmed, more models will be supported / uploaded.
 
 ## Usage
-Build with cmake or run `make llava-cli` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.
 
-After building, run: `./llava-cli` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:
 
 ```sh
-./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -95,9 +95,9 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
 python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 
-7) And finally we can run the llava-cli using the 1.6 model version:
+7) And finally we can run the llava cli using the 1.6 model version:
 ```console
-./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
+./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```
 
 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh
index f73623ae..45ccf8d7 100755
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
 # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 
 program_dir="build_64/bin"
-binName="llava-cli"
+binName="llama-llava-cli"
 n_threads=4
 
 
diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt
index 8827e3f1..f0ae5cd8 100644
--- a/examples/lookahead/CMakeLists.txt
+++ b/examples/lookahead/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET lookahead)
+set(TARGET llama-lookahead)
 add_executable(${TARGET} lookahead.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt
index b91633f6..ef19fe25 100644
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -1,22 +1,22 @@
-set(TARGET lookup)
+set(TARGET llama-lookup)
 add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
-set(TARGET lookup-create)
+set(TARGET llama-lookup-create)
 add_executable(${TARGET} lookup-create.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
-set(TARGET lookup-merge)
+set(TARGET llama-lookup-merge)
 add_executable(${TARGET} lookup-merge.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
-set(TARGET lookup-stats)
+set(TARGET llama-lookup-stats)
 add_executable(${TARGET} lookup-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp
index 07c93eb8..81e2b043 100644
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@@ -11,14 +11,14 @@
 #include <unordered_map>
 #include <vector>
 
-static void print_usage() {
+static void print_usage(char* argv0) {
     fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
-    fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
+    fprintf(stderr, "Usage: %s [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n", argv0);
 }
 
 int main(int argc, char ** argv){
     if (argc < 3) {
-        print_usage();
+        print_usage(argv[0]);
         exit(1);
     }
 
@@ -27,7 +27,7 @@ int main(int argc, char ** argv){
     for (int i = 0; i < argc-1; ++i) {
         args[i] = argv[i+1];
         if (args[i] == "-h" || args[i] == "--help") {
-            print_usage();
+            print_usage(argv[0]);
             exit(0);
         }
     }
diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt
index deb77d58..a97ded36 100644
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -1,12 +1,12 @@
 cmake_minimum_required(VERSION 3.12)
-project("main-cmake-pkg" C CXX)
-set(TARGET main-cmake-pkg)
+project("llama-cli-cmake-pkg" C CXX)
+set(TARGET llama-cli-cmake-pkg)
 
 find_package(Llama 0.0.1 REQUIRED)
 
 # Bake common functionality in with target. Because applications
 # using the relocatable Llama package should be outside of the
-# source tree, main-cmake-pkg pretends the dependencies are built-in.
+# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in.
 set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
 add_library(common OBJECT)
 file(GLOB _common_files
@@ -15,7 +15,7 @@ file(GLOB _common_files
 )
 target_sources(common PRIVATE ${_common_files})
 
-# If the common project was part of "main-cmake-pkg" the transient
+# If the common project was part of "llama-cli-cmake-pkg" the transient
 # defines would automatically be attached. Because the common func-
 # tionality is separate, but dependent upon the defines, it must be
 # explicitly extracted from the "llama" target.
diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md
index a88e92f2..08d83dd0 100644
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/example/main-cmake-pkg
 
-This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
+This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
 
 ## Building
 
@@ -20,7 +20,7 @@ cmake --build build --config Release
 cmake --install build --prefix C:/LlamaCPP
 ```
 
-### Build main-cmake-pkg
+### Build llama-cli-cmake-pkg
 
 
 ```cmd
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index d532980b..5f6efaa9 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET main)
+set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/main/README.md b/examples/main/README.md
index cdc002f1..61e4a42f 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,4 +1,4 @@
-# llama.cpp/example/main
+# llama.cpp/examples/main
 
 This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
 
@@ -20,13 +20,13 @@ To get started right away, run the following command, making sure to use the cor
 #### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
+./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time"
 ```
 
 #### Windows:
 
 ```powershell
-main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
+llama-cli.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
 ```
 
 For an interactive experience, try this command:
@@ -34,7 +34,7 @@ For an interactive experience, try this command:
 #### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
+./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
 'User: Hi
 AI: Hello. I am an AI chatbot. Would you like to talk?
 User: Sure!
@@ -45,7 +45,7 @@ User:'
 #### Windows:
 
 ```powershell
-main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+llama-cli.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
 ```
 
 The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
@@ -53,18 +53,18 @@ The following command generates "infinite" text from a starting prompt (you can
 #### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1
+./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1
 ```
 
 #### Windows:
 
 ```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
+llama-cli.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
 ```
 
 ## Common Options
 
-In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
+In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
 
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
@@ -74,7 +74,7 @@ In this section, we cover the most commonly used options for running the `main`
 
 ## Input Prompts
 
-The `main` program provides several ways to interact with the LLaMA models using input prompts:
+The `llama-cli` program provides several ways to interact with the LLaMA models using input prompts:
 
 -   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
 -   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
@@ -82,7 +82,7 @@ The `main` program provides several ways to interact with the LLaMA models using
 
 ## Interaction
 
-The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
+The `llama-cli` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
 
 In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
 
@@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
 The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
 
 ```sh
-./main -r "User:" --in-prefix " "
+./llama-cli -r "User:" --in-prefix " "
 ```
 
 ### In-Suffix
@@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
 The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
 
 ```sh
-./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
+./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```
 
 ## Context Management
diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt
index 319535a6..c13557ba 100644
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET parallel)
+set(TARGET llama-parallel)
 add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt
index 3161bf3e..dc467a5d 100644
--- a/examples/passkey/CMakeLists.txt
+++ b/examples/passkey/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET passkey)
+set(TARGET llama-passkey)
 add_executable(${TARGET} passkey.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/passkey/README.md b/examples/passkey/README.md
index 9e7a119b..a48a6283 100644
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -8,5 +8,5 @@ See the following PRs for more info:
 ### Usage
 
 ```bash
-make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
+make -j && ./llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
 ```
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index 3c76d322..be0f2fd0 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET perplexity)
+set(TARGET llama-perplexity)
 add_executable(${TARGET} perplexity.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 0bd78c21..efde8dfd 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -476,7 +476,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     }
 
     // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
     // BOS tokens will be added for each chunk before eval
 
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
index e31cf5e3..bb986a71 100644
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET quantize-stats)
+set(TARGET llama-quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 6b977fde..3ee4eb97 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET quantize)
+set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh
index 38e28ffc..24bc970e 100644
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -18,9 +18,9 @@ fi
 
 set -x
 
-SPLIT=$1/gguf-split
-QUANTIZE=$1/quantize
-MAIN=$1/main
+SPLIT=$1/llama-gguf-split
+QUANTIZE=$1/llama-quantize
+MAIN=$1/llama-cli
 WORK_PATH=$TMP_DIR/quantize
 ROOT_DIR=$(realpath $(dirname $0)/../../)
 
diff --git a/examples/reason-act.sh b/examples/reason-act.sh
index 046c48db..06d59279 100755
--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
   MODEL="-m $2 "
 fi
 
-./main $MODEL --color \
+./llama-cli $MODEL --color \
     -f ./prompts/reason-act.txt \
     -i --interactive-first \
     --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt
index eaabae08..66610f31 100644
--- a/examples/retrieval/CMakeLists.txt
+++ b/examples/retrieval/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET retrieval)
+set(TARGET llama-retrieval)
 add_executable(${TARGET} retrieval.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md
index 2b2595c4..bc5f22e2 100644
--- a/examples/retrieval/README.md
+++ b/examples/retrieval/README.md
@@ -15,7 +15,7 @@ https://github.com/ggerganov/llama.cpp/pull/6193
 `retrieval` example can be tested as follows:
 
 ```bash
-make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
+make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
 ```
 
 This chunks and embeds all given files and starts a loop requesting query inputs:
diff --git a/examples/rpc/README.md b/examples/rpc/README.md
index eeec71a8..86544e3f 100644
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -70,5 +70,5 @@ cmake --build . --config Release
 Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
 
 ```bash
-$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
+$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index cc6ed855..0fb5e359 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET save-load-state)
+set(TARGET llama-save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/server-llama2-13B.sh b/examples/server-llama2-13B.sh
index 17fedc2b..4ce79b7f 100755
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
@@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
 
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./server $GEN_OPTIONS \
+./llama-server $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --rope-freq-scale 1.0 \
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index dab70961..8365f951 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET server)
+set(TARGET llama-server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/examples/server/README.md b/examples/server/README.md
index ccbdcdbd..e7fb0bf6 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -80,26 +80,26 @@ The project is under active development, and we are [looking for feedback and co
 
 ## Build
 
-`server` is built alongside everything else from the root of the project
+`llama-server` is built alongside everything else from the root of the project
 
 - Using `make`:
 
   ```bash
-  make server
+  make llama-server
   ```
 
 - Using `CMake`:
 
   ```bash
   cmake -B build
-  cmake --build build --config Release -t server
+  cmake --build build --config Release -t llama-server
   ```
 
-  Binary is at `./build/bin/server`
+  Binary is at `./build/bin/llama-server`
 
 ## Build with SSL
 
-`server` can also be built with SSL support using OpenSSL 3
+`llama-server` can also be built with SSL support using OpenSSL 3
 
 - Using `make`:
 
@@ -107,14 +107,14 @@ The project is under active development, and we are [looking for feedback and co
   # NOTE: For non-system openssl, use the following:
   #   CXXFLAGS="-I /path/to/openssl/include"
   #   LDFLAGS="-L /path/to/openssl/lib"
-  make LLAMA_SERVER_SSL=true server
+  make LLAMA_SERVER_SSL=true llama-server
   ```
 
 - Using `CMake`:
 
   ```bash
   cmake -B build -DLLAMA_SERVER_SSL=ON
-  cmake --build build --config Release -t server
+  cmake --build build --config Release -t llama-server
   ```
 
 ## Quick Start
@@ -124,13 +124,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.)
 
 ```bash
-./server -m models/7B/ggml-model.gguf -c 2048
+./llama-server -m models/7B/ggml-model.gguf -c 2048
 ```
 
 ### Windows
 
 ```powershell
-server.exe -m models\7B\ggml-model.gguf -c 2048
+llama-server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
 
 The above command will start a server that by default listens on `127.0.0.1:8080`.
@@ -629,11 +629,11 @@ bash chat.sh
 
 ### OAI-like API
 
-The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
+The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
 
 ### API errors
 
-`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
+`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
 
 Example of an error:
 
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 23a3ec97..0f18ca39 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -99,7 +99,7 @@ The `bench.py` script does several steps:
 It aims to be used in the CI, but you can run it manually:
 
 ```shell
-LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
               --runner-label local \
               --name local \
               --branch `git rev-parse --abbrev-ref HEAD` \
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index 86c5de10..4fbbb203 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -245,7 +245,7 @@ def start_server(args):
 
 def start_server_background(args):
     # Start the server
-    server_path = '../../../build/bin/server'
+    server_path = '../../../build/bin/llama-server'
     if 'LLAMA_SERVER_BIN_PATH' in os.environ:
         server_path = os.environ['LLAMA_SERVER_BIN_PATH']
     server_args = [
diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md
index 36a46885..2dc17782 100644
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@@ -44,12 +44,12 @@ http module.
 
 ### running using examples/server
 
-bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
+./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
 
 ### running using python3's server module
 
 first run examples/server
-* bin/server -m path/model.gguf
+* ./llama-server -m path/model.gguf
 
 next run this web front end in examples/server/public_simplechat
 * cd ../examples/server/public_simplechat
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index 83c0208f..5e6cb277 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -27,10 +27,8 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
 
 ```shell
 cd ../../..
-mkdir build
-cd build
-cmake -DLLAMA_CURL=ON ../
-cmake --build . --target server
+cmake -B build -DLLAMA_CURL=ON
+cmake --build build --target llama-server
 ```
 
 2. Start the test: `./tests.sh`
@@ -40,7 +38,7 @@ It's possible to override some scenario steps values with environment variables:
 | variable                 | description                                                                                    |
 |--------------------------|------------------------------------------------------------------------------------------------|
 | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
-| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/server`                         |
+| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
 | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
 | `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 26d9359d..7b5dabb0 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1272,9 +1272,9 @@ def context_text(context):
 
 def start_server_background(context):
     if os.name == 'nt':
-        context.server_path = '../../../build/bin/Release/server.exe'
+        context.server_path = '../../../build/bin/Release/llama-server.exe'
     else:
-        context.server_path = '../../../build/bin/server'
+        context.server_path = '../../../build/bin/llama-server'
     if 'LLAMA_SERVER_BIN_PATH' in os.environ:
         context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
     server_listen_addr = context.server_fqdn
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index 7da5ff6f..070cfbe7 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET simple)
+set(TARGET llama-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt
index 810f3c46..aa208e7a 100644
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET speculative)
+set(TARGET llama-speculative)
 add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index 69cf8932..e4d5083e 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -2,7 +2,7 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 
-set(TARGET ls-sycl-device)
+set(TARGET llama-ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/sycl/README.md b/examples/sycl/README.md
index c589c2d3..0e3acd35 100644
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -6,9 +6,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.
 
 |Tool Name| Function|Status|
 |-|-|-|
-|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
 
-### ls-sycl-device
+### llama-ls-sycl-device
 
 List all SYCL devices with ID, compute capability, max work group size, ect.
 
@@ -23,7 +23,7 @@ source /opt/intel/oneapi/setvars.sh
 3. Execute
 
 ```
-./build/bin/ls-sycl-device
+./build/bin/llama-ls-sycl-device
 ```
 
 Check the ID in startup log, like:
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index 7b39a18c..da0e4aab 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -23,15 +23,15 @@ fi
 if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
     echo "use $GGML_SYCL_DEVICE as main GPU"
     #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 else
     #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 fi
 
 #use main GPU only
-#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 
 #use multiple GPUs with same max compute units
-#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 
diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt
index 5e6654d7..b704dcae 100644
--- a/examples/tokenize/CMakeLists.txt
+++ b/examples/tokenize/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET tokenize)
+set(TARGET llama-tokenize)
 add_executable(${TARGET} tokenize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt
index 4459516d..9a1d2a35 100644
--- a/examples/train-text-from-scratch/CMakeLists.txt
+++ b/examples/train-text-from-scratch/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(TARGET train-text-from-scratch)
+set(TARGET llama-train-text-from-scratch)
 add_executable(${TARGET} train-text-from-scratch.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md
index 1b345406..3abae238 100644
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@@ -7,7 +7,7 @@ Basic usage instructions:
 wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
 
 # train
-./bin/train-text-from-scratch \
+./bin/llama-train-text-from-scratch \
         --vocab-model ../models/ggml-vocab-llama.gguf \
         --ctx 64 --embd 256 --head 8 --layer 16 \
         --checkpoint-in  chk-shakespeare-256x16-LATEST.gguf \
@@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
         --no-checkpointing
 
 # predict
-./bin/main -m ggml-shakespeare-256x16-f32.gguf
+./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
 ```
 
 Output files will be saved every N iterations (config with `--save-every N`).
author	Olivier Chafik <ochafik@users.noreply.github.com>	2024-06-13 00:41:52 +0100
committer	GitHub <noreply@github.com>	2024-06-13 00:41:52 +0100
commit	1c641e6aac5c18b964e7b32d9dbbb4bf5301d0d7 (patch)
tree	616348dac8e67d80a03a81847ce9ee4bb7e19d49 /examples
parent	963552903f51043ee947a8deeaaa7ec00bc3f1a4 (diff)