From f482bb2e4920e544651fb832f2e0bcb4d2ff69ab Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 23 Mar 2024 18:07:00 +0100
Subject: common: llama_load_model_from_url split support  (#6192)

* llama: llama_split_prefix fix strncpy does not include string termination
common: llama_load_model_from_url:
 - fix header name case sensitive
 - support downloading additional split in parallel
 - hide password in url

* common: EOL EOF

* common: remove redundant LLAMA_CURL_MAX_PATH_LENGTH definition

* common: change max url max length

* common: minor comment

* server: support HF URL options

* llama: llama_model_loader fix log

* common: use a constant for max url length

* common: clean up curl if file cannot be loaded in gguf

* server: tests: add split tests, and HF options params

* common: move llama_download_hide_password_in_url inside llama_download_file as a lambda

* server: tests: enable back Release test on PR

* spacing

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* spacing

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* spacing

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/gguf-split/gguf-split.cpp              |  4 ----
 examples/server/README.md                       |  4 +++-
 examples/server/server.cpp                      | 18 +++++++++++++++++-
 examples/server/tests/features/parallel.feature |  3 ++-
 examples/server/tests/features/server.feature   |  4 ++--
 examples/server/tests/features/steps/steps.py   | 13 +++++++++----
 6 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'examples')

diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index f703588e..b1af5999 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -26,10 +26,6 @@ enum split_operation : uint8_t {
     SPLIT_OP_MERGE,
 };
 
-static const char * const LLM_KV_SPLIT_NO            = "split.no";
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
-
 struct split_params {
     split_operation operation = SPLIT_OP_SPLIT;
     int n_split_tensors = 128;
diff --git a/examples/server/README.md b/examples/server/README.md
index 8aa4eac9..dfea2b90 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -20,7 +20,9 @@ The project is under active development, and we are [looking for feedback and co
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
 - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
-- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
+- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused).
+- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused).
+- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 27bd2dd7..b02c2546 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2208,7 +2208,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     printf("  -m FNAME, --model FNAME\n");
     printf("                            model path (default: %s)\n", params.model.c_str());
     printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
-    printf("                            model download url (default: %s)\n", params.model_url.c_str());
+    printf("                            model download url (default: unused)\n");
+    printf("  -hfr REPO, --hf-repo REPO\n");
+    printf("                            Hugging Face model repository (default: unused)\n");
+    printf("  -hff FILE, --hf-file FILE\n");
+    printf("                            Hugging Face model file (default: unused)\n");
     printf("  -a ALIAS, --alias ALIAS\n");
     printf("                            set an alias for the model, will be added as `model` field in completion response\n");
     printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
@@ -2337,6 +2341,18 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 break;
             }
             params.model_url = argv[i];
+        } else if (arg == "-hfr" || arg == "--hf-repo") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hf_repo = argv[i];
+        } else if (arg == "-hff" || arg == "--hf-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hf_file = argv[i];
         } else if (arg == "-a" || arg == "--alias") {
             if (++i >= argc) {
                 invalid_param = true;
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
index a66fed62..6cd306a2 100644
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -4,7 +4,8 @@ Feature: Parallel
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
+    And   a model file test-model-00001-of-00003.gguf
     And   42 as server seed
     And   128 as batch size
     And   256 KV cache size
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index a2e0e5b3..646a4e49 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -4,8 +4,8 @@ Feature: llama.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
-    And   a model file stories260K.gguf
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file test-model.gguf
     And   a model alias tinyllama-2
     And   42 as server seed
       # KV Cache corresponds to the total amount of tokens
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 03f55f65..86c3339d 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -16,7 +16,6 @@ import numpy as np
 import openai
 from behave import step
 from behave.api.async_step import async_run_until_complete
-from huggingface_hub import hf_hub_download
 from prometheus_client import parser
 
 
@@ -39,6 +38,8 @@ def step_server_config(context, server_fqdn, server_port):
 
     context.model_alias = None
     context.model_file = None
+    context.model_hf_repo = None
+    context.model_hf_file = None
     context.model_url = None
     context.n_batch = None
     context.n_ubatch = None
@@ -68,9 +69,9 @@ def step_server_config(context, server_fqdn, server_port):
 
 @step('a model file {hf_file} from HF repo {hf_repo}')
 def step_download_hf_model(context, hf_file, hf_repo):
-    context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
-    if context.debug:
-        print(f"model file: {context.model_file}")
+    context.model_hf_repo = hf_repo
+    context.model_hf_file = hf_file
+    context.model_file = os.path.basename(hf_file)
 
 
 @step('a model file {model_file}')
@@ -1079,6 +1080,10 @@ def start_server_background(context):
         server_args.extend(['--model', context.model_file])
     if context.model_url:
         server_args.extend(['--model-url', context.model_url])
+    if context.model_hf_repo:
+        server_args.extend(['--hf-repo', context.model_hf_repo])
+    if context.model_hf_file:
+        server_args.extend(['--hf-file', context.model_hf_file])
     if context.n_batch:
         server_args.extend(['--batch-size', context.n_batch])
     if context.n_ubatch:
-- 
cgit v1.2.3