From 438c2ca83045a00ef244093d27e9ed41a8cb4ea9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 22 Oct 2023 22:53:08 +0300
Subject: server : parallel decoding and multimodal (#3677)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* implementing parallel decoding in server example

* crash fixed

* save dev progress

* refactored sampling function

* completion endpoint working

* multiple client support

* grammar + no stream completion

* cached prompt support

* chat.mjs support cached prompt + some fixes

* server ui now support multiple clients

* unused change reverted

* fixed timings per slot

* add context swap

* add changes to README.md

* llava multimodal integration

* fixed tokens probs

* add multimodal input - alfa

* refactor code + remove unused comments + improved README.md

* fix compilation errors with llvm

* notify the user from server ui that multimodality is unavialable

* some ci fixes

* fix ci make build undefined ref errors

* fix long prompt than ctx proposed in #3639

* fixed premature end due stop word

* context shift fixed

* fix llava implementation

* sync README.md changes

* readme change

* update api like OpenAI

* multimodal support enabled by default

* fix make bui;d errors

* fix multiple clients

* fix zig build

* new sampling API

* latest changes of sampling API

* server : coding-style normalization

* server : coding-style normalization (part 2)

* server : remove beam-search functionality

* server : bug fix in ingest_images

n_tokens is incremented internally by llama_batch_add

* server : use refs + use llama_batch_clear()

* server : snake case

* server : minor sync

* added thread safe pipeline

* server : bach has to be allocated for n_parallel sequences

* server : no need for atomic int - already using mutex

* server : logs + minor code style

* server : fix multibyte handle in partial response (#3706)

* fix image load + view image in chat

* make : silence stb warnings

* clip : link to ggml, not to llama

* server : fix switch fallthrough

* server : fix crash in Debug on macOS (I have no idea why this fixes it!?)

* server : refactor ctx_sampling init + n_ctx + names

* server : bug fix for prompt caching

* Do not save/load image_data to localStorage

* editorconfig : new line in index.html

* server : completion requests remember slot_id

* Update readme to document multimodal in server

* server : minor style

* Update readme to document multimodal in server

* server : hide ctx_sampling->prev behind API (#3696)

* server : apply fix from #3722

* server : fix slot reuse

* server : add comment about changing slot_state to bool

---------

Co-authored-by: FSSRepo <go778sgt@gmail.com>
Co-authored-by: Damian Stewart <d@damianstewart.com>
Co-authored-by: Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
Co-authored-by: Jhen-Jie Hong <iainst0409@gmail.com>
Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
---
 examples/server/api_like_OAI.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'examples/server/api_like_OAI.py')

diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py
index 14d2dcf6..313e1a96 100755
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -8,6 +8,7 @@ import json
 
 
 app = Flask(__name__)
+slot_id = -1
 
 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
 parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
     if(is_present(body, "stop")): postData["stop"] += body["stop"]
     postData["n_keep"] = -1
     postData["stream"] = stream
-
+    postData["cache_prompt"] = True
+    postData["slot_id"] = slot_id
     return postData
 
 def make_resData(data, chat=False, promptToken=[]):
@@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
             }
         ]
     }
+    slot_id = data["slot_id"]
     if (chat):
         if (start):
             resData["choices"][0]["delta"] =  {
-- 
cgit v1.2.3