Server: fix seed for multiple slots (#6835)

* Server: add tests for consistent results * sampling: separate rng per sampling context
author: Johannes Gäßler <johannesg@5d6.de> 2024-04-24 11:08:36 +0200
committer: GitHub <noreply@github.com> 2024-04-24 11:08:36 +0200
commit: 28103f4832e301a9c84d44ff0df9d75d46ab6c76 (patch)
tree: 8ba391e3a7e0ce9a20d4b41782ef133bd7e32738 /llama.h
parent: c0d1b3e03e27634ac2871761f5033cf9324d472d (diff)
1 files changed, 7 insertions, 2 deletions
diff --git a/llama.h b/llama.h
index 4effca42..7bfd1374 100644
--- a/llama.h
+++ b/llama.h
@@ -987,7 +987,7 @@ extern "C" {
             struct llama_context * ctx,
           llama_token_data_array * candidates);
 
-    /// @details Randomly selects a token from the candidates based on their probabilities.
+    /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
     LLAMA_API llama_token llama_sample_token(
             struct llama_context * ctx,
           llama_token_data_array * candidates);
@@ -1074,8 +1074,9 @@ extern "C" {
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 
-#include <vector>
+#include <random>
 #include <string>
+#include <vector>
 
 struct ggml_tensor;
 
@@ -1112,6 +1113,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
         const std::string & src,
         llama_partial_utf8   partial_start);
 
+// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
+// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
+llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
+
 #endif // LLAMA_API_INTERNAL
 
 #endif // LLAMA_H
author	Johannes Gäßler <johannesg@5d6.de>	2024-04-24 11:08:36 +0200
committer	GitHub <noreply@github.com>	2024-04-24 11:08:36 +0200
commit	28103f4832e301a9c84d44ff0df9d75d46ab6c76 (patch)
tree	8ba391e3a7e0ce9a20d4b41782ef133bd7e32738 /llama.h
parent	c0d1b3e03e27634ac2871761f5033cf9324d472d (diff)