summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-06-04 21:23:05 +0300
committerGitHub <noreply@github.com>2024-06-04 21:23:05 +0300
commit0cd6bd3483fa66124b76a8a8ac794d9ee18c70c1 (patch)
tree063feb702c456075281e875d835f96bf98087279
parent5ca0944a153b65724d51b2f484139aa25ccb7a8b (diff)
llama : remove beam search (#7736)
-rw-r--r--Makefile6
-rw-r--r--examples/CMakeLists.txt1
-rw-r--r--examples/beam-search/CMakeLists.txt5
-rw-r--r--examples/beam-search/beam-search.cpp188
-rw-r--r--llama.cpp254
-rw-r--r--llama.h42
6 files changed, 2 insertions, 494 deletions
diff --git a/Makefile b/Makefile
index b527f6f3..27eb6987 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
- simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
@@ -914,10 +914,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b40ee4cc..53002f8e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench)
- add_subdirectory(beam-search)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt
deleted file mode 100644
index f0e37468..00000000
--- a/examples/beam-search/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET beam-search)
-add_executable(${TARGET} beam-search.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
deleted file mode 100644
index 3d34378a..00000000
--- a/examples/beam-search/beam-search.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-# define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-// Used for debugging to print out beam tokens.
-struct ostream_beam_view {
- llama_context * ctx;
- llama_beam_view beam_view;
-};
-
-static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
- os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
- for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
- os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
- }
- return os << ')';
-}
-
-// Put here anything you want back in beam_search_callback().
-struct beam_search_callback_data {
- llama_context * ctx;
- std::vector<llama_token> response;
-};
-
-// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
-// For example, eob can be flagged due to maximum token length, stop words, etc.
-static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
- return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
-}
-
-// Function matching type llama_beam_search_callback_fn_t.
-// Custom callback example is called each time the beams lengths increase:
-// * Show progress by printing ',' following by number of convergent beam tokens if any.
-// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
-// This is also called when the stop condition is met.
-// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
- auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
- // Mark beams as EOS as needed.
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
- llama_beam_view& beam_view = beams_state.beam_views[i];
- if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
- beam_view.eob = true;
- }
- }
- printf(","); // Show progress
- if (const size_t n = beams_state.common_prefix_length) {
- callback_data.response.resize(callback_data.response.size() + n);
- assert(0u < beams_state.n_beams);
- const llama_token * tokens = beams_state.beam_views[0].tokens;
- std::copy(tokens, tokens + n, callback_data.response.end() - n);
- printf("%zu", n);
- }
- fflush(stdout);
-#if 1 // DEBUG: print current beams for this iteration
- std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
- std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
- }
-#endif
-}
-
-int main(int argc, char ** argv)
-{
- gpt_params params;
- //params.n_gpu_layers = 200;
-
- //---------------------------------
- // Print help :
- //---------------------------------
-
- if ( argc < 2 || argv[1][0] == '-' )
- {
- printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
- return 1 ;
- }
-
- //---------------------------------
- // Load parameters :
- //---------------------------------
-
- params.model = argv[1];
-
- params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
-
- if ( argc > 3 )
- {
- params.prompt = argv[3];
- }
-
- if ( params.prompt.empty() )
- {
- params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
- }
-
- //---------------------------------
- // Init LLM :
- //---------------------------------
-
- llama_backend_init();
- llama_numa_init(params.numa);
-
- llama_model * model;
- llama_context * ctx;
-
- std::tie(model, ctx) = llama_init_from_gpt_params( params );
-
- if ( model == NULL )
- {
- fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
- return 1;
- }
-
- //---------------------------------
- // Tokenize the prompt :
- //---------------------------------
-
- std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
-
- const size_t max_context_size = llama_n_ctx( ctx );
- const size_t max_tokens_list_size = max_context_size - 4 ;
-
- if (tokens_list.size() > max_tokens_list_size)
- {
- fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
- __func__ , tokens_list.size() , max_tokens_list_size );
- return 1;
- }
-
- fprintf( stderr, "\n\n" );
-
- // Print the tokens from the prompt :
-
- for( auto id : tokens_list )
- {
- std::cout << llama_token_to_piece(ctx, id);
- }
- std::cout << std::flush;
-
- int n_past = 0;
-
- if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
- {
- fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
- return 1;
- }
- n_past += tokens_list.size();
-
- beam_search_callback_data callback_data{ctx, {}};
- size_t const beam_width = static_cast<size_t>(params.n_beams);
- int const n_predict = 256;
- llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
-
- std::cout << "\n\n";
- for (llama_token const token_id : callback_data.response) {
- std::cout << llama_token_to_piece(ctx,token_id);
- }
- std::cout << std::endl;
-
- llama_free( ctx );
- llama_free_model( model );
-
- llama_backend_free();
-
- return 0;
-}
diff --git a/llama.cpp b/llama.cpp
index a3e94487..92c33f53 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -14712,260 +14712,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
}
//
-// Beam search
-//
-
-struct llama_beam {
- std::vector<llama_token> tokens;
- float p; // Cumulative beam probability (renormalized relative to all beams)
- bool eob; // Initialize end-of-beam to false. Callback sets this to true.
- // Sort beams by probability. In case of ties, prefer beams at eob.
- bool operator<(const llama_beam & rhs) const {
- return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
- }
- // Shift off first n tokens and discard them.
- void shift_tokens(const size_t n) {
- if (n) {
- std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
- tokens.resize(tokens.size() - n);
- }
- }
- llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
-};
-
-// A struct for calculating logit-related info.
-struct llama_logit_info {
- const float * const logits;
- const int n_vocab;
- const float max_l;
- const float normalizer;
- struct sum_exp {
- float max_l;
- float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
- };
- llama_logit_info(llama_context * ctx)
- : logits(llama_get_logits(ctx))
- , n_vocab(llama_n_vocab(llama_get_model(ctx)))
- , max_l(*std::max_element(logits, logits + n_vocab))
- , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
- { }
- llama_token_data get_token_data(const llama_token token_id) const {
- constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
- return {token_id, logits[token_id], p};
- }
- // Return top k token_data by logit.
- std::vector<llama_token_data> top_k(size_t k) {
- std::vector<llama_token_data> min_heap; // min-heap by logit
- const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
- min_heap.reserve(k_min);
- for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
- min_heap.push_back(get_token_data(token_id));
- }
- auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
- std::make_heap(min_heap.begin(), min_heap.end(), comp);
- for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
- if (min_heap.front().logit < logits[token_id]) {
- std::pop_heap(min_heap.begin(), min_heap.end(), comp);
- min_heap.back().id = token_id;
- min_heap.back().logit = logits[token_id];
- std::push_heap(min_heap.begin(), min_heap.end(), comp);
- }
- }
- return min_heap;
- }
- float probability_from_logit(float logit) const {
- return normalizer * std::exp(logit - max_l);
- }
-};
-
-struct llama_beam_search_data {
- llama_context * ctx;
- size_t n_beams;
- int n_past;
- int n_predict;
- std::vector<llama_beam> beams;
- std::vector<llama_beam> next_beams;
-
- // Re-calculated on each loop iteration
- size_t common_prefix_length;
-
- // Used to communicate to/from callback on beams state.
- std::vector<llama_beam_view> beam_views;
-
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
- : ctx(ctx)
- , n_beams(n_beams)
- , n_past(n_past)
- , n_predict(n_predict)
- , beam_views(n_beams) {
- beams.reserve(n_beams);
- next_beams.reserve(n_beams);
- }
-
- // Collapse beams to a single beam given by index.
- void collapse_beams(const size_t beam_idx) {
- if (0u < beam_idx) {
- std::swap(beams[0], beams[beam_idx]);
- }
- beams.resize(1);
- }
-
- // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
- // The repetitive patterns below reflect the 2 stages of heaps:
- // * Gather elements until the vector is full, then call std::make_heap() on it.
- // * If the heap is full and a new element is found that should be included, pop the
- // least element to the back(), replace it with the new, then push it into the heap.
- void fill_next_beams_by_top_probabilities(llama_beam & beam) {
- // Min-heaps use a greater-than comparator.
- const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
- if (beam.eob) {
- // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
- if (next_beams.size() < n_beams) {
- next_beams.push_back(std::move(beam));
- if (next_beams.size() == n_beams) {
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
- }
- } else if (next_beams.front().p < beam.p) {
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
- next_beams.back() = std::move(beam);
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
- }
- } else {
- // beam is not at end-of-sentence, so branch with next top_k tokens.
- if (!beam.tokens.empty()) {
- llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
- }
- llama_logit_info logit_info(ctx);
- std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
-
- // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
- // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
- size_t i=0;
- if (next_beams.size() < n_beams) {
- for (; next_beams.size() < n_beams ; ++i) {
- llama_beam next_beam = beam;
- next_beam.tokens.push_back(next_tokens[i].id);
- next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
- next_beams.push_back(std::move(next_beam));
- }
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
- } else {
- for (; next_beams.front().p == 0.0f ; ++i) {
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
- next_beams.back() = beam;
- next_beams.back().tokens.push_back(next_tokens[i].id);
- next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
- }
- }
- for (; i < n_beams ; ++i) {
- const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
- if (next_beams.front().p < next_p) {
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
- next_beams.back() = beam;
- next_beams.back().tokens.push_back(next_tokens[i].id);
- next_beams.back().p = next_p;
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
- }
- }
- }
- }
-
- // Find common_prefix_length based on beams.
- // Requires beams is not empty.
- size_t find_common_prefix_length() {
- size_t common_prefix_length = beams[0].tokens.size();
- for (size_t i = 1 ; i < beams.size() ; ++i) {
- common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
- for (size_t j = 0 ; j < common_prefix_length ; ++j) {
- if (beams[0].tokens[j] != beams[i].tokens[j]) {
- common_prefix_length = j;
- break;
- }
- }
- }
- return common_prefix_length;
- }
-
- // Construct beams_state to send back to caller via the callback function.
- // Side effect: set common_prefix_length = find_common_prefix_length();
- llama_beams_state get_beams_state(const bool last_call) {
- for (size_t i = 0 ; i < beams.size() ; ++i) {
- beam_views[i] = beams[i].view();
- }
- common_prefix_length = find_common_prefix_length();
- return {beam_views.data(), beams.size(), common_prefix_length, last_call};
- }
-
- // Loop:
- // * while i < n_predict, AND
- // * any of the beams have not yet reached end-of-beam (eob), AND
- // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
- // (since all other beam probabilities can only decrease)
- void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
- beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
- const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
- for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
- !beams[top_beam_index()].eob ; ++i) {
- callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
- update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
- if (common_prefix_length) {
- llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
- n_past += common_prefix_length;
- }
- // Zero-out next_beam probabilities to place them last in following min-heap.
- std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
- for (llama_beam & beam : beams) {
- beam.shift_tokens(common_prefix_length);
- fill_next_beams_by_top_probabilities(beam);
- }
- // next_beams become the beams of next/final iteration. Swap them to re-use memory.
- beams.swap(next_beams);
- renormalize_beam_probabilities(beams);
- }
- collapse_beams(top_beam_index());
- callback(callback_data, get_beams_state(true));
- }
-
- // As beams grow, the cumulative probabilities decrease.
- // Renormalize them to avoid floating point underflow.
- static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
- const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
- const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
- std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
- }
-
- // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
- size_t top_beam_index() {
- return std::max_element(beams.begin(), beams.end()) - beams.begin();
- }
-
- // Copy (p,eob) for each beam which may have been changed by the callback.
- void update_beams_from_beam_views() {
- for (size_t i = 0 ; i < beams.size() ; ++i) {
- beams[i].p = beam_views[i].p;
- beams[i].eob = beam_views[i].eob;
- }
- }
-};
-
-void llama_beam_search(llama_context * ctx,
- llama_beam_search_callback_fn_t callback, void * callback_data,
- size_t n_beams, int n_past, int n_predict) {
- assert(ctx);
- const int64_t t_start_sample_us = ggml_time_us();
-
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
-
- beam_search_data.loop(callback, callback_data);
-
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
- ctx->n_sample++;
-}
-
-//
// quantization
//
diff --git a/llama.h b/llama.h
index a78ccdaf..b2a302da 100644
--- a/llama.h
+++ b/llama.h
@@ -1056,49 +1056,9 @@ extern "C" {
llama_token token);
//
- // Beam search
+ // Model split
//
- struct llama_beam_view {
- const llama_token * tokens;
-
- size_t n_tokens;
- float p; // Cumulative beam probability (renormalized relative to all beams)
- bool eob; // Callback should set this to true when a beam is at end-of-beam.
- };
-
- // Passed to beam_search_callback function.
- // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
- // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
- // These pointers are valid only during the synchronous callback, so should not be saved.
- struct llama_beams_state {
- struct llama_beam_view * beam_views;
-
- size_t n_beams; // Number of elements in beam_views[].
- size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
- bool last_call; // True iff this is the last callback invocation.
- };
-
- // Type of pointer to the beam_search_callback function.
- // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
- // passed back to beam_search_callback. This avoids having to use global variables in the callback.
- typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
-
- /// @details Deterministically returns entire sentence constructed by a beam search.
- /// @param ctx Pointer to the llama_context.
- /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
- /// @param callback_data A pointer that is simply passed back to callback.
- /// @param n_beams Number of beams to use.
- /// @param n_past Number of tokens already evaluated.
- /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
- LLAMA_API void llama_beam_search(
- struct llama_context * ctx,
- llama_beam_search_callback_fn_t callback,
- void * callback_data,
- size_t n_beams,
- int32_t n_past,
- int32_t n_predict);
-
/// @details Build a split GGUF final path for this chunk.
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
// Returns the split_path length.