diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-06-04 21:23:05 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-04 21:23:05 +0300 |
commit | 0cd6bd3483fa66124b76a8a8ac794d9ee18c70c1 (patch) | |
tree | 063feb702c456075281e875d835f96bf98087279 /examples | |
parent | 5ca0944a153b65724d51b2f484139aa25ccb7a8b (diff) |
llama : remove beam search (#7736)
Diffstat (limited to 'examples')
-rw-r--r-- | examples/CMakeLists.txt | 1 | ||||
-rw-r--r-- | examples/beam-search/CMakeLists.txt | 5 | ||||
-rw-r--r-- | examples/beam-search/beam-search.cpp | 188 |
3 files changed, 0 insertions, 194 deletions
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b40ee4cc..53002f8e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -15,7 +15,6 @@ else() add_subdirectory(baby-llama) add_subdirectory(batched) add_subdirectory(batched-bench) - add_subdirectory(beam-search) add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt deleted file mode 100644 index f0e37468..00000000 --- a/examples/beam-search/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET beam-search) -add_executable(${TARGET} beam-search.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp deleted file mode 100644 index 3d34378a..00000000 --- a/examples/beam-search/beam-search.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include <cassert> -#include <cinttypes> -#include <cmath> -#include <cstdio> -#include <cstring> -#include <ctime> -#include <fstream> -#include <iostream> -#include <string> -#include <vector> - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -#include <signal.h> -#include <unistd.h> -#elif defined (_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include <windows.h> -#include <signal.h> -#endif - -// Used for debugging to print out beam tokens. -struct ostream_beam_view { - llama_context * ctx; - llama_beam_view beam_view; -}; - -static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) { - os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; - for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { - os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); - } - return os << ')'; -} - -// Put here anything you want back in beam_search_callback(). -struct beam_search_callback_data { - llama_context * ctx; - std::vector<llama_token> response; -}; - -// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. -// For example, eob can be flagged due to maximum token length, stop words, etc. -static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { - return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]); -} - -// Function matching type llama_beam_search_callback_fn_t. -// Custom callback example is called each time the beams lengths increase: -// * Show progress by printing ',' following by number of convergent beam tokens if any. -// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. -// This is also called when the stop condition is met. -// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. -static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { - auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr); - // Mark beams as EOS as needed. - for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { - llama_beam_view& beam_view = beams_state.beam_views[i]; - if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) { - beam_view.eob = true; - } - } - printf(","); // Show progress - if (const size_t n = beams_state.common_prefix_length) { - callback_data.response.resize(callback_data.response.size() + n); - assert(0u < beams_state.n_beams); - const llama_token * tokens = beams_state.beam_views[0].tokens; - std::copy(tokens, tokens + n, callback_data.response.end() - n); - printf("%zu", n); - } - fflush(stdout); -#if 1 // DEBUG: print current beams for this iteration - std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n"; - for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { - std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl; - } -#endif -} - -int main(int argc, char ** argv) -{ - gpt_params params; - //params.n_gpu_layers = 200; - - //--------------------------------- - // Print help : - //--------------------------------- - - if ( argc < 2 || argv[1][0] == '-' ) - { - printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] ); - return 1 ; - } - - //--------------------------------- - // Load parameters : - //--------------------------------- - - params.model = argv[1]; - - params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2; - - if ( argc > 3 ) - { - params.prompt = argv[3]; - } - - if ( params.prompt.empty() ) - { - params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n"; - } - - //--------------------------------- - // Init LLM : - //--------------------------------- - - llama_backend_init(); - llama_numa_init(params.numa); - - llama_model * model; - llama_context * ctx; - - std::tie(model, ctx) = llama_init_from_gpt_params( params ); - - if ( model == NULL ) - { - fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); - return 1; - } - - //--------------------------------- - // Tokenize the prompt : - //--------------------------------- - - std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true); - - const size_t max_context_size = llama_n_ctx( ctx ); - const size_t max_tokens_list_size = max_context_size - 4 ; - - if (tokens_list.size() > max_tokens_list_size) - { - fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" , - __func__ , tokens_list.size() , max_tokens_list_size ); - return 1; - } - - fprintf( stderr, "\n\n" ); - - // Print the tokens from the prompt : - - for( auto id : tokens_list ) - { - std::cout << llama_token_to_piece(ctx, id); - } - std::cout << std::flush; - - int n_past = 0; - - if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0))) - { - fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); - return 1; - } - n_past += tokens_list.size(); - - beam_search_callback_data callback_data{ctx, {}}; - size_t const beam_width = static_cast<size_t>(params.n_beams); - int const n_predict = 256; - llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict); - - std::cout << "\n\n"; - for (llama_token const token_id : callback_data.response) { - std::cout << llama_token_to_piece(ctx,token_id); - } - std::cout << std::endl; - - llama_free( ctx ); - llama_free_model( model ); - - llama_backend_free(); - - return 0; -} |