summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-06-04 21:23:05 +0300
committerGitHub <noreply@github.com>2024-06-04 21:23:05 +0300
commit0cd6bd3483fa66124b76a8a8ac794d9ee18c70c1 (patch)
tree063feb702c456075281e875d835f96bf98087279 /examples
parent5ca0944a153b65724d51b2f484139aa25ccb7a8b (diff)
llama : remove beam search (#7736)
Diffstat (limited to 'examples')
-rw-r--r--examples/CMakeLists.txt1
-rw-r--r--examples/beam-search/CMakeLists.txt5
-rw-r--r--examples/beam-search/beam-search.cpp188
3 files changed, 0 insertions, 194 deletions
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b40ee4cc..53002f8e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench)
- add_subdirectory(beam-search)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt
deleted file mode 100644
index f0e37468..00000000
--- a/examples/beam-search/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET beam-search)
-add_executable(${TARGET} beam-search.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
deleted file mode 100644
index 3d34378a..00000000
--- a/examples/beam-search/beam-search.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-# define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-// Used for debugging to print out beam tokens.
-struct ostream_beam_view {
- llama_context * ctx;
- llama_beam_view beam_view;
-};
-
-static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
- os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
- for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
- os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
- }
- return os << ')';
-}
-
-// Put here anything you want back in beam_search_callback().
-struct beam_search_callback_data {
- llama_context * ctx;
- std::vector<llama_token> response;
-};
-
-// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
-// For example, eob can be flagged due to maximum token length, stop words, etc.
-static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
- return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
-}
-
-// Function matching type llama_beam_search_callback_fn_t.
-// Custom callback example is called each time the beams lengths increase:
-// * Show progress by printing ',' following by number of convergent beam tokens if any.
-// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
-// This is also called when the stop condition is met.
-// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
- auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
- // Mark beams as EOS as needed.
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
- llama_beam_view& beam_view = beams_state.beam_views[i];
- if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
- beam_view.eob = true;
- }
- }
- printf(","); // Show progress
- if (const size_t n = beams_state.common_prefix_length) {
- callback_data.response.resize(callback_data.response.size() + n);
- assert(0u < beams_state.n_beams);
- const llama_token * tokens = beams_state.beam_views[0].tokens;
- std::copy(tokens, tokens + n, callback_data.response.end() - n);
- printf("%zu", n);
- }
- fflush(stdout);
-#if 1 // DEBUG: print current beams for this iteration
- std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
- std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
- }
-#endif
-}
-
-int main(int argc, char ** argv)
-{
- gpt_params params;
- //params.n_gpu_layers = 200;
-
- //---------------------------------
- // Print help :
- //---------------------------------
-
- if ( argc < 2 || argv[1][0] == '-' )
- {
- printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
- return 1 ;
- }
-
- //---------------------------------
- // Load parameters :
- //---------------------------------
-
- params.model = argv[1];
-
- params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
-
- if ( argc > 3 )
- {
- params.prompt = argv[3];
- }
-
- if ( params.prompt.empty() )
- {
- params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
- }
-
- //---------------------------------
- // Init LLM :
- //---------------------------------
-
- llama_backend_init();
- llama_numa_init(params.numa);
-
- llama_model * model;
- llama_context * ctx;
-
- std::tie(model, ctx) = llama_init_from_gpt_params( params );
-
- if ( model == NULL )
- {
- fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
- return 1;
- }
-
- //---------------------------------
- // Tokenize the prompt :
- //---------------------------------
-
- std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
-
- const size_t max_context_size = llama_n_ctx( ctx );
- const size_t max_tokens_list_size = max_context_size - 4 ;
-
- if (tokens_list.size() > max_tokens_list_size)
- {
- fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
- __func__ , tokens_list.size() , max_tokens_list_size );
- return 1;
- }
-
- fprintf( stderr, "\n\n" );
-
- // Print the tokens from the prompt :
-
- for( auto id : tokens_list )
- {
- std::cout << llama_token_to_piece(ctx, id);
- }
- std::cout << std::flush;
-
- int n_past = 0;
-
- if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
- {
- fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
- return 1;
- }
- n_past += tokens_list.size();
-
- beam_search_callback_data callback_data{ctx, {}};
- size_t const beam_width = static_cast<size_t>(params.n_beams);
- int const n_predict = 256;
- llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
-
- std::cout << "\n\n";
- for (llama_token const token_id : callback_data.response) {
- std::cout << llama_token_to_piece(ctx,token_id);
- }
- std::cout << std::endl;
-
- llama_free( ctx );
- llama_free_model( model );
-
- llama_backend_free();
-
- return 0;
-}