From f5a77a629bd0f37ae1696747633ab42a5530ec15 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 22 Mar 2023 07:32:36 +0200 Subject: Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add * Add * Add .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning --- utils.h | 61 +++---------------------------------------------------------- 1 file changed, 3 insertions(+), 58 deletions(-) (limited to 'utils.h') diff --git a/utils.h b/utils.h index 31290385..3f970eab 100644 --- a/utils.h +++ b/utils.h @@ -2,8 +2,9 @@ #pragma once +#include "llama.h" + #include -#include #include #include #include @@ -49,64 +50,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params); std::string gpt_random_prompt(std::mt19937 & rng); -// -// Model file parsing -// - -#define FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files -#define FILE_MAGIC 0x67676d66 // 'ggmf' in hex -#define FILE_VERSION 1 - // // Vocab utils // -struct llama_vocab { - using id = int32_t; - using token = std::string; - - struct token_score { - token tok; - float score; - }; - - std::unordered_map token_to_id; - std::vector id_to_token; -}; - -void replace(std::string & str, const std::string & needle, const std::string & replacement); - -// poor-man's JSON parsing -std::unordered_map json_parse(const std::string & fname); - -// TODO: temporary until #77 is merged, need this now for some tokenizer tests -bool llama_vocab_load(const std::string & fname, llama_vocab & vocab); - -// TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. -// ref: https://github.com/google/sentencepiece -std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos); - -// sample next token given probabilities for each embedding -// -// - consider only the top K tokens -// - from them, consider only the top tokens with cumulative probability > P -// -llama_vocab::id llama_sample_top_p_top_k( - const llama_vocab & vocab, - const float * logits, - std::vector & last_n_tokens, - double repeat_penalty, - int top_k, - double top_p, - double temp, - std::mt19937 & rng); - -// filer to top K tokens from list of logits -void sample_top_k(std::vector> & logits_id, int top_k); - -// -// Quantization -// - -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); +std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); -- cgit v1.2.3