From 074bea2eb1f1349a0118239c4152914aecaa1be4 Mon Sep 17 00:00:00 2001 From: Mack Straight Date: Mon, 20 Mar 2023 03:17:23 -0700 Subject: sentencepiece bpe compatible tokenizer (#252) * potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com> --- utils.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'utils.h') diff --git a/utils.h b/utils.h index 49658f7d..b3a0f472 100644 --- a/utils.h +++ b/utils.h @@ -58,6 +58,7 @@ struct gpt_vocab { std::map token_to_id; std::map id_to_token; + std::map score; }; void replace(std::string & str, const std::string & needle, const std::string & replacement); @@ -79,7 +80,7 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // ref: https://github.com/google/sentencepiece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); +std::vector llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos); // load the tokens from encoder.json bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); -- cgit v1.2.3