diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-04-29 16:58:41 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-29 16:58:41 +0300 |
commit | f4ab2a41476600a98067a9474ea8f9e6db41bcfa (patch) | |
tree | 4e840ec5b4243ed43906a576e396995e3d9dbc21 /unicode.cpp | |
parent | 3f167476b11efa7ab08f6cacdeb8cab0935c1249 (diff) |
llama : fix BPE pre-tokenization (#6920)
* merged the changes from deepseeker models to main branch
* Moved regex patterns to unicode.cpp and updated unicode.h
* Moved header files
* Resolved issues
* added and refactored unicode_regex_split and related functions
* Updated/merged the deepseek coder pr
* Refactored code
* Adding unicode regex mappings
* Adding unicode regex function
* Added needed functionality, testing remains
* Fixed issues
* Fixed issue with gpt2 regex custom preprocessor
* unicode : fix? unicode_wstring_to_utf8
* lint : fix whitespaces
* tests : add tokenizer tests for numbers
* unicode : remove redundant headers
* tests : remove and rename tokenizer test scripts
* tests : add sample usage
* gguf-py : reader prints warnings on duplicate keys
* llama : towards llama3 tokenization support (wip)
* unicode : shot in the dark to fix tests on Windows
* unicode : first try custom implementations
* convert : add "tokenizer.ggml.pre" GGUF KV (wip)
* llama : use new pre-tokenizer type
* convert : fix pre-tokenizer type writing
* lint : fix
* make : add test-tokenizer-0-llama-v3
* wip
* models : add llama v3 vocab file
* llama : adapt punctuation regex + add llama 3 regex
* minor
* unicode : set bomb
* unicode : set bomb
* unicode : always use std::wregex
* unicode : support \p{N}, \p{L} and \p{P} natively
* unicode : try fix windows
* unicode : category support via std::regex
* unicode : clean-up
* unicode : simplify
* convert : add convert-hf-to-gguf-update.py
ggml-ci
* lint : update
* convert : add falcon
ggml-ci
* unicode : normalize signatures
* lint : fix
* lint : fix
* convert : remove unused functions
* convert : add comments
* convert : exercise contractions
ggml-ci
* lint : fix
* cmake : refactor test targets
* tests : refactor vocab tests
ggml-ci
* tests : add more vocabs and tests
ggml-ci
* unicode : cleanup
* scripts : ignore new update script in check-requirements.sh
* models : add phi-3, mpt, gpt-2, starcoder
* tests : disable obsolete
ggml-ci
* tests : use faster bpe test
ggml-ci
* llama : more prominent warning for old BPE models
* tests : disable test-tokenizer-1-bpe due to slowness
ggml-ci
---------
Co-authored-by: Jaggzh <jaggz.h@gmail.com>
Co-authored-by: Kazim Abrar Mahi <kazimabrarmahi135@gmail.com>
Diffstat (limited to 'unicode.cpp')
-rw-r--r-- | unicode.cpp | 487 |
1 files changed, 448 insertions, 39 deletions
diff --git a/unicode.cpp b/unicode.cpp index df8c5f58..f2ccda05 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -5,11 +5,14 @@ #include <cstddef> #include <cstdint> #include <map> +#include <regex> #include <stdexcept> #include <string> #include <unordered_map> #include <utility> #include <vector> +#include <locale> +#include <codecvt> static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) { std::string result; @@ -53,23 +56,22 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) offset += 4; return result; } - throw std::invalid_argument("invalid string"); + throw std::invalid_argument("failed to convert utf8 to codepoint"); } -static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) { - std::vector<uint16_t> result; - if (/* 0x0000 <= cp && */ cp <= 0xffff) { - result.emplace_back(cp); - } - else if (0x10000 <= cp && cp <= 0x10ffff) { - result.emplace_back(0xd800 | ((cp - 0x10000) >> 10)); - result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff)); - } - else { - throw std::invalid_argument("invalid cpt"); - } - return result; -} +//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) { +// std::vector<uint16_t> result; +// if (/* 0x0000 <= cp && */ cp <= 0xffff) { +// result.emplace_back(cp); +// return result; +// } +// if (0x10000 <= cp && cp <= 0x10ffff) { +// result.emplace_back(0xd800 | ((cp - 0x10000) >> 10)); +// result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff)); +// return result; +// } +// throw std::invalid_argument("failed to convert codepoint to utf16"); +//} //static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) { // std::vector<uint16_t> result; @@ -80,28 +82,28 @@ static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) { // return result; //} -static uint32_t cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) { - assert(offset < utf16.size()); - if (((utf16[0] >> 10) << 10) != 0xd800) { - auto result = utf16[offset + 0]; - offset += 1; - return result; - } - - if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) { - throw std::invalid_argument("invalid character"); - } - - auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); - offset += 2; - return result; -} +//static uint32_t unicode_cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) { +// assert(offset < utf16.size()); +// if (((utf16[0] >> 10) << 10) != 0xd800) { +// auto result = utf16[offset + 0]; +// offset += 1; +// return result; +// } +// +// if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) { +// throw std::invalid_argument("invalid character"); +// } +// +// auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); +// offset += 2; +// return result; +//} //static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) { // std::vector<uint32_t> result; // size_t offset = 0; // while (offset < utf16.size()) { -// result.push_back(cpt_from_utf16(utf16, offset)); +// result.push_back(unicode_cpt_from_utf16(utf16, offset)); // } // return result; //} @@ -194,34 +196,277 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() { return map; } +static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { + std::wstring_convert<std::codecvt_utf8<wchar_t>> conv; + return conv.from_bytes(s); +} + +static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) { + std::vector<std::string> bpe_encoded_words; + for (const auto & word : bpe_words) { + std::string text_utf; + auto utf_word = unicode_cpts_from_utf8(word); + for (size_t i = 0; i < utf_word.size(); ++i) { + text_utf += unicode_cpt_to_utf8(utf_word[i]); + } + + std::string encoded_token; + for (char & c : text_utf) { + encoded_token += unicode_byte_to_utf8(c); + } + bpe_encoded_words.emplace_back(encoded_token); + } + return bpe_encoded_words; +} + +// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ +static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) { + std::vector<size_t> bpe_offsets; // store the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size + + size_t start = 0; + + const auto cpts = unicode_cpts_from_utf8(text); + + for (auto offset : offsets) { + std::string token; + + bool collecting_numeric = false; + bool collecting_letter = false; + bool collecting_special = false; + bool collecting_whitespace_lookahead = false; + bool collecting = false; + + std::vector<std::string> text_utf; + text_utf.reserve(offset); + + for (size_t i = start; i < start + offset; ++i) { + text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i])); + } + + for (int i = 0; i < (int)text_utf.size(); i++) { + const std::string & utf_char = text_utf[i]; + bool split_condition = false; + int bytes_remain = text_utf.size() - i; + + // forward backward lookups + const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; + const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; + + // handling contractions + if (!split_condition && bytes_remain >= 2) { + // 's|'t|'m|'d + if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) { + split_condition = true; + } + if (split_condition) { + if (token.size()) { + bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); + } + token = utf_char + utf_char_next; + bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); + token = ""; + i++; + continue; + } + } + if (!split_condition && bytes_remain >= 3) { + // 're|'ve|'ll + if (utf_char == "\'" && ( + (utf_char_next == "r" && utf_char_next_next == "e") || + (utf_char_next == "v" && utf_char_next_next == "e") || + (utf_char_next == "l" && utf_char_next_next == "l")) + ) { + split_condition = true; + } + if (split_condition) { + // current token + next token can be defined + if (token.size()) { + bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); + } + token = utf_char; + token += utf_char_next; + token += utf_char_next_next; + + bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); + token = ""; + i += 2; + continue; + } + } + + if (!split_condition && !collecting) { + if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { + collecting_letter = true; + collecting = true; + } + else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + collecting_numeric = true; + collecting = true; + } + else if ( + ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || + (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) + ) { + collecting_special = true; + collecting = true; + } + else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) { + collecting_whitespace_lookahead = true; + collecting = true; + } + else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) { + split_condition = true; + } + } + else if (!split_condition && collecting) { + if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) { + split_condition = true; + } + else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) { + split_condition = true; + } + else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { + split_condition = true; + } + else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + split_condition = true; + } + } + + if (utf_char_next == "") { + split_condition = true; // final + token += utf_char; + } + + if (split_condition) { + if (token.size()) { + bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); + } + token = utf_char; + collecting = false; + collecting_letter = false; + collecting_numeric = false; + collecting_special = false; + collecting_whitespace_lookahead = false; + } + else { + token += utf_char; + } + } + + start += offset; + } + + return bpe_offsets; +} + +// use std::wregex to split the text +static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) { + std::wregex expr(regex_expr); + std::vector<size_t> bpe_offsets; // store the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size + size_t start = 0; + for (auto offset : offsets) { + std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr); + std::wcregex_iterator end; + + int64_t start_idx = 0; + while (it != end) { + std::wcmatch match = *it; + if (match.position() > start_idx) { + bpe_offsets.emplace_back(match.position() - start_idx); + } + bpe_offsets.emplace_back(match.length()); + start_idx = match.position() + match.length(); + ++it; + } + + if (start_idx < (int64_t) offset) { + bpe_offsets.emplace_back(offset - start_idx); + } + start += offset; + } + + return bpe_offsets; +} + +// use std::regex to split the text +static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) { + std::regex expr(regex_expr); + std::vector<size_t> bpe_offsets; // store the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size + size_t start = 0; + for (auto offset : offsets) { + std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr); + std::cregex_iterator end; + + int64_t start_idx = 0; + while (it != end) { + std::cmatch match = *it; + if (match.position() > start_idx) { + bpe_offsets.emplace_back(match.position() - start_idx); + } + bpe_offsets.emplace_back(match.length()); + start_idx = match.position() + match.length(); + ++it; + } + + if (start_idx < (int64_t) offset) { + bpe_offsets.emplace_back(offset - start_idx); + } + start += offset; + } + + return bpe_offsets; +} + +static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) { + std::vector<size_t> bpe_offsets; + + (void)(text); + (void)(regex_expr); + (void)(offsets); + // TODO: this implementation is actually wrong, uncomment and run: + // make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf + //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { + // bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); + //} + + return bpe_offsets; +} + // // interface // std::string unicode_cpt_to_utf8(uint32_t cp) { std::string result; + if (/* 0x00 <= cp && */ cp <= 0x7f) { result.push_back(cp); + return result; } - else if (0x80 <= cp && cp <= 0x7ff) { + if (0x80 <= cp && cp <= 0x7ff) { result.push_back(0xc0 | ((cp >> 6) & 0x1f)); result.push_back(0x80 | (cp & 0x3f)); + return result; } - else if (0x800 <= cp && cp <= 0xffff) { + if (0x800 <= cp && cp <= 0xffff) { result.push_back(0xe0 | ((cp >> 12) & 0x0f)); result.push_back(0x80 | ((cp >> 6) & 0x3f)); result.push_back(0x80 | (cp & 0x3f)); + return result; } - else if (0x10000 <= cp && cp <= 0x10ffff) { + if (0x10000 <= cp && cp <= 0x10ffff) { result.push_back(0xf0 | ((cp >> 18) & 0x07)); result.push_back(0x80 | ((cp >> 12) & 0x3f)); result.push_back(0x80 | ((cp >> 6) & 0x3f)); result.push_back(0x80 | (cp & 0x3f)); + return result; } - else { - throw std::invalid_argument("invalid codepoint"); - } - return result; + + throw std::invalid_argument("invalid codepoint"); } std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) { @@ -275,3 +520,167 @@ char32_t unicode_tolower(char32_t cp) { auto it = unicode_map_lowercase.find(cp); return it == unicode_map_lowercase.end() ? cp : it->second; } + +std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) { + // unicode categories + static const std::map<std::string, int> k_ucat_enum = { + { "\\p{N}", CODEPOINT_TYPE_DIGIT }, + { "\\p{L}", CODEPOINT_TYPE_LETTER }, + { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION }, + }; + + static const std::map<int, int> k_ucat_cpt = { + { CODEPOINT_TYPE_DIGIT, 0xD1 }, + { CODEPOINT_TYPE_LETTER, 0xD2 }, + { CODEPOINT_TYPE_PUNCTUATION, 0xD3 }, + }; + + static const std::map<int, std::string> k_ucat_map = { + { CODEPOINT_TYPE_DIGIT, "\x30-\x39" }, // 0-9 + { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z + { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} + }; + + // compute collapsed codepoints only if needed by at least one regex + bool need_collapse = false; + for (auto & regex_expr : regex_exprs) { + // search for unicode categories + for (const auto & ucat : k_ucat_enum) { + if (std::string::npos != regex_expr.find(ucat.first)) { + need_collapse = true; + break; + } + } + } + + const auto cpts = unicode_cpts_from_utf8(text); + + // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte + // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 + std::string text_collapsed; + if (need_collapse) { + // collapse all unicode categories + text_collapsed.resize(cpts.size()); + + for (size_t i = 0; i < cpts.size(); ++i) { + // keep single-byte codepoints as is + if (cpts[i] < 128) { + text_collapsed[i] = cpts[i]; + continue; + } + + const int cpt_type = unicode_cpt_type(cpts[i]); + + if (k_ucat_cpt.find(cpt_type) != k_ucat_cpt.end()) { + text_collapsed[i] = k_ucat_cpt.at(cpt_type); + } else { + text_collapsed[i] = (char) 0xD0; // fallback + } + } + } + + std::vector<size_t> bpe_offsets = { cpts.size() }; + + for (auto & regex_expr : regex_exprs) { + // first, see if we have an efficient custom regex implementation + auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets); + + if (!tmp.empty()) { + bpe_offsets = std::move(tmp); + continue; + } + + // fallback to general-purpose std::regex / std::wregex + try { + // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category + // with the corresponding collapsed representation + bool use_collapsed = false; + for (auto & ucat : k_ucat_enum) { + if (std::string::npos != regex_expr.find(ucat.first)) { + use_collapsed = true; + break; + } + } + + if (use_collapsed) { + // sanity-check that the original regex does not contain any non-ASCII characters + const auto cpts_regex = unicode_cpts_from_utf8(regex_expr); + for (size_t i = 0; i < cpts_regex.size(); ++i) { + if (cpts_regex[i] >= 128) { + throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported"); + } + } + + // generate a collapsed representation of the regex + std::string regex_expr_collapsed; + + // track if we are inside [], because nested [] are not allowed + bool inside = false; + for (size_t i = 0; i < regex_expr.size(); ++i) { + if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) { + regex_expr_collapsed += '['; + inside = true; + continue; + } + + if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') { + regex_expr_collapsed += ']'; + inside = false; + continue; + } + + if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() && + regex_expr[i + 1] == 'p' && + regex_expr[i + 2] == '{' && + regex_expr[i + 4] == '}') { + const std::string pat = regex_expr.substr(i, 5); + if (k_ucat_enum.find(pat) != k_ucat_enum.end()) { + if (!inside) { + regex_expr_collapsed += '['; + } + regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat)); + regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat)); + if (!inside) { + regex_expr_collapsed += ']'; + } + i += 4; + continue; + } + } + + regex_expr_collapsed += regex_expr[i]; + } + + //printf("text_collapsed: %s\n", text_collapsed.c_str()); + //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str()); + bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets); + } else { + // no unicode category used, we can use std::wregex directly + const std::wstring wtext = unicode_wstring_from_utf8(text); + const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); + + //printf("text: %s\n", text.c_str()); + //printf("regex_expr: %s\n", regex_expr.c_str()); + bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets); + } + } catch (std::regex_error & e) { + fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str()); + fprintf(stderr, "Regex error: %s\n", e.what()); + throw std::runtime_error("Failed to process regex"); + } + } + + std::vector<std::string> bpe_words; + bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size + + size_t start = 0; + for (size_t & offset : bpe_offsets) { + bpe_words.emplace_back(); + for (size_t i = start; i < start + offset; ++i) { + bpe_words.back() += unicode_cpt_to_utf8(cpts[i]); + } + start += offset; + } + + return unicode_byte_encoding_process(bpe_words); +} |