diff options
author | jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> | 2024-05-09 15:30:44 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-09 23:30:44 +1000 |
commit | 43248e559472556f368988575d9fba906b3eb139 (patch) | |
tree | 01ac2174500a50d7326e3d62a30dd469b946e60a /unicode.cpp | |
parent | a743d76a01f23038b2c85af1e9048ee836767b44 (diff) |
llama3 custom regex split (#6965)
* merged the changes from deepseeker models to main branch
* Moved regex patterns to unicode.cpp and updated unicode.h
* Moved header files
* Resolved issues
* added and refactored unicode_regex_split and related functions
* Updated/merged the deepseek coder pr
* Refactored code
* Adding unicode regex mappings
* Adding unicode regex function
* Added needed functionality, testing remains
* Fixed issues
* Fixed issue with gpt2 regex custom preprocessor
* unicode : fix? unicode_wstring_to_utf8
* lint : fix whitespaces
* tests : add tokenizer tests for numbers
* unicode : remove redundant headers
* tests : remove and rename tokenizer test scripts
* tests : add sample usage
* gguf-py : reader prints warnings on duplicate keys
* llama : towards llama3 tokenization support (wip)
* unicode : shot in the dark to fix tests on Windows
* unicode : first try custom implementations
* convert : add "tokenizer.ggml.pre" GGUF KV (wip)
* llama : use new pre-tokenizer type
* convert : fix pre-tokenizer type writing
* lint : fix
* make : add test-tokenizer-0-llama-v3
* wip
* models : add llama v3 vocab file
* llama : adapt punctuation regex + add llama 3 regex
* minor
* unicode : set bomb
* unicode : set bomb
* unicode : always use std::wregex
* unicode : support \p{N}, \p{L} and \p{P} natively
* unicode : try fix windows
* unicode : category support via std::regex
* unicode : clean-up
* unicode : simplify
* llama3 custom regex split
* convert : add convert-hf-to-gguf-update.py
ggml-ci
* lint : update
* convert : add falcon
ggml-ci
* unicode : normalize signatures
* lint : fix
* lint : fix
* convert : remove unused functions
* convert : add comments
* convert : exercise contractions
ggml-ci
* Using char32_t for codepoints
* lint : fix
* already exists unicode_tolower()
* Typing
* Restore BOM
* cmake : refactor test targets
* tests : refactor vocab tests
ggml-ci
* tests : add more vocabs and tests
ggml-ci
* unicode : cleanup
* scripts : ignore new update script in check-requirements.sh
* Fix merge
* models : add phi-3, mpt, gpt-2, starcoder
* tests : disable obsolete
ggml-ci
* tests : use faster bpe test
ggml-ci
* llama : more prominent warning for old BPE models
* tests : disable test-tokenizer-1-bpe due to slowness
ggml-ci
* Move unused variable value
* GPT2 custom regex split
* Add alternative regex for custom aplit llama3
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Style
* Add bruteforce random tests for token encoding
* wip: fixing unicode codepoint ranges
* Fix merge
* Unicode tables: separator, lowercase, uppercase and whitespace
* llama3 custom regex split: fix \s
* Restore BOM
* Style
* wip: generate NDF table
* Ignore special tokens for testing
* Clean gen-unicode-data.py
* Refactor random tokenizer test
* lint : fix
* tests : add fail test for llama-bpe
---------
Co-authored-by: Jaggzh <jaggz.h@gmail.com>
Co-authored-by: Kazim Abrar Mahi <kazimabrarmahi135@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: jaime-m-p <>
Diffstat (limited to 'unicode.cpp')
-rw-r--r-- | unicode.cpp | 366 |
1 files changed, 249 insertions, 117 deletions
diff --git a/unicode.cpp b/unicode.cpp index 955c5696..ca03c49d 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -9,6 +9,7 @@ #include <stdexcept> #include <string> #include <unordered_map> +#include <unordered_set> #include <utility> #include <vector> #include <locale> @@ -111,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) static std::unordered_map<uint32_t, int> unicode_cpt_type_map() { std::unordered_map<uint32_t, int> cpt_types; for (auto p : unicode_ranges_number) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_NUMBER; } } for (auto p : unicode_ranges_letter) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_LETTER; } } - for (auto p : unicode_ranges_whitespace) { - for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_WHITESPACE; + for (auto p : unicode_ranges_separator) { + for (auto i = p.first; i <= p.second; ++i) { + cpt_types[i] = CODEPOINT_TYPE_SEPARATOR; } } for (auto p : unicode_ranges_accent_mark) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK; } } for (auto p : unicode_ranges_punctuation) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION; } } @@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() { } } for (auto p : unicode_ranges_control) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_CONTROL; } } @@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t std::vector<size_t> bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; - const auto cpts = unicode_cpts_from_utf8(text); + size_t start = 0; for (auto offset : offsets) { - std::string token; + const size_t offset_ini = start; + const size_t offset_end = start + offset; + assert(offset_end <= cpts.size()); + start = offset_end; + + auto _get_cpt = [&] (const size_t pos) -> char32_t { + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + }; + + auto _get_cpt_type = [&] (const size_t pos) -> int { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED; + }; + + size_t _prev_end = offset_ini; + auto _add_token = [&] (const size_t end) -> size_t { + assert(_prev_end <= end && end <= offset_end); + size_t len = end - _prev_end; + if (len > 0) { + bpe_offsets.push_back(len); + } + _prev_end = end; + //if (len > 0) { + // std::string s = ""; + // for(size_t p = end-len; p < end; p++) + // s += unicode_cpt_to_utf8(cpts[p]); + // printf(">>> '%s'\n", s.c_str()); + //} + return len; + }; + + for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { + const char32_t cpt = _get_cpt(pos); + const int cpt_type = _get_cpt_type(pos); + + // regex: 's|'t|'re|'ve|'m|'ll|'d + if (cpt == '\'' && pos+1 < offset_end) { + char32_t cpt_next = _get_cpt(pos+1); + if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') { + pos += _add_token(pos+2); + continue; + } + if (pos+2 < offset_end) { + char32_t cpt_next_next = _get_cpt(pos+2); + if ((cpt_next == 'r' && cpt_next_next == 'e') || + (cpt_next == 'v' && cpt_next_next == 'e') || + (cpt_next == 'l' && cpt_next_next == 'l')) { + pos += _add_token(pos+3); + continue; + } + } + } - bool collecting_numeric = false; - bool collecting_letter = false; - bool collecting_special = false; - bool collecting_whitespace_lookahead = false; - bool collecting = false; + char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt); + int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type); + // regex: <space>?\p{L}+ + if (cpt2_type == CODEPOINT_TYPE_LETTER) { + pos += (cpt == ' '); + while (cpt2_type == CODEPOINT_TYPE_LETTER) { + cpt2_type = _get_cpt_type(++pos); + } + _add_token(pos); + continue; + } + // regex: <space>?\p{N}+ + if (cpt2_type == CODEPOINT_TYPE_NUMBER) { + pos += (cpt == ' '); + while (cpt2_type == CODEPOINT_TYPE_NUMBER) { + cpt2_type = _get_cpt_type(++pos); + } + _add_token(pos); + continue; + } + // regex: <space>?[^\s\p{L}\p{N}]+ + if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) { + pos += (cpt == ' '); + while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) { + cpt2_type = _get_cpt_type(++pos); + cpt2 = _get_cpt(pos); + } + _add_token(pos); + continue; + } - std::vector<std::string> text_utf; - text_utf.reserve(offset); + size_t num_whitespaces = 0; + while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) { + num_whitespaces++; + } - for (size_t i = start; i < start + offset; ++i) { - text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i])); + // regex: \s+(?!\S) + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + pos += num_whitespaces - 1; + _add_token(pos); + continue; + } + + // regex: \s+ + if (num_whitespaces > 0) { + pos += num_whitespaces; + _add_token(pos); + continue; + } + + // no matches + _add_token(++pos); } + } + + return bpe_offsets; +} - for (int i = 0; i < (int)text_utf.size(); i++) { - const std::string & utf_char = text_utf[i]; - bool split_condition = false; - int bytes_remain = text_utf.size() - i; +// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" +static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) { + std::vector<size_t> bpe_offsets; // store the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - // forward backward lookups - const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; - const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; + const auto cpts = unicode_cpts_from_utf8(text); - // handling contractions - if (!split_condition && bytes_remain >= 2) { - // 's|'t|'m|'d - if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) { - split_condition = true; + size_t start = 0; + for (auto offset : offsets) { + const size_t offset_ini = start; + const size_t offset_end = start + offset; + assert(offset_end <= cpts.size()); + start = offset_end; + + auto _get_cpt = [&] (const size_t pos) -> char32_t { + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + }; + + auto _get_cpt_type = [&] (const size_t pos) -> int { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED; + }; + + size_t _prev_end = offset_ini; + auto _add_token = [&] (const size_t end) -> size_t { + assert(_prev_end <= end && end <= offset_end); + size_t len = end - _prev_end; + if (len > 0) { + bpe_offsets.push_back(len); + } + _prev_end = end; + //if (len > 0) { + // std::string s = ""; + // for(size_t p = end-len; p < end; p++) + // s += unicode_cpt_to_utf8(cpts[p]); + // printf(">>> '%s'\n", s.c_str()); + //} + return len; + }; + + for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { + const char32_t cpt = _get_cpt(pos); + const int cpt_type = _get_cpt_type(pos); + + // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive + if (cpt == '\'' && pos+1 < offset_end) { + char32_t cpt_next = unicode_tolower(_get_cpt(pos+1)); + if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') { + pos += _add_token(pos+2); + continue; } - if (split_condition) { - if (token.size()) { - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); + if (pos+2 < offset_end) { + char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2)); + if ((cpt_next == 'r' && cpt_next_next == 'e') || + (cpt_next == 'v' && cpt_next_next == 'e') || + (cpt_next == 'l' && cpt_next_next == 'l')) { + pos += _add_token(pos+3); + continue; } - token = utf_char + utf_char_next; - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); - token = ""; - i++; - continue; } } - if (!split_condition && bytes_remain >= 3) { - // 're|'ve|'ll - if (utf_char == "\'" && ( - (utf_char_next == "r" && utf_char_next_next == "e") || - (utf_char_next == "v" && utf_char_next_next == "e") || - (utf_char_next == "l" && utf_char_next_next == "l")) - ) { - split_condition = true; - } - if (split_condition) { - // current token + next token can be defined - if (token.size()) { - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); - } - token = utf_char; - token += utf_char_next; - token += utf_char_next_next; - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); - token = ""; - i += 2; + // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct? + if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) { + if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters + pos++; + while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) { + pos++; + } + _add_token(pos); continue; } } - if (!split_condition && !collecting) { - if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { - collecting_letter = true; - collecting = true; - } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) { - collecting_numeric = true; - collecting = true; - } - else if ( - ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || - (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) - ) { - collecting_special = true; - collecting = true; - } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) { - collecting_whitespace_lookahead = true; - collecting = true; - } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) { - split_condition = true; + // regex: \p{N}{1,3} + if (cpt_type == CODEPOINT_TYPE_NUMBER) { + size_t ini = pos; + while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) { + if (++pos - ini >= 3 ) { + _add_token(pos); + ini = pos; + } } + _add_token(pos); + continue; } - else if (!split_condition && collecting) { - if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) { - split_condition = true; - } - else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) { - split_condition = true; + + // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]* + char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt); + int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type); + if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) { + pos += (cpt == ' '); + while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) { + cpt2_type = _get_cpt_type(++pos); + cpt2 = _get_cpt(pos); } - else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { - split_condition = true; + while (cpt2 == '\r' || cpt2 == '\n') { + cpt2 = _get_cpt(++pos); } - else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) { - split_condition = true; + _add_token(pos); + continue; + } + + size_t num_whitespaces = 0; + size_t last_end_r_or_n = 0; + while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) { + char32_t cpt2 = _get_cpt(pos+num_whitespaces); + if (cpt2 == '\r' || cpt2 == '\n') { + last_end_r_or_n = pos + num_whitespaces + 1; } + num_whitespaces++; } - if (utf_char_next == "") { - split_condition = true; // final - token += utf_char; + // regex: \s*[\r\n]+ + if (last_end_r_or_n > 0) { + pos = last_end_r_or_n; + _add_token(pos); + continue; } - if (split_condition) { - if (token.size()) { - bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); - } - token = utf_char; - collecting = false; - collecting_letter = false; - collecting_numeric = false; - collecting_special = false; - collecting_whitespace_lookahead = false; + // regex: \s+(?!\S) + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + pos += num_whitespaces - 1; + _add_token(pos); + continue; } - else { - token += utf_char; + + // regex: \s+ + if (num_whitespaces > 0) { + pos += num_whitespaces; + _add_token(pos); + continue; } - } - start += offset; + // no matches + _add_token(++pos); + } } return bpe_offsets; @@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) { std::vector<size_t> bpe_offsets; - (void)(text); - (void)(regex_expr); - (void)(offsets); - // TODO: this implementation is actually wrong, uncomment and run: - // make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf - //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { - // bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); - //} + if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { + bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); + } else if ( + regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" || + regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") { + + bpe_offsets = unicode_regex_split_custom_llama3(text, offsets); + } return bpe_offsets; } @@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) { return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset)); } +bool unicode_cpt_is_whitespace(uint32_t cp) { + static const std::unordered_set<uint32_t> is_whitespace = [] { + std::unordered_set<uint32_t> is_whitespace; + for (auto p : unicode_ranges_whitespace) { + for (auto i = p.first; i <= p.second; ++i) { + is_whitespace.insert(i); + } + } + return is_whitespace; + }(); + return (bool)is_whitespace.count(cp); +} + std::string unicode_byte_to_utf8(uint8_t byte) { static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map(); return map.at(byte); |