diff options
author | jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> | 2024-06-18 18:40:52 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-18 18:40:52 +0200 |
commit | 37bef8943312d91183ff06d8f1214082a17344a5 (patch) | |
tree | 7713dc5aceb3b181568db3d21b1383762de41c4a /unicode.cpp | |
parent | 91c188d6c296bd3384f2a02a83b71187aa3d18b3 (diff) |
tokenizer : BPE fixes (#7530)
* Random test: add_bos_token, add_eos_token
* Random test: add BPE models for testing
* Custom regex split fails with codepoint 0
* Fix falcon punctuation regex
* Refactor llm_tokenizer_bpe: move code to constructor
* Move 'add_special_bos/eos' logic to llm_tokenizer_bpe
* Move tokenizer flags to vocab structure.
* Default values for special_add_bos/eos
* Build vocab.special_tokens_cache using vocab token types
* Generalize 'jina-v2' per token attributes
* Fix unicode whitespaces (deepseek-coder, deepseek-llm)
* Skip missing byte tokens (falcon)
* Better unicode data generation
* Replace char32_t with uint32_t
Diffstat (limited to 'unicode.cpp')
-rw-r--r-- | unicode.cpp | 29 |
1 files changed, 21 insertions, 8 deletions
diff --git a/unicode.cpp b/unicode.cpp index 2f8d7383..913c34b9 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -226,8 +226,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t assert(offset_end <= cpts.size()); start = offset_end; + static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; auto _get_cpt = [&] (const size_t pos) -> uint32_t { - return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; auto _get_flags = [&] (const size_t pos) -> codepoint_flags { @@ -309,7 +310,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t } // regex: \s+(?!\S) - if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) { pos += num_whitespaces - 1; _add_token(pos); continue; @@ -344,8 +345,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & assert(offset_end <= cpts.size()); start = offset_end; + static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; auto _get_cpt = [&] (const size_t pos) -> uint32_t { - return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; auto _get_flags = [&] (const size_t pos) -> codepoint_flags { @@ -450,7 +452,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & } // regex: \s+(?!\S) - if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) { pos += num_whitespaces - 1; _add_token(pos); continue; @@ -679,10 +681,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std continue; } - const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag(); + const auto flags = unicode_cpt_flags(cpts[i]); - if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) { - text_collapsed[i] = k_ucat_cpt.at(cpt_flag); + if (flags.is_whitespace) { + //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does. + //text_collapsed[i] = (char) 0x85; // <Next Line> as whitespace fallback + text_collapsed[i] = (char) 0x0B; // <vertical tab> as whitespace fallback + } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) { + text_collapsed[i] = k_ucat_cpt.at(flags.category_flag()); } else { text_collapsed[i] = (char) 0xD0; // fallback } @@ -766,9 +772,16 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets); } else { // no unicode category used, we can use std::wregex directly - const std::wstring wtext = unicode_wstring_from_utf8(text); const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); + // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback + std::wstring wtext(cpts.begin(), cpts.end()); + for (size_t i = 0; i < wtext.size(); ++i) { + if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) { + wtext[i] = 0x0B; + } + } + //printf("text: %s\n", text.c_str()); //printf("regex_expr: %s\n", regex_expr.c_str()); bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets); |