summaryrefslogtreecommitdiff
path: root/unicode.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'unicode.cpp')
-rw-r--r--unicode.cpp366
1 files changed, 249 insertions, 117 deletions
diff --git a/unicode.cpp b/unicode.cpp
index 955c5696..ca03c49d 100644
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -9,6 +9,7 @@
#include <stdexcept>
#include <string>
#include <unordered_map>
+#include <unordered_set>
#include <utility>
#include <vector>
#include <locale>
@@ -111,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
std::unordered_map<uint32_t, int> cpt_types;
for (auto p : unicode_ranges_number) {
- for (auto i = p.first; i <= p.second; ++ i) {
+ for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
}
}
for (auto p : unicode_ranges_letter) {
- for (auto i = p.first; i <= p.second; ++ i) {
+ for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_LETTER;
}
}
- for (auto p : unicode_ranges_whitespace) {
- for (auto i = p.first; i <= p.second; ++ i) {
- cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
+ for (auto p : unicode_ranges_separator) {
+ for (auto i = p.first; i <= p.second; ++i) {
+ cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
}
}
for (auto p : unicode_ranges_accent_mark) {
- for (auto i = p.first; i <= p.second; ++ i) {
+ for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
}
}
for (auto p : unicode_ranges_punctuation) {
- for (auto i = p.first; i <= p.second; ++ i) {
+ for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
}
}
@@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
}
}
for (auto p : unicode_ranges_control) {
- for (auto i = p.first; i <= p.second; ++ i) {
+ for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
}
}
@@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
std::vector<size_t> bpe_offsets; // store the offset of each word
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
- size_t start = 0;
-
const auto cpts = unicode_cpts_from_utf8(text);
+ size_t start = 0;
for (auto offset : offsets) {
- std::string token;
+ const size_t offset_ini = start;
+ const size_t offset_end = start + offset;
+ assert(offset_end <= cpts.size());
+ start = offset_end;
+
+ auto _get_cpt = [&] (const size_t pos) -> char32_t {
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
+ };
+
+ auto _get_cpt_type = [&] (const size_t pos) -> int {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
+ };
+
+ size_t _prev_end = offset_ini;
+ auto _add_token = [&] (const size_t end) -> size_t {
+ assert(_prev_end <= end && end <= offset_end);
+ size_t len = end - _prev_end;
+ if (len > 0) {
+ bpe_offsets.push_back(len);
+ }
+ _prev_end = end;
+ //if (len > 0) {
+ // std::string s = "";
+ // for(size_t p = end-len; p < end; p++)
+ // s += unicode_cpt_to_utf8(cpts[p]);
+ // printf(">>> '%s'\n", s.c_str());
+ //}
+ return len;
+ };
+
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+ const char32_t cpt = _get_cpt(pos);
+ const int cpt_type = _get_cpt_type(pos);
+
+ // regex: 's|'t|'re|'ve|'m|'ll|'d
+ if (cpt == '\'' && pos+1 < offset_end) {
+ char32_t cpt_next = _get_cpt(pos+1);
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+ pos += _add_token(pos+2);
+ continue;
+ }
+ if (pos+2 < offset_end) {
+ char32_t cpt_next_next = _get_cpt(pos+2);
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
+ pos += _add_token(pos+3);
+ continue;
+ }
+ }
+ }
- bool collecting_numeric = false;
- bool collecting_letter = false;
- bool collecting_special = false;
- bool collecting_whitespace_lookahead = false;
- bool collecting = false;
+ char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
+ int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
+ // regex: <space>?\p{L}+
+ if (cpt2_type == CODEPOINT_TYPE_LETTER) {
+ pos += (cpt == ' ');
+ while (cpt2_type == CODEPOINT_TYPE_LETTER) {
+ cpt2_type = _get_cpt_type(++pos);
+ }
+ _add_token(pos);
+ continue;
+ }
+ // regex: <space>?\p{N}+
+ if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
+ pos += (cpt == ' ');
+ while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
+ cpt2_type = _get_cpt_type(++pos);
+ }
+ _add_token(pos);
+ continue;
+ }
+ // regex: <space>?[^\s\p{L}\p{N}]+
+ if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+ pos += (cpt == ' ');
+ while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+ cpt2_type = _get_cpt_type(++pos);
+ cpt2 = _get_cpt(pos);
+ }
+ _add_token(pos);
+ continue;
+ }
- std::vector<std::string> text_utf;
- text_utf.reserve(offset);
+ size_t num_whitespaces = 0;
+ while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
+ num_whitespaces++;
+ }
- for (size_t i = start; i < start + offset; ++i) {
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
+ // regex: \s+(?!\S)
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
+ pos += num_whitespaces - 1;
+ _add_token(pos);
+ continue;
+ }
+
+ // regex: \s+
+ if (num_whitespaces > 0) {
+ pos += num_whitespaces;
+ _add_token(pos);
+ continue;
+ }
+
+ // no matches
+ _add_token(++pos);
}
+ }
+
+ return bpe_offsets;
+}
- for (int i = 0; i < (int)text_utf.size(); i++) {
- const std::string & utf_char = text_utf[i];
- bool split_condition = false;
- int bytes_remain = text_utf.size() - i;
+// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
+static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
+ std::vector<size_t> bpe_offsets; // store the offset of each word
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
- // forward backward lookups
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
+ const auto cpts = unicode_cpts_from_utf8(text);
- // handling contractions
- if (!split_condition && bytes_remain >= 2) {
- // 's|'t|'m|'d
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
- split_condition = true;
+ size_t start = 0;
+ for (auto offset : offsets) {
+ const size_t offset_ini = start;
+ const size_t offset_end = start + offset;
+ assert(offset_end <= cpts.size());
+ start = offset_end;
+
+ auto _get_cpt = [&] (const size_t pos) -> char32_t {
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
+ };
+
+ auto _get_cpt_type = [&] (const size_t pos) -> int {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
+ };
+
+ size_t _prev_end = offset_ini;
+ auto _add_token = [&] (const size_t end) -> size_t {
+ assert(_prev_end <= end && end <= offset_end);
+ size_t len = end - _prev_end;
+ if (len > 0) {
+ bpe_offsets.push_back(len);
+ }
+ _prev_end = end;
+ //if (len > 0) {
+ // std::string s = "";
+ // for(size_t p = end-len; p < end; p++)
+ // s += unicode_cpt_to_utf8(cpts[p]);
+ // printf(">>> '%s'\n", s.c_str());
+ //}
+ return len;
+ };
+
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+ const char32_t cpt = _get_cpt(pos);
+ const int cpt_type = _get_cpt_type(pos);
+
+ // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
+ if (cpt == '\'' && pos+1 < offset_end) {
+ char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+ pos += _add_token(pos+2);
+ continue;
}
- if (split_condition) {
- if (token.size()) {
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+ if (pos+2 < offset_end) {
+ char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
+ pos += _add_token(pos+3);
+ continue;
}
- token = utf_char + utf_char_next;
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
- token = "";
- i++;
- continue;
}
}
- if (!split_condition && bytes_remain >= 3) {
- // 're|'ve|'ll
- if (utf_char == "\'" && (
- (utf_char_next == "r" && utf_char_next_next == "e") ||
- (utf_char_next == "v" && utf_char_next_next == "e") ||
- (utf_char_next == "l" && utf_char_next_next == "l"))
- ) {
- split_condition = true;
- }
- if (split_condition) {
- // current token + next token can be defined
- if (token.size()) {
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
- }
- token = utf_char;
- token += utf_char_next;
- token += utf_char_next_next;
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
- token = "";
- i += 2;
+ // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
+ if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
+ if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
+ pos++;
+ while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
+ pos++;
+ }
+ _add_token(pos);
continue;
}
}
- if (!split_condition && !collecting) {
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
- collecting_letter = true;
- collecting = true;
- }
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
- collecting_numeric = true;
- collecting = true;
- }
- else if (
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
- (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
- ) {
- collecting_special = true;
- collecting = true;
- }
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
- collecting_whitespace_lookahead = true;
- collecting = true;
- }
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
- split_condition = true;
+ // regex: \p{N}{1,3}
+ if (cpt_type == CODEPOINT_TYPE_NUMBER) {
+ size_t ini = pos;
+ while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
+ if (++pos - ini >= 3 ) {
+ _add_token(pos);
+ ini = pos;
+ }
}
+ _add_token(pos);
+ continue;
}
- else if (!split_condition && collecting) {
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
- split_condition = true;
- }
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
- split_condition = true;
+
+ // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
+ char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
+ int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
+ if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+ pos += (cpt == ' ');
+ while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+ cpt2_type = _get_cpt_type(++pos);
+ cpt2 = _get_cpt(pos);
}
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
- split_condition = true;
+ while (cpt2 == '\r' || cpt2 == '\n') {
+ cpt2 = _get_cpt(++pos);
}
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
- split_condition = true;
+ _add_token(pos);
+ continue;
+ }
+
+ size_t num_whitespaces = 0;
+ size_t last_end_r_or_n = 0;
+ while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
+ char32_t cpt2 = _get_cpt(pos+num_whitespaces);
+ if (cpt2 == '\r' || cpt2 == '\n') {
+ last_end_r_or_n = pos + num_whitespaces + 1;
}
+ num_whitespaces++;
}
- if (utf_char_next == "") {
- split_condition = true; // final
- token += utf_char;
+ // regex: \s*[\r\n]+
+ if (last_end_r_or_n > 0) {
+ pos = last_end_r_or_n;
+ _add_token(pos);
+ continue;
}
- if (split_condition) {
- if (token.size()) {
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
- }
- token = utf_char;
- collecting = false;
- collecting_letter = false;
- collecting_numeric = false;
- collecting_special = false;
- collecting_whitespace_lookahead = false;
+ // regex: \s+(?!\S)
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
+ pos += num_whitespaces - 1;
+ _add_token(pos);
+ continue;
}
- else {
- token += utf_char;
+
+ // regex: \s+
+ if (num_whitespaces > 0) {
+ pos += num_whitespaces;
+ _add_token(pos);
+ continue;
}
- }
- start += offset;
+ // no matches
+ _add_token(++pos);
+ }
}
return bpe_offsets;
@@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
std::vector<size_t> bpe_offsets;
- (void)(text);
- (void)(regex_expr);
- (void)(offsets);
- // TODO: this implementation is actually wrong, uncomment and run:
- // make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
- //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
- // bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
- //}
+ if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
+ bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
+ } else if (
+ regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
+ regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
+
+ bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
+ }
return bpe_offsets;
}
@@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
}
+bool unicode_cpt_is_whitespace(uint32_t cp) {
+ static const std::unordered_set<uint32_t> is_whitespace = [] {
+ std::unordered_set<uint32_t> is_whitespace;
+ for (auto p : unicode_ranges_whitespace) {
+ for (auto i = p.first; i <= p.second; ++i) {
+ is_whitespace.insert(i);
+ }
+ }
+ return is_whitespace;
+ }();
+ return (bool)is_whitespace.count(cp);
+}
+
std::string unicode_byte_to_utf8(uint8_t byte) {
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
return map.at(byte);