diff options
| author | George Hazan <george.hazan@gmail.com> | 2024-10-09 18:13:40 +0300 |
|---|---|---|
| committer | George Hazan <george.hazan@gmail.com> | 2024-10-09 18:13:40 +0300 |
| commit | 0e86b853be3b5f809ed1decbf636221c1144a386 (patch) | |
| tree | 4ce84d9849646a559d5afece33fc6e64d39e4a50 /libs/litehtml/src/utf8_strings.cpp | |
| parent | 834cbb58d74215980165eab257538ba918a378cd (diff) | |
litehtml update
Diffstat (limited to 'libs/litehtml/src/utf8_strings.cpp')
| -rw-r--r-- | libs/litehtml/src/utf8_strings.cpp | 72 |
1 files changed, 36 insertions, 36 deletions
diff --git a/libs/litehtml/src/utf8_strings.cpp b/libs/litehtml/src/utf8_strings.cpp index 864a4d0147..f720f6d26c 100644 --- a/libs/litehtml/src/utf8_strings.cpp +++ b/libs/litehtml/src/utf8_strings.cpp @@ -4,27 +4,17 @@ namespace litehtml { -utf8_to_utf32::utf8_to_utf32(const char* val) +// consume one utf-8 char and increment index accordingly +// if str[index] == 0 index is not incremented +char32_t read_utf8_char(const string& str, int& index) { - m_utf8 = (const byte*) val; - if (!m_utf8) return; - - while (true) - { - char32_t wch = get_char(); - if (!wch) break; - m_str += wch; - } -} - -char32_t utf8_to_utf32::get_char() -{ - char32_t b1 = getb(); - - if (!b1) + auto getb = [&]() -> byte { - return 0; - } + if (!str[index]) return 0; + return str[index++]; + }; + + byte b1 = getb(); // Determine whether we are dealing // with a one-, two-, three-, or four- @@ -38,31 +28,34 @@ char32_t utf8_to_utf32::get_char() { // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx char32_t r = (b1 & 0x1f) << 6; - r |= get_next_utf8(getb()); + r |= getb() & 0x3f; return r; } else if ((b1 & 0xf0) == 0xe0) { // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx char32_t r = (b1 & 0x0f) << 12; - r |= get_next_utf8(getb()) << 6; - r |= get_next_utf8(getb()); + r |= (getb() & 0x3f) << 6; + r |= getb() & 0x3f; return r; } else if ((b1 & 0xf8) == 0xf0) { - // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx - // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx - // (uuuuu = wwww + 1) - char32_t b2 = get_next_utf8(getb()); - char32_t b3 = get_next_utf8(getb()); - char32_t b4 = get_next_utf8(getb()); - return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) | - ((b3 & 0x3f) << 6) | (b4 & 0x3f); + // 4-byte sequence: uuuzzzzzzyyyyyyxxxxxx = 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx + byte b2 = getb() & 0x3f; + byte b3 = getb() & 0x3f; + byte b4 = getb() & 0x3f; + return ((b1 & 7) << 18) | (b2 << 12) | (b3 << 6) | b4; } - //bad start for UTF-8 multi-byte sequence - return '?'; + return 0xFFFD; +} + +// No error handling, str must be valid UTF-8 (it is ensured by document::parse_html and css_parser::parse_stylesheet). +// Currently used only in css parser, where actual char value is not needed, so it returns void. +void prev_utf8_char(const string& str, int& index) +{ + while (index && ((byte)str[--index] >> 6) == 0b10); // skip continuation bytes } void append_char(string& str, char32_t code) @@ -78,7 +71,7 @@ void append_char(string& str, char32_t code) } else if (0xd800 <= code && code <= 0xdfff) { - // error: surrogate + // error: unexpected surrogate (code is UTF-32, not UTF-16) } else if (code <= 0xFFFF) { @@ -95,10 +88,17 @@ void append_char(string& str, char32_t code) } } -utf32_to_utf8::utf32_to_utf8(const std::u32string& wstr) +utf8_to_utf32::utf8_to_utf32(const string& val) +{ + int index = 0; + while (char32_t ch = read_utf8_char(val, index)) + m_str += ch; +} + +utf32_to_utf8::utf32_to_utf8(const std::u32string& val) { - for (auto ch: wstr) + for (auto ch : val) append_char(m_str, ch); } -} // namespace litehtml
\ No newline at end of file +} // namespace litehtml |
