summaryrefslogtreecommitdiff
path: root/libs/litehtml/src/utf8_strings.cpp
diff options
context:
space:
mode:
authorGeorge Hazan <george.hazan@gmail.com>2024-10-09 18:13:40 +0300
committerGeorge Hazan <george.hazan@gmail.com>2024-10-09 18:13:40 +0300
commit0e86b853be3b5f809ed1decbf636221c1144a386 (patch)
tree4ce84d9849646a559d5afece33fc6e64d39e4a50 /libs/litehtml/src/utf8_strings.cpp
parent834cbb58d74215980165eab257538ba918a378cd (diff)
litehtml update
Diffstat (limited to 'libs/litehtml/src/utf8_strings.cpp')
-rw-r--r--libs/litehtml/src/utf8_strings.cpp72
1 files changed, 36 insertions, 36 deletions
diff --git a/libs/litehtml/src/utf8_strings.cpp b/libs/litehtml/src/utf8_strings.cpp
index 864a4d0147..f720f6d26c 100644
--- a/libs/litehtml/src/utf8_strings.cpp
+++ b/libs/litehtml/src/utf8_strings.cpp
@@ -4,27 +4,17 @@
namespace litehtml
{
-utf8_to_utf32::utf8_to_utf32(const char* val)
+// consume one utf-8 char and increment index accordingly
+// if str[index] == 0 index is not incremented
+char32_t read_utf8_char(const string& str, int& index)
{
- m_utf8 = (const byte*) val;
- if (!m_utf8) return;
-
- while (true)
- {
- char32_t wch = get_char();
- if (!wch) break;
- m_str += wch;
- }
-}
-
-char32_t utf8_to_utf32::get_char()
-{
- char32_t b1 = getb();
-
- if (!b1)
+ auto getb = [&]() -> byte
{
- return 0;
- }
+ if (!str[index]) return 0;
+ return str[index++];
+ };
+
+ byte b1 = getb();
// Determine whether we are dealing
// with a one-, two-, three-, or four-
@@ -38,31 +28,34 @@ char32_t utf8_to_utf32::get_char()
{
// 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
char32_t r = (b1 & 0x1f) << 6;
- r |= get_next_utf8(getb());
+ r |= getb() & 0x3f;
return r;
}
else if ((b1 & 0xf0) == 0xe0)
{
// 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
char32_t r = (b1 & 0x0f) << 12;
- r |= get_next_utf8(getb()) << 6;
- r |= get_next_utf8(getb());
+ r |= (getb() & 0x3f) << 6;
+ r |= getb() & 0x3f;
return r;
}
else if ((b1 & 0xf8) == 0xf0)
{
- // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
- // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
- // (uuuuu = wwww + 1)
- char32_t b2 = get_next_utf8(getb());
- char32_t b3 = get_next_utf8(getb());
- char32_t b4 = get_next_utf8(getb());
- return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) |
- ((b3 & 0x3f) << 6) | (b4 & 0x3f);
+ // 4-byte sequence: uuuzzzzzzyyyyyyxxxxxx = 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+ byte b2 = getb() & 0x3f;
+ byte b3 = getb() & 0x3f;
+ byte b4 = getb() & 0x3f;
+ return ((b1 & 7) << 18) | (b2 << 12) | (b3 << 6) | b4;
}
- //bad start for UTF-8 multi-byte sequence
- return '?';
+ return 0xFFFD;
+}
+
+// No error handling, str must be valid UTF-8 (it is ensured by document::parse_html and css_parser::parse_stylesheet).
+// Currently used only in css parser, where actual char value is not needed, so it returns void.
+void prev_utf8_char(const string& str, int& index)
+{
+ while (index && ((byte)str[--index] >> 6) == 0b10); // skip continuation bytes
}
void append_char(string& str, char32_t code)
@@ -78,7 +71,7 @@ void append_char(string& str, char32_t code)
}
else if (0xd800 <= code && code <= 0xdfff)
{
- // error: surrogate
+ // error: unexpected surrogate (code is UTF-32, not UTF-16)
}
else if (code <= 0xFFFF)
{
@@ -95,10 +88,17 @@ void append_char(string& str, char32_t code)
}
}
-utf32_to_utf8::utf32_to_utf8(const std::u32string& wstr)
+utf8_to_utf32::utf8_to_utf32(const string& val)
+{
+ int index = 0;
+ while (char32_t ch = read_utf8_char(val, index))
+ m_str += ch;
+}
+
+utf32_to_utf8::utf32_to_utf8(const std::u32string& val)
{
- for (auto ch: wstr)
+ for (auto ch : val)
append_char(m_str, ch);
}
-} // namespace litehtml \ No newline at end of file
+} // namespace litehtml