litehtml update

author: George Hazan <george.hazan@gmail.com> 2024-10-09 18:13:40 +0300
committer: George Hazan <george.hazan@gmail.com> 2024-10-09 18:13:40 +0300
commit: 0e86b853be3b5f809ed1decbf636221c1144a386 (patch)
tree: 4ce84d9849646a559d5afece33fc6e64d39e4a50 /libs/litehtml/src/utf8_strings.cpp
parent: 834cbb58d74215980165eab257538ba918a378cd (diff)
1 files changed, 36 insertions, 36 deletions
diff --git a/libs/litehtml/src/utf8_strings.cpp b/libs/litehtml/src/utf8_strings.cpp
index 864a4d0147..f720f6d26c 100644
--- a/libs/litehtml/src/utf8_strings.cpp
+++ b/libs/litehtml/src/utf8_strings.cpp
@@ -4,27 +4,17 @@
 namespace litehtml
 {
 
-utf8_to_utf32::utf8_to_utf32(const char* val)
+// consume one utf-8 char and increment index accordingly
+// if str[index] == 0 index is not incremented
+char32_t read_utf8_char(const string& str, int& index)
 {
-	m_utf8 = (const byte*) val;
-	if (!m_utf8) return;
-
-	while (true)
-	{
-		char32_t wch = get_char();
-		if (!wch) break;
-		m_str += wch;
-	}
-}
-
-char32_t utf8_to_utf32::get_char()
-{
-	char32_t b1 = getb();
-
-	if (!b1)
+	auto getb = [&]() -> byte
 	{
-		return 0;
-	}
+		if (!str[index]) return 0;
+		return str[index++];
+	};
+	
+	byte b1 = getb();
 
 	// Determine whether we are dealing
 	// with a one-, two-, three-, or four-
@@ -38,31 +28,34 @@ char32_t utf8_to_utf32::get_char()
 	{
 		// 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
 		char32_t r = (b1 & 0x1f) << 6;
-		r |= get_next_utf8(getb());
+		r |= getb() & 0x3f;
 		return r;
 	}
 	else if ((b1 & 0xf0) == 0xe0)
 	{
 		// 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
 		char32_t r = (b1 & 0x0f) << 12;
-		r |= get_next_utf8(getb()) << 6;
-		r |= get_next_utf8(getb());
+		r |= (getb() & 0x3f) << 6;
+		r |= getb() & 0x3f;
 		return r;
 	}
 	else if ((b1 & 0xf8) == 0xf0)
 	{
-		// 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
-		//     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
-		// (uuuuu = wwww + 1)
-		char32_t b2 = get_next_utf8(getb());
-		char32_t b3 = get_next_utf8(getb());
-		char32_t b4 = get_next_utf8(getb());
-		return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) |
-			((b3 & 0x3f) << 6) | (b4 & 0x3f);
+		// 4-byte sequence: uuuzzzzzzyyyyyyxxxxxx = 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+		byte b2 = getb() & 0x3f;
+		byte b3 = getb() & 0x3f;
+		byte b4 = getb() & 0x3f;
+		return ((b1 & 7) << 18) | (b2 << 12) | (b3 << 6) | b4;
 	}
 
-	//bad start for UTF-8 multi-byte sequence
-	return '?';
+	return 0xFFFD;
+}
+
+// No error handling, str must be valid UTF-8 (it is ensured by document::parse_html and css_parser::parse_stylesheet).
+// Currently used only in css parser, where actual char value is not needed, so it returns void.
+void prev_utf8_char(const string& str, int& index)
+{
+	while (index && ((byte)str[--index] >> 6) == 0b10); // skip continuation bytes
 }
 
 void append_char(string& str, char32_t code)
@@ -78,7 +71,7 @@ void append_char(string& str, char32_t code)
 	}
 	else if (0xd800 <= code && code <= 0xdfff)
 	{
-		// error: surrogate
+		// error: unexpected surrogate (code is UTF-32, not UTF-16)
 	}
 	else if (code <= 0xFFFF)
 	{
@@ -95,10 +88,17 @@ void append_char(string& str, char32_t code)
 	}
 }
 
-utf32_to_utf8::utf32_to_utf8(const std::u32string& wstr)
+utf8_to_utf32::utf8_to_utf32(const string& val)
+{
+	int index = 0;
+	while (char32_t ch = read_utf8_char(val, index))
+		m_str += ch;
+}
+
+utf32_to_utf8::utf32_to_utf8(const std::u32string& val)
 {
-	for (auto ch: wstr)
+	for (auto ch : val)
 		append_char(m_str, ch);
 }
 
-} // namespace litehtml
-\ No newline at end of file
+} // namespace litehtml
author	George Hazan <george.hazan@gmail.com>	2024-10-09 18:13:40 +0300
committer	George Hazan <george.hazan@gmail.com>	2024-10-09 18:13:40 +0300
commit	0e86b853be3b5f809ed1decbf636221c1144a386 (patch)
tree	4ce84d9849646a559d5afece33fc6e64d39e4a50 /libs/litehtml/src/utf8_strings.cpp
parent	834cbb58d74215980165eab257538ba918a378cd (diff)