1 files changed, 99 insertions, 0 deletions
diff --git a/libs/litehtml/src/utf8_strings.cpp b/libs/litehtml/src/utf8_strings.cpp
new file mode 100644
index 0000000000..787af86f42
--- /dev/null
+++ b/libs/litehtml/src/utf8_strings.cpp
@@ -0,0 +1,99 @@
+#include "html.h"
+#include "utf8_strings.h"
+
+
+litehtml::utf8_to_wchar::utf8_to_wchar(const char* val)
+{
+	m_utf8 = (const byte*) val;
+	if (!m_utf8) return;
+
+	while (true)
+	{
+		ucode_t wch = get_char();
+		if (!wch) break;
+		m_str += wch;
+	}
+}
+
+litehtml::ucode_t litehtml::utf8_to_wchar::get_char()
+{
+	ucode_t b1 = getb();
+
+	if (!b1)
+	{
+		return 0;
+	}
+
+	// Determine whether we are dealing
+	// with a one-, two-, three-, or four-
+	// byte sequence.
+	if ((b1 & 0x80) == 0)
+	{
+		// 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
+		return b1;
+	}
+	else if ((b1 & 0xe0) == 0xc0)
+	{
+		// 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
+		ucode_t r = (b1 & 0x1f) << 6;
+		r |= get_next_utf8(getb());
+		return r;
+	}
+	else if ((b1 & 0xf0) == 0xe0)
+	{
+		// 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
+		ucode_t r = (b1 & 0x0f) << 12;
+		r |= get_next_utf8(getb()) << 6;
+		r |= get_next_utf8(getb());
+		return r;
+	}
+	else if ((b1 & 0xf8) == 0xf0)
+	{
+		// 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
+		//     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
+		// (uuuuu = wwww + 1)
+		int b2 = get_next_utf8(getb());
+		int b3 = get_next_utf8(getb());
+		int b4 = get_next_utf8(getb());
+		return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) |
+			((b3 & 0x3f) << 6) | (b4 & 0x3f);
+	}
+
+	//bad start for UTF-8 multi-byte sequence
+	return '?';
+}
+
+litehtml::wchar_to_utf8::wchar_to_utf8(const std::wstring& val)
+{
+	unsigned int code;
+	for (int i = 0; val[i]; i++)
+	{
+		code = val[i];
+		if (code <= 0x7F)
+		{
+			m_str += (char)code;
+		}
+		else if (code <= 0x7FF)
+		{
+			m_str += (code >> 6) + 192;
+			m_str += (code & 63) + 128;
+		}
+		else if (0xd800 <= code && code <= 0xdfff)
+		{
+			//invalid block of utf8
+		}
+		else if (code <= 0xFFFF)
+		{
+			m_str += (code >> 12) + 224;
+			m_str += ((code >> 6) & 63) + 128;
+			m_str += (code & 63) + 128;
+		}
+		else if (code <= 0x10FFFF)
+		{
+			m_str += (code >> 18) + 240;
+			m_str += ((code >> 12) & 63) + 128;
+			m_str += ((code >> 6) & 63) + 128;
+			m_str += (code & 63) + 128;
+		}
+	}
+}