blob: 864a4d0147d0fc16bdd8b2478883ea063af1ba0f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
#include "html.h"
#include "utf8_strings.h"
namespace litehtml
{
utf8_to_utf32::utf8_to_utf32(const char* val)
{
m_utf8 = (const byte*) val;
if (!m_utf8) return;
while (true)
{
char32_t wch = get_char();
if (!wch) break;
m_str += wch;
}
}
char32_t utf8_to_utf32::get_char()
{
char32_t b1 = getb();
if (!b1)
{
return 0;
}
// Determine whether we are dealing
// with a one-, two-, three-, or four-
// byte sequence.
if ((b1 & 0x80) == 0)
{
// 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
return b1;
}
else if ((b1 & 0xe0) == 0xc0)
{
// 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
char32_t r = (b1 & 0x1f) << 6;
r |= get_next_utf8(getb());
return r;
}
else if ((b1 & 0xf0) == 0xe0)
{
// 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
char32_t r = (b1 & 0x0f) << 12;
r |= get_next_utf8(getb()) << 6;
r |= get_next_utf8(getb());
return r;
}
else if ((b1 & 0xf8) == 0xf0)
{
// 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
// = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
// (uuuuu = wwww + 1)
char32_t b2 = get_next_utf8(getb());
char32_t b3 = get_next_utf8(getb());
char32_t b4 = get_next_utf8(getb());
return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) |
((b3 & 0x3f) << 6) | (b4 & 0x3f);
}
//bad start for UTF-8 multi-byte sequence
return '?';
}
void append_char(string& str, char32_t code)
{
if (code <= 0x7F)
{
str += (char)code;
}
else if (code <= 0x7FF)
{
str += char((code >> 6) + 192);
str += (code & 63) + 128;
}
else if (0xd800 <= code && code <= 0xdfff)
{
// error: surrogate
}
else if (code <= 0xFFFF)
{
str += char((code >> 12) + 224);
str += ((code >> 6) & 63) + 128;
str += (code & 63) + 128;
}
else if (code <= 0x10FFFF)
{
str += char((code >> 18) + 240);
str += ((code >> 12) & 63) + 128;
str += ((code >> 6) & 63) + 128;
str += (code & 63) + 128;
}
}
utf32_to_utf8::utf32_to_utf8(const std::u32string& wstr)
{
for (auto ch: wstr)
append_char(m_str, ch);
}
} // namespace litehtml
|