blob: 50f82d63933e0a2b2d3b269b335ea547530a15f9 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
//
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2018
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
#include "td/utils/utf8.h"
#include "td/utils/logging.h" // for UNREACHABLE
#include "td/utils/unicode.h"
namespace td {
bool check_utf8(CSlice str) {
const char *data = str.data();
const char *data_end = data + str.size();
do {
unsigned int a = static_cast<unsigned char>(*data++);
if ((a & 0x80) == 0) {
if (data == data_end + 1) {
return true;
}
continue;
}
#define ENSURE(condition) \
if (!(condition)) { \
return false; \
}
ENSURE((a & 0x40) != 0);
unsigned int b = static_cast<unsigned char>(*data++);
ENSURE((b & 0xc0) == 0x80);
if ((a & 0x20) == 0) {
ENSURE((a & 0x1e) > 0);
continue;
}
unsigned int c = static_cast<unsigned char>(*data++);
ENSURE((c & 0xc0) == 0x80);
if ((a & 0x10) == 0) {
int x = (((a & 0x0f) << 6) | (b & 0x20));
ENSURE(x != 0 && x != 0x360); // surrogates
continue;
}
unsigned int d = static_cast<unsigned char>(*data++);
ENSURE((d & 0xc0) == 0x80);
if ((a & 0x08) == 0) {
int t = (((a & 0x07) << 6) | (b & 0x30));
ENSURE(0 < t && t < 0x110); // end of unicode
continue;
}
return false;
#undef ENSURE
} while (true);
UNREACHABLE();
return false;
}
void append_utf8_character(string &str, uint32 ch) {
if (ch <= 0x7f) {
str.push_back(static_cast<char>(ch));
} else if (ch <= 0x7ff) {
str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
} else if (ch <= 0xffff) {
str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
} else {
str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
}
}
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
uint32 a = ptr[0];
if ((a & 0x80) == 0) {
if (code) {
*code = a;
}
return ptr + 1;
} else if ((a & 0x20) == 0) {
if (code) {
*code = ((a & 0x1f) << 6) | (ptr[1] & 0x3f);
}
return ptr + 2;
} else if ((a & 0x10) == 0) {
if (code) {
*code = ((a & 0x0f) << 12) | ((ptr[1] & 0x3f) << 6) | (ptr[2] & 0x3f);
}
return ptr + 3;
} else if ((a & 0x08) == 0) {
if (code) {
*code = ((a & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f);
}
return ptr + 4;
}
UNREACHABLE();
if (code) {
*code = 0;
}
return ptr;
}
string utf8_to_lower(Slice str) {
string result;
auto pos = str.ubegin();
auto end = str.uend();
while (pos != end) {
uint32 code;
pos = next_utf8_unsafe(pos, &code);
append_utf8_character(result, unicode_to_lower(code));
}
return result;
}
} // namespace td
|