scripts/gen-unicode-data.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

import regex
import ctypes
import unicodedata


class CoodepointFlags (ctypes.Structure):
    _fields_ = [  # see definition in unicode.h
        ("is_undefined",   ctypes.c_uint16, 1),
        ("is_number",      ctypes.c_uint16, 1),  # regex: \p{N}
        ("is_letter",      ctypes.c_uint16, 1),  # regex: \p{L}
        ("is_separator",   ctypes.c_uint16, 1),  # regex: \p{Z}
        ("is_accent_mark", ctypes.c_uint16, 1),  # regex: \p{M}
        ("is_punctuation", ctypes.c_uint16, 1),  # regex: \p{P}
        ("is_symbol",      ctypes.c_uint16, 1),  # regex: \p{S}
        ("is_control",     ctypes.c_uint16, 1),  # regex: \p{C}
    ]


assert (ctypes.sizeof(CoodepointFlags) == 2)


MAX_CODEPOINTS = 0x110000

regex_number      = regex.compile(r'\p{N}')
regex_letter      = regex.compile(r'\p{L}')
regex_separator   = regex.compile(r'\p{Z}')
regex_accent_mark = regex.compile(r'\p{M}')
regex_punctuation = regex.compile(r'\p{P}')
regex_symbol      = regex.compile(r'\p{S}')
regex_control     = regex.compile(r'\p{C}')
regex_whitespace  = regex.compile(r'\s')

codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
table_whitespace = []
table_lowercase = []
table_uppercase = []
table_nfd = []

for codepoint in range(MAX_CODEPOINTS):
    # convert codepoint to unicode character
    char = chr(codepoint)

    # regex categories
    flags = codepoint_flags[codepoint]
    flags.is_number      = bool(regex_number.match(char))
    flags.is_letter      = bool(regex_letter.match(char))
    flags.is_separator   = bool(regex_separator.match(char))
    flags.is_accent_mark = bool(regex_accent_mark.match(char))
    flags.is_punctuation = bool(regex_punctuation.match(char))
    flags.is_symbol      = bool(regex_symbol.match(char))
    flags.is_control     = bool(regex_control.match(char))
    flags.is_undefined   = bytes(flags)[0] == 0
    assert (not flags.is_undefined)

    # whitespaces
    if bool(regex_whitespace.match(char)):
        table_whitespace.append(codepoint)

    # lowercase conversion
    lower = ord(char.lower()[0])
    if codepoint != lower:
        table_lowercase.append((codepoint, lower))

    # uppercase conversion
    upper = ord(char.upper()[0])
    if codepoint != upper:
        table_uppercase.append((codepoint, upper))

    # NFD normalization
    norm = ord(unicodedata.normalize('NFD', char)[0])
    if codepoint != norm:
        table_nfd.append((codepoint, norm))


# group ranges with same flags
ranges_flags = [(0, codepoint_flags[0])]  # start, flags
for codepoint, flags in enumerate(codepoint_flags):
    if bytes(flags) != bytes(ranges_flags[-1][1]):
        ranges_flags.append((codepoint, flags))
ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))


# group ranges with same nfd
ranges_nfd = [(0, 0, 0)]  # start, last, nfd
for codepoint, norm in table_nfd:
    start = ranges_nfd[-1][0]
    if ranges_nfd[-1] != (start, codepoint - 1, norm):
        ranges_nfd.append(None)
        start = codepoint
    ranges_nfd[-1] = (start, codepoint, norm)


# Generate 'unicode-data.cpp'


def out(line=""):
    print(line, end='\n')  # noqa


out("""\
// generated with scripts/gen-unicode-data.py

#include "unicode-data.h"

#include <cstdint>
#include <vector>
#include <unordered_map>
#include <unordered_set>
""")

out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
for codepoint, flags in ranges_flags:
    flags = int.from_bytes(bytes(flags), "little")
    out("{0x%06X, 0x%04X}," % (codepoint, flags))
out("};\n")

out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
out("};\n")

out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
for tuple in table_lowercase:
    out("{0x%06X, 0x%06X}," % tuple)
out("};\n")

out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
for tuple in table_uppercase:
    out("{0x%06X, 0x%06X}," % tuple)
out("};\n")

out("const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd")
for triple in ranges_nfd:
    out("{0x%06X, 0x%06X, 0x%06X}," % triple)
out("};\n")