diff options
author | jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> | 2024-05-18 01:09:13 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-18 01:09:13 +0200 |
commit | b43272afa29a64dcb8bcf26a96a05bac40792b92 (patch) | |
tree | 1d5e893fd96c3f56b62f6e1ca2ba1274e69deca9 /unicode-data.h | |
parent | 0fc1e820a9900a3dd08ddd3c6abe6604c53b689b (diff) |
Unicode codepoint flags for custom regexs (#7245)
* Replace CODEPOINT_TYPE_* with codepoint_flags
* Update and bugfix brute force random test
* Deterministic brute force random test
* Unicode normalization NFD
* Get rid of BOM
Diffstat (limited to 'unicode-data.h')
-rw-r--r-- | unicode-data.h | 27 |
1 files changed, 15 insertions, 12 deletions
diff --git a/unicode-data.h b/unicode-data.h index 3cccf206..e27fe177 100644 --- a/unicode-data.h +++ b/unicode-data.h @@ -1,17 +1,20 @@ #pragma once #include <cstdint> -#include <map> -#include <utility> #include <vector> +#include <unordered_map> +#include <unordered_set> -extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number; -extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter; -extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator; -extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace; -extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark; -extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation; -extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol; -extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control; -extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd; -extern const std::map<char32_t, char32_t> unicode_map_lowercase; +struct range_nfd { + uint32_t first; + uint32_t last; + uint32_t nfd; +}; + +static const uint32_t MAX_CODEPOINTS = 0x110000; + +extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags; +extern const std::unordered_set<uint32_t> unicode_set_whitespace; +extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase; +extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase; +extern const std::vector<range_nfd> unicode_ranges_nfd; |