From b43272afa29a64dcb8bcf26a96a05bac40792b92 Mon Sep 17 00:00:00 2001 From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> Date: Sat, 18 May 2024 01:09:13 +0200 Subject: Unicode codepoint flags for custom regexs (#7245) * Replace CODEPOINT_TYPE_* with codepoint_flags * Update and bugfix brute force random test * Deterministic brute force random test * Unicode normalization NFD * Get rid of BOM --- unicode-data.h | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'unicode-data.h') diff --git a/unicode-data.h b/unicode-data.h index 3cccf206..e27fe177 100644 --- a/unicode-data.h +++ b/unicode-data.h @@ -1,17 +1,20 @@ #pragma once #include -#include -#include #include +#include +#include -extern const std::vector> unicode_ranges_number; -extern const std::vector> unicode_ranges_letter; -extern const std::vector> unicode_ranges_separator; -extern const std::vector> unicode_ranges_whitespace; -extern const std::vector> unicode_ranges_accent_mark; -extern const std::vector> unicode_ranges_punctuation; -extern const std::vector> unicode_ranges_symbol; -extern const std::vector> unicode_ranges_control; -extern const std::multimap unicode_map_nfd; -extern const std::map unicode_map_lowercase; +struct range_nfd { + uint32_t first; + uint32_t last; + uint32_t nfd; +}; + +static const uint32_t MAX_CODEPOINTS = 0x110000; + +extern const std::vector> unicode_ranges_flags; +extern const std::unordered_set unicode_set_whitespace; +extern const std::unordered_map unicode_map_lowercase; +extern const std::unordered_map unicode_map_uppercase; +extern const std::vector unicode_ranges_nfd; -- cgit v1.2.3