From b43272afa29a64dcb8bcf26a96a05bac40792b92 Mon Sep 17 00:00:00 2001 From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> Date: Sat, 18 May 2024 01:09:13 +0200 Subject: Unicode codepoint flags for custom regexs (#7245) * Replace CODEPOINT_TYPE_* with codepoint_flags * Update and bugfix brute force random test * Deterministic brute force random test * Unicode normalization NFD * Get rid of BOM --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index e11f0ac4..b752ddc6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12576,16 +12576,16 @@ struct llm_tokenizer_wpm { // to lowercase, pad chinese characters, pad punctuation std::string new_str = ""; for (uint32_t code : cpts_nfd) { - int type = unicode_cpt_type(code); - if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) { + const codepoint_flags flags = unicode_cpt_flags(code); + if (flags.is_accent_mark || flags.is_control) { continue; } code = unicode_tolower(code); - if (type == CODEPOINT_TYPE_SEPARATOR) { + if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ? code = ' '; } std::string s = unicode_cpt_to_utf8(code); - if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) { + if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) { new_str += " "; new_str += s; new_str += " "; -- cgit v1.2.3