From b43272afa29a64dcb8bcf26a96a05bac40792b92 Mon Sep 17 00:00:00 2001
From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
Date: Sat, 18 May 2024 01:09:13 +0200
Subject: Unicode codepoint flags for custom regexs (#7245)

* Replace CODEPOINT_TYPE_* with codepoint_flags
* Update and bugfix brute force random test
* Deterministic brute force random test
* Unicode normalization NFD
* Get rid of BOM
---
 llama.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'llama.cpp')

diff --git a/llama.cpp b/llama.cpp
index e11f0ac4..b752ddc6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12576,16 +12576,16 @@ struct llm_tokenizer_wpm {
         // to lowercase, pad chinese characters, pad punctuation
         std::string new_str = "";
         for (uint32_t code : cpts_nfd) {
-            int type = unicode_cpt_type(code);
-            if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
+            const codepoint_flags flags = unicode_cpt_flags(code);
+            if (flags.is_accent_mark || flags.is_control) {
                 continue;
             }
             code = unicode_tolower(code);
-            if (type == CODEPOINT_TYPE_SEPARATOR) {
+            if (flags.is_separator || flags.is_whitespace) {  //####FIXME: is_separator ?
                 code = ' ';
             }
             std::string s = unicode_cpt_to_utf8(code);
-            if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
+            if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
                 new_str += " ";
                 new_str += s;
                 new_str += " ";
-- 
cgit v1.2.3