summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorjaime-m-p <167997752+jaime-m-p@users.noreply.github.com>2024-05-18 01:09:13 +0200
committerGitHub <noreply@github.com>2024-05-18 01:09:13 +0200
commitb43272afa29a64dcb8bcf26a96a05bac40792b92 (patch)
tree1d5e893fd96c3f56b62f6e1ca2ba1274e69deca9 /llama.cpp
parent0fc1e820a9900a3dd08ddd3c6abe6604c53b689b (diff)
Unicode codepoint flags for custom regexs (#7245)
* Replace CODEPOINT_TYPE_* with codepoint_flags * Update and bugfix brute force random test * Deterministic brute force random test * Unicode normalization NFD * Get rid of BOM
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp8
1 files changed, 4 insertions, 4 deletions
diff --git a/llama.cpp b/llama.cpp
index e11f0ac4..b752ddc6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12576,16 +12576,16 @@ struct llm_tokenizer_wpm {
// to lowercase, pad chinese characters, pad punctuation
std::string new_str = "";
for (uint32_t code : cpts_nfd) {
- int type = unicode_cpt_type(code);
- if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
+ const codepoint_flags flags = unicode_cpt_flags(code);
+ if (flags.is_accent_mark || flags.is_control) {
continue;
}
code = unicode_tolower(code);
- if (type == CODEPOINT_TYPE_SEPARATOR) {
+ if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
code = ' ';
}
std::string s = unicode_cpt_to_utf8(code);
- if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
+ if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
new_str += " ";
new_str += s;
new_str += " ";