diff options
author | aunsane <aunsane@gmail.com> | 2018-04-27 21:33:17 +0300 |
---|---|---|
committer | aunsane <aunsane@gmail.com> | 2018-04-27 21:33:17 +0300 |
commit | e1ec72eab6d00b3ba38e5932bc88920f103b6e4a (patch) | |
tree | 999de2725a83e30fbbf6576200525d4ef0c5fe38 /protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h | |
parent | b9ce1d4d98525490ca1a38e2d9fd4f3369adb3e0 (diff) |
Telegram: initial commit
- tdlib moved to telegram dir
Diffstat (limited to 'protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h')
-rw-r--r-- | protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h b/protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h new file mode 100644 index 0000000000..6be1952c19 --- /dev/null +++ b/protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h @@ -0,0 +1,106 @@ +// +// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2018 +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#pragma once + +#include "td/utils/int_types.h" +#include "td/utils/Slice.h" + +namespace td { + +/// checks UTF-8 string for correctness +bool check_utf8(CSlice str); + +/// checks if a code unit is a first code unit of a UTF-8 character +inline bool is_utf8_character_first_code_unit(unsigned char c) { + return (c & 0xC0) != 0x80; +} + +/// returns length of UTF-8 string in characters +inline size_t utf8_length(Slice str) { + size_t result = 0; + for (auto c : str) { + result += is_utf8_character_first_code_unit(c); + } + return result; +} + +/// appends a Unicode character using UTF-8 encoding +void append_utf8_character(string &str, uint32 ch); + +/// moves pointer one UTF-8 character back +inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) { + while (!is_utf8_character_first_code_unit(*--ptr)) { + // pass + } + return ptr; +} + +/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code +const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code); + +/// truncates UTF-8 string to the given length in Unicode characters +template <class T> +T utf8_truncate(T str, size_t length) { + if (str.size() > length) { + for (size_t i = 0; i < str.size(); i++) { + if (is_utf8_character_first_code_unit(static_cast<unsigned char>(str[i]))) { + if (length == 0) { + return str.substr(0, i); + } else { + length--; + } + } + } + } + return str; +} + +/// truncates UTF-8 string to the given length given in UTF-16 code units +template <class T> +T utf8_utf16_truncate(T str, size_t length) { + for (size_t i = 0; i < str.size(); i++) { + auto c = static_cast<unsigned char>(str[i]); + if (is_utf8_character_first_code_unit(c)) { + if (length <= 0) { + return str.substr(0, i); + } else { + length--; + if (c >= 0xf0) { // >= 4 bytes in symbol => surrogaite pair + length--; + } + } + } + } + return str; +} + +template <class T> +T utf8_substr(T str, size_t offset) { + auto offset_pos = utf8_truncate(str, offset).size(); + return str.substr(offset_pos); +} + +template <class T> +T utf8_substr(T str, size_t offset, size_t length) { + return utf8_truncate(utf8_substr(str, offset), length); +} + +template <class T> +T utf8_utf16_substr(T str, size_t offset) { + auto offset_pos = utf8_utf16_truncate(str, offset).size(); + return str.substr(offset_pos); +} + +template <class T> +T utf8_utf16_substr(T str, size_t offset, size_t length) { + return utf8_utf16_truncate(utf8_utf16_substr(str, offset), length); +} + +/// Returns UTF-8 string converted to lower case. +string utf8_to_lower(Slice str); + +} // namespace td |