Telegram: initial commit

- tdlib moved to telegram dir
author: aunsane <aunsane@gmail.com> 2018-04-27 21:33:17 +0300
committer: aunsane <aunsane@gmail.com> 2018-04-27 21:33:17 +0300
commit: e1ec72eab6d00b3ba38e5932bc88920f103b6e4a (patch)
tree: 999de2725a83e30fbbf6576200525d4ef0c5fe38 /protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h
parent: b9ce1d4d98525490ca1a38e2d9fd4f3369adb3e0 (diff)
1 files changed, 106 insertions, 0 deletions
diff --git a/protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h b/protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h
new file mode 100644
index 0000000000..6be1952c19
--- /dev/null
+++ b/protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h
@@ -0,0 +1,106 @@
+//
+// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2018
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+#pragma once
+
+#include "td/utils/int_types.h"
+#include "td/utils/Slice.h"
+
+namespace td {
+
+/// checks UTF-8 string for correctness
+bool check_utf8(CSlice str);
+
+/// checks if a code unit is a first code unit of a UTF-8 character
+inline bool is_utf8_character_first_code_unit(unsigned char c) {
+  return (c & 0xC0) != 0x80;
+}
+
+/// returns length of UTF-8 string in characters
+inline size_t utf8_length(Slice str) {
+  size_t result = 0;
+  for (auto c : str) {
+    result += is_utf8_character_first_code_unit(c);
+  }
+  return result;
+}
+
+/// appends a Unicode character using UTF-8 encoding
+void append_utf8_character(string &str, uint32 ch);
+
+/// moves pointer one UTF-8 character back
+inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
+  while (!is_utf8_character_first_code_unit(*--ptr)) {
+    // pass
+  }
+  return ptr;
+}
+
+/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
+const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
+
+/// truncates UTF-8 string to the given length in Unicode characters
+template <class T>
+T utf8_truncate(T str, size_t length) {
+  if (str.size() > length) {
+    for (size_t i = 0; i < str.size(); i++) {
+      if (is_utf8_character_first_code_unit(static_cast<unsigned char>(str[i]))) {
+        if (length == 0) {
+          return str.substr(0, i);
+        } else {
+          length--;
+        }
+      }
+    }
+  }
+  return str;
+}
+
+/// truncates UTF-8 string to the given length given in UTF-16 code units
+template <class T>
+T utf8_utf16_truncate(T str, size_t length) {
+  for (size_t i = 0; i < str.size(); i++) {
+    auto c = static_cast<unsigned char>(str[i]);
+    if (is_utf8_character_first_code_unit(c)) {
+      if (length <= 0) {
+        return str.substr(0, i);
+      } else {
+        length--;
+        if (c >= 0xf0) {  // >= 4 bytes in symbol => surrogaite pair
+          length--;
+        }
+      }
+    }
+  }
+  return str;
+}
+
+template <class T>
+T utf8_substr(T str, size_t offset) {
+  auto offset_pos = utf8_truncate(str, offset).size();
+  return str.substr(offset_pos);
+}
+
+template <class T>
+T utf8_substr(T str, size_t offset, size_t length) {
+  return utf8_truncate(utf8_substr(str, offset), length);
+}
+
+template <class T>
+T utf8_utf16_substr(T str, size_t offset) {
+  auto offset_pos = utf8_utf16_truncate(str, offset).size();
+  return str.substr(offset_pos);
+}
+
+template <class T>
+T utf8_utf16_substr(T str, size_t offset, size_t length) {
+  return utf8_utf16_truncate(utf8_utf16_substr(str, offset), length);
+}
+
+/// Returns UTF-8 string converted to lower case.
+string utf8_to_lower(Slice str);
+
+}  // namespace td
author	aunsane <aunsane@gmail.com>	2018-04-27 21:33:17 +0300
committer	aunsane <aunsane@gmail.com>	2018-04-27 21:33:17 +0300
commit	e1ec72eab6d00b3ba38e5932bc88920f103b6e4a (patch)
tree	999de2725a83e30fbbf6576200525d4ef0c5fe38 /protocols/Telegram/tdlib/td/tdutils/td/utils/utf8.h
parent	b9ce1d4d98525490ca1a38e2d9fd4f3369adb3e0 (diff)