summaryrefslogtreecommitdiff
path: root/protocols/Sametime/src/glib/gunidecomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'protocols/Sametime/src/glib/gunidecomp.c')
-rw-r--r--protocols/Sametime/src/glib/gunidecomp.c532
1 files changed, 532 insertions, 0 deletions
diff --git a/protocols/Sametime/src/glib/gunidecomp.c b/protocols/Sametime/src/glib/gunidecomp.c
new file mode 100644
index 0000000000..0bd7ced2c8
--- /dev/null
+++ b/protocols/Sametime/src/glib/gunidecomp.c
@@ -0,0 +1,532 @@
+/* decomp.c - Character decomposition.
+ *
+ * Copyright (C) 1999, 2000 Tom Tromey
+ * Copyright 2000 Red Hat, Inc.
+ *
+ * The Gnome Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The Gnome Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with the Gnome Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "gunicode.h"
+#include "gunidecomp.h"
+#include "gmem.h"
+#include "gunicomp.h"
+#include "gunicodeprivate.h"
+
+
+#define CC_PART1(Page, Char) \
+ ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+ ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+ : (cclass_data[combining_class_table_part1[Page]][Char]))
+
+#define CC_PART2(Page, Char) \
+ ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+ ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+ : (cclass_data[combining_class_table_part2[Page]][Char]))
+
+#define COMBINING_CLASS(Char) \
+ (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
+ ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
+ : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
+ ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
+ : 0))
+
+/**
+ * g_unichar_combining_class:
+ * @uc: a Unicode character
+ *
+ * Determines the canonical combining class of a Unicode character.
+ *
+ * Return value: the combining class of the character
+ *
+ * Since: 2.14
+ **/
+gint
+g_unichar_combining_class (gunichar uc)
+{
+ return COMBINING_CLASS (uc);
+}
+
+/* constants for hangul syllable [de]composition */
+#define SBase 0xAC00
+#define LBase 0x1100
+#define VBase 0x1161
+#define TBase 0x11A7
+#define LCount 19
+#define VCount 21
+#define TCount 28
+#define NCount (VCount * TCount)
+#define SCount (LCount * NCount)
+
+/**
+ * g_unicode_canonical_ordering:
+ * @string: a UCS-4 encoded string.
+ * @len: the maximum length of @string to use.
+ *
+ * Computes the canonical ordering of a string in-place.
+ * This rearranges decomposed characters in the string
+ * according to their combining classes. See the Unicode
+ * manual for more information.
+ **/
+void
+g_unicode_canonical_ordering (gunichar *string,
+ gsize len)
+{
+ gsize i;
+ int swap = 1;
+
+ while (swap)
+ {
+ int last;
+ swap = 0;
+ last = COMBINING_CLASS (string[0]);
+ for (i = 0; i < len - 1; ++i)
+ {
+ int next = COMBINING_CLASS (string[i + 1]);
+ if (next != 0 && last > next)
+ {
+ gsize j;
+ /* Percolate item leftward through string. */
+ for (j = i + 1; j > 0; --j)
+ {
+ gunichar t;
+ if (COMBINING_CLASS (string[j - 1]) <= next)
+ break;
+ t = string[j];
+ string[j] = string[j - 1];
+ string[j - 1] = t;
+ swap = 1;
+ }
+ /* We're re-entering the loop looking at the old
+ character again. */
+ next = last;
+ }
+ last = next;
+ }
+ }
+}
+
+/* http://www.unicode.org/unicode/reports/tr15/#Hangul
+ * r should be null or have sufficient space. Calling with r == NULL will
+ * only calculate the result_len; however, a buffer with space for three
+ * characters will always be big enough. */
+static void
+decompose_hangul (gunichar s,
+ gunichar *r,
+ gsize *result_len)
+{
+ gint SIndex = s - SBase;
+
+ /* not a hangul syllable */
+ if (SIndex < 0 || SIndex >= SCount)
+ {
+ if (r)
+ r[0] = s;
+ *result_len = 1;
+ }
+ else
+ {
+ gunichar L = LBase + SIndex / NCount;
+ gunichar V = VBase + (SIndex % NCount) / TCount;
+ gunichar T = TBase + SIndex % TCount;
+
+ if (r)
+ {
+ r[0] = L;
+ r[1] = V;
+ }
+
+ if (T != TBase)
+ {
+ if (r)
+ r[2] = T;
+ *result_len = 3;
+ }
+ else
+ *result_len = 2;
+ }
+}
+
+/* returns a pointer to a null-terminated UTF-8 string */
+static const gchar *
+find_decomposition (gunichar ch,
+ gboolean compat)
+{
+ int start = 0;
+ int end = G_N_ELEMENTS (decomp_table);
+
+ if (ch >= decomp_table[start].ch &&
+ ch <= decomp_table[end - 1].ch)
+ {
+ while (TRUE)
+ {
+ int half = (start + end) / 2;
+ if (ch == decomp_table[half].ch)
+ {
+ int offset;
+
+ if (compat)
+ {
+ offset = decomp_table[half].compat_offset;
+ if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
+ offset = decomp_table[half].canon_offset;
+ }
+ else
+ {
+ offset = decomp_table[half].canon_offset;
+ if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
+ return NULL;
+ }
+
+ return &(decomp_expansion_string[offset]);
+ }
+ else if (half == start)
+ break;
+ else if (ch > decomp_table[half].ch)
+ start = half;
+ else
+ end = half;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * g_unicode_canonical_decomposition:
+ * @ch: a Unicode character.
+ * @result_len: location to store the length of the return value.
+ *
+ * Computes the canonical decomposition of a Unicode character.
+ *
+ * Return value: a newly allocated string of Unicode characters.
+ * @result_len is set to the resulting length of the string.
+ **/
+gunichar *
+g_unicode_canonical_decomposition (gunichar ch,
+ gsize *result_len)
+{
+ const gchar *decomp;
+ const gchar *p;
+ gunichar *r;
+
+ /* Hangul syllable */
+ if (ch >= 0xac00 && ch <= 0xd7a3)
+ {
+ decompose_hangul (ch, NULL, result_len);
+ r = g_malloc (*result_len * sizeof (gunichar));
+ decompose_hangul (ch, r, result_len);
+ }
+ else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
+ {
+ /* Found it. */
+ int i;
+
+ *result_len = g_utf8_strlen (decomp, -1);
+ r = g_malloc (*result_len * sizeof (gunichar));
+
+ for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
+ r[i] = g_utf8_get_char (p);
+ }
+ else
+ {
+ /* Not in our table. */
+ r = g_malloc (sizeof (gunichar));
+ *r = ch;
+ *result_len = 1;
+ }
+
+ /* Supposedly following the Unicode 2.1.9 table means that the
+ decompositions come out in canonical order. I haven't tested
+ this, but we rely on it here. */
+ return r;
+}
+
+/* L,V => LV and LV,T => LVT */
+static gboolean
+combine_hangul (gunichar a,
+ gunichar b,
+ gunichar *result)
+{
+ gint LIndex = a - LBase;
+ gint SIndex = a - SBase;
+
+ gint VIndex = b - VBase;
+ gint TIndex = b - TBase;
+
+ if (0 <= LIndex && LIndex < LCount
+ && 0 <= VIndex && VIndex < VCount)
+ {
+ *result = SBase + (LIndex * VCount + VIndex) * TCount;
+ return TRUE;
+ }
+ else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
+ && 0 < TIndex && TIndex < TCount)
+ {
+ *result = a + TIndex;
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+#define CI(Page, Char) \
+ ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+ ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+ : (compose_data[compose_table[Page]][Char]))
+
+#define COMPOSE_INDEX(Char) \
+ (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
+
+static gboolean
+combine (gunichar a,
+ gunichar b,
+ gunichar *result)
+{
+ gushort index_a, index_b;
+
+ if (combine_hangul (a, b, result))
+ return TRUE;
+
+ index_a = COMPOSE_INDEX(a);
+
+ if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
+ {
+ if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
+ {
+ *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
+ return TRUE;
+ }
+ else
+ return FALSE;
+ }
+
+ index_b = COMPOSE_INDEX(b);
+
+ if (index_b >= COMPOSE_SECOND_SINGLE_START)
+ {
+ if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
+ {
+ *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
+ return TRUE;
+ }
+ else
+ return FALSE;
+ }
+
+ if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
+ index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
+ {
+ gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
+
+ if (res)
+ {
+ *result = res;
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+gunichar *
+_g_utf8_normalize_wc (const gchar *str,
+ gssize max_len,
+ GNormalizeMode mode)
+{
+ gsize n_wc;
+ gunichar *wc_buffer;
+ const char *p;
+ gsize last_start;
+ gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
+ mode == G_NORMALIZE_NFKD);
+ gboolean do_compose = (mode == G_NORMALIZE_NFC ||
+ mode == G_NORMALIZE_NFKC);
+
+ n_wc = 0;
+ p = str;
+ while ((max_len < 0 || p < str + max_len) && *p)
+ {
+ const gchar *decomp;
+ gunichar wc = g_utf8_get_char (p);
+
+ if (wc >= 0xac00 && wc <= 0xd7a3)
+ {
+ gsize result_len;
+ decompose_hangul (wc, NULL, &result_len);
+ n_wc += result_len;
+ }
+ else
+ {
+ decomp = find_decomposition (wc, do_compat);
+
+ if (decomp)
+ n_wc += g_utf8_strlen (decomp, -1);
+ else
+ n_wc++;
+ }
+
+ p = g_utf8_next_char (p);
+ }
+
+ wc_buffer = g_new (gunichar, n_wc + 1);
+
+ last_start = 0;
+ n_wc = 0;
+ p = str;
+ while ((max_len < 0 || p < str + max_len) && *p)
+ {
+ gunichar wc = g_utf8_get_char (p);
+ const gchar *decomp;
+ int cc;
+ gsize old_n_wc = n_wc;
+
+ if (wc >= 0xac00 && wc <= 0xd7a3)
+ {
+ gsize result_len;
+ decompose_hangul (wc, wc_buffer + n_wc, &result_len);
+ n_wc += result_len;
+ }
+ else
+ {
+ decomp = find_decomposition (wc, do_compat);
+
+ if (decomp)
+ {
+ const char *pd;
+ for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
+ wc_buffer[n_wc++] = g_utf8_get_char (pd);
+ }
+ else
+ wc_buffer[n_wc++] = wc;
+ }
+
+ if (n_wc > 0)
+ {
+ cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
+
+ if (cc == 0)
+ {
+ g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
+ last_start = old_n_wc;
+ }
+ }
+
+ p = g_utf8_next_char (p);
+ }
+
+ if (n_wc > 0)
+ {
+ g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
+ last_start = n_wc;
+ }
+
+ wc_buffer[n_wc] = 0;
+
+ /* All decomposed and reordered */
+
+ if (do_compose && n_wc > 0)
+ {
+ gsize i, j;
+ int last_cc = 0;
+ last_start = 0;
+
+ for (i = 0; i < n_wc; i++)
+ {
+ int cc = COMBINING_CLASS (wc_buffer[i]);
+
+ if (i > 0 &&
+ (last_cc == 0 || last_cc < cc) &&
+ combine (wc_buffer[last_start], wc_buffer[i],
+ &wc_buffer[last_start]))
+ {
+ for (j = i + 1; j < n_wc; j++)
+ wc_buffer[j-1] = wc_buffer[j];
+ n_wc--;
+ i--;
+
+ if (i == last_start)
+ last_cc = 0;
+ else
+ last_cc = COMBINING_CLASS (wc_buffer[i-1]);
+
+ continue;
+ }
+
+ if (cc == 0)
+ last_start = i;
+
+ last_cc = cc;
+ }
+ }
+
+ wc_buffer[n_wc] = 0;
+
+ return wc_buffer;
+}
+
+/**
+ * g_utf8_normalize:
+ * @str: a UTF-8 encoded string.
+ * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
+ * @mode: the type of normalization to perform.
+ *
+ * Converts a string into canonical form, standardizing
+ * such issues as whether a character with an accent
+ * is represented as a base character and combining
+ * accent or as a single precomposed character. The
+ * string has to be valid UTF-8, otherwise %NULL is
+ * returned. You should generally call g_utf8_normalize()
+ * before comparing two Unicode strings.
+ *
+ * The normalization mode %G_NORMALIZE_DEFAULT only
+ * standardizes differences that do not affect the
+ * text content, such as the above-mentioned accent
+ * representation. %G_NORMALIZE_ALL also standardizes
+ * the "compatibility" characters in Unicode, such
+ * as SUPERSCRIPT THREE to the standard forms
+ * (in this case DIGIT THREE). Formatting information
+ * may be lost but for most text operations such
+ * characters should be considered the same.
+ *
+ * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
+ * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
+ * but returned a result with composed forms rather
+ * than a maximally decomposed form. This is often
+ * useful if you intend to convert the string to
+ * a legacy encoding or pass it to a system with
+ * less capable Unicode handling.
+ *
+ * Return value: a newly allocated string, that is the
+ * normalized form of @str, or %NULL if @str is not
+ * valid UTF-8.
+ **/
+gchar *
+g_utf8_normalize (const gchar *str,
+ gssize len,
+ GNormalizeMode mode)
+{
+ gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
+ gchar *result;
+
+ result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
+ g_free (result_wc);
+
+ return result;
+}