summaryrefslogtreecommitdiff
path: root/MirOTR/entities.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'MirOTR/entities.cpp')
-rw-r--r--MirOTR/entities.cpp395
1 files changed, 395 insertions, 0 deletions
diff --git a/MirOTR/entities.cpp b/MirOTR/entities.cpp
new file mode 100644
index 0000000..c757b70
--- /dev/null
+++ b/MirOTR/entities.cpp
@@ -0,0 +1,395 @@
+// (C) of entities.cpp: Christoph
+// http://mercurial.intuxication.org/hg/cstuff/raw-file/tip/entities.c
+// http://stackoverflow.com/questions/1082162/how-to-unescape-html-in-c/1082191#1082191
+// modified by ProgAndy
+
+#include "stdafx.h"
+#include "entities.h"
+
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+using namespace std;
+
+#define UNICODE_MAX 0x10FFFFul
+
+
+static const char *named_entities[][2] =
+{
+ { "AElig;", "Æ" },
+ { "Aacute;", "Á" },
+ { "Acirc;", "Â" },
+ { "Agrave;", "À" },
+ { "Alpha;", "Α" },
+ { "Aring;", "Å" },
+ { "Atilde;", "Ã" },
+ { "Auml;", "Ä" },
+ { "Beta;", "Β" },
+ { "Ccedil;", "Ç" },
+ { "Chi;", "Χ" },
+ { "Dagger;", "‡" },
+ { "Delta;", "Δ" },
+ { "ETH;", "Ð" },
+ { "Eacute;", "É" },
+ { "Ecirc;", "Ê" },
+ { "Egrave;", "È" },
+ { "Epsilon;", "Ε" },
+ { "Eta;", "Η" },
+ { "Euml;", "Ë" },
+ { "Gamma;", "Γ" },
+ { "Iacute;", "Í" },
+ { "Icirc;", "Î" },
+ { "Igrave;", "Ì" },
+ { "Iota;", "Ι" },
+ { "Iuml;", "Ï" },
+ { "Kappa;", "Κ" },
+ { "Lambda;", "Λ" },
+ { "Mu;", "Μ" },
+ { "Ntilde;", "Ñ" },
+ { "Nu;", "Ν" },
+ { "OElig;", "Œ" },
+ { "Oacute;", "Ó" },
+ { "Ocirc;", "Ô" },
+ { "Ograve;", "Ò" },
+ { "Omega;", "Ω" },
+ { "Omicron;", "Ο" },
+ { "Oslash;", "Ø" },
+ { "Otilde;", "Õ" },
+ { "Ouml;", "Ö" },
+ { "Phi;", "Φ" },
+ { "Pi;", "Π" },
+ { "Prime;", "″" },
+ { "Psi;", "Ψ" },
+ { "Rho;", "Ρ" },
+ { "Scaron;", "Š" },
+ { "Sigma;", "Σ" },
+ { "THORN;", "Þ" },
+ { "Tau;", "Τ" },
+ { "Theta;", "Θ" },
+ { "Uacute;", "Ú" },
+ { "Ucirc;", "Û" },
+ { "Ugrave;", "Ù" },
+ { "Upsilon;", "Υ" },
+ { "Uuml;", "Ü" },
+ { "Xi;", "Ξ" },
+ { "Yacute;", "Ý" },
+ { "Yuml;", "Ÿ" },
+ { "Zeta;", "Ζ" },
+ { "aacute;", "á" },
+ { "acirc;", "â" },
+ { "acute;", "´" },
+ { "aelig;", "æ" },
+ { "agrave;", "à" },
+ { "alefsym;", "ℵ" },
+ { "alpha;", "α" },
+ { "amp;", "&" },
+ { "and;", "∧" },
+ { "ang;", "∠" },
+ { "apos;", "'" },
+ { "aring;", "å" },
+ { "asymp;", "≈" },
+ { "atilde;", "ã" },
+ { "auml;", "ä" },
+ { "bdquo;", "„" },
+ { "beta;", "β" },
+ { "brvbar;", "¦" },
+ { "bull;", "•" },
+ { "cap;", "∩" },
+ { "ccedil;", "ç" },
+ { "cedil;", "¸" },
+ { "cent;", "¢" },
+ { "chi;", "χ" },
+ { "circ;", "ˆ" },
+ { "clubs;", "♣" },
+ { "cong;", "≅" },
+ { "copy;", "©" },
+ { "crarr;", "↵" },
+ { "cup;", "∪" },
+ { "curren;", "¤" },
+ { "dArr;", "⇓" },
+ { "dagger;", "†" },
+ { "darr;", "↓" },
+ { "deg;", "°" },
+ { "delta;", "δ" },
+ { "diams;", "♦" },
+ { "divide;", "÷" },
+ { "eacute;", "é" },
+ { "ecirc;", "ê" },
+ { "egrave;", "è" },
+ { "empty;", "∅" },
+ { "emsp;", " " },
+ { "ensp;", " " },
+ { "epsilon;", "ε" },
+ { "equiv;", "≡" },
+ { "eta;", "η" },
+ { "eth;", "ð" },
+ { "euml;", "ë" },
+ { "euro;", "€" },
+ { "exist;", "∃" },
+ { "fnof;", "ƒ" },
+ { "forall;", "∀" },
+ { "frac12;", "½" },
+ { "frac14;", "¼" },
+ { "frac34;", "¾" },
+ { "frasl;", "⁄" },
+ { "gamma;", "γ" },
+ { "ge;", "≥" },
+ { "gt;", ">" },
+ { "hArr;", "⇔" },
+ { "harr;", "↔" },
+ { "hearts;", "♥" },
+ { "hellip;", "…" },
+ { "iacute;", "í" },
+ { "icirc;", "î" },
+ { "iexcl;", "¡" },
+ { "igrave;", "ì" },
+ { "image;", "ℑ" },
+ { "infin;", "∞" },
+ { "int;", "∫" },
+ { "iota;", "ι" },
+ { "iquest;", "¿" },
+ { "isin;", "∈" },
+ { "iuml;", "ï" },
+ { "kappa;", "κ" },
+ { "lArr;", "⇐" },
+ { "lambda;", "λ" },
+ { "lang;", "〈" },
+ { "laquo;", "«" },
+ { "larr;", "←" },
+ { "lceil;", "⌈" },
+ { "ldquo;", "“" },
+ { "le;", "≤" },
+ { "lfloor;", "⌊" },
+ { "lowast;", "∗" },
+ { "loz;", "◊" },
+ { "lrm;", "\xE2\x80\x8E" },
+ { "lsaquo;", "‹" },
+ { "lsquo;", "‘" },
+ { "lt;", "<" },
+ { "macr;", "¯" },
+ { "mdash;", "—" },
+ { "micro;", "µ" },
+ { "middot;", "·" },
+ { "minus;", "−" },
+ { "mu;", "μ" },
+ { "nabla;", "∇" },
+ { "nbsp;", " " },
+ { "ndash;", "–" },
+ { "ne;", "≠" },
+ { "ni;", "∋" },
+ { "not;", "¬" },
+ { "notin;", "∉" },
+ { "nsub;", "⊄" },
+ { "ntilde;", "ñ" },
+ { "nu;", "ν" },
+ { "oacute;", "ó" },
+ { "ocirc;", "ô" },
+ { "oelig;", "œ" },
+ { "ograve;", "ò" },
+ { "oline;", "‾" },
+ { "omega;", "ω" },
+ { "omicron;", "ο" },
+ { "oplus;", "⊕" },
+ { "or;", "∨" },
+ { "ordf;", "ª" },
+ { "ordm;", "º" },
+ { "oslash;", "ø" },
+ { "otilde;", "õ" },
+ { "otimes;", "⊗" },
+ { "ouml;", "ö" },
+ { "para;", "¶" },
+ { "part;", "∂" },
+ { "permil;", "‰" },
+ { "perp;", "⊥" },
+ { "phi;", "φ" },
+ { "pi;", "π" },
+ { "piv;", "ϖ" },
+ { "plusmn;", "±" },
+ { "pound;", "£" },
+ { "prime;", "′" },
+ { "prod;", "∏" },
+ { "prop;", "∝" },
+ { "psi;", "ψ" },
+ { "quot;", "\"" },
+ { "rArr;", "⇒" },
+ { "radic;", "√" },
+ { "rang;", "〉" },
+ { "raquo;", "»" },
+ { "rarr;", "→" },
+ { "rceil;", "⌉" },
+ { "rdquo;", "”" },
+ { "real;", "ℜ" },
+ { "reg;", "®" },
+ { "rfloor;", "⌋" },
+ { "rho;", "ρ" },
+ { "rlm;", "\xE2\x80\x8F" },
+ { "rsaquo;", "›" },
+ { "rsquo;", "’" },
+ { "sbquo;", "‚" },
+ { "scaron;", "š" },
+ { "sdot;", "⋅" },
+ { "sect;", "§" },
+ { "shy;", "\xC2\xAD" },
+ { "sigma;", "σ" },
+ { "sigmaf;", "ς" },
+ { "sim;", "∼" },
+ { "spades;", "♠" },
+ { "sub;", "⊂" },
+ { "sube;", "⊆" },
+ { "sum;", "∑" },
+ { "sup;", "⊃" },
+ { "sup1;", "¹" },
+ { "sup2;", "²" },
+ { "sup3;", "³" },
+ { "supe;", "⊇" },
+ { "szlig;", "ß" },
+ { "tau;", "τ" },
+ { "there4;", "∴" },
+ { "theta;", "θ" },
+ { "thetasym;", "ϑ" },
+ { "thinsp;", " " },
+ { "thorn;", "þ" },
+ { "tilde;", "˜" },
+ { "times;", "×" },
+ { "trade;", "™" },
+ { "uArr;", "⇑" },
+ { "uacute;", "ú" },
+ { "uarr;", "↑" },
+ { "ucirc;", "û" },
+ { "ugrave;", "ù" },
+ { "uml;", "¨" },
+ { "upsih;", "ϒ" },
+ { "upsilon;", "υ" },
+ { "uuml;", "ü" },
+ { "weierp;", "℘" },
+ { "xi;", "ξ" },
+ { "yacute;", "ý" },
+ { "yen;", "¥" },
+ { "yuml;", "ÿ" },
+ { "zeta;", "ζ" },
+ { "zwj;", "\xE2\x80\x8D" },
+ { "zwnj;", "\xE2\x80\x8C" }
+};
+
+static int cmp(const void *key, const void *element)
+{
+ return strncmp((const char *)key, *(const char **)element,
+ strlen(*(const char **)element));
+}
+
+static const char *get_named_entity(const char *name)
+{
+ const char **entity = (const char **)bsearch(name, named_entities,
+ sizeof(named_entities) / sizeof(*named_entities),
+ sizeof(*named_entities), cmp);
+
+ return entity ? entity[1] : NULL;
+}
+
+static size_t putc_utf8(unsigned long cp, char *buffer)
+{
+ unsigned char *bytes = (unsigned char *)buffer;
+
+ if(cp <= 0x007Ful)
+ {
+ bytes[0] = (unsigned char)cp;
+ return 1;
+ }
+
+ if(cp <= 0x07FFul)
+ {
+ bytes[1] = (unsigned char)((2u << 6) | (cp & 0x3Fu));
+ bytes[0] = (unsigned char)((6u << 5) | (cp >> 6));
+ return 2;
+ }
+
+ if(cp <= 0xFFFFul)
+ {
+ bytes[2] = (unsigned char)(( 2u << 6) | ( cp & 0x3Fu));
+ bytes[1] = (unsigned char)(( 2u << 6) | ((cp >> 6) & 0x3Fu));
+ bytes[0] = (unsigned char)((14u << 4) | (cp >> 12));
+ return 3;
+ }
+
+ if(cp <= 0x10FFFFul)
+ {
+ bytes[3] = (unsigned char)(( 2u << 6) | ( cp & 0x3Fu));
+ bytes[2] = (unsigned char)(( 2u << 6) | ((cp >> 6) & 0x3Fu));
+ bytes[1] = (unsigned char)(( 2u << 6) | ((cp >> 12) & 0x3Fu));
+ bytes[0] = (unsigned char)((30u << 3) | (cp >> 18));
+ return 4;
+ }
+
+ return 0;
+}
+
+static _Bool parse_entity(const char *current, char **to,
+ const char **from, size_t len)
+{
+ const char *end = (const char *)memchr(current, ';', len);
+ if(!end) return 0;
+
+ if(current[1] == '#')
+ {
+ char *tail = NULL;
+ errno = 0;
+
+ _Bool hex = current[2] == 'x' || current[2] == 'X';
+
+ unsigned long cp = strtoul(
+ current + (hex ? 3 : 2), &tail, hex ? 16 : 10);
+
+ if(tail == end && !errno && cp <= UNICODE_MAX)
+ {
+ *to += putc_utf8(cp, *to);
+ *from = end + 1;
+
+ return 1;
+ }
+ }
+ else
+ {
+ const char *entity = get_named_entity(&current[1]);
+ if(entity)
+ {
+ size_t len = strlen(entity);
+ memcpy(*to, entity, len);
+
+ *to += len;
+ *from = end + 1;
+
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+size_t decode_html_entities_utf8(char *dest, const char *src, size_t len)
+{
+ if(!src) src = dest;
+
+ char *to = dest;
+ const char *from = src;
+
+ const char *current;
+ while((current = (const char*)memchr(from, '&', len)))
+ {
+ memcpy(to, from, (size_t)(current - from));
+ to += current - from;
+
+ if(parse_entity(current, &to, &from, len))
+ continue;
+
+ from = current;
+ *to++ = *from++;
+ }
+
+ size_t remaining = strnlen(from, len);
+
+ memcpy(to, from, remaining);
+ to += remaining;
+
+ *to = 0;
+ return (size_t)(to - dest);
+}