diff options
Diffstat (limited to 'MirOTR/entities.cpp')
-rw-r--r-- | MirOTR/entities.cpp | 395 |
1 files changed, 395 insertions, 0 deletions
diff --git a/MirOTR/entities.cpp b/MirOTR/entities.cpp new file mode 100644 index 0000000..c757b70 --- /dev/null +++ b/MirOTR/entities.cpp @@ -0,0 +1,395 @@ +// (C) of entities.cpp: Christoph +// http://mercurial.intuxication.org/hg/cstuff/raw-file/tip/entities.c +// http://stackoverflow.com/questions/1082162/how-to-unescape-html-in-c/1082191#1082191 +// modified by ProgAndy + +#include "stdafx.h" +#include "entities.h" + +#include <errno.h> +#include <string.h> +#include <stdlib.h> +using namespace std; + +#define UNICODE_MAX 0x10FFFFul + + +static const char *named_entities[][2] = +{ + { "AElig;", "Æ" }, + { "Aacute;", "Á" }, + { "Acirc;", "Â" }, + { "Agrave;", "À" }, + { "Alpha;", "Α" }, + { "Aring;", "Å" }, + { "Atilde;", "Ã" }, + { "Auml;", "Ä" }, + { "Beta;", "Β" }, + { "Ccedil;", "Ç" }, + { "Chi;", "Χ" }, + { "Dagger;", "‡" }, + { "Delta;", "Δ" }, + { "ETH;", "Ð" }, + { "Eacute;", "É" }, + { "Ecirc;", "Ê" }, + { "Egrave;", "È" }, + { "Epsilon;", "Ε" }, + { "Eta;", "Η" }, + { "Euml;", "Ë" }, + { "Gamma;", "Γ" }, + { "Iacute;", "Í" }, + { "Icirc;", "Î" }, + { "Igrave;", "Ì" }, + { "Iota;", "Ι" }, + { "Iuml;", "Ï" }, + { "Kappa;", "Κ" }, + { "Lambda;", "Λ" }, + { "Mu;", "Μ" }, + { "Ntilde;", "Ñ" }, + { "Nu;", "Ν" }, + { "OElig;", "Œ" }, + { "Oacute;", "Ó" }, + { "Ocirc;", "Ô" }, + { "Ograve;", "Ò" }, + { "Omega;", "Ω" }, + { "Omicron;", "Ο" }, + { "Oslash;", "Ø" }, + { "Otilde;", "Õ" }, + { "Ouml;", "Ö" }, + { "Phi;", "Φ" }, + { "Pi;", "Π" }, + { "Prime;", "″" }, + { "Psi;", "Ψ" }, + { "Rho;", "Ρ" }, + { "Scaron;", "Š" }, + { "Sigma;", "Σ" }, + { "THORN;", "Þ" }, + { "Tau;", "Τ" }, + { "Theta;", "Θ" }, + { "Uacute;", "Ú" }, + { "Ucirc;", "Û" }, + { "Ugrave;", "Ù" }, + { "Upsilon;", "Υ" }, + { "Uuml;", "Ü" }, + { "Xi;", "Ξ" }, + { "Yacute;", "Ý" }, + { "Yuml;", "Ÿ" }, + { "Zeta;", "Ζ" }, + { "aacute;", "á" }, + { "acirc;", "â" }, + { "acute;", "´" }, + { "aelig;", "æ" }, + { "agrave;", "à" }, + { "alefsym;", "ℵ" }, + { "alpha;", "α" }, + { "amp;", "&" }, + { "and;", "∧" }, + { "ang;", "∠" }, + { "apos;", "'" }, + { "aring;", "å" }, + { "asymp;", "≈" }, + { "atilde;", "ã" }, + { "auml;", "ä" }, + { "bdquo;", "„" }, + { "beta;", "β" }, + { "brvbar;", "¦" }, + { "bull;", "•" }, + { "cap;", "∩" }, + { "ccedil;", "ç" }, + { "cedil;", "¸" }, + { "cent;", "¢" }, + { "chi;", "χ" }, + { "circ;", "ˆ" }, + { "clubs;", "♣" }, + { "cong;", "≅" }, + { "copy;", "©" }, + { "crarr;", "↵" }, + { "cup;", "∪" }, + { "curren;", "¤" }, + { "dArr;", "⇓" }, + { "dagger;", "†" }, + { "darr;", "↓" }, + { "deg;", "°" }, + { "delta;", "δ" }, + { "diams;", "♦" }, + { "divide;", "÷" }, + { "eacute;", "é" }, + { "ecirc;", "ê" }, + { "egrave;", "è" }, + { "empty;", "∅" }, + { "emsp;", " " }, + { "ensp;", " " }, + { "epsilon;", "ε" }, + { "equiv;", "≡" }, + { "eta;", "η" }, + { "eth;", "ð" }, + { "euml;", "ë" }, + { "euro;", "€" }, + { "exist;", "∃" }, + { "fnof;", "ƒ" }, + { "forall;", "∀" }, + { "frac12;", "½" }, + { "frac14;", "¼" }, + { "frac34;", "¾" }, + { "frasl;", "⁄" }, + { "gamma;", "γ" }, + { "ge;", "≥" }, + { "gt;", ">" }, + { "hArr;", "⇔" }, + { "harr;", "↔" }, + { "hearts;", "♥" }, + { "hellip;", "…" }, + { "iacute;", "í" }, + { "icirc;", "î" }, + { "iexcl;", "¡" }, + { "igrave;", "ì" }, + { "image;", "ℑ" }, + { "infin;", "∞" }, + { "int;", "∫" }, + { "iota;", "ι" }, + { "iquest;", "¿" }, + { "isin;", "∈" }, + { "iuml;", "ï" }, + { "kappa;", "κ" }, + { "lArr;", "⇐" }, + { "lambda;", "λ" }, + { "lang;", "〈" }, + { "laquo;", "«" }, + { "larr;", "←" }, + { "lceil;", "⌈" }, + { "ldquo;", "“" }, + { "le;", "≤" }, + { "lfloor;", "⌊" }, + { "lowast;", "∗" }, + { "loz;", "◊" }, + { "lrm;", "\xE2\x80\x8E" }, + { "lsaquo;", "‹" }, + { "lsquo;", "‘" }, + { "lt;", "<" }, + { "macr;", "¯" }, + { "mdash;", "—" }, + { "micro;", "µ" }, + { "middot;", "·" }, + { "minus;", "−" }, + { "mu;", "μ" }, + { "nabla;", "∇" }, + { "nbsp;", " " }, + { "ndash;", "–" }, + { "ne;", "≠" }, + { "ni;", "∋" }, + { "not;", "¬" }, + { "notin;", "∉" }, + { "nsub;", "⊄" }, + { "ntilde;", "ñ" }, + { "nu;", "ν" }, + { "oacute;", "ó" }, + { "ocirc;", "ô" }, + { "oelig;", "œ" }, + { "ograve;", "ò" }, + { "oline;", "‾" }, + { "omega;", "ω" }, + { "omicron;", "ο" }, + { "oplus;", "⊕" }, + { "or;", "∨" }, + { "ordf;", "ª" }, + { "ordm;", "º" }, + { "oslash;", "ø" }, + { "otilde;", "õ" }, + { "otimes;", "⊗" }, + { "ouml;", "ö" }, + { "para;", "¶" }, + { "part;", "∂" }, + { "permil;", "‰" }, + { "perp;", "⊥" }, + { "phi;", "φ" }, + { "pi;", "π" }, + { "piv;", "ϖ" }, + { "plusmn;", "±" }, + { "pound;", "£" }, + { "prime;", "′" }, + { "prod;", "∏" }, + { "prop;", "∝" }, + { "psi;", "ψ" }, + { "quot;", "\"" }, + { "rArr;", "⇒" }, + { "radic;", "√" }, + { "rang;", "〉" }, + { "raquo;", "»" }, + { "rarr;", "→" }, + { "rceil;", "⌉" }, + { "rdquo;", "”" }, + { "real;", "ℜ" }, + { "reg;", "®" }, + { "rfloor;", "⌋" }, + { "rho;", "ρ" }, + { "rlm;", "\xE2\x80\x8F" }, + { "rsaquo;", "›" }, + { "rsquo;", "’" }, + { "sbquo;", "‚" }, + { "scaron;", "š" }, + { "sdot;", "⋅" }, + { "sect;", "§" }, + { "shy;", "\xC2\xAD" }, + { "sigma;", "σ" }, + { "sigmaf;", "ς" }, + { "sim;", "∼" }, + { "spades;", "♠" }, + { "sub;", "⊂" }, + { "sube;", "⊆" }, + { "sum;", "∑" }, + { "sup;", "⊃" }, + { "sup1;", "¹" }, + { "sup2;", "²" }, + { "sup3;", "³" }, + { "supe;", "⊇" }, + { "szlig;", "ß" }, + { "tau;", "τ" }, + { "there4;", "∴" }, + { "theta;", "θ" }, + { "thetasym;", "ϑ" }, + { "thinsp;", " " }, + { "thorn;", "þ" }, + { "tilde;", "˜" }, + { "times;", "×" }, + { "trade;", "™" }, + { "uArr;", "⇑" }, + { "uacute;", "ú" }, + { "uarr;", "↑" }, + { "ucirc;", "û" }, + { "ugrave;", "ù" }, + { "uml;", "¨" }, + { "upsih;", "ϒ" }, + { "upsilon;", "υ" }, + { "uuml;", "ü" }, + { "weierp;", "℘" }, + { "xi;", "ξ" }, + { "yacute;", "ý" }, + { "yen;", "¥" }, + { "yuml;", "ÿ" }, + { "zeta;", "ζ" }, + { "zwj;", "\xE2\x80\x8D" }, + { "zwnj;", "\xE2\x80\x8C" } +}; + +static int cmp(const void *key, const void *element) +{ + return strncmp((const char *)key, *(const char **)element, + strlen(*(const char **)element)); +} + +static const char *get_named_entity(const char *name) +{ + const char **entity = (const char **)bsearch(name, named_entities, + sizeof(named_entities) / sizeof(*named_entities), + sizeof(*named_entities), cmp); + + return entity ? entity[1] : NULL; +} + +static size_t putc_utf8(unsigned long cp, char *buffer) +{ + unsigned char *bytes = (unsigned char *)buffer; + + if(cp <= 0x007Ful) + { + bytes[0] = (unsigned char)cp; + return 1; + } + + if(cp <= 0x07FFul) + { + bytes[1] = (unsigned char)((2u << 6) | (cp & 0x3Fu)); + bytes[0] = (unsigned char)((6u << 5) | (cp >> 6)); + return 2; + } + + if(cp <= 0xFFFFul) + { + bytes[2] = (unsigned char)(( 2u << 6) | ( cp & 0x3Fu)); + bytes[1] = (unsigned char)(( 2u << 6) | ((cp >> 6) & 0x3Fu)); + bytes[0] = (unsigned char)((14u << 4) | (cp >> 12)); + return 3; + } + + if(cp <= 0x10FFFFul) + { + bytes[3] = (unsigned char)(( 2u << 6) | ( cp & 0x3Fu)); + bytes[2] = (unsigned char)(( 2u << 6) | ((cp >> 6) & 0x3Fu)); + bytes[1] = (unsigned char)(( 2u << 6) | ((cp >> 12) & 0x3Fu)); + bytes[0] = (unsigned char)((30u << 3) | (cp >> 18)); + return 4; + } + + return 0; +} + +static _Bool parse_entity(const char *current, char **to, + const char **from, size_t len) +{ + const char *end = (const char *)memchr(current, ';', len); + if(!end) return 0; + + if(current[1] == '#') + { + char *tail = NULL; + errno = 0; + + _Bool hex = current[2] == 'x' || current[2] == 'X'; + + unsigned long cp = strtoul( + current + (hex ? 3 : 2), &tail, hex ? 16 : 10); + + if(tail == end && !errno && cp <= UNICODE_MAX) + { + *to += putc_utf8(cp, *to); + *from = end + 1; + + return 1; + } + } + else + { + const char *entity = get_named_entity(¤t[1]); + if(entity) + { + size_t len = strlen(entity); + memcpy(*to, entity, len); + + *to += len; + *from = end + 1; + + return 1; + } + } + + return 0; +} + +size_t decode_html_entities_utf8(char *dest, const char *src, size_t len) +{ + if(!src) src = dest; + + char *to = dest; + const char *from = src; + + const char *current; + while((current = (const char*)memchr(from, '&', len))) + { + memcpy(to, from, (size_t)(current - from)); + to += current - from; + + if(parse_entity(current, &to, &from, len)) + continue; + + from = current; + *to++ = *from++; + } + + size_t remaining = strnlen(from, len); + + memcpy(to, from, remaining); + to += remaining; + + *to = 0; + return (size_t)(to - dest); +} |