/* * This code implements decoding encoded MIME header in style * =?iso-8859-2?Q? "User using email in central Europe characters such as =E9" ?= * * (c) majvan 2002-2004 */ #include "../stdafx.h" struct _tcptable CodePageNamesAll[] = { { "ANSI", "", TRUE, CP_ACP }, { "WINDOWS-1", "250", 0, 1250 }, { "WINDOWS-1", "251", 0, 1251 }, { "WINDOWS-1", "252", 0, 1252 }, { "WINDOWS-1", "253", 0, 1253 }, { "WINDOWS-1", "254", 0, 1254 }, { "WINDOWS-1", "255", 0, 1255 }, { "WINDOWS-1", "256", 0, 1256 }, { "WINDOWS-1", "257", 0, 1257 }, { "WINDOWS-1", "258", 0, 1258 }, { "CP1", "250", 0, 1250 }, { "CP1", "251", 0, 1251 }, { "CP1", "252", 0, 1252 }, { "CP1", "253", 0, 1253 }, { "CP1", "254", 0, 1254 }, { "CP1", "255", 0, 1255 }, { "CP1", "256", 0, 1256 }, { "CP1", "257", 0, 1257 }, { "CP1", "258", 0, 1258 }, { "ANSI-1", "250", 0, 1250 }, { "ANSI-1", "251", 0, 1251 }, { "ANSI-1", "252", 0, 1252 }, { "ANSI-1", "253", 0, 1253 }, { "ANSI-1", "254", 0, 1254 }, { "ANSI-1", "255", 0, 1255 }, { "ANSI-1", "256", 0, 1256 }, { "ANSI-1", "257", 0, 1257 }, { "ANSI-1", "258", 0, 1258 }, { "KOI8", "-R", 0, 20866 }, { "KOI8", "", 0, 20866 }, { "KOI8", "-U", 0, 21866 }, { "KOI8", "-RU", 0, 21866 }, { "US-", "ASCII", 0, 20127 }, { "CP", "367", 0, 20127 }, { "ASCII", "", 0, 20127 }, { "ASCII", "7", 0, 20127 }, { "ISO-8859", "-1", 0, 28591 }, { "ISO-8859", "-2", 0, 28592 }, { "ISO-8859", "-3", 0, 28593 }, { "ISO-8859", "-4", 0, 28594 }, { "ISO-8859", "-5", 0, 28595 }, { "ISO-8859", "-6", 0, 28596 }, { "ISO-8859", "-7", 0, 28597 }, { "ISO-8859", "-8", 0, 28598 }, { "ISO-8859", "-9", 0, 28599 }, { "ISO-8859", "-15", 0, 28605 }, { "ISO_8859", "-1", 0, 28591 }, { "ISO_8859", "-2", 0, 28592 }, { "ISO_8859", "-3", 0, 28593 }, { "ISO_8859", "-4", 0, 28594 }, { "ISO_8859", "-5", 0, 28595 }, { "ISO_8859", "-6", 0, 28596 }, { "ISO_8859", "-7", 0, 28597 }, { "ISO_8859", "-8", 0, 28598 }, { "ISO_8859", "-9", 0, 28599 }, { "ISO_8859", "-15", 0, 28605 }, { "ISO-", "10646-USC2", 0, 1200 }, { "ISO-2022", "/2-JP", 0, 50220 }, { "ISO-2022", "-JP", 0, 50221 }, { "ISO-2022", "/JIS-JP", 0, 50222 }, { "ISO-2022", "-KR", 0, 50225 }, { "ISO-2022", "-CH(SP)", 0, 50227 }, { "ISO-2022", "-CH(TR)", 0, 50229 }, { "UTF-", "7", 0, 65000 }, { "UTF-", "8", 0, 65001 }, { "ARAB-", "TRANSPARENT", 0, 710 }, { "ASMO-", "TRANSPARENT", 0, 720 }, { "ASMO-", "449", 0, 709 }, { "ASMO-", "708", 0, 708 }, { "BIG5", "", 0, 950 }, { "EUC-", "CH(SP)", 0, 51936 }, { "EUC-", "CH(TR)", 0, 51950 }, { "EUC-", "JP", 0, 51932 }, { "EUC-", "KR", 0, 51949 }, { "GB-", "2312", 0, 20936 }, { "GB", "2312", 0, 20936 }, { "HZGB-", "2312", 0, 52936 }, { "IBM-", "037", 0, 37 }, { "IBM-", "290", 0, 290 }, { "IBM-", "437", 0, 437 }, { "IBM-", "500", 0, 500 }, { "IBM-", "775", 0, 775 }, { "IBM-", "850", 0, 850 }, { "IBM-", "852", 0, 852 }, { "IBM-", "855", 0, 855 }, { "IBM-", "857", 0, 857 }, { "IBM-", "860", 0, 860 }, { "IBM-", "861", 0, 861 }, { "IBM-", "862", 0, 862 }, { "IBM-", "863", 0, 863 }, { "IBM-", "864", 0, 864 }, { "IBM-", "865", 0, 865 }, { "IBM-", "866", 0, 866 }, { "IBM-", "869", 0, 869 }, { "IBM-", "870", 0, 870 }, { "IBM-", "875", 0, 875 }, { "IBM-", "1026", 0, 1026 }, { "IBM-", "273", 0, 20273 }, { "IBM-", "277", 0, 20277 }, { "IBM-", "278", 0, 20278 }, { "IBM-", "280", 0, 20280 }, { "IBM-", "284", 0, 20284 }, { "IBM-", "285", 0, 20285 }, { "IBM-", "290", 0, 20290 }, { "IBM-", "297", 0, 20297 }, { "IBM-", "420", 0, 20420 }, { "IBM-", "423", 0, 20423 }, { "IBM-", "871", 0, 20871 }, { "IBM-", "880", 0, 20880 }, { "IBM-", "905", 0, 20905 }, { "IBM-", "THAI", 0, 20838 }, { "ISCII-", "DEVANAGARI", 0, 57002 }, { "ISCII-", "BENGALI", 0, 57003 }, { "ISCII-", "TAMIL", 0, 57004 }, { "ISCII-", "TELUGU", 0, 57005 }, { "ISCII-", "ASSAMESE", 0, 57006 }, { "ISCII-", "ORIYA", 0, 57007 }, { "ISCII-", "KANNADA", 0, 57008 }, { "ISCII-", "MALAYALAM", 0, 57009 }, { "ISCII-", "GUJARATI", 0, 57010 }, { "ISCII-", "PUNJABI", 0, 57011 }, { "KOR-", "JOHAB", 0, 1361 }, { "KSC-", "5601", 0, 1361 }, { "MAC-", "ROMAN", 0, 10000 }, { "MAC-", "JP", 0, 10001 }, { "MAC-", "CH(SP)(BIG5)", 0, 10002 }, { "MAC-", "KR", 0, 10003 }, { "MAC-", "AR", 0, 10004 }, { "MAC-", "HW", 0, 10005 }, { "MAC-", "GR", 0, 10006 }, { "MAC-", "CY", 0, 10007 }, { "MAC-", "CH(SP)(GB2312)", 0, 10008 }, { "MAC-", "ROMANIA", 0, 10010 }, { "MAC-", "UA", 0, 10017 }, { "MAC-", "TH", 0, 10021 }, { "MAC-", "LAT2", 0, 10029 }, { "MAC-", "ICE", 0, 10079 }, { "MAC-", "TR", 0, 10081 }, { "MAC-", "CR", 0, 10082 } }; int CPLENALL = _countof(CodePageNamesAll); struct _tcptable *CodePageNamesSupp; int CPLENSUPP = 1; void SkipNonSpaces(char *&p) { while (!WS(p)) p++; } void SkipSpaces(char *&p) { while (WS(p)) p++; } ///////////////////////////////////////////////////////////////////////////////////////// // Gets codepage ID from string representing charset such as "iso-8859-1" // input- the string // size- max length of input string int GetCharsetFromString(char *input, size_t size) { char *pin = input; char *pout, *parser; if ((size < 1) || (parser = pout = new char[size + 1]) == nullptr) return -1; while ((*pin != 0) && (pin - input < (INT_PTR)size)) { if ((*pin >= 'a') && (*pin <= 'z')) *parser++ = *(pin++) - ('a' - 'A'); // make it capital else *parser++ = *pin++; } *parser = 0; #ifdef DEBUG_DECODECODEPAGE mir_writeLogA(DecodeFile, "%s", pout); #endif for (int i = 0; i < CPLENALL; i++) { size_t len = mir_strlen(CodePageNamesAll[i].NameBase); if (0 == strncmp(pout, CodePageNamesAll[i].NameBase, len)) { if (0 == mir_strcmp(pout + len, CodePageNamesAll[i].NameSub)) { delete[] pout; return CodePageNamesAll[i].CP; } } } delete[] pout; return -1; //not found } ///////////////////////////////////////////////////////////////////////////////////////// // HexValue to DecValue ('a' to 10) // HexValue- hexa value ('a') // DecValue- poiner where to store dec value // returns 0 if not success int FromHexa(char HexValue, char *DecValue) { if (HexValue >= '0' && HexValue <= '9') { *DecValue = HexValue - '0'; return 1; } if (HexValue >= 'A' && HexValue <= 'F') { *DecValue = HexValue - 'A' + 10; return 1; } if (HexValue >= 'a' && HexValue <= 'f') { *DecValue = HexValue - 'a' + 10; return 1; } return 0; } ///////////////////////////////////////////////////////////////////////////////////////// // Decodes string in quoted printable // Src- input string // Dst- where to store output string // DstLen- how max long should be output string // isQ- if is "Q-encoding" modification. should be TRUE in headers // always returns 1 int DecodeQuotedPrintable(char *Src, char *Dst, int DstLen, BOOL isQ) { #ifdef DEBUG_DECODEQUOTED char *DstTemp = Dst; mir_writeLogA(DecodeFile, "%s", Src); #endif for (auto *Limit = Dst + DstLen; *Src != 0 && Dst < Limit; Src++) { if (*Src == '=') { Src++; if (*Src == 0) break; if (!isQ) { if (*Src == '\r') { if (Src[1] == '\n') Src++; continue; } if (*Src == '\n') continue; } char First, Second; if (!FromHexa(Src[0], &First)) { *Dst++ = '='; continue; } if (!FromHexa(Src[1], &Second)) { *Dst++ = '='; Src--; continue; } *Dst++ = ((char)(First) << 4) + Second; Src++; } else if (isQ && *Src == '_') *Dst++ = ' '; else *Dst++ = *Src; } *Dst = 0; #ifdef DEBUG_DECODEQUOTED mir_writeLogA(DecodeFile, "%s", DstTemp); #endif return 1; } ///////////////////////////////////////////////////////////////////////////////////////// // Converts string to unicode from string with specified codepage // stream- input string // cp- codepage of input string // out- pointer to new allocated memory that contains unicode string int ConvertStringToUnicode(char *stream, unsigned int cp, wchar_t **out) { CPINFO CPInfo; wchar_t *temp, *src = *out, *dest; size_t outlen; int streamlen, Index; //codepages, which require to have set 0 in dwFlags parameter when calling MultiByteToWideChar uint32_t CodePagesZeroFlags[] = {50220, 50221, 50222, 50225, 50227, 50229, 52936, 54936, 57002, 57003, 57004, 57005, 57006, 57007, 57008, 57009, 57010, 57011, 65000, 65001}; if ((cp != CP_ACP) && (cp != CP_OEMCP) && (cp != CP_MACCP) && (cp != CP_THREAD_ACP) && (cp != CP_SYMBOL) && (cp != CP_UTF7) && (cp != CP_UTF8) && !GetCPInfo(cp, &CPInfo)) cp = CP_ACP; #ifdef DEBUG_DECODECODEPAGE mir_writeLogA(DecodeFile, "%d", cp); #endif for (Index = 0; Index < sizeof(CodePagesZeroFlags) / sizeof(CodePagesZeroFlags[0]); Index++) if (CodePagesZeroFlags[Index] == cp) { Index = -1; break; } if (Index == -1) streamlen = MultiByteToWideChar(cp, 0, stream, -1, nullptr, 0); else streamlen = MultiByteToWideChar(cp, MB_USEGLYPHCHARS, stream, -1, nullptr, 0); if (*out != nullptr) outlen = mir_wstrlen(*out); else outlen = 0; temp = new wchar_t[streamlen + outlen + 1]; if (*out != nullptr) { for (dest = temp; *src != (wchar_t)0; src++, dest++) //copy old string from *out to temp *dest = *src; delete[] *out; } else dest = temp; *out = temp; if (Index == -1) { if (!MultiByteToWideChar(cp, 0, stream, -1, dest, streamlen)) return 0; } else { if (!MultiByteToWideChar(cp, MB_USEGLYPHCHARS, stream, -1, dest, streamlen)) return 0; } return 1; } ///////////////////////////////////////////////////////////////////////////////////////// // Converts string from MIME header to unicode // stream- input string // cp- codepage of input string // storeto- pointer to memory that contains unicode string // mode- MIME_PLAIN or MIME_MAIL (MIME_MAIL deletes '"' from start and end of string) CMStringW ConvertCodedStringToUnicode(char *stream, uint32_t cp, int mode) { char *start = stream, *finder, *finderend; char Encoding = 0; CMStringW ret; if (stream == nullptr) return ret; SkipSpaces(start); while (*start != 0) { if (CODES(start)) { finder = start + 2; finderend = finder; while (!CODED(finderend) && !EOS(finderend)) finderend++; start = finderend; if (CODED(finderend)) { Encoding = *(finderend + 1); switch (Encoding) { case 'b': case 'B': case 'q': case 'Q': break; default: goto NotEncoded; } if (-1 == (cp = (uint32_t)GetCharsetFromString(finder, finderend - finder))) cp = CP_ACP; if (Encoding != 0) { int codeend; char *pcodeend = nullptr; finder = finderend + 2; if (CODED(finder)) finder++; SkipSpaces(finder); finderend = finder; while (!CODEE(finderend) && !EOS(finderend)) finderend++; if (codeend = CODEE(finderend)) pcodeend = finderend; while (WS(finderend - 1)) finderend--; if ((mode == MIME_MAIL) && (((*finder == '"') && (*(finderend - 1) == '"')))) { finder++; finderend--; } char *oneWordEncoded = new char[finderend - finder + 1]; strncpy(oneWordEncoded, finder, finderend - finder); oneWordEncoded[finderend - finder] = 0; ptrA DecodedResult; switch (Encoding) { case 'b': case 'B': DecodedResult = (char*)mir_base64_decode(oneWordEncoded, 0); break; case 'q': case 'Q': int size = finderend - finder + 1 + 1; DecodedResult = (char*)mir_alloc(size + 1); DecodeQuotedPrintable(oneWordEncoded, DecodedResult, size, TRUE); break; } delete[] oneWordEncoded; if (codeend) finderend = pcodeend + 2; // if string continues and there's some whitespace, add space to string that is to be converted if (WS(finderend)) finderend++; wchar_t *oneWord = nullptr; if (ConvertStringToUnicode(DecodedResult, cp, &oneWord)) { ret.Append(oneWord); delete oneWord; } start = finderend; } else if (!EOS(start)) start++; } else if (!EOS(start)) start++; } else { NotEncoded: ret.AppendChar(*start); start++; } } return ret; }