diff options
Diffstat (limited to 'plugins/UserInfoEx/src/ex_import/tinyxmlparser.cpp')
-rw-r--r-- | plugins/UserInfoEx/src/ex_import/tinyxmlparser.cpp | 1613 |
1 files changed, 1613 insertions, 0 deletions
diff --git a/plugins/UserInfoEx/src/ex_import/tinyxmlparser.cpp b/plugins/UserInfoEx/src/ex_import/tinyxmlparser.cpp new file mode 100644 index 0000000000..73f2c18679 --- /dev/null +++ b/plugins/UserInfoEx/src/ex_import/tinyxmlparser.cpp @@ -0,0 +1,1613 @@ +/* +www.sourceforge.net/projects/tinyxml +Original code (2.0 and earlier)copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com) + +This software is provided 'as-is', without any express or implied +warranty. In no event will the authors be held liable for any +damages arising from the use of this software. + +Permission is granted to anyone to use this software for any +purpose, including commercial applications, and to alter it and +redistribute it freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must +not claim that you wrote the original software. If you use this +software in a product, an acknowledgment in the product documentation +would be appreciated but is not required. + +2. Altered source versions must be plainly marked as such, and +must not be misrepresented as being the original software. + +3. This notice may not be removed or altered from any source +distribution. + +=============================================================================== + +UserinfoEx plugin for Miranda IM + +Copyright: +ฉ 2006-2010 DeathAxe, Yasnovidyashii, Merlin, K. Romanov, Kreol + +File name : $HeadURL: https://userinfoex.googlecode.com/svn/trunk/ex_import/tinyxmlparser.cpp $ +Revision : $Revision: 187 $ +Last change on : $Date: 2010-09-08 16:05:54 +0400 (ะกั, 08 ัะตะฝ 2010) $ +Last change by : $Author: ing.u.horn $ + +=============================================================================== +*/ + +#include <ctype.h> +#include <stddef.h> + +#ifdef USE_MMGR +#include <string.h> +#include <assert.h> +#include <stdio.h> +#include "mmgr.h" +#endif + +#include "tinyxml.h" + +//#define DEBUG_PARSER +#if defined(DEBUG_PARSER) +# if defined(DEBUG) && defined(_MSC_VER) +# include <windows.h> +# define TIXML_LOG OutputDebugString +# else +# define TIXML_LOG printf +# endif +#endif + +// Note tha "PutString" hardcodes the same list. This +// is less flexible than it appears. Changing the entries +// or order will break putstring. +TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = +{ + { "&", 5, '&' }, + { "<", 4, '<' }, + { ">", 4, '>' }, + { """, 6, '\"' }, + { "'", 6, '\'' } +}; + +// Bunch of unicode info at: +// http://www.unicode.org/faq/utf_bom.html +// Including the basic of this table, which determines the #bytes in the +// sequence from the lead byte. 1 placed for invalid sequences -- +// although the result will be junk, pass it through as much as possible. +// Beware of the non-characters in UTF-8: +// ef bb bf (Microsoft "lead bytes") +// ef bf be +// ef bf bf + +const unsigned char TIXML_UTF_LEAD_0 = 0xefU; +const unsigned char TIXML_UTF_LEAD_1 = 0xbbU; +const unsigned char TIXML_UTF_LEAD_2 = 0xbfU; + +const int TiXmlBase::utf8ByteTable[256] = +{ + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0 + 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte + 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid +}; + + +void TiXmlBase::ConvertUTF32ToUTF8(unsigned long input, char* output, int* length) +{ + const unsigned long BYTE_MASK = 0xBF; + const unsigned long BYTE_MARK = 0x80; + const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + + if (input < 0x80) + *length = 1; + else if (input < 0x800) + *length = 2; + else if (input < 0x10000) + *length = 3; + else if (input < 0x200000) + *length = 4; + else + { *length = 0; return; } // This code won't covert this correctly anyway. + + output += *length; + + // Scary scary fall throughs. + switch (*length) + { + case 4: + --output; + *output = (char)((input | BYTE_MARK) & BYTE_MASK); + input >>= 6; + case 3: + --output; + *output = (char)((input | BYTE_MARK) & BYTE_MASK); + input >>= 6; + case 2: + --output; + *output = (char)((input | BYTE_MARK) & BYTE_MASK); + input >>= 6; + case 1: + --output; + *output = (char)(input | FIRST_BYTE_MARK[*length]); + } +} + + +/*static*/ int TiXmlBase::IsAlpha(unsigned char anyByte, TiXmlEncoding /*encoding*/) +{ + // This will only work for low-ascii, everything else is assumed to be a valid + // letter. I'm not sure this is the best approach, but it is quite tricky trying + // to figure out alhabetical vs. not across encoding. So take a very + // conservative approach. + +// if (encoding == TIXML_ENCODING_UTF8) +// { + if (anyByte < 127) + return isalpha(anyByte); + else + return 1; // What else to do? The unicode set is huge...get the english ones right. +// } +// else +// { +// return isalpha(anyByte); +// } +} + + +/*static*/ int TiXmlBase::IsAlphaNum(unsigned char anyByte, TiXmlEncoding /*encoding*/) +{ + // This will only work for low-ascii, everything else is assumed to be a valid + // letter. I'm not sure this is the best approach, but it is quite tricky trying + // to figure out alhabetical vs. not across encoding. So take a very + // conservative approach. + +// if (encoding == TIXML_ENCODING_UTF8) +// { + if (anyByte < 127) + return isalnum(anyByte); + else + return 1; // What else to do? The unicode set is huge...get the english ones right. +// } +// else +// { +// return isalnum(anyByte); +// } +} + + +class TiXmlParsingData +{ + friend class TiXmlDocument; + public: + void Stamp(const char* now, TiXmlEncoding encoding); + + const TiXmlCursor& Cursor() { return cursor; } + + private: + // Only used by the document! + TiXmlParsingData(const char* start, int _tabsize, int row, int col) + { + assert(start); + stamp = start; + tabsize = _tabsize; + cursor.row = row; + cursor.col = col; + } + + TiXmlCursor cursor; + const char* stamp; + int tabsize; +}; + + +void TiXmlParsingData::Stamp(const char* now, TiXmlEncoding encoding) +{ + assert(now); + + // Do nothing if the tabsize is 0. + if (tabsize < 1) + { + return; + } + + // Get the current row, column. + int row = cursor.row; + int col = cursor.col; + const char* p = stamp; + assert(p); + + while (p < now) + { + // Treat p as unsigned, so we have a happy compiler. + const unsigned char* pU = (const unsigned char*)p; + + // Code contributed by Fletcher Dunn: (modified by lee) + switch (*pU) { + case 0: + // We *should* never get here, but in case we do, don't + // advance past the terminating null character, ever + return; + + case '\r': + // bump down to the next line + ++row; + col = 0; + // Eat the character + ++p; + + // Check for \r\n sequence, and treat this as a single character + if (*p == '\n') { + ++p; + } + break; + + case '\n': + // bump down to the next line + ++row; + col = 0; + + // Eat the character + ++p; + + // Check for \n\r sequence, and treat this as a single + // character. (Yes, this bizarre thing does occur still + // on some arcane platforms...) + if (*p == '\r') { + ++p; + } + break; + + case '\t': + // Eat the character + ++p; + + // Skip to next tab stop + col = (col / tabsize + 1) * tabsize; + break; + + case TIXML_UTF_LEAD_0: + if (encoding == TIXML_ENCODING_UTF8) + { + if (*(p+1) && *(p+2)) + { + // In these cases, don't advance the column. These are + // 0-width spaces. + if (*(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2) + p += 3; + else if (*(pU+1)==0xbfU && *(pU+2)==0xbeU) + p += 3; + else if (*(pU+1)==0xbfU && *(pU+2)==0xbfU) + p += 3; + else + { p +=3; ++col; } // A normal character. + } + } + else + { + ++p; + ++col; + } + break; + + default: + if (encoding == TIXML_ENCODING_UTF8) + { + // Eat the 1 to 4 byte utf8 character. + int step = TiXmlBase::utf8ByteTable[*((unsigned char*)p)]; + if (step == 0) + step = 1; // Error case from bad encoding, but handle gracefully. + p += step; + + // Just advance one column, of course. + ++col; + } + else + { + ++p; + ++col; + } + break; + } + } + cursor.row = row; + cursor.col = col; + assert(cursor.row >= -1); + assert(cursor.col >= -1); + stamp = p; + assert(stamp); +} + + +const char* TiXmlBase::SkipWhiteSpace(const char* p, TiXmlEncoding encoding) +{ + if (!p || !*p) + { + return 0; + } + if (encoding == TIXML_ENCODING_UTF8) + { + while (*p) + { + const unsigned char* pU = (const unsigned char*)p; + + // Skip the stupid Microsoft UTF-8 Byte order marks + if ( *(pU+0)==TIXML_UTF_LEAD_0 + && *(pU+1)==TIXML_UTF_LEAD_1 + && *(pU+2)==TIXML_UTF_LEAD_2) + { + p += 3; + continue; + } + else if (*(pU+0)==TIXML_UTF_LEAD_0 + && *(pU+1)==0xbfU + && *(pU+2)==0xbeU) + { + p += 3; + continue; + } + else if (*(pU+0)==TIXML_UTF_LEAD_0 + && *(pU+1)==0xbfU + && *(pU+2)==0xbfU) + { + p += 3; + continue; + } + + if (IsWhiteSpace(*p) || *p == '\n' || *p =='\r') // Still using old rules for white space. + ++p; + else + break; + } + } + else + { + while (*p && IsWhiteSpace(*p) || *p == '\n' || *p =='\r') + ++p; + } + + return p; +} + +#ifdef TIXML_USE_STL +/*static*/ bool TiXmlBase::StreamWhiteSpace(TIXML_ISTREAM * in, TIXML_STRING * tag) +{ + for (;;) + { + if (!in->good()) return false; + + int c = in->peek(); + // At this scope, we can't get to a document. So fail silently. + if (!IsWhiteSpace(c) || c <= 0) + return true; + + *tag += (char) in->get(); + } +} + +/*static*/ bool TiXmlBase::StreamTo(TIXML_ISTREAM * in, int character, TIXML_STRING * tag) +{ + //assert(character > 0 && character < 128); // else it won't work in utf-8 + while (in->good()) + { + int c = in->peek(); + if (c == character) + return true; + if (c <= 0) // Silent failure: can't get document at this scope + return false; + + in->get(); + *tag += (char) c; + } + return false; +} +#endif + +const char* TiXmlBase::ReadName(const char* p, TIXML_STRING * name, TiXmlEncoding encoding) +{ + *name = ""; + assert(p); + + // Names start with letters or underscores. + // Of course, in unicode, tinyxml has no idea what a letter *is*. The + // algorithm is generous. + // + // After that, they can be letters, underscores, numbers, + // hyphens, or colons. (Colons are valid ony for namespaces, + // but tinyxml can't tell namespaces from names.) + if ( p && *p + && (IsAlpha((unsigned char) *p, encoding) || *p == '_')) + { + while ( p && *p + && ( IsAlphaNum((unsigned char) *p, encoding) + || *p == '_' + || *p == '-' + || *p == '.' + || *p == ':')) + { + (*name) += *p; + ++p; + } + return p; + } + return 0; +} + +const char* TiXmlBase::GetEntity(const char* p, char* value, int* length, TiXmlEncoding encoding) +{ + // Presume an entity, and pull it out. + TIXML_STRING ent; + int i; + *length = 0; + + if (*(p+1) && *(p+1) == '#' && *(p+2)) + { + unsigned long ucs = 0; + ptrdiff_t delta = 0; + unsigned mult = 1; + + if (*(p+2) == 'x') + { + // Hexadecimal. + if (!*(p+3)) return 0; + + const char* q = p+3; + q = strchr(q, ';'); + + if (!q || !*q) return 0; + + delta = q-p; + --q; + + while (*q != 'x') + { + if (*q >= '0' && *q <= '9') + ucs += mult * (*q - '0'); + else if (*q >= 'a' && *q <= 'f') + ucs += mult * (*q - 'a' + 10); + else if (*q >= 'A' && *q <= 'F') + ucs += mult * (*q - 'A' + 10); + else + return 0; + mult *= 16; + --q; + } + } + else + { + // Decimal. + if (!*(p+2)) return 0; + + const char* q = p+2; + q = strchr(q, ';'); + + if (!q || !*q) return 0; + + delta = q-p; + --q; + + while (*q != '#') + { + if (*q >= '0' && *q <= '9') + ucs += mult * (*q - '0'); + else + return 0; + mult *= 10; + --q; + } + } + if (encoding == TIXML_ENCODING_UTF8) + { + // convert the UCS to UTF-8 + ConvertUTF32ToUTF8(ucs, value, length); + } + else + { + *value = (char)ucs; + *length = 1; + } + return p + delta + 1; + } + + // Now try to match it. + for (i=0; i<NUM_ENTITY; ++i) + { + if (strncmp(entity[i].str, p, entity[i].strLength) == 0) + { + assert(strlen(entity[i].str) == entity[i].strLength); + *value = entity[i].chr; + *length = 1; + return (p + entity[i].strLength); + } + } + + // So it wasn't an entity, its unrecognized, or something like that. + *value = *p; // Don't put back the last one, since we return it! + //*length = 1; // Leave unrecognized entities - this doesn't really work. + // Just writes strange XML. + return p+1; +} + + +bool TiXmlBase::StringEqual(const char* p, + const char* tag, + bool ignoreCase, + TiXmlEncoding encoding) +{ + assert(p); + assert(tag); + if (!p || !*p) + { + assert(0); + return false; + } + + const char* q = p; + + if (ignoreCase) + { + while (*q && *tag && ToLower(*q, encoding) == ToLower(*tag, encoding)) + { + ++q; + ++tag; + } + + if (*tag == 0) + return true; + } + else + { + while (*q && *tag && *q == *tag) + { + ++q; + ++tag; + } + + if (*tag == 0) // Have we found the end of the tag, and everything equal? + return true; + } + return false; +} + +const char* TiXmlBase::ReadText( const char* p, + TIXML_STRING * text, + bool trimWhiteSpace, + const char* endTag, + bool caseInsensitive, + TiXmlEncoding encoding) +{ + *text = ""; + if ( !trimWhiteSpace // certain tags always keep whitespace + || !condenseWhiteSpace) // if true, whitespace is always kept + { + // Keep all the white space. + while ( p && *p + && !StringEqual(p, endTag, caseInsensitive, encoding) + ) + { + int len; + char cArr[4] = { 0, 0, 0, 0 }; + p = GetChar(p, cArr, &len, encoding); + text->append(cArr, len); + } + } + else + { + bool whitespace = false; + + // Remove leading white space: + p = SkipWhiteSpace(p, encoding); + while ( p && *p + && !StringEqual(p, endTag, caseInsensitive, encoding)) + { + if (*p == '\r' || *p == '\n') + { + whitespace = true; + ++p; + } + else if (IsWhiteSpace(*p)) + { + whitespace = true; + ++p; + } + else + { + // If we've found whitespace, add it before the + // new character. Any whitespace just becomes a space. + if (whitespace) + { + (*text) += ' '; + whitespace = false; + } + int len; + char cArr[4] = { 0, 0, 0, 0 }; + p = GetChar(p, cArr, &len, encoding); + if (len == 1) + (*text) += cArr[0]; // more efficient + else + text->append(cArr, len); + } + } + } + return p + strlen(endTag); +} + +#ifdef TIXML_USE_STL + +void TiXmlDocument::StreamIn(TIXML_ISTREAM * in, TIXML_STRING * tag) +{ + // The basic issue with a document is that we don't know what we're + // streaming. Read something presumed to be a tag (and hope), then + // identify it, and call the appropriate stream method on the tag. + // + // This "pre-streaming" will never read the closing ">" so the + // sub-tag can orient itself. + + if (!StreamTo(in, '<', tag)) + { + SetError(TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + + while (in->good()) + { + int tagIndex = (int) tag->length(); + while (in->good() && in->peek() != '>') + { + int c = in->get(); + if (c <= 0) + { + SetError(TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN); + break; + } + (*tag) += (char) c; + } + + if (in->good()) + { + // We now have something we presume to be a node of + // some sort. Identify it, and call the node to + // continue streaming. + TiXmlNode* node = Identify(tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING); + + if (node) + { + node->StreamIn(in, tag); + bool isElement = node->ToElement() != 0; + delete node; + node = 0; + + // If this is the root element, we're done. Parsing will be + // done by the >> operator. + if (isElement) + { + return; + } + } + else + { + SetError(TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + } + } + // We should have returned sooner. + SetError(TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN); +} + +#endif + +const char* TiXmlDocument::Parse(const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding) +{ + ClearError(); + + // Parse away, at the document level. Since a document + // contains nothing but other tags, most of what happens + // here is skipping white space. + if (!p || !*p) + { + SetError(TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN); + return 0; + } + + // Note that, for a document, this needs to come + // before the while space skip, so that parsing + // starts from the pointer we are given. + location.Clear(); + if (prevData) + { + location.row = prevData->cursor.row; + location.col = prevData->cursor.col; + } + else + { + location.row = 0; + location.col = 0; + } + TiXmlParsingData data(p, TabSize(), location.row, location.col); + location = data.Cursor(); + + if (encoding == TIXML_ENCODING_UNKNOWN) + { + // Check for the Microsoft UTF-8 lead bytes. + const unsigned char* pU = (const unsigned char*)p; + if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0 + && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1 + && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2) + { + encoding = TIXML_ENCODING_UTF8; + useMicrosoftBOM = true; + } + } + + p = SkipWhiteSpace(p, encoding); + if (!p) + { + SetError(TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN); + return 0; + } + + while (p && *p) + { + TiXmlNode* node = Identify(p, encoding); + if (node) + { + p = node->Parse(p, &data, encoding); + LinkEndChild(node); + } + else + { + break; + } + + // Did we get encoding info? + if ( encoding == TIXML_ENCODING_UNKNOWN + && node->ToDeclaration()) + { + TiXmlDeclaration* dec = node->ToDeclaration(); + const char* enc = dec->Encoding(); + assert(enc); + + if (*enc == 0) + encoding = TIXML_ENCODING_UTF8; + else if (StringEqual(enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN)) + encoding = TIXML_ENCODING_UTF8; + else if (StringEqual(enc, "UTF8", true, TIXML_ENCODING_UNKNOWN)) + encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice + else + encoding = TIXML_ENCODING_LEGACY; + } + + p = SkipWhiteSpace(p, encoding); + } + + // Was this empty? + if (!firstChild) { + SetError(TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding); + return 0; + } + + // All is well. + return p; +} + +void TiXmlDocument::SetError(int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding) +{ + // The first error in a chain is more accurate - don't set again! + if (error) + return; + + assert(err > 0 && err < TIXML_ERROR_STRING_COUNT); + error = true; + errorId = err; + errorDesc = errorString[ errorId ]; + + errorLocation.Clear(); + if (pError && data) + { + data->Stamp(pError, encoding); + errorLocation = data->Cursor(); + } +} + + +TiXmlNode* TiXmlNode::Identify(const char* p, TiXmlEncoding encoding) +{ + TiXmlNode* returnNode = 0; + + p = SkipWhiteSpace(p, encoding); + if (!p || !*p || *p != '<') + { + return 0; + } + + TiXmlDocument* doc = GetDocument(); + p = SkipWhiteSpace(p, encoding); + + if (!p || !*p) + { + return 0; + } + + // What is this thing? + // - Elements start with a letter or underscore, but xml is reserved. + // - Comments: <!-- + // - Decleration: <?xml + // - Everthing else is unknown to tinyxml. + // + + const char* xmlHeader = { "<?xml" }; + const char* commentHeader = { "<!--" }; + const char* dtdHeader = { "<!" }; + const char* cdataHeader = { "<![CDATA[" }; + + if (StringEqual(p, xmlHeader, true, encoding)) + { + #ifdef DEBUG_PARSER + TIXML_LOG("XML parsing Declaration\n"); + #endif + returnNode = new TiXmlDeclaration(); + } + else if (StringEqual(p, commentHeader, false, encoding)) + { + #ifdef DEBUG_PARSER + TIXML_LOG("XML parsing Comment\n"); + #endif + returnNode = new TiXmlComment(); + } + else if (StringEqual(p, cdataHeader, false, encoding)) + { + #ifdef DEBUG_PARSER + TIXML_LOG("XML parsing CDATA\n"); + #endif + TiXmlText* text = new TiXmlText(""); + text->SetCDATA(true); + returnNode = text; + } + else if (StringEqual(p, dtdHeader, false, encoding)) + { + #ifdef DEBUG_PARSER + TIXML_LOG("XML parsing Unknown(1)\n"); + #endif + returnNode = new TiXmlUnknown(); + } + else if ( IsAlpha(*(p+1), encoding) + || *(p+1) == '_') + { + #ifdef DEBUG_PARSER + TIXML_LOG("XML parsing Element\n"); + #endif + returnNode = new TiXmlElement(""); + } + else + { + #ifdef DEBUG_PARSER + TIXML_LOG("XML parsing Unknown(2)\n"); + #endif + returnNode = new TiXmlUnknown(); + } + + if (returnNode) + { + // Set the parent, so it can report errors + returnNode->parent = this; + } + else + { + if (doc) + doc->SetError(TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN); + } + return returnNode; +} + +#ifdef TIXML_USE_STL + +void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag) +{ + // We're called with some amount of pre-parsing. That is, some of "this" + // element is in "tag". Go ahead and stream to the closing ">" + while (in->good()) + { + int c = in->get(); + if (c <= 0) + { + TiXmlDocument* document = GetDocument(); + if (document) + document->SetError(TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + (*tag) += (char) c ; + + if (c == '>') + break; + } + + if (tag->length() < 3) return; + + // Okay...if we are a "/>" tag, then we're done. We've read a complete tag. + // If not, identify and stream. + + if ( tag->at(tag->length() - 1) == '>' + && tag->at(tag->length() - 2) == '/') + { + // All good! + return; + } + else if (tag->at(tag->length() - 1) == '>') + { + // There is more. Could be: + // text + // closing tag + // another node. + for (;;) + { + StreamWhiteSpace(in, tag); + + // Do we have text? + if (in->good() && in->peek() != '<') + { + // Yep, text. + TiXmlText text(""); + text.StreamIn(in, tag); + + // What follows text is a closing tag or another node. + // Go around again and figure it out. + continue; + } + + // We now have either a closing tag...or another node. + // We should be at a "<", regardless. + if (!in->good()) return; + assert(in->peek() == '<'); + int tagIndex = (int) tag->length(); + + bool closingTag = false; + bool firstCharFound = false; + + for (;;) + { + if (!in->good()) + return; + + int c = in->peek(); + if (c <= 0) + { + TiXmlDocument* document = GetDocument(); + if (document) + document->SetError(TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + + if (c == '>') + break; + + *tag += (char) c; + in->get(); + + if (!firstCharFound && c != '<' && !IsWhiteSpace(c)) + { + firstCharFound = true; + if (c == '/') + closingTag = true; + } + } + // If it was a closing tag, then read in the closing '>' to clean up the input stream. + // If it was not, the streaming will be done by the tag. + if (closingTag) + { + if (!in->good()) + return; + + int c = in->get(); + if (c <= 0) + { + TiXmlDocument* document = GetDocument(); + if (document) + document->SetError(TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + assert(c == '>'); + *tag += (char) c; + + // We are done, once we've found our closing tag. + return; + } + else + { + // If not a closing tag, id it, and stream. + const char* tagloc = tag->c_str() + tagIndex; + TiXmlNode* node = Identify(tagloc, TIXML_DEFAULT_ENCODING); + if (!node) + return; + node->StreamIn(in, tag); + delete node; + node = 0; + + // No return: go around from the beginning: text, closing tag, or node. + } + } + } +} +#endif + +const char* TiXmlElement::Parse(const char* p, TiXmlParsingData* data, TiXmlEncoding encoding) +{ + p = SkipWhiteSpace(p, encoding); + TiXmlDocument* document = GetDocument(); + + if (!p || !*p) + { + if (document) document->SetError(TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding); + return 0; + } + + if (data) + { + data->Stamp(p, encoding); + location = data->Cursor(); + } + + if (*p != '<') + { + if (document) document->SetError(TIXML_ERROR_PARSING_ELEMENT, p, data, encoding); + return 0; + } + + p = SkipWhiteSpace(p+1, encoding); + + // Read the name. + const char* pErr = p; + + p = ReadName(p, &value, encoding); + if (!p || !*p) + { + if (document) document->SetError(TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding); + return 0; + } + + TIXML_STRING endTag ("</"); + endTag += value; + endTag += ">"; + + // Check for and read attributes. Also look for an empty + // tag or an end tag. + while (p && *p) + { + pErr = p; + p = SkipWhiteSpace(p, encoding); + if (!p || !*p) + { + if (document) document->SetError(TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding); + return 0; + } + if (*p == '/') + { + ++p; + // Empty tag. + if (*p != '>') + { + if (document) document->SetError(TIXML_ERROR_PARSING_EMPTY, p, data, encoding); + return 0; + } + return (p+1); + } + else if (*p == '>') + { + // Done with attributes (if there were any.) + // Read the value -- which can include other + // elements -- read the end tag, and return. + ++p; + p = ReadValue(p, data, encoding); // Note this is an Element method, and will set the error if one happens. + if (!p || !*p) + return 0; + + // We should find the end tag now + if (StringEqual(p, endTag.c_str(), false, encoding)) + { + p += endTag.length(); + return p; + } + else + { + if (document) document->SetError(TIXML_ERROR_READING_END_TAG, p, data, encoding); + return 0; + } + } + else + { + // Try to read an attribute: + TiXmlAttribute* attrib = new TiXmlAttribute(); + if (!attrib) + { + if (document) document->SetError(TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding); + return 0; + } + + attrib->SetDocument(document); + const char* pErr = p; + p = attrib->Parse(p, data, encoding); + + if (!p || !*p) + { + if (document) document->SetError(TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding); + delete attrib; + return 0; + } + + // Handle the strange case of double attributes: + TiXmlAttribute* node = attributeSet.Find(attrib->NameTStr()); + if (node) + { + node->SetValue(attrib->Value()); + delete attrib; + return 0; + } + + attributeSet.Add(attrib); + } + } + return p; +} + + +const char* TiXmlElement::ReadValue(const char* p, TiXmlParsingData* data, TiXmlEncoding encoding) +{ + TiXmlDocument* document = GetDocument(); + + // Read in text and elements in any order. + const char* pWithWhiteSpace = p; + p = SkipWhiteSpace(p, encoding); + + while (p && *p) + { + if (*p != '<') + { + // Take what we have, make a text element. + TiXmlText* textNode = new TiXmlText(""); + + if (!textNode) + { + if (document) document->SetError(TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding); + return 0; + } + + if (TiXmlBase::IsWhiteSpaceCondensed()) + { + p = textNode->Parse(p, data, encoding); + } + else + { + // Special case: we want to keep the white space + // so that leading spaces aren't removed. + p = textNode->Parse(pWithWhiteSpace, data, encoding); + } + + if (!textNode->Blank()) + LinkEndChild(textNode); + else + delete textNode; + } + else + { + // We hit a '<' + // Have we hit a new element or an end tag? This could also be + // a TiXmlText in the "CDATA" style. + if (StringEqual(p, "</", false, encoding)) + { + return p; + } + else + { + TiXmlNode* node = Identify(p, encoding); + if (node) + { + p = node->Parse(p, data, encoding); + LinkEndChild(node); + } + else + { + return 0; + } + } + } + pWithWhiteSpace = p; + p = SkipWhiteSpace(p, encoding); + } + + if (!p) + { + if (document) document->SetError(TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding); + } + return p; +} + + +#ifdef TIXML_USE_STL +void TiXmlUnknown::StreamIn(TIXML_ISTREAM * in, TIXML_STRING * tag) +{ + while (in->good()) + { + int c = in->get(); + if (c <= 0) + { + TiXmlDocument* document = GetDocument(); + if (document) + document->SetError(TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + (*tag) += (char) c; + + if (c == '>') + { + // All is well. + return; + } + } +} +#endif + + +const char* TiXmlUnknown::Parse(const char* p, TiXmlParsingData* data, TiXmlEncoding encoding) +{ + TiXmlDocument* document = GetDocument(); + p = SkipWhiteSpace(p, encoding); + + if (data) + { + data->Stamp(p, encoding); + location = data->Cursor(); + } + if (!p || !*p || *p != '<') + { + if (document) document->SetError(TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding); + return 0; + } + ++p; + value = ""; + + while (p && *p && *p != '>') + { + value += *p; + ++p; + } + + if (!p) + { + if (document) document->SetError(TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding); + } + if (*p == '>') + return p+1; + return p; +} + +#ifdef TIXML_USE_STL +void TiXmlComment::StreamIn(TIXML_ISTREAM * in, TIXML_STRING * tag) +{ + while (in->good()) + { + int c = in->get(); + if (c <= 0) + { + TiXmlDocument* document = GetDocument(); + if (document) + document->SetError(TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + + (*tag) += (char) c; + + if (c == '>' + && tag->at(tag->length() - 2) == '-' + && tag->at(tag->length() - 3) == '-') + { + // All is well. + return; + } + } +} +#endif + + +const char* TiXmlComment::Parse(const char* p, TiXmlParsingData* data, TiXmlEncoding encoding) +{ + TiXmlDocument* document = GetDocument(); + value = ""; + + p = SkipWhiteSpace(p, encoding); + + if (data) + { + data->Stamp(p, encoding); + location = data->Cursor(); + } + const char* startTag = "<!--"; + const char* endTag = "-->"; + + if (!StringEqual(p, startTag, false, encoding)) + { + document->SetError(TIXML_ERROR_PARSING_COMMENT, p, data, encoding); + return 0; + } + p += strlen(startTag); + p = ReadText(p, &value, false, endTag, false, encoding); + return p; +} + + +const char* TiXmlAttribute::Parse(const char* p, TiXmlParsingData* data, TiXmlEncoding encoding) +{ + p = SkipWhiteSpace(p, encoding); + if (!p || !*p) return 0; + +// int tabsize = 4; +// if (document) +// tabsize = document->TabSize(); + + if (data) + { + data->Stamp(p, encoding); + location = data->Cursor(); + } + // Read the name, the '=' and the value. + const char* pErr = p; + p = ReadName(p, &name, encoding); + if (!p || !*p) + { + if (document) document->SetError(TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding); + return 0; + } + p = SkipWhiteSpace(p, encoding); + if (!p || !*p || *p != '=') + { + if (document) document->SetError(TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding); + return 0; + } + + ++p; // skip '=' + p = SkipWhiteSpace(p, encoding); + if (!p || !*p) + { + if (document) document->SetError(TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding); + return 0; + } + + const char* end; + const char SINGLE_QUOTE = '\''; + const char DOUBLE_QUOTE = '\"'; + + if (*p == SINGLE_QUOTE) + { + ++p; + end = "\'"; // single quote in string + p = ReadText(p, &value, false, end, false, encoding); + } + else if (*p == DOUBLE_QUOTE) + { + ++p; + end = "\""; // double quote in string + p = ReadText(p, &value, false, end, false, encoding); + } + else + { + // All attribute values should be in single or double quotes. + // But this is such a common error that the parser will try + // its best, even without them. + value = ""; + while ( p && *p // existence + && !IsWhiteSpace(*p) && *p != '\n' && *p != '\r' // whitespace + && *p != '/' && *p != '>') // tag end + { + if (*p == SINGLE_QUOTE || *p == DOUBLE_QUOTE) { + // [ 1451649 ] Attribute values with trailing quotes not handled correctly + // We did not have an opening quote but seem to have a + // closing one. Give up and throw an error. + if (document) document->SetError(TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding); + return 0; + } + value += *p; + ++p; + } + } + return p; +} + +#ifdef TIXML_USE_STL +void TiXmlText::StreamIn(TIXML_ISTREAM * in, TIXML_STRING * tag) +{ + if (cdata) + { + int c = in->get(); + if (c <= 0) + { + TiXmlDocument* document = GetDocument(); + if (document) + document->SetError(TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + + (*tag) += (char) c; + + if (c == '>' + && tag->at(tag->length() - 2) == ']' + && tag->at(tag->length() - 3) == ']') + { + // All is well. + return; + } + } + else + { + while (in->good()) + { + int c = in->peek(); + if (c == '<') + return; + if (c <= 0) + { + TiXmlDocument* document = GetDocument(); + if (document) + document->SetError(TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + + (*tag) += (char) c; + in->get(); + } + } +} +#endif + +const char* TiXmlText::Parse(const char* p, TiXmlParsingData* data, TiXmlEncoding encoding) +{ + value = ""; + TiXmlDocument* document = GetDocument(); + + if (data) + { + data->Stamp(p, encoding); + location = data->Cursor(); + } + + const char* const startTag = "<![CDATA["; + const char* const endTag = "]]>"; + + if (cdata || StringEqual(p, startTag, false, encoding)) + { + cdata = true; + + if (!StringEqual(p, startTag, false, encoding)) + { + document->SetError(TIXML_ERROR_PARSING_CDATA, p, data, encoding); + return 0; + } + p += strlen(startTag); + + // Keep all the white space, ignore the encoding, etc. + while ( p && *p + && !StringEqual(p, endTag, false, encoding) + ) + { + value += *p; + ++p; + } + + TIXML_STRING dummy; + p = ReadText(p, &dummy, false, endTag, false, encoding); + return p; + } + else + { + bool ignoreWhite = true; + + const char* end = "<"; + p = ReadText(p, &value, ignoreWhite, end, false, encoding); + if (p) + return p-1; // don't truncate the '<' + return 0; + } +} + +#ifdef TIXML_USE_STL +void TiXmlDeclaration::StreamIn(TIXML_ISTREAM * in, TIXML_STRING * tag) +{ + while (in->good()) + { + int c = in->get(); + if (c <= 0) + { + TiXmlDocument* document = GetDocument(); + if (document) + document->SetError(TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN); + return; + } + (*tag) += (char) c; + + if (c == '>') + { + // All is well. + return; + } + } +} +#endif + +const char* TiXmlDeclaration::Parse(const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding) +{ + p = SkipWhiteSpace(p, _encoding); + // Find the beginning, find the end, and look for + // the stuff in-between. + TiXmlDocument* document = GetDocument(); + if (!p || !*p || !StringEqual(p, "<?xml", true, _encoding)) + { + if (document) document->SetError(TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding); + return 0; + } + if (data) + { + data->Stamp(p, _encoding); + location = data->Cursor(); + } + p += 5; + + version = ""; + encoding = ""; + standalone = ""; + + while (p && *p) + { + if (*p == '>') + { + ++p; + return p; + } + + p = SkipWhiteSpace(p, _encoding); + if (StringEqual(p, "version", true, _encoding)) + { + TiXmlAttribute attrib; + p = attrib.Parse(p, data, _encoding); + version = attrib.Value(); + } + else if (StringEqual(p, "encoding", true, _encoding)) + { + TiXmlAttribute attrib; + p = attrib.Parse(p, data, _encoding); + encoding = attrib.Value(); + } + else if (StringEqual(p, "standalone", true, _encoding)) + { + TiXmlAttribute attrib; + p = attrib.Parse(p, data, _encoding); + standalone = attrib.Value(); + } + else + { + // Read over whatever it is. + while (p && *p && *p != '>' && !IsWhiteSpace(*p)) + ++p; + } + } + return 0; +} + +bool TiXmlText::Blank() const +{ + for (unsigned i=0; i<value.length(); i++) + if (!IsWhiteSpace(value[i])) + return false; + return true; +} + |