diff options
Diffstat (limited to 'plugins/Variables/libxml/HTMLparser.h')
-rw-r--r-- | plugins/Variables/libxml/HTMLparser.h | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/plugins/Variables/libxml/HTMLparser.h b/plugins/Variables/libxml/HTMLparser.h new file mode 100644 index 0000000000..c6d8899c30 --- /dev/null +++ b/plugins/Variables/libxml/HTMLparser.h @@ -0,0 +1,159 @@ +/* + * HTMLparser.h : interface for an HTML 4.0 non-verifying parser + * + * See Copyright for the status of this software. + * + * daniel@veillard.com + */ + +#ifndef __HTML_PARSER_H__ +#define __HTML_PARSER_H__ +#include <libxml/parser.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Most of the back-end structures from XML and HTML are shared. + */ +typedef xmlParserCtxt htmlParserCtxt; +typedef xmlParserCtxtPtr htmlParserCtxtPtr; +typedef xmlParserNodeInfo htmlParserNodeInfo; +typedef xmlSAXHandler htmlSAXHandler; +typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; +typedef xmlParserInput htmlParserInput; +typedef xmlParserInputPtr htmlParserInputPtr; +typedef xmlDocPtr htmlDocPtr; +typedef xmlNodePtr htmlNodePtr; + +/* + * Internal description of an HTML element, representing HTML 4.01 + * and XHTML 1.0 (which share the same structure). + */ +typedef struct _htmlElemDesc htmlElemDesc; +typedef htmlElemDesc *htmlElemDescPtr; +struct _htmlElemDesc { + const char *name; /* The tag name */ + char startTag; /* Whether the start tag can be implied */ + char endTag; /* Whether the end tag can be implied */ + char saveEndTag; /* Whether the end tag should be saved */ + char empty; /* Is this an empty element ? */ + char depr; /* Is this a deprecated element ? */ + char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ + char isinline; /* is this a block 0 or inline 1 element */ + const char *desc; /* the description */ + +/* NRK Jan.2003 + * New fields encapsulating HTML structure + * + * Bugs: + * This is a very limited representation. It fails to tell us when + * an element *requires* subelements (we only have whether they're + * allowed or not), and it doesn't tell us where CDATA and PCDATA + * are allowed. Some element relationships are not fully represented: + * these are flagged with the word MODIFIER + */ + const char** subelts; /* allowed sub-elements of this element */ + const char* defaultsubelt; /* subelement for suggested auto-repair + if necessary or NULL */ + const char** attrs_opt; /* Optional Attributes */ + const char** attrs_depr; /* Additional deprecated attributes */ + const char** attrs_req; /* Required attributes */ +}; + +/* + * Internal description of an HTML entity. + */ +typedef struct _htmlEntityDesc htmlEntityDesc; +typedef htmlEntityDesc *htmlEntityDescPtr; +struct _htmlEntityDesc { + unsigned int value; /* the UNICODE value for the character */ + const char *name; /* The entity name */ + const char *desc; /* the description */ +}; + +/* + * There is only few public functions. + */ +const htmlElemDesc * htmlTagLookup (const xmlChar *tag); +const htmlEntityDesc * htmlEntityLookup(const xmlChar *name); +const htmlEntityDesc * htmlEntityValueLookup(unsigned int value); + +int htmlIsAutoClosed(htmlDocPtr doc, + htmlNodePtr elem); +int htmlAutoCloseTag(htmlDocPtr doc, + const xmlChar *name, + htmlNodePtr elem); +const htmlEntityDesc * htmlParseEntityRef(htmlParserCtxtPtr ctxt, + xmlChar **str); +int htmlParseCharRef(htmlParserCtxtPtr ctxt); +void htmlParseElement(htmlParserCtxtPtr ctxt); + +int htmlParseDocument(htmlParserCtxtPtr ctxt); +htmlDocPtr htmlSAXParseDoc (xmlChar *cur, + const char *encoding, + htmlSAXHandlerPtr sax, + void *userData); +htmlDocPtr htmlParseDoc (xmlChar *cur, + const char *encoding); +htmlDocPtr htmlSAXParseFile(const char *filename, + const char *encoding, + htmlSAXHandlerPtr sax, + void *userData); +htmlDocPtr htmlParseFile (const char *filename, + const char *encoding); +int UTF8ToHtml (unsigned char *out, + int *outlen, + const unsigned char *in, + int *inlen); +int htmlEncodeEntities(unsigned char *out, + int *outlen, + const unsigned char *in, + int *inlen, int quoteChar); +int htmlIsScriptAttribute(const xmlChar *name); +int htmlHandleOmittedElem(int val); + +/** + * Interfaces for the Push mode. + */ +void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); +htmlParserCtxtPtr htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, + void *user_data, + const char *chunk, + int size, + const char *filename, + xmlCharEncoding enc); +int htmlParseChunk (htmlParserCtxtPtr ctxt, + const char *chunk, + int size, + int terminate); + +/* NRK/Jan2003: further knowledge of HTML structure + */ +typedef enum { + HTML_NA = 0 , /* something we don't check at all */ + HTML_INVALID = 0x1 , + HTML_DEPRECATED = 0x2 , + HTML_VALID = 0x4 , + HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ +} htmlStatus ; + +/* Using htmlElemDesc rather than name here, to emphasise the fact + that otherwise there's a lookup overhead +*/ +htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; +int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; +htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; +htmlStatus htmlNodeStatus(const htmlNodePtr, int) ; +#define htmlDefaultSubelement(elt) elt->defaultsubelt +#define htmlElementAllowedHereDesc(parent,elt) \ + htmlElementAllowedHere((parent), (elt)->name) +#define htmlRequiredAttrs(elt) (elt)->attrs_req + + +#ifdef __cplusplus +} +#endif + +#endif /* __HTML_PARSER_H__ */ |