diff options
Diffstat (limited to 'plugins/Variables/libxml/HTMLparser.h')
| -rw-r--r-- | plugins/Variables/libxml/HTMLparser.h | 159 | 
1 files changed, 159 insertions, 0 deletions
diff --git a/plugins/Variables/libxml/HTMLparser.h b/plugins/Variables/libxml/HTMLparser.h new file mode 100644 index 0000000000..c6d8899c30 --- /dev/null +++ b/plugins/Variables/libxml/HTMLparser.h @@ -0,0 +1,159 @@ +/* + * HTMLparser.h : interface for an HTML 4.0 non-verifying parser + * + * See Copyright for the status of this software. + * + * daniel@veillard.com + */ + +#ifndef __HTML_PARSER_H__ +#define __HTML_PARSER_H__ +#include <libxml/parser.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Most of the back-end structures from XML and HTML are shared. + */ +typedef xmlParserCtxt htmlParserCtxt; +typedef xmlParserCtxtPtr htmlParserCtxtPtr; +typedef xmlParserNodeInfo htmlParserNodeInfo; +typedef xmlSAXHandler htmlSAXHandler; +typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; +typedef xmlParserInput htmlParserInput; +typedef xmlParserInputPtr htmlParserInputPtr; +typedef xmlDocPtr htmlDocPtr; +typedef xmlNodePtr htmlNodePtr; + +/* + * Internal description of an HTML element, representing HTML 4.01 + * and XHTML 1.0 (which share the same structure). + */ +typedef struct _htmlElemDesc htmlElemDesc; +typedef htmlElemDesc *htmlElemDescPtr; +struct _htmlElemDesc { +    const char *name;	/* The tag name */ +    char startTag;      /* Whether the start tag can be implied */ +    char endTag;        /* Whether the end tag can be implied */ +    char saveEndTag;    /* Whether the end tag should be saved */ +    char empty;         /* Is this an empty element ? */ +    char depr;          /* Is this a deprecated element ? */ +    char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */ +    char isinline;      /* is this a block 0 or inline 1 element */ +    const char *desc;   /* the description */ + +/* NRK Jan.2003 + * New fields encapsulating HTML structure + * + * Bugs: + *	This is a very limited representation.  It fails to tell us when + *	an element *requires* subelements (we only have whether they're + *	allowed or not), and it doesn't tell us where CDATA and PCDATA + *	are allowed.  Some element relationships are not fully represented: + *	these are flagged with the word MODIFIER + */ +    const char** subelts;		/* allowed sub-elements of this element */ +    const char* defaultsubelt;	/* subelement for suggested auto-repair +					   if necessary or NULL */ +    const char** attrs_opt;		/* Optional Attributes */ +    const char** attrs_depr;		/* Additional deprecated attributes */ +    const char** attrs_req;		/* Required attributes */ +}; + +/* + * Internal description of an HTML entity. + */ +typedef struct _htmlEntityDesc htmlEntityDesc; +typedef htmlEntityDesc *htmlEntityDescPtr; +struct _htmlEntityDesc { +    unsigned int value;	/* the UNICODE value for the character */ +    const char *name;	/* The entity name */ +    const char *desc;   /* the description */ +}; + +/* + * There is only few public functions. + */ +const htmlElemDesc * 	htmlTagLookup	(const xmlChar *tag); +const htmlEntityDesc * 	htmlEntityLookup(const xmlChar *name); +const htmlEntityDesc * 	htmlEntityValueLookup(unsigned int value); + +int			htmlIsAutoClosed(htmlDocPtr doc, +					 htmlNodePtr elem); +int			htmlAutoCloseTag(htmlDocPtr doc, +					 const xmlChar *name, +					 htmlNodePtr elem); +const htmlEntityDesc * 	htmlParseEntityRef(htmlParserCtxtPtr ctxt, +					 xmlChar **str); +int			htmlParseCharRef(htmlParserCtxtPtr ctxt); +void			htmlParseElement(htmlParserCtxtPtr ctxt); + +int			htmlParseDocument(htmlParserCtxtPtr ctxt); +htmlDocPtr		htmlSAXParseDoc	(xmlChar *cur, +					 const char *encoding, +					 htmlSAXHandlerPtr sax, +					 void *userData); +htmlDocPtr		htmlParseDoc	(xmlChar *cur, +					 const char *encoding); +htmlDocPtr		htmlSAXParseFile(const char *filename, +					 const char *encoding, +					 htmlSAXHandlerPtr sax, +					 void *userData); +htmlDocPtr		htmlParseFile	(const char *filename, +					 const char *encoding); +int			UTF8ToHtml	(unsigned char *out, +					 int *outlen, +					 const unsigned char *in, +					 int *inlen); +int			htmlEncodeEntities(unsigned char *out, +					 int *outlen, +					 const unsigned char *in, +					 int *inlen, int quoteChar); +int			htmlIsScriptAttribute(const xmlChar *name); +int			htmlHandleOmittedElem(int val); + +/** + * Interfaces for the Push mode. + */ +void			htmlFreeParserCtxt	(htmlParserCtxtPtr ctxt); +htmlParserCtxtPtr	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, +						 void *user_data, +						 const char *chunk, +						 int size, +						 const char *filename, +						 xmlCharEncoding enc); +int			htmlParseChunk		(htmlParserCtxtPtr ctxt, +						 const char *chunk, +						 int size, +						 int terminate); + +/* NRK/Jan2003: further knowledge of HTML structure + */ +typedef enum { +  HTML_NA = 0 ,		/* something we don't check at all */ +  HTML_INVALID = 0x1 , +  HTML_DEPRECATED = 0x2 , +  HTML_VALID = 0x4 , +  HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ +} htmlStatus ; + +/* Using htmlElemDesc rather than name here, to emphasise the fact +   that otherwise there's a lookup overhead +*/ +htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; +int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; +htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; +htmlStatus htmlNodeStatus(const htmlNodePtr, int) ; +#define htmlDefaultSubelement(elt) elt->defaultsubelt +#define htmlElementAllowedHereDesc(parent,elt) \ +	htmlElementAllowedHere((parent), (elt)->name) +#define htmlRequiredAttrs(elt) (elt)->attrs_req + + +#ifdef __cplusplus +} +#endif + +#endif /* __HTML_PARSER_H__ */  | 
