diff options
author | George Hazan <george.hazan@gmail.com> | 2024-03-18 12:13:54 +0300 |
---|---|---|
committer | George Hazan <george.hazan@gmail.com> | 2024-03-18 12:13:54 +0300 |
commit | 705c4d24c9c61edffc82864bf9c24384dc29a8d7 (patch) | |
tree | 4d21f87671db36b99402da3221d45b64c257c1fe /libs/litehtml/src/gumbo/include | |
parent | 5784fc3a62b9136c6690ed45ec7b505f35512e08 (diff) |
litehtml - lightweight html renderer
Diffstat (limited to 'libs/litehtml/src/gumbo/include')
18 files changed, 2178 insertions, 0 deletions
diff --git a/libs/litehtml/src/gumbo/include/gumbo.h b/libs/litehtml/src/gumbo/include/gumbo.h new file mode 100644 index 0000000000..27e6c6c575 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo.h @@ -0,0 +1,675 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and +// GUMBO_ as a prefix for enum constants (static constants get the Google-style +// kGumbo prefix). + +/** + * @file + * @mainpage Gumbo HTML Parser + * + * This provides a conformant, no-dependencies implementation of the HTML5 + * parsing algorithm. It supports only UTF8; if you need to parse a different + * encoding, run a preprocessing step to convert to UTF8. It returns a parse + * tree made of the structs in this file. + * + * Example: + * @code + * GumboOutput* output = gumbo_parse(input); + * do_something_with_doctype(output->document); + * do_something_with_html_tree(output->root); + * gumbo_destroy_output(&options, output); + * @endcode + * HTML5 Spec: + * + * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html + */ + +#ifndef GUMBO_GUMBO_H_ +#define GUMBO_GUMBO_H_ + +#ifdef _MSC_VER +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS +#endif +#ifndef fileno +#define fileno _fileno +#endif +#endif + +#include <stdbool.h> +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * A struct representing a character position within the original text buffer. + * Line and column numbers are 1-based and offsets are 0-based, which matches + * how most editors and command-line tools work. Also, columns measure + * positions in terms of characters while offsets measure by bytes; this is + * because the offset field is often used to pull out a particular region of + * text (which in most languages that bind to C implies pointer arithmetic on a + * buffer of bytes), while the column field is often used to reference a + * particular column on a printable display, which nowadays is usually UTF-8. + */ +typedef struct { + unsigned int line; + unsigned int column; + unsigned int offset; +} GumboSourcePosition; + +/** + * A SourcePosition used for elements that have no source position, i.e. + * parser-inserted elements. + */ +extern const GumboSourcePosition kGumboEmptySourcePosition; + +/** + * A struct representing a string or part of a string. Strings within the + * parser are represented by a char* and a length; the char* points into + * an existing data buffer owned by some other code (often the original input). + * GumboStringPieces are assumed (by convention) to be immutable, because they + * may share data. Use GumboStringBuffer if you need to construct a string. + * Clients should assume that it is not NUL-terminated, and should always use + * explicit lengths when manipulating them. + */ +typedef struct { + /** A pointer to the beginning of the string. NULL iff length == 0. */ + const char* data; + + /** The length of the string fragment, in bytes. May be zero. */ + size_t length; +} GumboStringPiece; + +/** A constant to represent a 0-length null string. */ +extern const GumboStringPiece kGumboEmptyString; + +/** + * Compares two GumboStringPieces, and returns true if they're equal or false + * otherwise. + */ +bool gumbo_string_equals( + const GumboStringPiece* str1, const GumboStringPiece* str2); + +/** + * Compares two GumboStringPieces ignoring case, and returns true if they're + * equal or false otherwise. + */ +bool gumbo_string_equals_ignore_case( + const GumboStringPiece* str1, const GumboStringPiece* str2); + +/** + * A simple vector implementation. This stores a pointer to a data array and a + * length. All elements are stored as void*; client code must cast to the + * appropriate type. Overflows upon addition result in reallocation of the data + * array, with the size doubling to maintain O(1) amortized cost. There is no + * removal function, as this isn't needed for any of the operations within this + * library. Iteration can be done through inspecting the structure directly in + * a for-loop. + */ +typedef struct { + /** Data elements. This points to a dynamically-allocated array of capacity + * elements, each a void* to the element itself. + */ + void** data; + + /** Number of elements currently in the vector. */ + unsigned int length; + + /** Current array capacity. */ + unsigned int capacity; +} GumboVector; + +/** An empty (0-length, 0-capacity) GumboVector. */ +extern const GumboVector kGumboEmptyVector; + +/** + * Returns the first index at which an element appears in this vector (testing + * by pointer equality), or -1 if it never does. + */ +int gumbo_vector_index_of(GumboVector* vector, const void* element); + +/** + * An enum for all the tags defined in the HTML5 standard. These correspond to + * the tag names themselves. Enum constants exist only for tags which appear in + * the spec itself (or for tags with special handling in the SVG and MathML + * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag + * name can be obtained through original_tag. + * + * This is mostly for API convenience, so that clients of this library don't + * need to perform a strcasecmp to find the normalized tag name. It also has + * efficiency benefits, by letting the parser work with enums instead of + * strings. + */ +typedef enum { +// Load all the tags from an external source, generated from tag.in. +#include "gumbo/tag_enum.h" + // Used for all tags that don't have special handling in HTML. Add new tags + // to the end of tag.in so as to preserve backwards-compatibility. + GUMBO_TAG_UNKNOWN, + // A marker value to indicate the end of the enum, for iterating over it. + // Also used as the terminator for varargs functions that take tags. + GUMBO_TAG_LAST, +} GumboTag; + +/** + * Returns the normalized (usually all-lowercased, except for foreign content) + * tag name for an GumboTag enum. Return value is static data owned by the + * library. + */ +const char* gumbo_normalized_tagname(GumboTag tag); + +/** + * Extracts the tag name from the original_text field of an element or token by + * stripping off </> characters and attributes and adjusting the passed-in + * GumboStringPiece appropriately. The tag name is in the original case and + * shares a buffer with the original text, to simplify memory management. + * Behavior is undefined if a string-piece that doesn't represent an HTML tag + * (<tagname> or </tagname>) is passed in. If the string piece is completely + * empty (NULL data pointer), then this function will exit successfully as a + * no-op. + */ +void gumbo_tag_from_original_text(GumboStringPiece* text); + +/** + * Fixes the case of SVG elements that are not all lowercase. + * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign + * This is not done at parse time because there's no place to store a mutated + * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags + * without special handling), while original_tag_name is a pointer into the + * original buffer. Instead, we provide this helper function that clients can + * use to rename SVG tags as appropriate. + * Returns the case-normalized SVG tagname if a replacement is found, or NULL if + * no normalization is called for. The return value is static data and owned by + * the library. + */ +const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname); + +/** + * Converts a tag name string (which may be in upper or mixed case) to a tag + * enum. The `tag` version expects `tagname` to be NULL-terminated + */ +GumboTag gumbo_tag_enum(const char* tagname); +GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length); + +/** + * Attribute namespaces. + * HTML includes special handling for XLink, XML, and XMLNS namespaces on + * attributes. Everything else goes in the generic "NONE" namespace. + */ +typedef enum { + GUMBO_ATTR_NAMESPACE_NONE, + GUMBO_ATTR_NAMESPACE_XLINK, + GUMBO_ATTR_NAMESPACE_XML, + GUMBO_ATTR_NAMESPACE_XMLNS, +} GumboAttributeNamespaceEnum; + +/** + * A struct representing a single attribute on an HTML tag. This is a + * name-value pair, but also includes information about source locations and + * original source text. + */ +typedef struct { + /** + * The namespace for the attribute. This will usually be + * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special + * values, per: + * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes + */ + GumboAttributeNamespaceEnum attr_namespace; + + /** + * The name of the attribute. This is in a freshly-allocated buffer to deal + * with case-normalization, and is null-terminated. + */ + const char* name; + + /** + * The original text of the attribute name, as a pointer into the original + * source buffer. + */ + GumboStringPiece original_name; + + /** + * The value of the attribute. This is in a freshly-allocated buffer to deal + * with unescaping, and is null-terminated. It does not include any quotes + * that surround the attribute. If the attribute has no value (for example, + * 'selected' on a checkbox), this will be an empty string. + */ + const char* value; + + /** + * The original text of the value of the attribute. This points into the + * original source buffer. It includes any quotes that surround the + * attribute, and you can look at original_value.data[0] and + * original_value.data[original_value.length - 1] to determine what the quote + * characters were. If the attribute has no value, this will be a 0-length + * string. + */ + GumboStringPiece original_value; + + /** The starting position of the attribute name. */ + GumboSourcePosition name_start; + + /** + * The ending position of the attribute name. This is not always derivable + * from the starting position of the value because of the possibility of + * whitespace around the = sign. + */ + GumboSourcePosition name_end; + + /** The starting position of the attribute value. */ + GumboSourcePosition value_start; + + /** The ending position of the attribute value. */ + GumboSourcePosition value_end; +} GumboAttribute; + +/** + * Given a vector of GumboAttributes, look up the one with the specified name + * and return it, or NULL if no such attribute exists. This uses a + * case-insensitive match, as HTML is case-insensitive. + */ +GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name); + +/** + * Enum denoting the type of node. This determines the type of the node.v + * union. + */ +typedef enum { + /** Document node. v will be a GumboDocument. */ + GUMBO_NODE_DOCUMENT, + /** Element node. v will be a GumboElement. */ + GUMBO_NODE_ELEMENT, + /** Text node. v will be a GumboText. */ + GUMBO_NODE_TEXT, + /** CDATA node. v will be a GumboText. */ + GUMBO_NODE_CDATA, + /** Comment node. v will be a GumboText, excluding comment delimiters. */ + GUMBO_NODE_COMMENT, + /** Text node, where all contents is whitespace. v will be a GumboText. */ + GUMBO_NODE_WHITESPACE, + /** Template node. This is separate from GUMBO_NODE_ELEMENT because many + * client libraries will want to ignore the contents of template nodes, as + * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing + * here, while clients that want to include template contents should also + * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */ + GUMBO_NODE_TEMPLATE +} GumboNodeType; + +/** + * Forward declaration of GumboNode so it can be used recursively in + * GumboNode.parent. + */ +typedef struct GumboInternalNode GumboNode; + +/** + * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode + */ +typedef enum { + GUMBO_DOCTYPE_NO_QUIRKS, + GUMBO_DOCTYPE_QUIRKS, + GUMBO_DOCTYPE_LIMITED_QUIRKS +} GumboQuirksModeEnum; + +/** + * Namespaces. + * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather, + * anything inside an <svg> tag is in the SVG namespace, anything inside the + * <math> tag is in the MathML namespace, and anything else is inside the HTML + * namespace. No other namespaces are supported, so this can be an enum only. + */ +typedef enum { + GUMBO_NAMESPACE_HTML, + GUMBO_NAMESPACE_SVG, + GUMBO_NAMESPACE_MATHML +} GumboNamespaceEnum; + +/** + * Parse flags. + * We track the reasons for parser insertion of nodes and store them in a + * bitvector in the node itself. This lets client code optimize out nodes that + * are implied by the HTML structure of the document, or flag constructs that + * may not be allowed by a style guide, or track the prevalence of incorrect or + * tricky HTML code. + */ +typedef enum { + /** + * A normal node - both start and end tags appear in the source, nothing has + * been reparented. + */ + GUMBO_INSERTION_NORMAL = 0, + + /** + * A node inserted by the parser to fulfill some implicit insertion rule. + * This is usually set in addition to some other flag giving a more specific + * insertion reason; it's a generic catch-all term meaning "The start tag for + * this node did not appear in the document source". + */ + GUMBO_INSERTION_BY_PARSER = 1 << 0, + + /** + * A flag indicating that the end tag for this node did not appear in the + * document source. Note that in some cases, you can still have + * parser-inserted nodes with an explicit end tag: for example, "Text</html>" + * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but + * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually + * exists. This flag will be set only if the end tag is completely missing; + * in some cases, the end tag may be misplaced (eg. a </body> tag with text + * afterwards), which will leave this flag unset and require clients to + * inspect the parse errors for that case. + */ + GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1, + + // Value 1 << 2 was for a flag that has since been removed. + + /** + * A flag for nodes that are inserted because their presence is implied by + * other tags, eg. <html>, <head>, <body>, <tbody>, etc. + */ + GUMBO_INSERTION_IMPLIED = 1 << 3, + + /** + * A flag for nodes that are converted from their end tag equivalents. For + * example, </p> when no paragraph is open implies that the parser should + * create a <p> tag and immediately close it, while </br> means the same thing + * as <br>. + */ + GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4, + + /** A flag for nodes that are converted from the parse of an <isindex> tag. */ + GUMBO_INSERTION_FROM_ISINDEX = 1 << 5, + + /** A flag for <image> tags that are rewritten as <img>. */ + GUMBO_INSERTION_FROM_IMAGE = 1 << 6, + + /** + * A flag for nodes that are cloned as a result of the reconstruction of + * active formatting elements. This is set only on the clone; the initial + * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG. + */ + GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7, + + /** A flag for nodes that are cloned by the adoption agency algorithm. */ + GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8, + + /** A flag for nodes that are moved by the adoption agency algorithm. */ + GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9, + + /** + * A flag for nodes that have been foster-parented out of a table (or + * should've been foster-parented, if verbatim mode is set). + */ + GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10, +} GumboParseFlags; + +/** + * Information specific to document nodes. + */ +typedef struct { + /** + * An array of GumboNodes, containing the children of this element. This will + * normally consist of the <html> element and any comment nodes found. + * Pointers are owned. + */ + GumboVector /* GumboNode* */ children; + + // True if there was an explicit doctype token as opposed to it being omitted. + bool has_doctype; + + // Fields from the doctype token, copied verbatim. + const char* name; + const char* public_identifier; + const char* system_identifier; + + /** + * Whether or not the document is in QuirksMode, as determined by the values + * in the GumboTokenDocType template. + */ + GumboQuirksModeEnum doc_type_quirks_mode; +} GumboDocument; + +/** + * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements. + * This contains just a block of text and its position. + */ +typedef struct { + /** + * The text of this node, after entities have been parsed and decoded. For + * comment/cdata nodes, this does not include the comment delimiters. + */ + const char* text; + + /** + * The original text of this node, as a pointer into the original buffer. For + * comment/cdata nodes, this includes the comment delimiters. + */ + GumboStringPiece original_text; + + /** + * The starting position of this node. This corresponds to the position of + * original_text, before entities are decoded. + * */ + GumboSourcePosition start_pos; +} GumboText; + +/** + * The struct used to represent all HTML elements. This contains information + * about the tag, attributes, and child nodes. + */ +typedef struct { + /** + * An array of GumboNodes, containing the children of this element. Pointers + * are owned. + */ + GumboVector /* GumboNode* */ children; + + /** The GumboTag enum for this element. */ + GumboTag tag; + + /** The GumboNamespaceEnum for this element. */ + GumboNamespaceEnum tag_namespace; + + /** + * A GumboStringPiece pointing to the original tag text for this element, + * pointing directly into the source buffer. If the tag was inserted + * algorithmically (for example, <head> or <tbody> insertion), this will be a + * zero-length string. + */ + GumboStringPiece original_tag; + + /** + * A GumboStringPiece pointing to the original end tag text for this element. + * If the end tag was inserted algorithmically, (for example, closing a + * self-closing tag), this will be a zero-length string. + */ + GumboStringPiece original_end_tag; + + /** The source position for the start of the start tag. */ + GumboSourcePosition start_pos; + + /** The source position for the start of the end tag. */ + GumboSourcePosition end_pos; + + /** + * An array of GumboAttributes, containing the attributes for this tag in the + * order that they were parsed. Pointers are owned. + */ + GumboVector /* GumboAttribute* */ attributes; +} GumboElement; + +/** + * A supertype for GumboElement and GumboText, so that we can include one + * generic type in lists of children and cast as necessary to subtypes. + */ +struct GumboInternalNode { + /** The type of node that this is. */ + GumboNodeType type; + + /** Pointer back to parent node. Not owned. */ + GumboNode* parent; + + /** The index within the parent's children vector of this node. */ + size_t index_within_parent; + + /** + * A bitvector of flags containing information about why this element was + * inserted into the parse tree, including a variety of special parse + * situations. + */ + GumboParseFlags parse_flags; + + /** The actual node data. */ + union { + GumboDocument document; // For GUMBO_NODE_DOCUMENT. + GumboElement element; // For GUMBO_NODE_ELEMENT. + GumboText text; // For everything else. + } v; +}; + +/** + * The type for an allocator function. Takes the 'userdata' member of the + * GumboParser struct as its first argument. Semantics should be the same as + * malloc, i.e. return a block of size_t bytes on success or NULL on failure. + * Allocating a block of 0 bytes behaves as per malloc. + */ +// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition. +typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size); + +/** + * The type for a deallocator function. Takes the 'userdata' member of the + * GumboParser struct as its first argument. + */ +typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); + +/** + * Input struct containing configuration options for the parser. + * These let you specify alternate memory managers, provide different error + * handling, etc. + * Use kGumboDefaultOptions for sensible defaults, and only set what you need. + */ +typedef struct GumboInternalOptions { + /** A memory allocator function. Default: malloc. */ + GumboAllocatorFunction allocator; + + /** A memory deallocator function. Default: free. */ + GumboDeallocatorFunction deallocator; + + /** + * An opaque object that's passed in as the first argument to all callbacks + * used by this library. Default: NULL. + */ + void* userdata; + + /** + * The tab-stop size, for computing positions in source code that uses tabs. + * Default: 8. + */ + int tab_stop; + + /** + * Whether or not to stop parsing when the first error is encountered. + * Default: false. + */ + bool stop_on_first_error; + + /** + * The maximum number of errors before the parser stops recording them. This + * is provided so that if the page is totally borked, we don't completely fill + * up the errors vector and exhaust memory with useless redundant errors. Set + * to -1 to disable the limit. + * Default: -1 + */ + int max_errors; + + /** + * The fragment context for parsing: + * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments + * + * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e. + * the regular parsing algorithm. Otherwise, pass the tag enum for the + * intended parent of the parsed fragment. We use just the tag enum rather + * than a full node because that's enough to set all the parsing context we + * need, and it provides some additional flexibility for client code to act as + * if parsing a fragment even when a full HTML tree isn't available. + * + * Default: GUMBO_TAG_LAST + */ + GumboTag fragment_context; + + /** + * The namespace for the fragment context. This lets client code + * differentiate between, say, parsing a <title> tag in SVG vs. parsing it in + * HTML. + * Default: GUMBO_NAMESPACE_HTML + */ + GumboNamespaceEnum fragment_namespace; +} GumboOptions; + +/** Default options struct; use this with gumbo_parse_with_options. */ +extern const GumboOptions kGumboDefaultOptions; + +/** The output struct containing the results of the parse. */ +typedef struct GumboInternalOutput { + /** + * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT + * that contains the entire document as its child. + */ + GumboNode* document; + + /** + * Pointer to the root node. This the <html> tag that forms the root of the + * document. + */ + GumboNode* root; + + /** + * A list of errors that occurred during the parse. + * NOTE: In version 1.0 of this library, the API for errors hasn't been fully + * fleshed out and may change in the future. For this reason, the GumboError + * header isn't part of the public API. Contact us if you need errors + * reported so we can work out something appropriate for your use-case. + */ + GumboVector /* GumboError */ errors; +} GumboOutput; + +/** + * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must + * live at least as long as the parse tree, as some fields (eg. original_text) + * point directly into the original buffer. + * + * This doesn't support buffers longer than 4 gigabytes. + */ +GumboOutput* gumbo_parse(const char* buffer); + +/** + * Extended version of gumbo_parse that takes an explicit options structure, + * buffer, and length. + */ +GumboOutput* gumbo_parse_with_options( + const GumboOptions* options, const char* buffer, size_t buffer_length); + +/** Release the memory used for the parse tree & parse errors. */ +void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_GUMBO_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/attribute.h b/libs/litehtml/src/gumbo/include/gumbo/attribute.h new file mode 100644 index 0000000000..f9b8aea576 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/attribute.h @@ -0,0 +1,37 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) + +#ifndef GUMBO_ATTRIBUTE_H_ +#define GUMBO_ATTRIBUTE_H_ + +#include "gumbo.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct GumboInternalParser; + +// Release the memory used for an GumboAttribute, including the attribute +// itself. +void gumbo_destroy_attribute( + struct GumboInternalParser* parser, GumboAttribute* attribute); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_ATTRIBUTE_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/char_ref.h b/libs/litehtml/src/gumbo/include/gumbo/char_ref.h new file mode 100644 index 0000000000..09d2598f45 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/char_ref.h @@ -0,0 +1,60 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// Internal header for character reference handling; this should not be exposed +// transitively by any public API header. This is why the functions aren't +// namespaced. + +#ifndef GUMBO_CHAR_REF_H_ +#define GUMBO_CHAR_REF_H_ + +#include <stdbool.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct GumboInternalParser; +struct GumboInternalUtf8Iterator; + +// Value that indicates no character was produced. +extern const int kGumboNoChar; + +// Certain named character references generate two codepoints, not one, and so +// the consume_char_ref subroutine needs to return this instead of an int. The +// first field will be kGumboNoChar if no character reference was found; the +// second field will be kGumboNoChar if that is the case or if the character +// reference returns only a single codepoint. +typedef struct { + int first; + int second; +} OneOrTwoCodepoints; + +// Implements the "consume a character reference" section of the spec. +// This reads in characters from the input as necessary, and fills in a +// OneOrTwoCodepoints struct containing the characters read. It may add parse +// errors to the GumboParser's errors vector, if the spec calls for it. Pass a +// space for the "additional allowed char" when the spec says "with no +// additional allowed char". Returns false on parse error, true otherwise. +bool consume_char_ref(struct GumboInternalParser* parser, + struct GumboInternalUtf8Iterator* input, int additional_allowed_char, + bool is_in_attribute, OneOrTwoCodepoints* output); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_CHAR_REF_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/error.h b/libs/litehtml/src/gumbo/include/gumbo/error.h new file mode 100644 index 0000000000..3aa54a6b27 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/error.h @@ -0,0 +1,227 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// Error types, enums, and handling functions. + +#ifndef GUMBO_ERROR_H_ +#define GUMBO_ERROR_H_ +#ifdef _MSC_VER +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS +#endif +#endif +#include <stdint.h> + +#include "gumbo.h" +#include "insertion_mode.h" +#include "string_buffer.h" +#include "token_type.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct GumboInternalParser; + +typedef enum { + GUMBO_ERR_UTF8_INVALID, + GUMBO_ERR_UTF8_TRUNCATED, + GUMBO_ERR_UTF8_NULL, + GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS, + GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON, + GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, + GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON, + GUMBO_ERR_NAMED_CHAR_REF_INVALID, + GUMBO_ERR_TAG_STARTS_WITH_QUESTION, + GUMBO_ERR_TAG_EOF, + GUMBO_ERR_TAG_INVALID, + GUMBO_ERR_CLOSE_TAG_EMPTY, + GUMBO_ERR_CLOSE_TAG_EOF, + GUMBO_ERR_CLOSE_TAG_INVALID, + GUMBO_ERR_SCRIPT_EOF, + GUMBO_ERR_ATTR_NAME_EOF, + GUMBO_ERR_ATTR_NAME_INVALID, + GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF, + GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF, + GUMBO_ERR_ATTR_UNQUOTED_EOF, + GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET, + GUMBO_ERR_ATTR_UNQUOTED_EQUALS, + GUMBO_ERR_ATTR_AFTER_EOF, + GUMBO_ERR_ATTR_AFTER_INVALID, + GUMBO_ERR_DUPLICATE_ATTR, + GUMBO_ERR_SOLIDUS_EOF, + GUMBO_ERR_SOLIDUS_INVALID, + GUMBO_ERR_DASHES_OR_DOCTYPE, + GUMBO_ERR_COMMENT_EOF, + GUMBO_ERR_COMMENT_INVALID, + GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH, + GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH, + GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH, + GUMBO_ERR_COMMENT_END_BANG_EOF, + GUMBO_ERR_DOCTYPE_EOF, + GUMBO_ERR_DOCTYPE_INVALID, + GUMBO_ERR_DOCTYPE_SPACE, + GUMBO_ERR_DOCTYPE_RIGHT_BRACKET, + GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET, + GUMBO_ERR_DOCTYPE_END, + GUMBO_ERR_PARSER, + GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG, +} GumboErrorType; + +// Additional data for duplicated attributes. +typedef struct GumboInternalDuplicateAttrError { + // The name of the attribute. Owned by this struct. + const char* name; + + // The (0-based) index within the attributes vector of the original + // occurrence. + unsigned int original_index; + + // The (0-based) index where the new occurrence would be. + unsigned int new_index; +} GumboDuplicateAttrError; + +// A simplified representation of the tokenizer state, designed to be more +// useful to clients of this library than the internal representation. This +// condenses the actual states used in the tokenizer state machine into a few +// values that will be familiar to users of HTML. +typedef enum { + GUMBO_ERR_TOKENIZER_DATA, + GUMBO_ERR_TOKENIZER_CHAR_REF, + GUMBO_ERR_TOKENIZER_RCDATA, + GUMBO_ERR_TOKENIZER_RAWTEXT, + GUMBO_ERR_TOKENIZER_PLAINTEXT, + GUMBO_ERR_TOKENIZER_SCRIPT, + GUMBO_ERR_TOKENIZER_TAG, + GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG, + GUMBO_ERR_TOKENIZER_ATTR_NAME, + GUMBO_ERR_TOKENIZER_ATTR_VALUE, + GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION, + GUMBO_ERR_TOKENIZER_COMMENT, + GUMBO_ERR_TOKENIZER_DOCTYPE, + GUMBO_ERR_TOKENIZER_CDATA, +} GumboTokenizerErrorState; + +// Additional data for tokenizer errors. +// This records the current state and codepoint encountered - this is usually +// enough to reconstruct what went wrong and provide a friendly error message. +typedef struct GumboInternalTokenizerError { + // The bad codepoint encountered. + int codepoint; + + // The state that the tokenizer was in at the time. + GumboTokenizerErrorState state; +} GumboTokenizerError; + +// Additional data for parse errors. +typedef struct GumboInternalParserError { + // The type of input token that resulted in this error. + GumboTokenType input_type; + + // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token. + GumboTag input_tag; + + // The insertion mode that the parser was in at the time. + GumboInsertionMode parser_state; + + // The tag stack at the point of the error. Note that this is an GumboVector + // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to + // get at the tag. + GumboVector /* GumboTag */ tag_stack; +} GumboParserError; + +// The overall error struct representing an error in decoding/tokenizing/parsing +// the HTML. This contains an enumerated type flag, a source position, and then +// a union of fields containing data specific to the error. +typedef struct GumboInternalError { + // The type of error. + GumboErrorType type; + + // The position within the source file where the error occurred. + GumboSourcePosition position; + + // A pointer to the byte within the original source file text where the error + // occurred (note that this is not the same as position.offset, as that gives + // character-based instead of byte-based offsets). + const char* original_text; + + // Type-specific error information. + union { + // The code point we encountered, for: + // * GUMBO_ERR_UTF8_INVALID + // * GUMBO_ERR_UTF8_TRUNCATED + // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON + // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID + uint64_t codepoint; + + // Tokenizer errors. + GumboTokenizerError tokenizer; + + // Short textual data, for: + // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON + // * GUMBO_ERR_NAMED_CHAR_REF_INVALID + GumboStringPiece text; + + // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR. + GumboDuplicateAttrError duplicate_attr; + + // Parser state, for GUMBO_ERR_PARSER and + // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG. + struct GumboInternalParserError parser; + } v; +} GumboError; + +// Adds a new error to the parser's error list, and returns a pointer to it so +// that clients can fill out the rest of its fields. May return NULL if we're +// already over the max_errors field specified in GumboOptions. +GumboError* gumbo_add_error(struct GumboInternalParser* parser); + +// Initializes the errors vector in the parser. +void gumbo_init_errors(struct GumboInternalParser* errors); + +// Frees all the errors in the 'errors_' field of the parser. +void gumbo_destroy_errors(struct GumboInternalParser* errors); + +// Frees the memory used for a single GumboError. +void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error); + +// Prints an error to a string. This fills an empty GumboStringBuffer with a +// freshly-allocated buffer containing the error message text. The caller is +// responsible for deleting the buffer. (Note that the buffer is allocated with +// the allocator specified in the GumboParser ~config and hence should be freed +// by gumbo_parser_deallocate().) +void gumbo_error_to_string(struct GumboInternalParser* parser, + const GumboError* error, GumboStringBuffer* output); + +// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer +// with a freshly-allocated buffer containing the error message text. The +// caller is responsible for deleting the buffer. (Note that the buffer is +// allocated with the allocator specified in the GumboParser ~config and hence +// should be freed by gumbo_parser_deallocate().) +void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser, + const GumboError* error, const char* source_text, + GumboStringBuffer* output); + +// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead +// of writing to a string. +void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser, + const GumboError* error, const char* source_text); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_ERROR_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/insertion_mode.h b/libs/litehtml/src/gumbo/include/gumbo/insertion_mode.h new file mode 100644 index 0000000000..45134c13b3 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/insertion_mode.h @@ -0,0 +1,57 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) + +#ifndef GUMBO_INSERTION_MODE_H_ +#define GUMBO_INSERTION_MODE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode +// If new enum values are added, be sure to update the kTokenHandlers dispatch +// table in parser.c. +typedef enum { + GUMBO_INSERTION_MODE_INITIAL, + GUMBO_INSERTION_MODE_BEFORE_HTML, + GUMBO_INSERTION_MODE_BEFORE_HEAD, + GUMBO_INSERTION_MODE_IN_HEAD, + GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT, + GUMBO_INSERTION_MODE_AFTER_HEAD, + GUMBO_INSERTION_MODE_IN_BODY, + GUMBO_INSERTION_MODE_TEXT, + GUMBO_INSERTION_MODE_IN_TABLE, + GUMBO_INSERTION_MODE_IN_TABLE_TEXT, + GUMBO_INSERTION_MODE_IN_CAPTION, + GUMBO_INSERTION_MODE_IN_COLUMN_GROUP, + GUMBO_INSERTION_MODE_IN_TABLE_BODY, + GUMBO_INSERTION_MODE_IN_ROW, + GUMBO_INSERTION_MODE_IN_CELL, + GUMBO_INSERTION_MODE_IN_SELECT, + GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE, + GUMBO_INSERTION_MODE_IN_TEMPLATE, + GUMBO_INSERTION_MODE_AFTER_BODY, + GUMBO_INSERTION_MODE_IN_FRAMESET, + GUMBO_INSERTION_MODE_AFTER_FRAMESET, + GUMBO_INSERTION_MODE_AFTER_AFTER_BODY, + GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET +} GumboInsertionMode; + +#ifdef __cplusplus +} // extern C +#endif + +#endif // GUMBO_INSERTION_MODE_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/parser.h b/libs/litehtml/src/gumbo/include/gumbo/parser.h new file mode 100644 index 0000000000..95019e3eca --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/parser.h @@ -0,0 +1,57 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// Contains the definition of the top-level GumboParser structure that's +// threaded through basically every internal function in the library. + +#ifndef GUMBO_PARSER_H_ +#define GUMBO_PARSER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct GumboInternalParserState; +struct GumboInternalOutput; +struct GumboInternalOptions; +struct GumboInternalTokenizerState; + +// An overarching struct that's threaded through (nearly) all functions in the +// library, OOP-style. This gives each function access to the options and +// output, along with any internal state needed for the parse. +typedef struct GumboInternalParser { + // Settings for this parse run. + const struct GumboInternalOptions* _options; + + // Output for the parse. + struct GumboInternalOutput* _output; + + // The internal tokenizer state, defined as a pointer to avoid a cyclic + // dependency on html5tokenizer.h. The main parse routine is responsible for + // initializing this on parse start, and destroying it on parse end. + // End-users will never see a non-garbage value in this pointer. + struct GumboInternalTokenizerState* _tokenizer_state; + + // The internal parser state. Initialized on parse start and destroyed on + // parse end; end-users will never see a non-garbage value in this pointer. + struct GumboInternalParserState* _parser_state; +} GumboParser; + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_PARSER_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/string_buffer.h b/libs/litehtml/src/gumbo/include/gumbo/string_buffer.h new file mode 100644 index 0000000000..ee7956acc8 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/string_buffer.h @@ -0,0 +1,84 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +#ifndef GUMBO_STRING_BUFFER_H_ +#define GUMBO_STRING_BUFFER_H_ + +#include <stdbool.h> +#include <stddef.h> + +#include "gumbo.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct GumboInternalParser; + +// A struct representing a mutable, growable string. This consists of a +// heap-allocated buffer that may grow (by doubling) as necessary. When +// converting to a string, this allocates a new buffer that is only as long as +// it needs to be. Note that the internal buffer here is *not* nul-terminated, +// so be sure not to use ordinary string manipulation functions on it. +typedef struct { + // A pointer to the beginning of the string. NULL iff length == 0. + char* data; + + // The length of the string fragment, in bytes. May be zero. + size_t length; + + // The capacity of the buffer, in bytes. + size_t capacity; +} GumboStringBuffer; + +// Initializes a new GumboStringBuffer. +void gumbo_string_buffer_init( + struct GumboInternalParser* parser, GumboStringBuffer* output); + +// Ensures that the buffer contains at least a certain amount of space. Most +// useful with snprintf and the other length-delimited string functions, which +// may want to write directly into the buffer. +void gumbo_string_buffer_reserve(struct GumboInternalParser* parser, + size_t min_capacity, GumboStringBuffer* output); + +// Appends a single Unicode codepoint onto the end of the GumboStringBuffer. +// This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the +// value of the codepoint. +void gumbo_string_buffer_append_codepoint( + struct GumboInternalParser* parser, int c, GumboStringBuffer* output); + +// Appends a string onto the end of the GumboStringBuffer. +void gumbo_string_buffer_append_string(struct GumboInternalParser* parser, + GumboStringPiece* str, GumboStringBuffer* output); + +// Converts this string buffer to const char*, alloctaing a new buffer for it. +char* gumbo_string_buffer_to_string( + struct GumboInternalParser* parser, GumboStringBuffer* input); + +// Reinitialize this string buffer. This clears it by setting length=0. It +// does not zero out the buffer itself. +void gumbo_string_buffer_clear( + struct GumboInternalParser* parser, GumboStringBuffer* input); + +// Deallocates this GumboStringBuffer. +void gumbo_string_buffer_destroy( + struct GumboInternalParser* parser, GumboStringBuffer* buffer); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_STRING_BUFFER_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/string_piece.h b/libs/litehtml/src/gumbo/include/gumbo/string_piece.h new file mode 100644 index 0000000000..8c8188c500 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/string_piece.h @@ -0,0 +1,38 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) + +#ifndef GUMBO_STRING_PIECE_H_ +#define GUMBO_STRING_PIECE_H_ + +#include "gumbo.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct GumboInternalParser; + +// Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the +// destination and copying over the characters from source. Dest should be +// empty, with no buffer allocated; otherwise, this leaks it. +void gumbo_string_copy(struct GumboInternalParser* parser, + GumboStringPiece* dest, const GumboStringPiece* source); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_STRING_PIECE_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/tag_enum.h b/libs/litehtml/src/gumbo/include/gumbo/tag_enum.h new file mode 100644 index 0000000000..6d7aeb3d7d --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/tag_enum.h @@ -0,0 +1,153 @@ +// Generated via `gentags.py src/tag.in`. +// Do not edit; edit src/tag.in instead. +// clang-format off +GUMBO_TAG_HTML, +GUMBO_TAG_HEAD, +GUMBO_TAG_TITLE, +GUMBO_TAG_BASE, +GUMBO_TAG_LINK, +GUMBO_TAG_META, +GUMBO_TAG_STYLE, +GUMBO_TAG_SCRIPT, +GUMBO_TAG_NOSCRIPT, +GUMBO_TAG_TEMPLATE, +GUMBO_TAG_BODY, +GUMBO_TAG_ARTICLE, +GUMBO_TAG_SECTION, +GUMBO_TAG_NAV, +GUMBO_TAG_ASIDE, +GUMBO_TAG_H1, +GUMBO_TAG_H2, +GUMBO_TAG_H3, +GUMBO_TAG_H4, +GUMBO_TAG_H5, +GUMBO_TAG_H6, +GUMBO_TAG_HGROUP, +GUMBO_TAG_HEADER, +GUMBO_TAG_FOOTER, +GUMBO_TAG_ADDRESS, +GUMBO_TAG_P, +GUMBO_TAG_HR, +GUMBO_TAG_PRE, +GUMBO_TAG_BLOCKQUOTE, +GUMBO_TAG_OL, +GUMBO_TAG_UL, +GUMBO_TAG_LI, +GUMBO_TAG_DL, +GUMBO_TAG_DT, +GUMBO_TAG_DD, +GUMBO_TAG_FIGURE, +GUMBO_TAG_FIGCAPTION, +GUMBO_TAG_MAIN, +GUMBO_TAG_DIV, +GUMBO_TAG_A, +GUMBO_TAG_EM, +GUMBO_TAG_STRONG, +GUMBO_TAG_SMALL, +GUMBO_TAG_S, +GUMBO_TAG_CITE, +GUMBO_TAG_Q, +GUMBO_TAG_DFN, +GUMBO_TAG_ABBR, +GUMBO_TAG_DATA, +GUMBO_TAG_TIME, +GUMBO_TAG_CODE, +GUMBO_TAG_VAR, +GUMBO_TAG_SAMP, +GUMBO_TAG_KBD, +GUMBO_TAG_SUB, +GUMBO_TAG_SUP, +GUMBO_TAG_I, +GUMBO_TAG_B, +GUMBO_TAG_U, +GUMBO_TAG_MARK, +GUMBO_TAG_RUBY, +GUMBO_TAG_RT, +GUMBO_TAG_RP, +GUMBO_TAG_BDI, +GUMBO_TAG_BDO, +GUMBO_TAG_SPAN, +GUMBO_TAG_BR, +GUMBO_TAG_WBR, +GUMBO_TAG_INS, +GUMBO_TAG_DEL, +GUMBO_TAG_IMAGE, +GUMBO_TAG_IMG, +GUMBO_TAG_IFRAME, +GUMBO_TAG_EMBED, +GUMBO_TAG_OBJECT, +GUMBO_TAG_PARAM, +GUMBO_TAG_VIDEO, +GUMBO_TAG_AUDIO, +GUMBO_TAG_SOURCE, +GUMBO_TAG_TRACK, +GUMBO_TAG_CANVAS, +GUMBO_TAG_MAP, +GUMBO_TAG_AREA, +GUMBO_TAG_MATH, +GUMBO_TAG_MI, +GUMBO_TAG_MO, +GUMBO_TAG_MN, +GUMBO_TAG_MS, +GUMBO_TAG_MTEXT, +GUMBO_TAG_MGLYPH, +GUMBO_TAG_MALIGNMARK, +GUMBO_TAG_ANNOTATION_XML, +GUMBO_TAG_SVG, +GUMBO_TAG_FOREIGNOBJECT, +GUMBO_TAG_DESC, +GUMBO_TAG_TABLE, +GUMBO_TAG_CAPTION, +GUMBO_TAG_COLGROUP, +GUMBO_TAG_COL, +GUMBO_TAG_TBODY, +GUMBO_TAG_THEAD, +GUMBO_TAG_TFOOT, +GUMBO_TAG_TR, +GUMBO_TAG_TD, +GUMBO_TAG_TH, +GUMBO_TAG_FORM, +GUMBO_TAG_FIELDSET, +GUMBO_TAG_LEGEND, +GUMBO_TAG_LABEL, +GUMBO_TAG_INPUT, +GUMBO_TAG_BUTTON, +GUMBO_TAG_SELECT, +GUMBO_TAG_DATALIST, +GUMBO_TAG_OPTGROUP, +GUMBO_TAG_OPTION, +GUMBO_TAG_TEXTAREA, +GUMBO_TAG_KEYGEN, +GUMBO_TAG_OUTPUT, +GUMBO_TAG_PROGRESS, +GUMBO_TAG_METER, +GUMBO_TAG_DETAILS, +GUMBO_TAG_SUMMARY, +GUMBO_TAG_MENU, +GUMBO_TAG_MENUITEM, +GUMBO_TAG_APPLET, +GUMBO_TAG_ACRONYM, +GUMBO_TAG_BGSOUND, +GUMBO_TAG_DIR, +GUMBO_TAG_FRAME, +GUMBO_TAG_FRAMESET, +GUMBO_TAG_NOFRAMES, +GUMBO_TAG_ISINDEX, +GUMBO_TAG_LISTING, +GUMBO_TAG_XMP, +GUMBO_TAG_NEXTID, +GUMBO_TAG_NOEMBED, +GUMBO_TAG_PLAINTEXT, +GUMBO_TAG_RB, +GUMBO_TAG_STRIKE, +GUMBO_TAG_BASEFONT, +GUMBO_TAG_BIG, +GUMBO_TAG_BLINK, +GUMBO_TAG_CENTER, +GUMBO_TAG_FONT, +GUMBO_TAG_MARQUEE, +GUMBO_TAG_MULTICOL, +GUMBO_TAG_NOBR, +GUMBO_TAG_SPACER, +GUMBO_TAG_TT, +GUMBO_TAG_RTC, diff --git a/libs/litehtml/src/gumbo/include/gumbo/tag_gperf.h b/libs/litehtml/src/gumbo/include/gumbo/tag_gperf.h new file mode 100644 index 0000000000..378eaf958c --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/tag_gperf.h @@ -0,0 +1,105 @@ +static unsigned int tag_hash( + register const char *str, register unsigned int len) { + static unsigned short asso_values[] = {296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 6, 4, 3, 1, 1, 0, + 1, 0, 0, 296, 296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2, + 69, 0, 134, 9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296, + 296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2, 69, 0, 134, + 9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296}; + register unsigned int hval = len; + + switch (hval) { + default: + hval += asso_values[(unsigned char) str[1] + 3]; + /*FALLTHROUGH*/ + case 1: + hval += asso_values[(unsigned char) str[0]]; + break; + } + return hval + asso_values[(unsigned char) str[len - 1]]; +} + +static const unsigned char kGumboTagMap[] = {GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_S, GUMBO_TAG_H6, GUMBO_TAG_H5, GUMBO_TAG_H4, + GUMBO_TAG_H3, GUMBO_TAG_SPACER, GUMBO_TAG_H2, GUMBO_TAG_HEADER, + GUMBO_TAG_H1, GUMBO_TAG_HEAD, GUMBO_TAG_LAST, GUMBO_TAG_DETAILS, + GUMBO_TAG_SELECT, GUMBO_TAG_DIR, GUMBO_TAG_LAST, GUMBO_TAG_DEL, + GUMBO_TAG_LAST, GUMBO_TAG_SOURCE, GUMBO_TAG_LEGEND, GUMBO_TAG_DATALIST, + GUMBO_TAG_METER, GUMBO_TAG_MGLYPH, GUMBO_TAG_LAST, GUMBO_TAG_MATH, + GUMBO_TAG_LABEL, GUMBO_TAG_TABLE, GUMBO_TAG_TEMPLATE, GUMBO_TAG_LAST, + GUMBO_TAG_RP, GUMBO_TAG_TIME, GUMBO_TAG_TITLE, GUMBO_TAG_DATA, + GUMBO_TAG_APPLET, GUMBO_TAG_HGROUP, GUMBO_TAG_SAMP, GUMBO_TAG_TEXTAREA, + GUMBO_TAG_ABBR, GUMBO_TAG_MARQUEE, GUMBO_TAG_LAST, GUMBO_TAG_MENUITEM, + GUMBO_TAG_SMALL, GUMBO_TAG_META, GUMBO_TAG_A, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_EMBED, + GUMBO_TAG_MAP, GUMBO_TAG_LAST, GUMBO_TAG_PARAM, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_NOBR, GUMBO_TAG_P, GUMBO_TAG_SPAN, GUMBO_TAG_EM, + GUMBO_TAG_LAST, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SECTION, GUMBO_TAG_NOEMBED, + GUMBO_TAG_NEXTID, GUMBO_TAG_FOOTER, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_HR, + GUMBO_TAG_LAST, GUMBO_TAG_FONT, GUMBO_TAG_DL, GUMBO_TAG_TR, + GUMBO_TAG_SCRIPT, GUMBO_TAG_MO, GUMBO_TAG_LAST, GUMBO_TAG_DD, + GUMBO_TAG_MAIN, GUMBO_TAG_TD, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_FORM, + GUMBO_TAG_OBJECT, GUMBO_TAG_LAST, GUMBO_TAG_FIELDSET, GUMBO_TAG_LAST, + GUMBO_TAG_BGSOUND, GUMBO_TAG_MENU, GUMBO_TAG_TFOOT, GUMBO_TAG_FIGURE, + GUMBO_TAG_RB, GUMBO_TAG_LI, GUMBO_TAG_LISTING, GUMBO_TAG_BASEFONT, + GUMBO_TAG_OPTGROUP, GUMBO_TAG_LAST, GUMBO_TAG_BASE, GUMBO_TAG_ADDRESS, + GUMBO_TAG_MI, GUMBO_TAG_LAST, GUMBO_TAG_PLAINTEXT, GUMBO_TAG_LAST, + GUMBO_TAG_PROGRESS, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_ACRONYM, GUMBO_TAG_ARTICLE, GUMBO_TAG_LAST, GUMBO_TAG_PRE, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_AREA, + GUMBO_TAG_RT, GUMBO_TAG_LAST, GUMBO_TAG_OPTION, GUMBO_TAG_IMAGE, + GUMBO_TAG_DT, GUMBO_TAG_LAST, GUMBO_TAG_TT, GUMBO_TAG_HTML, GUMBO_TAG_WBR, + GUMBO_TAG_OL, GUMBO_TAG_LAST, GUMBO_TAG_STYLE, GUMBO_TAG_STRIKE, + GUMBO_TAG_SUP, GUMBO_TAG_MULTICOL, GUMBO_TAG_U, GUMBO_TAG_DFN, GUMBO_TAG_UL, + GUMBO_TAG_FIGCAPTION, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST, GUMBO_TAG_VAR, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_FRAMESET, GUMBO_TAG_LAST, + GUMBO_TAG_BR, GUMBO_TAG_I, GUMBO_TAG_FRAME, GUMBO_TAG_LAST, GUMBO_TAG_DIV, + GUMBO_TAG_LAST, GUMBO_TAG_TH, GUMBO_TAG_MS, GUMBO_TAG_ANNOTATION_XML, + GUMBO_TAG_B, GUMBO_TAG_TBODY, GUMBO_TAG_THEAD, GUMBO_TAG_BIG, + GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_XMP, GUMBO_TAG_LAST, GUMBO_TAG_KBD, + GUMBO_TAG_LAST, GUMBO_TAG_LINK, GUMBO_TAG_IFRAME, GUMBO_TAG_MARK, + GUMBO_TAG_CENTER, GUMBO_TAG_OUTPUT, GUMBO_TAG_DESC, GUMBO_TAG_CANVAS, + GUMBO_TAG_COL, GUMBO_TAG_MALIGNMARK, GUMBO_TAG_IMG, GUMBO_TAG_ASIDE, + GUMBO_TAG_LAST, GUMBO_TAG_CODE, GUMBO_TAG_LAST, GUMBO_TAG_SUB, GUMBO_TAG_MN, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_INS, GUMBO_TAG_AUDIO, + GUMBO_TAG_STRONG, GUMBO_TAG_CITE, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_INPUT, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_NAV, GUMBO_TAG_LAST, GUMBO_TAG_COLGROUP, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_SVG, GUMBO_TAG_KEYGEN, GUMBO_TAG_VIDEO, + GUMBO_TAG_BDO, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_BODY, GUMBO_TAG_LAST, GUMBO_TAG_Q, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_TRACK, + GUMBO_TAG_LAST, GUMBO_TAG_BDI, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_CAPTION, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_RUBY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BUTTON, + GUMBO_TAG_SUMMARY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_RTC, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BLINK, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, + GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_ISINDEX}; diff --git a/libs/litehtml/src/gumbo/include/gumbo/tag_sizes.h b/libs/litehtml/src/gumbo/include/gumbo/tag_sizes.h new file mode 100644 index 0000000000..7c92de073b --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/tag_sizes.h @@ -0,0 +1,4 @@ +// Generated via `gentags.py src/tag.in`. +// Do not edit; edit src/tag.in instead. +// clang-format off +4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3,
\ No newline at end of file diff --git a/libs/litehtml/src/gumbo/include/gumbo/tag_strings.h b/libs/litehtml/src/gumbo/include/gumbo/tag_strings.h new file mode 100644 index 0000000000..6540e2e6ba --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/tag_strings.h @@ -0,0 +1,153 @@ +// Generated via `gentags.py src/tag.in`. +// Do not edit; edit src/tag.in instead. +// clang-format off +"html", +"head", +"title", +"base", +"link", +"meta", +"style", +"script", +"noscript", +"template", +"body", +"article", +"section", +"nav", +"aside", +"h1", +"h2", +"h3", +"h4", +"h5", +"h6", +"hgroup", +"header", +"footer", +"address", +"p", +"hr", +"pre", +"blockquote", +"ol", +"ul", +"li", +"dl", +"dt", +"dd", +"figure", +"figcaption", +"main", +"div", +"a", +"em", +"strong", +"small", +"s", +"cite", +"q", +"dfn", +"abbr", +"data", +"time", +"code", +"var", +"samp", +"kbd", +"sub", +"sup", +"i", +"b", +"u", +"mark", +"ruby", +"rt", +"rp", +"bdi", +"bdo", +"span", +"br", +"wbr", +"ins", +"del", +"image", +"img", +"iframe", +"embed", +"object", +"param", +"video", +"audio", +"source", +"track", +"canvas", +"map", +"area", +"math", +"mi", +"mo", +"mn", +"ms", +"mtext", +"mglyph", +"malignmark", +"annotation-xml", +"svg", +"foreignobject", +"desc", +"table", +"caption", +"colgroup", +"col", +"tbody", +"thead", +"tfoot", +"tr", +"td", +"th", +"form", +"fieldset", +"legend", +"label", +"input", +"button", +"select", +"datalist", +"optgroup", +"option", +"textarea", +"keygen", +"output", +"progress", +"meter", +"details", +"summary", +"menu", +"menuitem", +"applet", +"acronym", +"bgsound", +"dir", +"frame", +"frameset", +"noframes", +"isindex", +"listing", +"xmp", +"nextid", +"noembed", +"plaintext", +"rb", +"strike", +"basefont", +"big", +"blink", +"center", +"font", +"marquee", +"multicol", +"nobr", +"spacer", +"tt", +"rtc", diff --git a/libs/litehtml/src/gumbo/include/gumbo/token_type.h b/libs/litehtml/src/gumbo/include/gumbo/token_type.h new file mode 100644 index 0000000000..eeab507869 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/token_type.h @@ -0,0 +1,41 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) + +#ifndef GUMBO_TOKEN_TYPE_H_ +#define GUMBO_TOKEN_TYPE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// An enum representing the type of token. +typedef enum { + GUMBO_TOKEN_DOCTYPE, + GUMBO_TOKEN_START_TAG, + GUMBO_TOKEN_END_TAG, + GUMBO_TOKEN_COMMENT, + GUMBO_TOKEN_WHITESPACE, + GUMBO_TOKEN_CHARACTER, + GUMBO_TOKEN_CDATA, + GUMBO_TOKEN_NULL, + GUMBO_TOKEN_EOF +} GumboTokenType; + +#ifdef __cplusplus +} // extern C +#endif + +#endif // GUMBO_TOKEN_TYPE_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/tokenizer.h b/libs/litehtml/src/gumbo/include/gumbo/tokenizer.h new file mode 100644 index 0000000000..1e2a2ca730 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/tokenizer.h @@ -0,0 +1,123 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// This contains an implementation of a tokenizer for HTML5. It consumes a +// buffer of UTF-8 characters, and then emits a stream of tokens. + +#ifndef GUMBO_TOKENIZER_H_ +#define GUMBO_TOKENIZER_H_ + +#include <stdbool.h> +#include <stddef.h> + +#include "gumbo.h" +#include "token_type.h" +#include "tokenizer_states.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct GumboInternalParser; + +// Struct containing all information pertaining to doctype tokens. +typedef struct GumboInternalTokenDocType { + const char* name; + const char* public_identifier; + const char* system_identifier; + bool force_quirks; + // There's no way to tell a 0-length public or system ID apart from the + // absence of a public or system ID, but they're handled different by the + // spec, so we need bool flags for them. + bool has_public_identifier; + bool has_system_identifier; +} GumboTokenDocType; + +// Struct containing all information pertaining to start tag tokens. +typedef struct GumboInternalTokenStartTag { + GumboTag tag; + GumboVector /* GumboAttribute */ attributes; + bool is_self_closing; +} GumboTokenStartTag; + +// A data structure representing a single token in the input stream. This +// contains an enum for the type, the source position, a GumboStringPiece +// pointing to the original text, and then a union for any parsed data. +typedef struct GumboInternalToken { + GumboTokenType type; + GumboSourcePosition position; + GumboStringPiece original_text; + union { + GumboTokenDocType doc_type; + GumboTokenStartTag start_tag; + GumboTag end_tag; + const char* text; // For comments. + int character; // For character, whitespace, null, and EOF tokens. + } v; +} GumboToken; + +// Initializes the tokenizer state within the GumboParser object, setting up a +// parse of the specified text. +void gumbo_tokenizer_state_init( + struct GumboInternalParser* parser, const char* text, size_t text_length); + +// Destroys the tokenizer state within the GumboParser object, freeing any +// dynamically-allocated structures within it. +void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser); + +// Sets the tokenizer state to the specified value. This is needed by some +// parser states, which alter the state of the tokenizer in response to tags +// seen. +void gumbo_tokenizer_set_state( + struct GumboInternalParser* parser, GumboTokenizerEnum state); + +// Flags whether the current node is a foreign content element. This is +// necessary for the markup declaration open state, where the tokenizer must be +// aware of the state of the parser to properly tokenize bad comment tags. +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state +void gumbo_tokenizer_set_is_current_node_foreign( + struct GumboInternalParser* parser, bool is_foreign); + +// Lexes a single token from the specified buffer, filling the output with the +// parsed GumboToken data structure. Returns true for a successful +// tokenization, false if a parse error occurs. +// +// Example: +// struct GumboInternalParser parser; +// GumboToken output; +// gumbo_tokenizer_state_init(&parser, text, strlen(text)); +// while (gumbo_lex(&parser, &output)) { +// ...do stuff with output. +// gumbo_token_destroy(&parser, &token); +// } +// gumbo_tokenizer_state_destroy(&parser); +bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output); + +// Frees the internally-allocated pointers within an GumboToken. Note that this +// doesn't free the token itself, since oftentimes it will be allocated on the +// stack. A simple call to free() (or GumboParser->deallocator, if +// appropriate) can handle that. +// +// Note that if you are handing over ownership of the internal strings to some +// other data structure - for example, a parse tree - these do not need to be +// freed. +void gumbo_token_destroy(struct GumboInternalParser* parser, GumboToken* token); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_TOKENIZER_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/tokenizer_states.h b/libs/litehtml/src/gumbo/include/gumbo/tokenizer_states.h new file mode 100644 index 0000000000..80659f5f1a --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/tokenizer_states.h @@ -0,0 +1,103 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// This contains the list of states used in the tokenizer. Although at first +// glance it seems like these could be kept internal to the tokenizer, several +// of the actions in the parser require that it reach into the tokenizer and +// reset the tokenizer state. For that to work, it needs to have the +// definitions of individual states available. +// +// This may also be useful for providing more detailed error messages for parse +// errors, as we can match up states and inputs in a table without having to +// clutter the tokenizer code with lots of precise error messages. + +#ifndef GUMBO_TOKENIZER_STATES_H_ +#define GUMBO_TOKENIZER_STATES_H_ + +// The ordering of this enum is also used to build the dispatch table for the +// tokenizer state machine, so if it is changed, be sure to update that too. +typedef enum { + GUMBO_LEX_DATA, + GUMBO_LEX_CHAR_REF_IN_DATA, + GUMBO_LEX_RCDATA, + GUMBO_LEX_CHAR_REF_IN_RCDATA, + GUMBO_LEX_RAWTEXT, + GUMBO_LEX_SCRIPT, + GUMBO_LEX_PLAINTEXT, + GUMBO_LEX_TAG_OPEN, + GUMBO_LEX_END_TAG_OPEN, + GUMBO_LEX_TAG_NAME, + GUMBO_LEX_RCDATA_LT, + GUMBO_LEX_RCDATA_END_TAG_OPEN, + GUMBO_LEX_RCDATA_END_TAG_NAME, + GUMBO_LEX_RAWTEXT_LT, + GUMBO_LEX_RAWTEXT_END_TAG_OPEN, + GUMBO_LEX_RAWTEXT_END_TAG_NAME, + GUMBO_LEX_SCRIPT_LT, + GUMBO_LEX_SCRIPT_END_TAG_OPEN, + GUMBO_LEX_SCRIPT_END_TAG_NAME, + GUMBO_LEX_SCRIPT_ESCAPED_START, + GUMBO_LEX_SCRIPT_ESCAPED_START_DASH, + GUMBO_LEX_SCRIPT_ESCAPED, + GUMBO_LEX_SCRIPT_ESCAPED_DASH, + GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH, + GUMBO_LEX_SCRIPT_ESCAPED_LT, + GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN, + GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME, + GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START, + GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED, + GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH, + GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH, + GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT, + GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END, + GUMBO_LEX_BEFORE_ATTR_NAME, + GUMBO_LEX_ATTR_NAME, + GUMBO_LEX_AFTER_ATTR_NAME, + GUMBO_LEX_BEFORE_ATTR_VALUE, + GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED, + GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED, + GUMBO_LEX_ATTR_VALUE_UNQUOTED, + GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE, + GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED, + GUMBO_LEX_SELF_CLOSING_START_TAG, + GUMBO_LEX_BOGUS_COMMENT, + GUMBO_LEX_MARKUP_DECLARATION, + GUMBO_LEX_COMMENT_START, + GUMBO_LEX_COMMENT_START_DASH, + GUMBO_LEX_COMMENT, + GUMBO_LEX_COMMENT_END_DASH, + GUMBO_LEX_COMMENT_END, + GUMBO_LEX_COMMENT_END_BANG, + GUMBO_LEX_DOCTYPE, + GUMBO_LEX_BEFORE_DOCTYPE_NAME, + GUMBO_LEX_DOCTYPE_NAME, + GUMBO_LEX_AFTER_DOCTYPE_NAME, + GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD, + GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID, + GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED, + GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED, + GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID, + GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID, + GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD, + GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID, + GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED, + GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED, + GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID, + GUMBO_LEX_BOGUS_DOCTYPE, + GUMBO_LEX_CDATA +} GumboTokenizerEnum; + +#endif // GUMBO_TOKENIZER_STATES_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/utf8.h b/libs/litehtml/src/gumbo/include/gumbo/utf8.h new file mode 100644 index 0000000000..ee852abfba --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/utf8.h @@ -0,0 +1,132 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// This contains an implementation of a UTF8 iterator and decoder suitable for +// an HTML5 parser. This does a bit more than straight UTF-8 decoding. The +// HTML5 spec specifies that: +// 1. Decoding errors are parse errors. +// 2. Certain other codepoints (eg. control characters) are parse errors. +// 3. Carriage returns and CR/LF groups are converted to line feeds. +// http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling +// +// Also, we want to keep track of source positions for error handling. As a +// result, we fold all that functionality into this decoder, and can't use an +// off-the-shelf library. +// +// This header is internal-only, which is why we prefix functions with only +// utf8_ or utf8_iterator_ instead of gumbo_utf8_. + +#ifndef GUMBO_UTF8_H_ +#define GUMBO_UTF8_H_ + +#include <stdbool.h> +#include <stddef.h> + +#include "gumbo.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct GumboInternalError; +struct GumboInternalParser; + +// Unicode replacement char. +extern const int kUtf8ReplacementChar; + +typedef struct GumboInternalUtf8Iterator { + // Points at the start of the code point most recently read into 'current'. + const char* _start; + + // Points at the mark. The mark is initially set to the beginning of the + // input. + const char* _mark; + + // Points past the end of the iter, like a past-the-end iterator in the STL. + const char* _end; + + // The code point under the cursor. + int _current; + + // The width in bytes of the current code point. + ptrdiff_t _width; + + // The SourcePosition for the current location. + GumboSourcePosition _pos; + + // The SourcePosition for the mark. + GumboSourcePosition _mark_pos; + + // Pointer back to the GumboParser instance, for configuration options and + // error recording. + struct GumboInternalParser* _parser; +} Utf8Iterator; + +// Returns true if this Unicode code point is in the list of characters +// forbidden by the HTML5 spec, such as NUL bytes and undefined control chars. +bool utf8_is_invalid_code_point(int c); + +// Initializes a new Utf8Iterator from the given byte buffer. The source does +// not have to be NUL-terminated, but the length must be passed in explicitly. +void utf8iterator_init(struct GumboInternalParser* parser, const char* source, + size_t source_length, Utf8Iterator* iter); + +// Advances the current position by one code point. +void utf8iterator_next(Utf8Iterator* iter); + +// Returns the current code point as an integer. +int utf8iterator_current(const Utf8Iterator* iter); + +// Retrieves and fills the output parameter with the current source position. +void utf8iterator_get_position( + const Utf8Iterator* iter, GumboSourcePosition* output); + +// Retrieves a character pointer to the start of the current character. +const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter); + +// Retrieves a character pointer to 1 past the end of the buffer. This is +// necessary for certain state machines and string comparisons that would like +// to look directly for ASCII text in the buffer without going through the +// decoder. +const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter); + +// If the upcoming text in the buffer matches the specified prefix (which has +// length 'length'), consume it and return true. Otherwise, return false with +// no other effects. If the length of the string would overflow the buffer, +// this returns false. Note that prefix should not contain null bytes because +// of the use of strncmp/strncasecmp internally. All existing use-cases adhere +// to this. +bool utf8iterator_maybe_consume_match( + Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive); + +// "Marks" a particular location of interest in the input stream, so that it can +// later be reset() to. There's also the ability to record an error at the +// point that was marked, as oftentimes that's more useful than the last +// character before the error was detected. +void utf8iterator_mark(Utf8Iterator* iter); + +// Returns the current input stream position to the mark. +void utf8iterator_reset(Utf8Iterator* iter); + +// Sets the position and original text fields of an error to the value at the +// mark. +void utf8iterator_fill_error_at_mark( + Utf8Iterator* iter, struct GumboInternalError* error); + +#ifdef __cplusplus +} +#endif +#endif // GUMBO_UTF8_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/util.h b/libs/litehtml/src/gumbo/include/gumbo/util.h new file mode 100644 index 0000000000..98a7d1c466 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/util.h @@ -0,0 +1,62 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// This contains some utility functions that didn't fit into any of the other +// headers. + +#ifndef GUMBO_UTIL_H_ +#define GUMBO_UTIL_H_ +#ifdef _MSC_VER +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS +#endif +#endif +#include <stdbool.h> +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + +// Forward declaration since it's passed into some of the functions in this +// header. +struct GumboInternalParser; + +// Utility function for allocating & copying a null-terminated string into a +// freshly-allocated buffer. This is necessary for proper memory management; we +// have the convention that all const char* in parse tree structures are +// freshly-allocated, so if we didn't copy, we'd try to delete a literal string +// when the parse tree is destroyed. +char* gumbo_copy_stringz(struct GumboInternalParser* parser, const char* str); + +// Allocate a chunk of memory, using the allocator specified in the Parser's +// ~config options. +void* gumbo_parser_allocate( + struct GumboInternalParser* parser, size_t num_bytes); + +// Deallocate a chunk of memory, using the deallocator specified in the Parser's +// ~config options. +void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr); + +// Debug wrapper for printf, to make it easier to turn off debugging info when +// required. +void gumbo_debug(const char* format, ...); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_UTIL_H_ diff --git a/libs/litehtml/src/gumbo/include/gumbo/vector.h b/libs/litehtml/src/gumbo/include/gumbo/vector.h new file mode 100644 index 0000000000..70fe6fa689 --- /dev/null +++ b/libs/litehtml/src/gumbo/include/gumbo/vector.h @@ -0,0 +1,67 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) + +#ifndef GUMBO_VECTOR_H_ +#define GUMBO_VECTOR_H_ + +#include "gumbo.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Forward declaration since it's passed into some of the functions in this +// header. +struct GumboInternalParser; + +// Initializes a new GumboVector with the specified initial capacity. +void gumbo_vector_init(struct GumboInternalParser* parser, + size_t initial_capacity, GumboVector* vector); + +// Frees the memory used by an GumboVector. Does not free the contained +// pointers. +void gumbo_vector_destroy( + struct GumboInternalParser* parser, GumboVector* vector); + +// Adds a new element to an GumboVector. +void gumbo_vector_add( + struct GumboInternalParser* parser, void* element, GumboVector* vector); + +// Removes and returns the element most recently added to the GumboVector. +// Ownership is transferred to caller. Capacity is unchanged. If the vector is +// empty, NULL is returned. +void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector); + +// Inserts an element at a specific index. This is potentially O(N) time, but +// is necessary for some of the spec's behavior. +void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element, + unsigned int index, GumboVector* vector); + +// Removes an element from the vector, or does nothing if the element is not in +// the vector. +void gumbo_vector_remove( + struct GumboInternalParser* parser, void* element, GumboVector* vector); + +// Removes and returns an element at a specific index. Note that this is +// potentially O(N) time and should be used sparingly. +void* gumbo_vector_remove_at(struct GumboInternalParser* parser, + unsigned int index, GumboVector* vector); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_VECTOR_H_ |