summaryrefslogtreecommitdiff
path: root/libs/litehtml/src/gumbo/tokenizer.c
diff options
context:
space:
mode:
authorGeorge Hazan <george.hazan@gmail.com>2024-03-18 12:13:54 +0300
committerGeorge Hazan <george.hazan@gmail.com>2024-03-18 12:13:54 +0300
commit705c4d24c9c61edffc82864bf9c24384dc29a8d7 (patch)
tree4d21f87671db36b99402da3221d45b64c257c1fe /libs/litehtml/src/gumbo/tokenizer.c
parent5784fc3a62b9136c6690ed45ec7b505f35512e08 (diff)
litehtml - lightweight html renderer
Diffstat (limited to 'libs/litehtml/src/gumbo/tokenizer.c')
-rw-r--r--libs/litehtml/src/gumbo/tokenizer.c2897
1 files changed, 2897 insertions, 0 deletions
diff --git a/libs/litehtml/src/gumbo/tokenizer.c b/libs/litehtml/src/gumbo/tokenizer.c
new file mode 100644
index 0000000000..0d0ea0f241
--- /dev/null
+++ b/libs/litehtml/src/gumbo/tokenizer.c
@@ -0,0 +1,2897 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: jdtang@google.com (Jonathan Tang)
+//
+// Coding conventions specific to this file:
+//
+// 1. Functions that fill in a token should be named emit_*, and should be
+// followed immediately by a return from the tokenizer (true if no error
+// occurred, false if an error occurred). Sometimes the emit functions
+// themselves return a boolean so that they can be combined with the return
+// statement; in this case, they should match this convention.
+// 2. Functions that shuffle data from temporaries to final API structures
+// should be named finish_*, and be called just before the tokenizer exits the
+// state that accumulates the temporary.
+// 3. All internal data structures should be kept in an initialized state from
+// tokenizer creation onwards, ready to accept input. When a buffer's flushed
+// and reset, it should be deallocated and immediately reinitialized.
+// 4. Make sure there are appropriate break statements following each state.
+// 5. Assertions on the state of the temporary and tag buffers are usually a
+// good idea, and should go at the entry point of each state when added.
+// 6. Statement order within states goes:
+// 1. Add parse errors, if appropriate.
+// 2. Call finish_* functions to build up tag state.
+// 2. Switch to new state. Set _reconsume flag if appropriate.
+// 3. Perform any other temporary buffer manipulation.
+// 4. Emit tokens
+// 5. Return/break.
+// This order ensures that we can verify that every emit is followed by a
+// return, ensures that the correct state is recorded with any parse errors, and
+// prevents parse error position from being messed up by possible mark/resets in
+// temporary buffer manipulation.
+
+#include "tokenizer.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "attribute.h"
+#include "char_ref.h"
+#include "error.h"
+#include "gumbo.h"
+#include "parser.h"
+#include "string_buffer.h"
+#include "string_piece.h"
+#include "token_type.h"
+#include "tokenizer_states.h"
+#include "utf8.h"
+#include "util.h"
+#include "vector.h"
+
+// Compared against _script_data_buffer to determine if we're in double-escaped
+// script mode.
+const GumboStringPiece kScriptTag = {"script", 6};
+
+// An enum for the return value of each individual state.
+typedef enum {
+ RETURN_ERROR, // Return false (error) from the tokenizer.
+ RETURN_SUCCESS, // Return true (success) from the tokenizer.
+ NEXT_CHAR // Proceed to the next character and continue lexing.
+} StateResult;
+
+// This is a struct containing state necessary to build up a tag token,
+// character by character.
+typedef struct GumboInternalTagState {
+ // A buffer to accumulate characters for various GumboStringPiece fields.
+ GumboStringBuffer _buffer;
+
+ // A pointer to the start of the original text corresponding to the contents
+ // of the buffer.
+ const char* _original_text;
+
+ // The current tag enum, computed once the tag name state has finished so that
+ // the buffer can be re-used for building up attributes.
+ GumboTag _tag;
+
+ // The starting location of the text in the buffer.
+ GumboSourcePosition _start_pos;
+
+ // The current list of attributes. This is copied (and ownership of its data
+ // transferred) to the GumboStartTag token upon completion of the tag. New
+ // attributes are added as soon as their attribute name state is complete, and
+ // values are filled in by operating on _attributes.data[attributes.length-1].
+ GumboVector /* GumboAttribute */ _attributes;
+
+ // If true, the next attribute value to be finished should be dropped. This
+ // happens if a duplicate attribute name is encountered - we want to consume
+ // the attribute value, but shouldn't overwrite the existing value.
+ bool _drop_next_attr_value;
+
+ // The state that caused the tokenizer to switch into a character reference in
+ // attribute value state. This is used to set the additional allowed
+ // character, and is switched back to on completion. Initialized as the
+ // tokenizer enters the character reference state.
+ GumboTokenizerEnum _attr_value_state;
+
+ // The last start tag to have been emitted by the tokenizer. This is
+ // necessary to check for appropriate end tags.
+ GumboTag _last_start_tag;
+
+ // If true, then this is a start tag. If false, it's an end tag. This is
+ // necessary to generate the appropriate token type at tag-closing time.
+ bool _is_start_tag;
+
+ // If true, then this tag is "self-closing" and doesn't have an end tag.
+ bool _is_self_closing;
+} GumboTagState;
+
+// This is the main tokenizer state struct, containing all state used by in
+// tokenizing the input stream.
+typedef struct GumboInternalTokenizerState {
+ // The current lexer state. Starts in GUMBO_LEX_DATA.
+ GumboTokenizerEnum _state;
+
+ // A flag indicating whether the current input character needs to reconsumed
+ // in another state, or whether the next input character should be read for
+ // the next iteration of the state loop. This is set when the spec reads
+ // "Reconsume the current input character in..."
+ bool _reconsume_current_input;
+
+ // A flag indicating whether the current node is a foreign element. This is
+ // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
+ // markup declaration state.
+ bool _is_current_node_foreign;
+
+ // A flag indicating whether the tokenizer is in a CDATA section. If so, then
+ // text tokens emitted will be GUMBO_TOKEN_CDATA.
+ bool _is_in_cdata;
+
+ // Certain states (notably character references) may emit two character tokens
+ // at once, but the contract for lex() fills in only one token at a time. The
+ // extra character is buffered here, and then this is checked on entry to
+ // lex(). If a character is stored here, it's immediately emitted and control
+ // returns from the lexer. kGumboNoChar is used to represent 'no character
+ // stored.'
+ //
+ // Note that characters emitted through this mechanism will have their source
+ // position marked as the character under the mark, i.e. multiple characters
+ // may be emitted with the same position. This is desirable for character
+ // references, but unsuitable for many other cases. Use the _temporary_buffer
+ // mechanism if the buffered characters must have their original positions in
+ // the document.
+ int _buffered_emit_char;
+
+ // A temporary buffer to accumulate characters, as described by the "temporary
+ // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
+ // way: we record the specific character to go into the buffer, which may
+ // sometimes be a lowercased version of the actual input character. However,
+ // we *also* use utf8iterator_mark() to record the position at tag start.
+ // When we start flushing the temporary buffer, we set _temporary_buffer_emit
+ // to the start of it, and then increment it for each call to the tokenizer.
+ // We also call utf8iterator_reset(), and utf8iterator_next() through the
+ // input stream, so that tokens emitted by emit_char have the correct position
+ // and original text.
+ GumboStringBuffer _temporary_buffer;
+
+ // The current cursor position we're emitting from within
+ // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
+ const char* _temporary_buffer_emit;
+
+ // The temporary buffer is also used by the spec to check whether we should
+ // enter the script data double escaped state, but we can't use the same
+ // buffer for both because we have to flush out "<s" as emits while still
+ // maintaining the context that will eventually become "script". This is a
+ // separate buffer that's used in place of the temporary buffer for states
+ // that may enter the script data double escape start state.
+ GumboStringBuffer _script_data_buffer;
+
+ // Pointer to the beginning of the current token in the original buffer; used
+ // to record the original text.
+ const char* _token_start;
+
+ // GumboSourcePosition recording the source location of the start of the
+ // current token.
+ GumboSourcePosition _token_start_pos;
+
+ // Current tag state.
+ GumboTagState _tag_state;
+
+ // Doctype state. We use the temporary buffer to accumulate characters (it's
+ // not used for anything else in the doctype states), and then freshly
+ // allocate the strings in the doctype token, then copy it over on emit.
+ GumboTokenDocType _doc_type_state;
+
+ // The UTF8Iterator over the tokenizer input.
+ Utf8Iterator _input;
+} GumboTokenizerState;
+
+// Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
+static void tokenizer_add_parse_error(
+ GumboParser* parser, GumboErrorType type) {
+ GumboError* error = gumbo_add_error(parser);
+ if (!error) {
+ return;
+ }
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ utf8iterator_get_position(&tokenizer->_input, &error->position);
+ error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
+ error->type = type;
+ error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
+ switch (tokenizer->_state) {
+ case GUMBO_LEX_DATA:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
+ break;
+ case GUMBO_LEX_CHAR_REF_IN_DATA:
+ case GUMBO_LEX_CHAR_REF_IN_RCDATA:
+ case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
+ break;
+ case GUMBO_LEX_RCDATA:
+ case GUMBO_LEX_RCDATA_LT:
+ case GUMBO_LEX_RCDATA_END_TAG_OPEN:
+ case GUMBO_LEX_RCDATA_END_TAG_NAME:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
+ break;
+ case GUMBO_LEX_RAWTEXT:
+ case GUMBO_LEX_RAWTEXT_LT:
+ case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
+ case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
+ break;
+ case GUMBO_LEX_PLAINTEXT:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
+ break;
+ case GUMBO_LEX_SCRIPT:
+ case GUMBO_LEX_SCRIPT_LT:
+ case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
+ case GUMBO_LEX_SCRIPT_END_TAG_NAME:
+ case GUMBO_LEX_SCRIPT_ESCAPED_START:
+ case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
+ case GUMBO_LEX_SCRIPT_ESCAPED:
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
+ case GUMBO_LEX_SCRIPT_ESCAPED_LT:
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
+ break;
+ case GUMBO_LEX_TAG_OPEN:
+ case GUMBO_LEX_END_TAG_OPEN:
+ case GUMBO_LEX_TAG_NAME:
+ case GUMBO_LEX_BEFORE_ATTR_NAME:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
+ break;
+ case GUMBO_LEX_SELF_CLOSING_START_TAG:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
+ break;
+ case GUMBO_LEX_ATTR_NAME:
+ case GUMBO_LEX_AFTER_ATTR_NAME:
+ case GUMBO_LEX_BEFORE_ATTR_VALUE:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
+ break;
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
+ case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
+ break;
+ case GUMBO_LEX_BOGUS_COMMENT:
+ case GUMBO_LEX_COMMENT_START:
+ case GUMBO_LEX_COMMENT_START_DASH:
+ case GUMBO_LEX_COMMENT:
+ case GUMBO_LEX_COMMENT_END_DASH:
+ case GUMBO_LEX_COMMENT_END:
+ case GUMBO_LEX_COMMENT_END_BANG:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
+ break;
+ case GUMBO_LEX_MARKUP_DECLARATION:
+ case GUMBO_LEX_DOCTYPE:
+ case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
+ case GUMBO_LEX_DOCTYPE_NAME:
+ case GUMBO_LEX_AFTER_DOCTYPE_NAME:
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
+ case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
+ case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
+ case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
+ case GUMBO_LEX_BOGUS_DOCTYPE:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
+ break;
+ case GUMBO_LEX_CDATA:
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
+ break;
+ }
+}
+
+static bool is_alpha(int c) {
+ // We don't use ISO C isupper/islower functions here because they
+ // depend upon the program's locale, while the behavior of the HTML5 spec is
+ // independent of which locale the program is run in.
+ return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+}
+
+static int ensure_lowercase(int c) {
+ return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
+}
+
+static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
+ if (is_in_cdata && c > 0) {
+ return GUMBO_TOKEN_CDATA;
+ }
+
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ return GUMBO_TOKEN_WHITESPACE;
+ case 0:
+ gumbo_debug("Emitted null byte.\n");
+ return GUMBO_TOKEN_NULL;
+ case -1:
+ return GUMBO_TOKEN_EOF;
+ default:
+ return GUMBO_TOKEN_CHARACTER;
+ }
+}
+
+// Starts recording characters in the temporary buffer.
+// Because this needs to reset the utf8iterator_mark to the beginning of the
+// text that will eventually be emitted, it needs to be called a couple of
+// states before the spec says "Set the temporary buffer to the empty string".
+// In general, this should be called whenever there's a transition to a
+// "less-than sign state". The initial < and possibly / then need to be
+// appended to the temporary buffer, their presence needs to be accounted for in
+// states that compare the temporary buffer against a literal value, and
+// spec stanzas that say "emit a < and / character token along with a character
+// token for each character in the temporary buffer" need to be adjusted to
+// account for the presence of the < and / inside the temporary buffer.
+static void clear_temporary_buffer(GumboParser* parser) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ assert(!tokenizer->_temporary_buffer_emit);
+ utf8iterator_mark(&tokenizer->_input);
+ gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
+ // The temporary buffer and script data buffer are the same object in the
+ // spec, so the script data buffer should be cleared as well.
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
+}
+
+// Appends a codepoint to the temporary buffer.
+static void append_char_to_temporary_buffer(
+ GumboParser* parser, int codepoint) {
+ gumbo_string_buffer_append_codepoint(
+ parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
+}
+
+// Checks to see if the temporary buffer equals a certain string.
+// Make sure this remains side-effect free; it's used in assertions.
+#ifndef NDEBUG
+static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
+ // TODO(jdtang): See if the extra strlen is a performance problem, and replace
+ // it with an explicit sizeof(literal) if necessary. I don't think it will
+ // be, as this is only used in a couple of rare states.
+ size_t text_len = strlen(text);
+ return text_len == buffer->length &&
+ memcmp(buffer->data, text, text_len) == 0;
+}
+#endif
+
+static void doc_type_state_init(GumboParser* parser) {
+ GumboTokenDocType* doc_type_state =
+ &parser->_tokenizer_state->_doc_type_state;
+ // We initialize these to NULL here so that we don't end up leaking memory if
+ // we never see a doctype token. When we do see a doctype token, we reset
+ // them to a freshly-allocated empty string so that we can present a uniform
+ // interface to client code and not make them check for null. Ownership is
+ // transferred to the doctype token when it's emitted.
+ doc_type_state->name = NULL;
+ doc_type_state->public_identifier = NULL;
+ doc_type_state->system_identifier = NULL;
+ doc_type_state->force_quirks = false;
+ doc_type_state->has_public_identifier = false;
+ doc_type_state->has_system_identifier = false;
+}
+
+// Sets the token original_text and position to the current iterator position.
+// This is necessary because [CDATA[ sections may include text that is ignored
+// by the tokenizer.
+static void reset_token_start_point(GumboTokenizerState* tokenizer) {
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
+}
+
+// Sets the tag buffer original text and start point to the current iterator
+// position. This is necessary because attribute names & values may have
+// whitespace preceeding them, and so we can't assume that the actual token
+// starting point was the end of the last tag buffer usage.
+static void reset_tag_buffer_start_point(GumboParser* parser) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ GumboTagState* tag_state = &tokenizer->_tag_state;
+
+ utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
+ tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
+}
+
+// Moves the temporary buffer contents over to the specified output string,
+// and clears the temporary buffer.
+static void finish_temporary_buffer(GumboParser* parser, const char** output) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ *output =
+ gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
+ clear_temporary_buffer(parser);
+}
+
+// Advances the iterator past the end of the token, and then fills in the
+// relevant position fields. It's assumed that after every emit, the tokenizer
+// will immediately return (letting the tree-construction stage read the filled
+// in Token). Thus, it's safe to advance the input stream here, since it will
+// bypass the advance at the bottom of the state machine loop.
+//
+// Since this advances the iterator and resets the current input, make sure to
+// call it after you've recorded any other data you need for the token.
+static void finish_token(GumboParser* parser, GumboToken* token) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ if (!tokenizer->_reconsume_current_input) {
+ utf8iterator_next(&tokenizer->_input);
+ }
+
+ token->position = tokenizer->_token_start_pos;
+ token->original_text.data = tokenizer->_token_start;
+ reset_token_start_point(tokenizer);
+ token->original_text.length =
+ tokenizer->_token_start - token->original_text.data;
+ if (token->original_text.length > 0 &&
+ token->original_text.data[token->original_text.length - 1] == '\r') {
+ // The UTF8 iterator will ignore carriage returns in the input stream, which
+ // means that the next token may start one past a \r character. The pointer
+ // arithmetic above results in that \r being appended to the original text
+ // of the preceding token, so we have to adjust its length here to chop the
+ // \r off.
+ --token->original_text.length;
+ }
+}
+
+// Records the doctype public ID, assumed to be in the temporary buffer.
+// Convenience method that also sets has_public_identifier to true.
+static void finish_doctype_public_id(GumboParser* parser) {
+ GumboTokenDocType* doc_type_state =
+ &parser->_tokenizer_state->_doc_type_state;
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
+ finish_temporary_buffer(parser, &doc_type_state->public_identifier);
+ doc_type_state->has_public_identifier = true;
+}
+
+// Records the doctype system ID, assumed to be in the temporary buffer.
+// Convenience method that also sets has_system_identifier to true.
+static void finish_doctype_system_id(GumboParser* parser) {
+ GumboTokenDocType* doc_type_state =
+ &parser->_tokenizer_state->_doc_type_state;
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
+ finish_temporary_buffer(parser, &doc_type_state->system_identifier);
+ doc_type_state->has_system_identifier = true;
+}
+
+// Writes a single specified character to the output token.
+static void emit_char(GumboParser* parser, int c, GumboToken* output) {
+ output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
+ output->v.character = c;
+ finish_token(parser, output);
+}
+
+// Writes a replacement character token and records a parse error.
+// Always returns RETURN_ERROR, per gumbo_lex return value.
+static StateResult emit_replacement_char(
+ GumboParser* parser, GumboToken* output) {
+ // In all cases, this is because of a null byte in the input stream.
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ emit_char(parser, kUtf8ReplacementChar, output);
+ return RETURN_ERROR;
+}
+
+// Writes an EOF character token. Always returns RETURN_SUCCESS.
+static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
+ emit_char(parser, -1, output);
+ return RETURN_SUCCESS;
+}
+
+// Writes the current input character out as a character token.
+// Always returns RETURN_SUCCESS.
+static bool emit_current_char(GumboParser* parser, GumboToken* output) {
+ emit_char(
+ parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
+ return RETURN_SUCCESS;
+}
+
+// Writes out a doctype token, copying it from the tokenizer state.
+static void emit_doctype(GumboParser* parser, GumboToken* output) {
+ output->type = GUMBO_TOKEN_DOCTYPE;
+ output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
+ finish_token(parser, output);
+ doc_type_state_init(parser);
+}
+
+// Debug-only function that explicitly sets the attribute vector data to NULL so
+// it can be asserted on tag creation, verifying that there are no memory leaks.
+static void mark_tag_state_as_empty(GumboTagState* tag_state) {
+#ifndef NDEBUG
+ tag_state->_attributes = kGumboEmptyVector;
+#endif
+}
+
+// Writes out the current tag as a start or end tag token.
+// Always returns RETURN_SUCCESS.
+static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+ if (tag_state->_is_start_tag) {
+ output->type = GUMBO_TOKEN_START_TAG;
+ output->v.start_tag.tag = tag_state->_tag;
+ output->v.start_tag.attributes = tag_state->_attributes;
+ output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
+ tag_state->_last_start_tag = tag_state->_tag;
+ mark_tag_state_as_empty(tag_state);
+ gumbo_debug(
+ "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
+ } else {
+ output->type = GUMBO_TOKEN_END_TAG;
+ output->v.end_tag = tag_state->_tag;
+ // In end tags, ownership of the attributes vector is not transferred to the
+ // token, but it's still initialized as normal, so it must be manually
+ // deallocated. There may also be attributes to destroy, in certain broken
+ // cases like </div</th> (the "th" is an attribute there).
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
+ }
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
+ mark_tag_state_as_empty(tag_state);
+ gumbo_debug(
+ "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
+ }
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
+ finish_token(parser, output);
+ gumbo_debug("Original text = %.*s.\n", output->original_text.length,
+ output->original_text.data);
+ assert(output->original_text.length >= 2);
+ assert(output->original_text.data[0] == '<');
+ assert(output->original_text.data[output->original_text.length - 1] == '>');
+ return RETURN_SUCCESS;
+}
+
+// In some states, we speculatively start a tag, but don't know whether it'll be
+// emitted as tag token or as a series of character tokens until we finish it.
+// We need to abandon the tag we'd started & free its memory in that case to
+// avoid a memory leak.
+static void abandon_current_tag(GumboParser* parser) {
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
+ }
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
+ mark_tag_state_as_empty(tag_state);
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
+ gumbo_debug("Abandoning current tag.\n");
+}
+
+// Wraps the consume_char_ref function to handle its output and make the
+// appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
+// error occurred, RETURN_SUCCESS otherwise.
+static StateResult emit_char_ref(GumboParser* parser,
+ int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ OneOrTwoCodepoints char_ref;
+ bool status = consume_char_ref(
+ parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
+ if (char_ref.first != kGumboNoChar) {
+ // consume_char_ref ends with the iterator pointing at the next character,
+ // so we need to be sure not advance it again before reading the next token.
+ tokenizer->_reconsume_current_input = true;
+ emit_char(parser, char_ref.first, output);
+ tokenizer->_buffered_emit_char = char_ref.second;
+ } else {
+ emit_char(parser, '&', output);
+ }
+ return status ? RETURN_SUCCESS : RETURN_ERROR;
+}
+
+// Emits a comment token. Comments use the temporary buffer to accumulate their
+// data, and then it's copied over and released to the 'text' field of the
+// GumboToken union. Always returns RETURN_SUCCESS.
+static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
+ output->type = GUMBO_TOKEN_COMMENT;
+ finish_temporary_buffer(parser, &output->v.text);
+ finish_token(parser, output);
+ return RETURN_SUCCESS;
+}
+
+// Checks to see we should be flushing accumulated characters in the temporary
+// buffer, and fills the output token with the next output character if so.
+// Returns true if a character has been emitted and the tokenizer should
+// immediately return, false if we're at the end of the temporary buffer and
+// should resume normal operation.
+static bool maybe_emit_from_temporary_buffer(
+ GumboParser* parser, GumboToken* output) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ const char* c = tokenizer->_temporary_buffer_emit;
+ GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
+
+ if (!c || c >= buffer->data + buffer->length) {
+ tokenizer->_temporary_buffer_emit = NULL;
+ return false;
+ }
+
+ assert(*c == utf8iterator_current(&tokenizer->_input));
+ // emit_char also advances the input stream. We need to do some juggling of
+ // the _reconsume_current_input flag to get the proper behavior when emitting
+ // previous tokens. Basically, _reconsume_current_input should *never* be set
+ // when emitting anything from the temporary buffer, since those characters
+ // have already been advanced past. However, it should be preserved so that
+ // when the *next* character is encountered again, the tokenizer knows not to
+ // advance past it.
+ bool saved_reconsume_state = tokenizer->_reconsume_current_input;
+ tokenizer->_reconsume_current_input = false;
+ emit_char(parser, *c, output);
+ ++tokenizer->_temporary_buffer_emit;
+ tokenizer->_reconsume_current_input = saved_reconsume_state;
+ return true;
+}
+
+// Sets up the tokenizer to begin flushing the temporary buffer.
+// This resets the input iterator stream to the start of the last tag, sets up
+// _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
+// the first character in it. It returns true if a character was emitted, false
+// otherwise.
+static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ assert(tokenizer->_temporary_buffer.data);
+ utf8iterator_reset(&tokenizer->_input);
+ tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
+ return maybe_emit_from_temporary_buffer(parser, output);
+}
+
+// Appends a codepoint to the current tag buffer. If
+// reinitilize_position_on_first is set, this also initializes the tag buffer
+// start point; the only time you would *not* want to pass true for this
+// parameter is if you want the original_text to include character (like an
+// opening quote) that doesn't appear in the value.
+static void append_char_to_tag_buffer(
+ GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
+ if (buffer->length == 0 && reinitilize_position_on_first) {
+ reset_tag_buffer_start_point(parser);
+ }
+ gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
+}
+
+// (Re-)initialize the tag buffer. This also resets the original_text pointer
+// and _start_pos field to point to the current position.
+static void initialize_tag_buffer(GumboParser* parser) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ GumboTagState* tag_state = &tokenizer->_tag_state;
+
+ gumbo_string_buffer_init(parser, &tag_state->_buffer);
+ reset_tag_buffer_start_point(parser);
+}
+
+// Initializes the tag_state to start a new tag, keeping track of the opening
+// positions and original text. Takes a boolean indicating whether this is a
+// start or end tag.
+static void start_new_tag(GumboParser* parser, bool is_start_tag) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ GumboTagState* tag_state = &tokenizer->_tag_state;
+ int c = utf8iterator_current(&tokenizer->_input);
+ assert(is_alpha(c));
+ c = ensure_lowercase(c);
+ assert(is_alpha(c));
+
+ initialize_tag_buffer(parser);
+ gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
+
+ assert(tag_state->_attributes.data == NULL);
+ // Initial size chosen by statistical analysis of a corpus of 60k webpages.
+ // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
+ // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
+ // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
+ gumbo_vector_init(parser, 1, &tag_state->_attributes);
+ tag_state->_drop_next_attr_value = false;
+ tag_state->_is_start_tag = is_start_tag;
+ tag_state->_is_self_closing = false;
+ gumbo_debug("Starting new tag.\n");
+}
+
+// Fills in the specified char* with the contents of the tag buffer.
+static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ GumboTagState* tag_state = &tokenizer->_tag_state;
+ *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
+}
+
+// Fills in:
+// * The original_text GumboStringPiece with the portion of the original
+// buffer that corresponds to the tag buffer.
+// * The start_pos GumboSourcePosition with the start position of the tag
+// buffer.
+// * The end_pos GumboSourcePosition with the current source position.
+static void copy_over_original_tag_text(GumboParser* parser,
+ GumboStringPiece* original_text, GumboSourcePosition* start_pos,
+ GumboSourcePosition* end_pos) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ GumboTagState* tag_state = &tokenizer->_tag_state;
+
+ original_text->data = tag_state->_original_text;
+ original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
+ tag_state->_original_text;
+ if (original_text->data[original_text->length - 1] == '\r') {
+ // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
+ // appended to the end of original text even when it's really the first part
+ // of the next character. If we detect this situation, shrink the length of
+ // the original text by 1 to remove the carriage return.
+ --original_text->length;
+ }
+ *start_pos = tag_state->_start_pos;
+ utf8iterator_get_position(&tokenizer->_input, end_pos);
+}
+
+// Releases and then re-initializes the tag buffer.
+static void reinitialize_tag_buffer(GumboParser* parser) {
+ gumbo_parser_deallocate(
+ parser, parser->_tokenizer_state->_tag_state._buffer.data);
+ initialize_tag_buffer(parser);
+}
+
+// Moves some data from the temporary buffer over the the tag-based fields in
+// TagState.
+static void finish_tag_name(GumboParser* parser) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ GumboTagState* tag_state = &tokenizer->_tag_state;
+
+ tag_state->_tag =
+ gumbo_tagn_enum(tag_state->_buffer.data, (unsigned)tag_state->_buffer.length);
+ reinitialize_tag_buffer(parser);
+}
+
+// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
+static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
+ int original_index, int new_index) {
+ GumboError* error = gumbo_add_error(parser);
+ if (!error) {
+ return;
+ }
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+ error->type = GUMBO_ERR_DUPLICATE_ATTR;
+ error->position = tag_state->_start_pos;
+ error->original_text = tag_state->_original_text;
+ error->v.duplicate_attr.original_index = original_index;
+ error->v.duplicate_attr.new_index = new_index;
+ copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
+ reinitialize_tag_buffer(parser);
+}
+
+// Creates a new attribute in the current tag, copying the current tag buffer to
+// the attribute's name. The attribute's value starts out as the empty string
+// (following the "Boolean attributes" section of the spec) and is only
+// overwritten on finish_attribute_value(). If the attribute has already been
+// specified, the new attribute is dropped, a parse error is added, and the
+// function returns false. Otherwise, this returns true.
+static bool finish_attribute_name(GumboParser* parser) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ GumboTagState* tag_state = &tokenizer->_tag_state;
+ // May've been set by a previous attribute without a value; reset it here.
+ tag_state->_drop_next_attr_value = false;
+ assert(tag_state->_attributes.data);
+ assert(tag_state->_attributes.capacity);
+
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
+ for (unsigned int i = 0; i < attributes->length; ++i) {
+ GumboAttribute* attr = attributes->data[i];
+ if (strlen(attr->name) == tag_state->_buffer.length &&
+ memcmp(attr->name, tag_state->_buffer.data,
+ tag_state->_buffer.length) == 0) {
+ // Identical attribute; bail.
+ add_duplicate_attr_error(parser, attr->name, i, attributes->length);
+ tag_state->_drop_next_attr_value = true;
+ return false;
+ }
+ }
+
+ GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
+ attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
+ copy_over_tag_buffer(parser, &attr->name);
+ copy_over_original_tag_text(
+ parser, &attr->original_name, &attr->name_start, &attr->name_end);
+ attr->value = gumbo_copy_stringz(parser, "");
+ copy_over_original_tag_text(
+ parser, &attr->original_value, &attr->name_start, &attr->name_end);
+ gumbo_vector_add(parser, attr, attributes);
+ reinitialize_tag_buffer(parser);
+ return true;
+}
+
+// Finishes an attribute value. This sets the value of the most recently added
+// attribute to the current contents of the tag buffer.
+static void finish_attribute_value(GumboParser* parser) {
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+ if (tag_state->_drop_next_attr_value) {
+ // Duplicate attribute name detected in an earlier state, so we have to
+ // ignore the value.
+ tag_state->_drop_next_attr_value = false;
+ reinitialize_tag_buffer(parser);
+ return;
+ }
+
+ GumboAttribute* attr =
+ tag_state->_attributes.data[tag_state->_attributes.length - 1];
+ gumbo_parser_deallocate(parser, (void*) attr->value);
+ copy_over_tag_buffer(parser, &attr->value);
+ copy_over_original_tag_text(
+ parser, &attr->original_value, &attr->value_start, &attr->value_end);
+ reinitialize_tag_buffer(parser);
+}
+
+// Returns true if the current end tag matches the last start tag emitted.
+static bool is_appropriate_end_tag(GumboParser* parser) {
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+ assert(!tag_state->_is_start_tag);
+ return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
+ tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
+ (unsigned)tag_state->_buffer.length);
+}
+
+void gumbo_tokenizer_state_init(
+ GumboParser* parser, const char* text, size_t text_length) {
+ GumboTokenizerState* tokenizer =
+ gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
+ parser->_tokenizer_state = tokenizer;
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_reconsume_current_input = false;
+ tokenizer->_is_current_node_foreign = false;
+ tokenizer->_is_in_cdata = false;
+ tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
+
+ tokenizer->_buffered_emit_char = kGumboNoChar;
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
+ tokenizer->_temporary_buffer_emit = NULL;
+
+ mark_tag_state_as_empty(&tokenizer->_tag_state);
+
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
+ tokenizer->_token_start = text;
+ utf8iterator_init(parser, text, text_length, &tokenizer->_input);
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
+ doc_type_state_init(parser);
+}
+
+void gumbo_tokenizer_state_destroy(GumboParser* parser) {
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+ assert(tokenizer->_doc_type_state.name == NULL);
+ assert(tokenizer->_doc_type_state.public_identifier == NULL);
+ assert(tokenizer->_doc_type_state.system_identifier == NULL);
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
+ gumbo_parser_deallocate(parser, tokenizer);
+}
+
+void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
+ parser->_tokenizer_state->_state = state;
+}
+
+void gumbo_tokenizer_set_is_current_node_foreign(
+ GumboParser* parser, bool is_foreign) {
+ if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
+ gumbo_debug("Toggling is_current_node_foreign to %s.\n",
+ is_foreign ? "true" : "false");
+ }
+ parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
+static StateResult handle_data_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '&':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
+ // The char_ref machinery expects to be on the & so it can mark that
+ // and return to it if the text isn't a char ref, so we need to
+ // reconsume it.
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
+ clear_temporary_buffer(parser);
+ append_char_to_temporary_buffer(parser, '<');
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ emit_char(parser, c, output);
+ return RETURN_ERROR;
+ default:
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
+static StateResult handle_char_ref_in_data_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_char_ref(parser, ' ', false, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
+static StateResult handle_rcdata_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '&':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
+ clear_temporary_buffer(parser);
+ append_char_to_temporary_buffer(parser, '<');
+ return NEXT_CHAR;
+ case '\0':
+ return emit_replacement_char(parser, output);
+ case -1:
+ return emit_eof(parser, output);
+ default:
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
+static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
+ return emit_char_ref(parser, ' ', false, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
+static StateResult handle_rawtext_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
+ clear_temporary_buffer(parser);
+ append_char_to_temporary_buffer(parser, '<');
+ return NEXT_CHAR;
+ case '\0':
+ return emit_replacement_char(parser, output);
+ case -1:
+ return emit_eof(parser, output);
+ default:
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
+static StateResult handle_script_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
+ clear_temporary_buffer(parser);
+ append_char_to_temporary_buffer(parser, '<');
+ return NEXT_CHAR;
+ case '\0':
+ return emit_replacement_char(parser, output);
+ case -1:
+ return emit_eof(parser, output);
+ default:
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
+static StateResult handle_plaintext_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\0':
+ return emit_replacement_char(parser, output);
+ case -1:
+ return emit_eof(parser, output);
+ default:
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
+static StateResult handle_tag_open_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "<"));
+ switch (c) {
+ case '!':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
+ clear_temporary_buffer(parser);
+ return NEXT_CHAR;
+ case '/':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
+ append_char_to_temporary_buffer(parser, '/');
+ return NEXT_CHAR;
+ case '?':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
+ clear_temporary_buffer(parser);
+ append_char_to_temporary_buffer(parser, '?');
+ tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
+ return NEXT_CHAR;
+ default:
+ if (is_alpha(c)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
+ start_new_tag(parser, true);
+ return NEXT_CHAR;
+ } else {
+ tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_temporary_buffer(parser, output);
+ return RETURN_ERROR;
+ }
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
+static StateResult handle_end_tag_open_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "</"));
+ switch (c) {
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_temporary_buffer(parser, output);
+ default:
+ if (is_alpha(c)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
+ start_new_tag(parser, false);
+ } else {
+ tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
+ clear_temporary_buffer(parser);
+ append_char_to_temporary_buffer(parser, c);
+ }
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
+static StateResult handle_tag_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+ return NEXT_CHAR;
+ case '/':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+ return NEXT_CHAR;
+ case '>':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_current_tag(parser, output);
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
+ abandon_current_tag(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return NEXT_CHAR;
+ default:
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
+static StateResult handle_rcdata_lt_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "<"));
+ if (c == '/') {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
+ append_char_to_temporary_buffer(parser, '/');
+ return NEXT_CHAR;
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
+ tokenizer->_reconsume_current_input = true;
+ return emit_temporary_buffer(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
+static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "</"));
+ if (is_alpha(c)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
+ start_new_tag(parser, false);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
+ return emit_temporary_buffer(parser, output);
+ }
+ return true;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
+static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(tokenizer->_temporary_buffer.length >= 2);
+ if (is_alpha(c)) {
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ } else if (is_appropriate_end_tag(parser)) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+ return NEXT_CHAR;
+ case '/':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+ return NEXT_CHAR;
+ case '>':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_current_tag(parser, output);
+ }
+ }
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
+ abandon_current_tag(parser);
+ return emit_temporary_buffer(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
+static StateResult handle_rawtext_lt_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "<"));
+ if (c == '/') {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
+ append_char_to_temporary_buffer(parser, '/');
+ return NEXT_CHAR;
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
+ tokenizer->_reconsume_current_input = true;
+ return emit_temporary_buffer(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
+static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "</"));
+ if (is_alpha(c)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
+ start_new_tag(parser, false);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
+ return emit_temporary_buffer(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
+static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(tokenizer->_temporary_buffer.length >= 2);
+ gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
+ tokenizer->_tag_state._buffer.data);
+ if (is_alpha(c)) {
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ } else if (is_appropriate_end_tag(parser)) {
+ gumbo_debug("Is an appropriate end tag.\n");
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+ return NEXT_CHAR;
+ case '/':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+ return NEXT_CHAR;
+ case '>':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_current_tag(parser, output);
+ }
+ }
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
+ abandon_current_tag(parser);
+ return emit_temporary_buffer(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
+static StateResult handle_script_lt_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "<"));
+ if (c == '/') {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
+ append_char_to_temporary_buffer(parser, '/');
+ return NEXT_CHAR;
+ } else if (c == '!') {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
+ append_char_to_temporary_buffer(parser, '!');
+ return emit_temporary_buffer(parser, output);
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+ tokenizer->_reconsume_current_input = true;
+ return emit_temporary_buffer(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
+static StateResult handle_script_end_tag_open_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "</"));
+ if (is_alpha(c)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
+ start_new_tag(parser, false);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+ return emit_temporary_buffer(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
+static StateResult handle_script_end_tag_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(tokenizer->_temporary_buffer.length >= 2);
+ if (is_alpha(c)) {
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ } else if (is_appropriate_end_tag(parser)) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+ return NEXT_CHAR;
+ case '/':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+ return NEXT_CHAR;
+ case '>':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_current_tag(parser, output);
+ }
+ }
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+ abandon_current_tag(parser);
+ return emit_temporary_buffer(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
+static StateResult handle_script_escaped_start_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ if (c == '-') {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
+ return emit_current_char(parser, output);
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
+static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ if (c == '-') {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
+ return emit_current_char(parser, output);
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
+static StateResult handle_script_escaped_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
+ return emit_current_char(parser, output);
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
+ clear_temporary_buffer(parser);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ case '\0':
+ return emit_replacement_char(parser, output);
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+ return emit_eof(parser, output);
+ default:
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
+static StateResult handle_script_escaped_dash_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
+ return emit_current_char(parser, output);
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
+ clear_temporary_buffer(parser);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ case '\0':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+ return emit_replacement_char(parser, output);
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return NEXT_CHAR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
+static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ return emit_current_char(parser, output);
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
+ clear_temporary_buffer(parser);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+ return emit_current_char(parser, output);
+ case '\0':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+ return emit_replacement_char(parser, output);
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return NEXT_CHAR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
+static StateResult handle_script_escaped_lt_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "<"));
+ assert(!tokenizer->_script_data_buffer.length);
+ if (c == '/') {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ } else if (is_alpha(c)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
+ append_char_to_temporary_buffer(parser, c);
+ gumbo_string_buffer_append_codepoint(
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
+ return emit_temporary_buffer(parser, output);
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+ return emit_temporary_buffer(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
+static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(temporary_buffer_equals(parser, "</"));
+ if (is_alpha(c)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
+ start_new_tag(parser, false);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+ return emit_temporary_buffer(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
+static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(tokenizer->_temporary_buffer.length >= 2);
+ if (is_alpha(c)) {
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ } else if (is_appropriate_end_tag(parser)) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+ return NEXT_CHAR;
+ case '/':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+ return NEXT_CHAR;
+ case '>':
+ finish_tag_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_current_tag(parser, output);
+ }
+ }
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+ abandon_current_tag(parser);
+ return emit_temporary_buffer(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
+static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ case '/':
+ case '>':
+ gumbo_tokenizer_set_state(
+ parser, gumbo_string_equals(&kScriptTag,
+ (GumboStringPiece*) &tokenizer->_script_data_buffer)
+ ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
+ : GUMBO_LEX_SCRIPT_ESCAPED);
+ return emit_current_char(parser, output);
+ default:
+ if (is_alpha(c)) {
+ gumbo_string_buffer_append_codepoint(
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
+ return emit_current_char(parser, output);
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ }
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
+static StateResult handle_script_double_escaped_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
+ return emit_current_char(parser, output);
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
+ return emit_current_char(parser, output);
+ case '\0':
+ return emit_replacement_char(parser, output);
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return NEXT_CHAR;
+ default:
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
+static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
+ return emit_current_char(parser, output);
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
+ return emit_current_char(parser, output);
+ case '\0':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+ return emit_replacement_char(parser, output);
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return NEXT_CHAR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
+static StateResult handle_script_double_escaped_dash_dash_state(
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
+ GumboToken* output) {
+ switch (c) {
+ case '-':
+ return emit_current_char(parser, output);
+ case '<':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
+ return emit_current_char(parser, output);
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+ return emit_current_char(parser, output);
+ case '\0':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+ return emit_replacement_char(parser, output);
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return NEXT_CHAR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+ return emit_current_char(parser, output);
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
+static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ if (c == '/') {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
+ return emit_current_char(parser, output);
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
+static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ case '/':
+ case '>':
+ gumbo_tokenizer_set_state(
+ parser, gumbo_string_equals(&kScriptTag,
+ (GumboStringPiece*) &tokenizer->_script_data_buffer)
+ ? GUMBO_LEX_SCRIPT_ESCAPED
+ : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+ return emit_current_char(parser, output);
+ default:
+ if (is_alpha(c)) {
+ gumbo_string_buffer_append_codepoint(
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
+ return emit_current_char(parser, output);
+ } else {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ }
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
+static StateResult handle_before_attr_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ return NEXT_CHAR;
+ case '/':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_current_tag(parser, output);
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
+ append_char_to_temporary_buffer(parser, 0xfffd);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ abandon_current_tag(parser);
+ return NEXT_CHAR;
+ case '"':
+ case '\'':
+ case '<':
+ case '=':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
+ // Fall through.
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
+static StateResult handle_attr_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ finish_attribute_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
+ return NEXT_CHAR;
+ case '/':
+ finish_attribute_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+ return NEXT_CHAR;
+ case '=':
+ finish_attribute_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
+ return NEXT_CHAR;
+ case '>':
+ finish_attribute_name(parser);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_current_tag(parser, output);
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
+ return NEXT_CHAR;
+ case -1:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ abandon_current_tag(parser);
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
+ return NEXT_CHAR;
+ case '"':
+ case '\'':
+ case '<':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
+ // Fall through.
+ default:
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
+static StateResult handle_after_attr_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ return NEXT_CHAR;
+ case '/':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+ return NEXT_CHAR;
+ case '=':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_current_tag(parser, output);
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
+ append_char_to_temporary_buffer(parser, 0xfffd);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ abandon_current_tag(parser);
+ return NEXT_CHAR;
+ case '"':
+ case '\'':
+ case '<':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
+ // Fall through.
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
+static StateResult handle_before_attr_value_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ return NEXT_CHAR;
+ case '"':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
+ reset_tag_buffer_start_point(parser);
+ return NEXT_CHAR;
+ case '&':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ case '\'':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
+ reset_tag_buffer_start_point(parser);
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ abandon_current_tag(parser);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_current_tag(parser, output);
+ return RETURN_ERROR;
+ case '<':
+ case '=':
+ case '`':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
+ // Fall through.
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
+ append_char_to_tag_buffer(parser, c, true);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
+static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '"':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
+ return NEXT_CHAR;
+ case '&':
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ abandon_current_tag(parser);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ default:
+ append_char_to_tag_buffer(parser, c, false);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
+static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\'':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
+ return NEXT_CHAR;
+ case '&':
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ abandon_current_tag(parser);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ default:
+ append_char_to_tag_buffer(parser, c, false);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
+static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+ finish_attribute_value(parser);
+ return NEXT_CHAR;
+ case '&':
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ finish_attribute_value(parser);
+ return emit_current_tag(parser, output);
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_reconsume_current_input = true;
+ abandon_current_tag(parser);
+ return NEXT_CHAR;
+ case '<':
+ case '=':
+ case '"':
+ case '\'':
+ case '`':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
+ // Fall through.
+ default:
+ append_char_to_tag_buffer(parser, c, true);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
+static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ OneOrTwoCodepoints char_ref;
+ int allowed_char;
+ bool is_unquoted = false;
+ switch (tokenizer->_tag_state._attr_value_state) {
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
+ allowed_char = '"';
+ break;
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
+ allowed_char = '\'';
+ break;
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
+ allowed_char = '>';
+ is_unquoted = true;
+ break;
+ default:
+ // -Wmaybe-uninitialized is a little overzealous here, and doesn't
+ // get that the assert(0) means this codepath will never happen.
+ allowed_char = ' ';
+ assert(0);
+ }
+
+ // Ignore the status, since we don't have a convenient way of signalling that
+ // a parser error has occurred when the error occurs in the middle of a
+ // multi-state token. We'd need a flag inside the TokenizerState to do this,
+ // but that's a low priority fix.
+ consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
+ if (char_ref.first != kGumboNoChar) {
+ tokenizer->_reconsume_current_input = true;
+ append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
+ if (char_ref.second != kGumboNoChar) {
+ append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
+ }
+ } else {
+ append_char_to_tag_buffer(parser, '&', is_unquoted);
+ }
+ gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
+ return NEXT_CHAR;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
+static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ finish_attribute_value(parser);
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+ return NEXT_CHAR;
+ case '/':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_current_tag(parser, output);
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ abandon_current_tag(parser);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
+static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_tag_state._is_self_closing = true;
+ return emit_current_tag(parser, output);
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ abandon_current_tag(parser);
+ return NEXT_CHAR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+ tokenizer->_reconsume_current_input = true;
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
+static StateResult handle_bogus_comment_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ while (c != '>' && c != -1) {
+ if (c == '\0') {
+ c = 0xFFFD;
+ }
+ append_char_to_temporary_buffer(parser, c);
+ utf8iterator_next(&tokenizer->_input);
+ c = utf8iterator_current(&tokenizer->_input);
+ }
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_comment(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
+static StateResult handle_markup_declaration_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ if (utf8iterator_maybe_consume_match(
+ &tokenizer->_input, "--", sizeof("--") - 1, true)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
+ tokenizer->_reconsume_current_input = true;
+ } else if (utf8iterator_maybe_consume_match(
+ &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
+ tokenizer->_reconsume_current_input = true;
+ // If we get here, we know we'll eventually emit a doctype token, so now is
+ // the time to initialize the doctype strings. (Not in doctype_state_init,
+ // since then they'll leak if ownership never gets transferred to the
+ // doctype token.
+ tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
+ tokenizer->_doc_type_state.public_identifier =
+ gumbo_copy_stringz(parser, "");
+ tokenizer->_doc_type_state.system_identifier =
+ gumbo_copy_stringz(parser, "");
+ } else if (tokenizer->_is_current_node_foreign &&
+ utf8iterator_maybe_consume_match(
+ &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
+ tokenizer->_is_in_cdata = true;
+ tokenizer->_reconsume_current_input = true;
+ } else {
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
+ tokenizer->_reconsume_current_input = true;
+ clear_temporary_buffer(parser);
+ }
+ return NEXT_CHAR;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
+static StateResult handle_comment_start_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_comment(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_comment(parser, output);
+ return RETURN_ERROR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
+static StateResult handle_comment_start_dash_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_comment(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_comment(parser, output);
+ return RETURN_ERROR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
+static StateResult handle_comment_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_comment(parser, output);
+ return RETURN_ERROR;
+ default:
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
+static StateResult handle_comment_end_dash_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_comment(parser, output);
+ return RETURN_ERROR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
+static StateResult handle_comment_end_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_comment(parser, output);
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case '!':
+ tokenizer_add_parse_error(
+ parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
+ return NEXT_CHAR;
+ case '-':
+ tokenizer_add_parse_error(
+ parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
+ append_char_to_temporary_buffer(parser, '-');
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_comment(parser, output);
+ return RETURN_ERROR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
+static StateResult handle_comment_end_bang_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '-':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, '!');
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ return emit_comment(parser, output);
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, '!');
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_comment(parser, output);
+ return RETURN_ERROR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, '-');
+ append_char_to_temporary_buffer(parser, '!');
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
+static StateResult handle_doctype_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ assert(!tokenizer->_temporary_buffer.length);
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
+ tokenizer->_reconsume_current_input = true;
+ tokenizer->_doc_type_state.force_quirks = true;
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
+static StateResult handle_before_doctype_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
+ tokenizer->_doc_type_state.force_quirks = true;
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
+ tokenizer->_doc_type_state.force_quirks = false;
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
+static StateResult handle_doctype_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
+ emit_doctype(parser, output);
+ return RETURN_SUCCESS;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
+ tokenizer->_doc_type_state.force_quirks = false;
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
+static StateResult handle_after_doctype_name_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_doctype(parser, output);
+ return RETURN_SUCCESS;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ if (utf8iterator_maybe_consume_match(
+ &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
+ tokenizer->_reconsume_current_input = true;
+ } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
+ sizeof("SYSTEM") - 1, false)) {
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
+ tokenizer->_reconsume_current_input = true;
+ } else {
+ tokenizer_add_parse_error(
+ parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+ tokenizer->_doc_type_state.force_quirks = true;
+ }
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
+static StateResult handle_after_doctype_public_keyword_state(
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
+ GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
+ return NEXT_CHAR;
+ case '"':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
+ return NEXT_CHAR;
+ case '\'':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
+static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ return NEXT_CHAR;
+ case '"':
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
+ return NEXT_CHAR;
+ case '\'':
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
+static StateResult handle_doctype_public_id_double_quoted_state(
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
+ GumboToken* output) {
+ switch (c) {
+ case '"':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
+ finish_doctype_public_id(parser);
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ finish_doctype_public_id(parser);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ finish_doctype_public_id(parser);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
+static StateResult handle_doctype_public_id_single_quoted_state(
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
+ GumboToken* output) {
+ switch (c) {
+ case '\'':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
+ finish_doctype_public_id(parser);
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ finish_doctype_public_id(parser);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ finish_doctype_public_id(parser);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
+static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_doctype(parser, output);
+ return RETURN_SUCCESS;
+ case '"':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
+ return NEXT_CHAR;
+ case '\'':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_reconsume_current_input = true;
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+ tokenizer->_doc_type_state.force_quirks = true;
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
+static StateResult handle_between_doctype_public_system_id_state(
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
+ GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_doctype(parser, output);
+ return RETURN_SUCCESS;
+ case '"':
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
+ return NEXT_CHAR;
+ case '\'':
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
+ return NEXT_CHAR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
+static StateResult handle_after_doctype_system_keyword_state(
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
+ GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
+ return NEXT_CHAR;
+ case '"':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
+ return NEXT_CHAR;
+ case '\'':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+ tokenizer->_doc_type_state.force_quirks = true;
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
+static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ return NEXT_CHAR;
+ case '"':
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
+ return NEXT_CHAR;
+ case '\'':
+ assert(temporary_buffer_equals(parser, ""));
+ gumbo_tokenizer_set_state(
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+ tokenizer->_doc_type_state.force_quirks = true;
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
+static StateResult handle_doctype_system_id_double_quoted_state(
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
+ GumboToken* output) {
+ switch (c) {
+ case '"':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
+ finish_doctype_system_id(parser);
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ finish_doctype_system_id(parser);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ finish_doctype_system_id(parser);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
+static StateResult handle_doctype_system_id_single_quoted_state(
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
+ GumboToken* output) {
+ switch (c) {
+ case '\'':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
+ finish_doctype_system_id(parser);
+ return NEXT_CHAR;
+ case '\0':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+ return NEXT_CHAR;
+ case '>':
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ finish_doctype_system_id(parser);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ finish_doctype_system_id(parser);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ append_char_to_temporary_buffer(parser, c);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
+static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ return NEXT_CHAR;
+ case '>':
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_doctype(parser, output);
+ return RETURN_SUCCESS;
+ case -1:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_doc_type_state.force_quirks = true;
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ default:
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+ return NEXT_CHAR;
+ }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
+static StateResult handle_bogus_doctype_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ if (c == '>' || c == -1) {
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ emit_doctype(parser, output);
+ return RETURN_ERROR;
+ }
+ return NEXT_CHAR;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
+static StateResult handle_cdata_state(GumboParser* parser,
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
+ if (c == -1 || utf8iterator_maybe_consume_match(
+ &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
+ tokenizer->_reconsume_current_input = true;
+ reset_token_start_point(tokenizer);
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+ tokenizer->_is_in_cdata = false;
+ return NEXT_CHAR;
+ } else {
+ return emit_current_char(parser, output);
+ }
+}
+
+typedef StateResult (*GumboLexerStateFunction)(
+ GumboParser*, GumboTokenizerState*, int, GumboToken*);
+
+static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
+ handle_char_ref_in_data_state, handle_rcdata_state,
+ handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
+ handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
+ handle_tag_name_state, handle_rcdata_lt_state,
+ handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
+ handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
+ handle_rawtext_end_tag_name_state, handle_script_lt_state,
+ handle_script_end_tag_open_state, handle_script_end_tag_name_state,
+ handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
+ handle_script_escaped_state, handle_script_escaped_dash_state,
+ handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
+ handle_script_escaped_end_tag_open_state,
+ handle_script_escaped_end_tag_name_state,
+ handle_script_double_escaped_start_state,
+ handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
+ handle_script_double_escaped_dash_dash_state,
+ handle_script_double_escaped_lt_state,
+ handle_script_double_escaped_end_state, handle_before_attr_name_state,
+ handle_attr_name_state, handle_after_attr_name_state,
+ handle_before_attr_value_state, handle_attr_value_double_quoted_state,
+ handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
+ handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
+ handle_self_closing_start_tag_state, handle_bogus_comment_state,
+ handle_markup_declaration_state, handle_comment_start_state,
+ handle_comment_start_dash_state, handle_comment_state,
+ handle_comment_end_dash_state, handle_comment_end_state,
+ handle_comment_end_bang_state, handle_doctype_state,
+ handle_before_doctype_name_state, handle_doctype_name_state,
+ handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
+ handle_before_doctype_public_id_state,
+ handle_doctype_public_id_double_quoted_state,
+ handle_doctype_public_id_single_quoted_state,
+ handle_after_doctype_public_id_state,
+ handle_between_doctype_public_system_id_state,
+ handle_after_doctype_system_keyword_state,
+ handle_before_doctype_system_id_state,
+ handle_doctype_system_id_double_quoted_state,
+ handle_doctype_system_id_single_quoted_state,
+ handle_after_doctype_system_id_state, handle_bogus_doctype_state,
+ handle_cdata_state};
+
+bool gumbo_lex(GumboParser* parser, GumboToken* output) {
+ // Because of the spec requirements that...
+ //
+ // 1. Tokens be handled immediately by the parser upon emission.
+ // 2. Some states (eg. CDATA, or various error conditions) require the
+ // emission of multiple tokens in the same states.
+ // 3. The tokenizer often has to reconsume the same character in a different
+ // state.
+ //
+ // ...all state must be held in the GumboTokenizer struct instead of in local
+ // variables in this function. That allows us to return from this method with
+ // a token, and then immediately jump back to the same state with the same
+ // input if we need to return a different token. The various emit_* functions
+ // are responsible for changing state (eg. flushing the chardata buffer,
+ // reading the next input character) to avoid an infinite loop.
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+
+ if (tokenizer->_buffered_emit_char != kGumboNoChar) {
+ tokenizer->_reconsume_current_input = true;
+ emit_char(parser, tokenizer->_buffered_emit_char, output);
+ // And now that we've avoided advancing the input, make sure we set
+ // _reconsume_current_input back to false to make sure the *next* character
+ // isn't consumed twice.
+ tokenizer->_reconsume_current_input = false;
+ tokenizer->_buffered_emit_char = kGumboNoChar;
+ return true;
+ }
+
+ if (maybe_emit_from_temporary_buffer(parser, output)) {
+ return true;
+ }
+
+ while (1) {
+ assert(!tokenizer->_temporary_buffer_emit);
+ assert(tokenizer->_buffered_emit_char == kGumboNoChar);
+ int c = utf8iterator_current(&tokenizer->_input);
+ gumbo_debug(
+ "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
+ StateResult result =
+ dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
+ // We need to clear reconsume_current_input before returning to prevent
+ // certain infinite loop states.
+ bool should_advance = !tokenizer->_reconsume_current_input;
+ tokenizer->_reconsume_current_input = false;
+
+ if (result == RETURN_SUCCESS) {
+ return true;
+ } else if (result == RETURN_ERROR) {
+ return false;
+ }
+
+ if (should_advance) {
+ utf8iterator_next(&tokenizer->_input);
+ }
+ }
+}
+
+void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
+ if (!token) return;
+
+ switch (token->type) {
+ case GUMBO_TOKEN_DOCTYPE:
+ gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
+ gumbo_parser_deallocate(
+ parser, (void*) token->v.doc_type.public_identifier);
+ gumbo_parser_deallocate(
+ parser, (void*) token->v.doc_type.system_identifier);
+ return;
+ case GUMBO_TOKEN_START_TAG:
+ for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
+ GumboAttribute* attr = token->v.start_tag.attributes.data[i];
+ if (attr) {
+ // May have been nulled out if this token was merged with another.
+ gumbo_destroy_attribute(parser, attr);
+ }
+ }
+ gumbo_parser_deallocate(
+ parser, (void*) token->v.start_tag.attributes.data);
+ return;
+ case GUMBO_TOKEN_COMMENT:
+ gumbo_parser_deallocate(parser, (void*) token->v.text);
+ return;
+ default:
+ return;
+ }
+}