summaryrefslogtreecommitdiff
path: root/libs/litehtml/src/gumbo/parser.c
diff options
context:
space:
mode:
Diffstat (limited to 'libs/litehtml/src/gumbo/parser.c')
-rw-r--r--libs/litehtml/src/gumbo/parser.c218
1 files changed, 67 insertions, 151 deletions
diff --git a/libs/litehtml/src/gumbo/parser.c b/libs/litehtml/src/gumbo/parser.c
index 968fcc0f41..0ab3f92084 100644
--- a/libs/litehtml/src/gumbo/parser.c
+++ b/libs/litehtml/src/gumbo/parser.c
@@ -291,17 +291,16 @@ typedef struct _NamespacedAttributeReplacement {
static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
{"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
- {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
- {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
- {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
- {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
- {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
- {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
- {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML},
- {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
- {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
- {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
- {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
+ {"xlink:arcrole", "arcrole", GUMBO_ATTR_NAMESPACE_XLINK},
+ {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
+ {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
+ {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
+ {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
+ {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
+ {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
+ {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
+ {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
+ {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
};
// The "scope marker" for the list of active formatting elements. We use a
@@ -1564,12 +1563,12 @@ static bool is_special_node(const GumboNode* node) {
(gumbo_tagset){TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE),
TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
- TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR),
+ TAG(COLGROUP), TAG(DD), TAG(DETAILS), TAG(DIR),
TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET),
TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME),
TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6),
TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME),
- TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK), TAG(LISTING),
+ TAG(IMG), TAG(INPUT), TAG(LI), TAG(LINK), TAG(LISTING),
TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P),
TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION),
@@ -2179,7 +2178,7 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
return handle_in_body(parser, token);
} else if (tag_in(token, kStartTag,
(gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
- TAG(MENUITEM), TAG(LINK)})) {
+ TAG(LINK)})) {
insert_element_from_token(parser, token);
pop_current_node(parser);
acknowledge_self_closing_tag(parser);
@@ -2419,7 +2418,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
return false;
} else if (tag_in(token, kStartTag,
(gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
- TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES),
+ TAG(LINK), TAG(META), TAG(NOFRAMES),
TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
return handle_in_head(parser, token);
@@ -2514,13 +2513,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
record_end_of_element(state->_current_token, &body->v.element);
}
return success;
- } else if (tag_in(token, kStartTag,
- (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
- TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS), TAG(DIALOG),
- TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
- TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
- TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P),
- TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
+ } else if (tag_in(token, kStartTag, (gumbo_tagset){
+ TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER),
+ TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
+ TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
+ TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
+ TAG(SUMMARY), TAG(UL), TAG(SEARCH)}))
+ {
bool result = maybe_implicitly_close_p_tag(parser, token);
insert_element_from_token(parser, token);
return result;
@@ -2583,13 +2582,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
insert_element_from_token(parser, token);
state->_frameset_ok = false;
return true;
- } else if (tag_in(token, kEndTag,
- (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
- TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS),
- TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
- TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
- TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV),
- TAG(OL), TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
+ } else if (tag_in(token, kEndTag, (gumbo_tagset){
+ TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON),
+ TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL),
+ TAG(FIELDSET), TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
+ TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL),
+ TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL), TAG(SEARCH)}))
+ {
GumboTag tag = token->v.end_tag;
if (!has_an_element_in_scope(parser, tag)) {
parser_add_parse_error(parser, token);
@@ -2820,100 +2819,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
set_frameset_not_ok(parser);
return result;
} else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
- parser_add_parse_error(parser, token);
- if (parser->_parser_state->_form_element != NULL &&
- !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
- ignore_token(parser);
- return false;
- }
- acknowledge_self_closing_tag(parser);
- maybe_implicitly_close_p_tag(parser, token);
- set_frameset_not_ok(parser);
-
- GumboVector* token_attrs = &token->v.start_tag.attributes;
- GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
- GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
- GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "name");
-
- GumboNode* form = insert_element_of_tag_type(
- parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
- if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
- parser->_parser_state->_form_element = form;
- }
- if (action_attr) {
- gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
- }
- insert_element_of_tag_type(
- parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
- pop_current_node(parser); // <hr>
-
- insert_element_of_tag_type(
- parser, GUMBO_TAG_LABEL, GUMBO_INSERTION_FROM_ISINDEX);
- TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
- text_state->_start_original_text = token->original_text.data;
- text_state->_start_position = token->position;
- text_state->_type = GUMBO_NODE_TEXT;
- if (prompt_attr) {
- int prompt_attr_length = strlen(prompt_attr->value);
- gumbo_string_buffer_destroy(parser, &text_state->_buffer);
- text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
- text_state->_buffer.length = prompt_attr_length;
- text_state->_buffer.capacity = prompt_attr_length + 1;
- gumbo_destroy_attribute(parser, prompt_attr);
- } else {
- GumboStringPiece prompt_text =
- GUMBO_STRING("This is a searchable index. Enter search keywords: ");
- gumbo_string_buffer_append_string(
- parser, &prompt_text, &text_state->_buffer);
- }
-
- GumboNode* input = insert_element_of_tag_type(
- parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
- for (unsigned int i = 0; i < token_attrs->length; ++i) {
- GumboAttribute* attr = token_attrs->data[i];
- if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
- gumbo_vector_add(parser, attr, &input->v.element.attributes);
- }
- token_attrs->data[i] = NULL;
- }
-
- // All attributes have been successfully transferred and nulled out at this
- // point, so the call to ignore_token will free the memory for it without
- // touching the attributes.
- ignore_token(parser);
-
- // The name attribute, if present, should be destroyed since it's ignored
- // when copying over. The action attribute should be kept since it's moved
- // to the form.
- if (name_attr) {
- gumbo_destroy_attribute(parser, name_attr);
- }
-
- GumboAttribute* name =
- gumbo_parser_allocate(parser, sizeof(GumboAttribute));
- GumboStringPiece name_str = GUMBO_STRING("name");
- GumboStringPiece isindex_str = GUMBO_STRING("isindex");
- name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
- name->name = gumbo_copy_stringz(parser, "name");
- name->value = gumbo_copy_stringz(parser, "isindex");
- name->original_name = name_str;
- name->original_value = isindex_str;
- name->name_start = kGumboEmptySourcePosition;
- name->name_end = kGumboEmptySourcePosition;
- name->value_start = kGumboEmptySourcePosition;
- name->value_end = kGumboEmptySourcePosition;
- gumbo_vector_add(parser, name, &input->v.element.attributes);
-
- pop_current_node(parser); // <input>
- pop_current_node(parser); // <label>
- insert_element_of_tag_type(
- parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
- pop_current_node(parser); // <hr>
- pop_current_node(parser); // <form>
- if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
- parser->_parser_state->_form_element = NULL;
+ reconstruct_active_formatting_elements(parser);
+ insert_element_from_token(parser, token);
+ if (token->v.start_tag.is_self_closing) {
+ pop_current_node(parser);
+ acknowledge_self_closing_tag(parser);
}
- return false;
+ return true;
} else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
parser->_parser_state->_ignore_next_linefeed = true;
@@ -3491,6 +3403,17 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
}
insert_element_from_token(parser, token);
return true;
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
+ pop_current_node(parser);
+ }
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
+ pop_current_node(parser);
+ }
+ insert_element_from_token(parser, token);
+ pop_current_node(parser);
+ acknowledge_self_closing_tag(parser);
+ return true;
} else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
GumboVector* open_elements = &parser->_parser_state->_open_elements;
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
@@ -3854,40 +3777,33 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
// Fall through to the if-statements below.
break;
}
- // Order matters for these clauses.
- if (tag_in(token, kStartTag,
- (gumbo_tagset){TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR),
- TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT),
- TAG(EM), TAG(EMBED), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5),
- TAG(H6), TAG(HEAD), TAG(HR), TAG(I), TAG(IMG), TAG(LI),
- TAG(LISTING), TAG(MENU), TAG(META), TAG(NOBR), TAG(OL), TAG(P),
- TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), TAG(SPAN), TAG(STRONG),
- TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), TAG(TT), TAG(U),
- TAG(UL), TAG(VAR)}) ||
- (tag_is(token, kStartTag, GUMBO_TAG_FONT) &&
- (token_has_attribute(token, "color") ||
- token_has_attribute(token, "face") ||
- token_has_attribute(token, "size")))) {
+
+ if (tag_in(token, kStartTag, (gumbo_tagset){
+ TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR), TAG(CENTER),
+ TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT), TAG(EM), TAG(EMBED),
+ TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), TAG(HEAD),
+ TAG(HR), TAG(I), TAG(IMG), TAG(LI), TAG(LISTING), TAG(MENU), TAG(META),
+ TAG(NOBR), TAG(OL), TAG(P), TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL),
+ TAG(SPAN), TAG(STRONG), TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE),
+ TAG(TT), TAG(U), TAG(UL), TAG(VAR)})
+ || tag_in(token, kEndTag, (gumbo_tagset){TAG(BR), TAG(P)})
+ || (tag_is(token, kStartTag, GUMBO_TAG_FONT)
+ && (token_has_attribute(token, "color")
+ || token_has_attribute(token, "face")
+ || token_has_attribute(token, "size"))))
+ {
/* Parse error */
parser_add_parse_error(parser, token);
- /*
- * Fragment case: If the parser was originally created for the HTML
- * fragment parsing algorithm, then act as described in the "any other
- * start tag" entry below.
- */
- if (!is_fragment_parser(parser)) {
- do {
- pop_current_node(parser);
- } while (!(is_mathml_integration_point(get_current_node(parser)) ||
- is_html_integration_point(get_current_node(parser)) ||
- get_current_node(parser)->v.element.tag_namespace ==
- GUMBO_NAMESPACE_HTML));
- parser->_parser_state->_reprocess_current_token = true;
- return false;
+ while (!is_mathml_integration_point(get_current_node(parser))
+ && !is_html_integration_point(get_current_node(parser))
+ && get_current_node(parser)->v.element.tag_namespace != GUMBO_NAMESPACE_HTML)
+ {
+ pop_current_node(parser);
}
- assert(token->type == GUMBO_TOKEN_START_TAG);
+ handle_html_content(parser, token);
+ return false;
}
if (token->type == GUMBO_TOKEN_START_TAG) {