#include "html.h" #include "css_selector.h" #include "css_parser.h" #include "internal.h" namespace litehtml { void css_selector::calc_specificity() { if(m_right.m_tag != star_id) { m_specificity.d = 1; } for(const auto& attr : m_right.m_attrs) { if(attr.type == select_id) { m_specificity.b++; } else { m_specificity.c++; } } if(m_left) { m_left->calc_specificity(); m_specificity += m_left->m_specificity; } } void css_selector::add_media_to_doc( document* doc ) const { if(m_media_query && doc) { doc->add_media_list(m_media_query); } } // https://www.w3.org/TR/selectors-4/#type-nmsp // = [ | '*' ]? '|' https://www.w3.org/TR/selectors-4/#typedef-ns-prefix string parse_ns_prefix(const css_token_vector& tokens, int& index) { const auto& a = at(tokens, index); const auto& b = at(tokens, index + 1); if (a.ch == '|') { index++; return ""; } if ((a.type == IDENT || a.ch == '*') && b.ch == '|') { index += 2; return a.type == IDENT ? a.name : "*"; } return ""; } struct wq_name { string prefix; string name; }; // = ? // Whitespace is forbidden between any of the components of a . wq_name parse_wq_name(const css_token_vector& tokens, int& index) { int start = index; string prefix = parse_ns_prefix(tokens, index); auto tok = at(tokens, index); if (tok.type == IDENT) { index++; return { prefix, tok.name }; } // restore index to before if failed to parse index = start; // handle situation when a name is erroneously parsed as prefix, eg. [x|=a] tok = at(tokens, index); if (tok.type == IDENT) { index++; return { "", tok.name }; } return {}; } // https://www.w3.org/TR/selectors-4/#typedef-type-selector // = | ? '*' // = ? // So: // = ? [ | '*' ] wq_name parse_type_selector(const css_token_vector& tokens, int& index) { int start = index; string prefix = parse_ns_prefix(tokens, index); const auto& tok = at(tokens, index); if (tok.type == IDENT || tok.ch == '*') { index++; string name = tok.type == IDENT ? tok.name : "*"; // type selector is always ASCII-case-insensitive for HTML documents, regardless of document mode (quirks/no quirks) return { lowcase(prefix), lowcase(name) }; } // restore index to before if failed to parse or '*' index = start; return {}; } // = [ '~' | '|' | '^' | '$' | '*' ]? '=' bool parse_attr_matcher(const css_token_vector& tokens, int& index, attr_matcher& matcher) { const auto& a = at(tokens, index); const auto& b = at(tokens, index + 1); if (a.ch == '=') { index++; matcher = attribute_equals; return true; } if (!(is_one_of(a.ch, '~', '|', '^', '$', '*') && b.ch == '=')) return false; index += 2; matcher = (attr_matcher)a.ch; return true; } // https://www.w3.org/TR/selectors-4/#typedef-attribute-selector // = '[' ']' | // '[' [ | ] ? ']' // = [ '~' | '|' | '^' | '$' | '*' ]? '=' // = i | s css_attribute_selector parse_attribute_selector(const css_token& block) { css_attribute_selector selector; const css_token_vector& tokens = block.value; int index = 0; // skip_whitespace(tokens, index); wq_name wq_name = parse_wq_name(tokens, index); if (wq_name.name == "") return {}; // attribute name in attribute selector is ASCII case-insensitive for HTML documents, regardless of document mode (quirks/no quirks) auto prefix = lowcase(wq_name.prefix); auto name = lowcase(wq_name.name); skip_whitespace(tokens, index); if (index == (int)tokens.size()) // [name] { selector.type = select_attr; selector.prefix = _id(prefix); selector.name = _id(name); selector.matcher = attribute_exists; return selector; } // skip_whitespace(tokens, index); attr_matcher matcher; if (!parse_attr_matcher(tokens, index, matcher)) return {}; // | skip_whitespace(tokens, index); const css_token& value = at(tokens, index); if (value.type != STRING && value.type != IDENT) return {}; index++; // ? skip_whitespace(tokens, index); char modifier = 0; const css_token& tok = at(tokens, index); if (tok.type == IDENT) { if (tok.ident() == "s") modifier = 's'; else if (tok.ident() == "i") modifier = 'i'; else return {}; // junk at the end of attribute selector index++; } skip_whitespace(tokens, index); if (index != (int)tokens.size()) return {}; // junk at the end of attribute selector // https://html.spec.whatwg.org/multipage/semantics-other.html#case-sensitivity-of-selectors // Attribute selectors on an HTML element in an HTML document must treat the values // of attributes with the following names as ASCII case-insensitive (unless s modifier is specified): static string_vector special_attributes = { "accept", "accept-charset", "align", "alink", "axis", "bgcolor", "charset", "checked", "clear", "codetype", "color", "compact", "declare", "defer", "dir", "direction", "disabled", "enctype", "face", "frame", "hreflang", "http-equiv", "lang", "language", "link", "media", "method", "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", "rel", "rev", "rules", "scope", "scrolling", "selected", "shape", "target", "text", "type", "valign", "valuetype", "vlink", }; selector.type = select_attr; selector.prefix = _id(prefix); selector.name = _id(name); selector.matcher = matcher; selector.caseless_match = modifier == 'i' || (!modifier && name in special_attributes); selector.value = selector.caseless_match ? lowcase(value.str) : value.str; return selector; } struct an_b { int a, b; bool valid; an_b() : a(), b(), valid(false) {} an_b(int a, int b) : a(a), b(b), valid(true) {} }; // NOTE: "+ 5" is not valid, and strtol correctly fails to parse it bool to_int(string s, int& number) { if (s == "") return false; const char* ptr = s.c_str(); char* end; int n = strtol(ptr, &end, 10); if (end != ptr + s.size()) return false; number = n; return true; } // https://www.w3.org/TR/css-syntax-3/#anb-syntax // I don't use the formal grammar because it creates a lot of unnecessary complexity. // Deviations from the standard: // * escapes are not allowed // * comments are allowed inside numbers and identifiers: ev/**/en an_b parse_an_b(string s) { lcase(trim(s)); if (s == "even") return {2, 0}; if (s == "odd") return {2, 1}; int a, b; int i = (int)s.find('n'); if (i == -1) { if (!to_int(s, b)) return {}; return {0, b}; } auto str_a = s.substr(0, i); auto str_b = s.substr(i + 1); if (is_one_of(str_a, "", "+", "-")) a = str_a == "-" ? -1 : 1; else { if (!to_int(str_a, a)) return {}; } trim(str_b); // spaces after n are allowed: 2n + 3 if (str_b != "") { if (str_b[0] == '+' || str_b[0] == '-') while (is_whitespace(str_b[1])) str_b.erase(1, 1); // spaces after sign are allowed if (!to_int(str_b, b)) return {}; } else b = 0; return {a, b}; } int find_of_keyword(const css_token_vector& tokens) { for (int i = 0; i < (int)tokens.size(); i++) { if (tokens[i].ident() == "of") return i; } return -1; } // :nth-child(An+B [of S]?) https://www.w3.org/TR/selectors-4/#the-nth-child-pseudo // :nth-last-child(An+B [of S]?) // where S is a forgiving // // :nth-of-type(An+B) https://www.w3.org/TR/selectors-4/#the-nth-of-type-pseudo // :nth-last-of-type(An+B) // css_attribute_selector parse_nth_child(const css_token& token, bool of_keyword, document_mode mode) { css_attribute_selector selector(select_pseudo_class, lowcase(token.name)); const auto& tokens = token.value; // find "of" keyword int i = of_keyword ? find_of_keyword(tokens) : -1; if (i >= 0) { const auto& selector_tokens = slice(tokens, i + 1); // The standard doesn't specify if pseudo-elements are allowed in this selector list. // But specifying them will make selector match nothing anyway because // "The structural pseudo-classes only apply to elements in the document tree; // they must never match pseudo-elements." https://www.w3.org/TR/selectors-4/#structural-pseudos // So I parse as if they were not allowed. selector.selector_list = parse_selector_list(selector_tokens, forgiving_mode + forbid_pseudo_elements, mode); // NOTE: selector_list may be empty, this does not invalidate the selector. // Chrome/Firefox behavior differs from the standard: they treat S as unforgiving and allow pseudo-elements. // NOTE: :is(), which also accepts , is handled correctly by Chrome/Firefox. // Use this code instead of above to match Chrome/Firefox behavior: //selector.selector_list = parse_selector_list(selector_tokens, strict_mode); //if (selector.selector_list.empty()) return {}; } // get string representation of everything between "nth-child(" and "of" or ")", except for comments string str = get_repr(tokens, 0, i); // Note: i == -1 works as expected an_b x = parse_an_b(str); if (!x.valid) return {}; selector.a = x.a; selector.b = x.b; return selector; } css_attribute_selector parse_function_pseudo_class(const css_token& token, document_mode mode) { string name = lowcase(token.name); if (name == "nth-child" || name == "nth-last-child") { return parse_nth_child(token, true, mode); } else if (name == "nth-of-type" || name == "nth-last-of-type") { return parse_nth_child(token, false, mode); } else if (name == "is") // https://www.w3.org/TR/selectors-4/#matches { css_attribute_selector selector(select_pseudo_class, name); // "taking a as its sole argument" // "Pseudo-elements... are not valid within :is()." selector.selector_list = parse_selector_list(token.value, forgiving_mode + forbid_pseudo_elements, mode); return selector; } else if (name == "not") // https://www.w3.org/TR/selectors-4/#negation { css_attribute_selector selector(select_pseudo_class, name); // "taking a selector list as an argument" // "Pseudo-elements... are not valid within :not()." selector.selector_list = parse_selector_list(token.value, strict_mode + forbid_pseudo_elements, mode); if (selector.selector_list.empty()) return {}; return selector; } else if (name == "lang") // https://www.w3.org/TR/selectors-4/#the-lang-pseudo { css_attribute_selector selector(select_pseudo_class, name); selector.value = get_repr(token.value); return selector; } return {}; } // simple = non-functional (without parentheses) bool is_supported_simple_pseudo_class(const string& name) { static std::set supported_simple_pseudo_classes = { // Location Pseudo-classes https://www.w3.org/TR/selectors-4/#location "any-link", "link", "visited", "local-link", "target", "target-within", "scope", // User Action Pseudo-classes https://www.w3.org/TR/selectors-4/#useraction-pseudos "hover", "active", "focus", "focus-visible", "focus-within", // Tree-Structural pseudo-classes https://www.w3.org/TR/selectors-4/#structural-pseudos "root", "empty", "first-child", "last-child", "only-child", "first-of-type", "last-of-type", "only-of-type", }; return supported_simple_pseudo_classes.count(lowcase(name)) == 1; } // https://www.w3.org/TR/selectors-4/#typedef-pseudo-class-selector // = ':' | // ':' ')' // where is not before, after, first-line or first-letter css_attribute_selector parse_pseudo_class(const css_token_vector& tokens, int& index, document_mode mode) { const auto& a = at(tokens, index); const auto& b = at(tokens, index + 1); if (a.ch != ':') return {}; if (b.type == IDENT) { // unsupported pseudo-classes must be treated as invalid: https://www.w3.org/TR/selectors-4/#w3c-partial if (!is_supported_simple_pseudo_class(b.ident())) return {}; index += 2; return { select_pseudo_class, b.ident() }; } if (b.type == CV_FUNCTION) { css_attribute_selector sel = parse_function_pseudo_class(b, mode); if (sel) index += 2; return sel; } return {}; } // https://www.w3.org/TR/selectors-4/#typedef-subclass-selector // = | | | // = with hash_type == ID // = '.' css_attribute_selector parse_subclass_selector(const css_token_vector& tokens, int& index, document_mode mode) { css_attribute_selector selector; const auto& tok0 = at(tokens, index); const auto& tok1 = at(tokens, index + 1); switch (tok0.type) { case HASH: if (tok0.hash_type == css_hash_id) { index++; selector.type = select_id; string name = tok0.name; // ids are matched ASCII case-insensitively in quirks mode if (mode == quirks_mode) lcase(name); selector.name = _id(name); return selector; } return {}; case '.': if (tok1.type == IDENT) { index += 2; selector.type = select_class; string name = tok1.name; // class names are matched ASCII case-insensitively in quirks mode if (mode == quirks_mode) lcase(name); selector.name = _id(name); return selector; } return {}; case SQUARE_BLOCK: selector = parse_attribute_selector(tok0); if (selector) index++; return selector; default: return parse_pseudo_class(tokens, index, mode); } } // simple = non-functional (without parentheses) bool is_supported_simple_pseudo_element(const string& name) { return is_one_of(lowcase(name), // Typographic Pseudo-elements https://www.w3.org/TR/css-pseudo-4/#typographic-pseudos //"first-line", "first-letter", // Highlight Pseudo-elements https://www.w3.org/TR/css-pseudo-4/#highlight-pseudos //"selection", // Tree-Abiding Pseudo-elements https://www.w3.org/TR/css-pseudo-4/#treelike "before", "after" //"marker", "placeholder", ); } css_attribute_selector parse_pseudo_element(const css_token_vector& tokens, int& index) { const auto& a = at(tokens, index); const auto& b = at(tokens, index + 1); const auto& c = at(tokens, index + 2); if (a.ch != ':') return {}; if (b.ch != ':' && b.type != IDENT) return {}; if (b.type == IDENT) // legacy syntax with one ':' https://www.w3.org/TR/selectors-4/#single-colon-pseudos { if (!is_one_of(b.ident(), "before", "after")) // first-line/letter are not supported return {}; index += 2; return {select_pseudo_element, b.ident()}; } if (c.type == IDENT) // normal syntax with '::' { if (!is_supported_simple_pseudo_element(c.ident())) return {}; index += 3; return {select_pseudo_element, c.ident()}; } return {}; } // https://www.w3.org/TR/selectors-4/#typedef-compound-selector // = [ ? * // [ * ]* ]! // NOTE: This grammar allows pseudo-classes to go before #id and .class and [attr]. // Whitespace is forbidden: // * Between any of the top-level components of a css_element_selector::ptr parse_compound_selector(const css_token_vector& tokens, int& index, document_mode mode) { auto selector = make_shared(); // ? wq_name wq_name = parse_type_selector(tokens, index); selector->m_prefix = _id(wq_name.prefix); selector->m_tag = _id(wq_name.name); // * while (css_attribute_selector sel = parse_subclass_selector(tokens, index, mode)) selector->m_attrs.push_back(sel); // [ * ]* while (true) { auto sel = parse_pseudo_element(tokens, index); if (!sel) break; selector->m_attrs.push_back(sel); while ((sel = parse_pseudo_class(tokens, index, mode))) selector->m_attrs.push_back(sel); } // [..]! "must produce at least one value" https://www.w3.org/TR/css-values-4/#mult-req if (selector->m_tag == empty_id && selector->m_attrs.empty()) return nullptr; if (selector->m_tag == empty_id) selector->m_tag = star_id; return selector; } // = '>' | '+' | '~' | [ '|' '|' ] // combinator is also handled here // parse_combinator consumes all leading and trailing whitespace // column combinator || is at-risk https://www.w3.org/TR/selectors-4/ and not implemented in Chrome/Firefox https://caniuse.com/mdn-css_selectors_column int parse_combinator(const css_token_vector& tokens, int& index) { bool ws = skip_whitespace(tokens, index); const css_token& tok = at(tokens, index); if (is_one_of(tok.ch, '>', '+', '~')) // if (tok.ch in ${'>', '+', '~'}) { index++; skip_whitespace(tokens, index); return tok.ch; } return ws ? ' ' : 0; } css_selector::ptr parse_complex_selector(const css_token_vector& tokens, document_mode mode) { int index = 0; skip_whitespace(tokens, index); auto sel = parse_compound_selector(tokens, index, mode); if (!sel) return nullptr; auto selector = make_shared(); selector->m_right = *sel; // NOTE: all the whitespace is handled by parse_combinator, that's why skip_whitespace is never called in the loop // NOTE: parse_complex_selector is different from most other parse_xxx functions in that it's required // to parse all input tokens, it doesn't just parse as much as possible. while (true) { int combinator = parse_combinator(tokens, index); if (index == (int)tokens.size()) // combinator == 0 means index already was at the end before the call to parse_combinator return !combinator || combinator == ' ' ? selector : nullptr; if (!combinator) // not the end and combinator failed to parse return nullptr; // otherwise: index is not at the end, combinator is good and tokens[index] is not whitespace // it means if parse_compound_selector fails it's an error sel = parse_compound_selector(tokens, index, mode); if (!sel) return nullptr; auto new_selector = make_shared(); new_selector->m_left = selector; new_selector->m_right = *sel; new_selector->m_combinator = (css_combinator)combinator; selector = new_selector; } } // Return true if `selector` has (in any of its css_element_selector's) a css_attribute_selector // of type `type` and name `name`. name == "" matches any name. bool has_selector(const css_selector& selector, attr_select_type type, const string& name = "") { for (const auto& sel : selector.m_right.m_attrs) { if (sel.type == type && (name == "" || equal_i(_s(sel.name), name))) return true; } if (selector.m_left) return has_selector(*selector.m_left, type, name); return false; } // https://www.w3.org/TR/css-syntax-3/#parse-comma-list // https://www.w3.org/TR/selectors-4/#selector-list // https://www.w3.org/TR/selectors-4/#forgiving-selector // Parse comma-separated list of complex selectors. css_selector::vector parse_selector_list(const css_token_vector& tokens, int options, document_mode mode) { // NOTE: this is unnecessary: "If input contains only s, return an empty list." vector list_of_lists = parse_comma_separated_list(tokens); css_selector::vector result; for (const auto& list: list_of_lists) { css_selector::ptr selector = parse_complex_selector(list, mode); // if selector is failed to parse or not allowed by the options if (!selector || ((options & forbid_pseudo_elements) && has_selector(*selector, select_pseudo_element))) { // in forgiving mode, ignore the bad selector if (options & forgiving_mode) continue; // in strict mode, entire selector-list fails to parse because of one bad selector return {}; } result.push_back(selector); } return result; } bool css_selector::parse(const string& text, document_mode mode) { auto tokens = normalize(text, f_componentize); auto ptr = parse_complex_selector(tokens, mode); if (!ptr) return false; *this = *ptr; return true; } } // namespace litehtml