diff options
Diffstat (limited to 'libs/litehtml/src/css_parser.cpp')
-rw-r--r-- | libs/litehtml/src/css_parser.cpp | 514 |
1 files changed, 514 insertions, 0 deletions
diff --git a/libs/litehtml/src/css_parser.cpp b/libs/litehtml/src/css_parser.cpp new file mode 100644 index 0000000000..24e49a78c8 --- /dev/null +++ b/libs/litehtml/src/css_parser.cpp @@ -0,0 +1,514 @@ +#include "html.h" +#include "css_parser.h" + +namespace litehtml +{ + +// https://www.w3.org/TR/css-syntax-3/#css-filter-code-points +void filter_code_points(string& input) +{ + const char* xFFFD = "\xEF\xBF\xBD"; + + size_t null_count = std::count(input.begin(), input.end(), 0); + + string result(input.size() + 2 * null_count, 0); + + for (int i = 0, j = 0; i < (int)input.size(); i++) + { + switch (input[i]) + { + case '\r': + result[j++] = '\n'; + if (i + 1 < (int)input.size() && input[i + 1] == '\n') i++; // skip \n after \r + break; + case '\f': + result[j++] = '\n'; + break; + case 0: + memcpy(&result[j], xFFFD, 3); + j += 3; + break; + default: + result[j++] = input[i]; + } + } + + // trim trailing NULs + result.resize(strlen(result.c_str())); + input = result; +} + +void remove_whitespace(css_token_vector& tokens, keep_whitespace_fn keep_whitespace) +{ + for (int i = 0; i < (int)tokens.size(); i++) + { + auto& tok = tokens[i]; + if (tok.type == ' ') + { + const auto& left = i > 0 ? tokens[i - 1] : css_token(); + const auto& right = at(tokens, i + 1); + bool keep = keep_whitespace && keep_whitespace(left, right); + if (!keep) remove(tokens, i), i--; + } + else if (tok.is_component_value()) + remove_whitespace(tok.value, keep_whitespace); + } +} + +void componentize(css_token_vector& tokens) +{ + css_parser parser(tokens); + css_token_vector result; + while (true) + { + css_token tok = parser.consume_component_value(); + if (tok.type == EOF) break; + result.push_back(tok); + } + tokens = result; +} + +// https://www.w3.org/TR/css-syntax-3/#normalize-into-a-token-stream +template<> +css_token_vector normalize(css_token_vector input, int options, keep_whitespace_fn keep_whitespace) +{ + if (options & f_componentize) componentize(input); + if (options & f_remove_whitespace) remove_whitespace(input, keep_whitespace); + return input; +} +template<> +css_token_vector normalize(string input, int options, keep_whitespace_fn keep_whitespace) +{ + filter_code_points(input); + auto tokens = tokenize(input); + return normalize(tokens, options, keep_whitespace); +} + +// https://www.w3.org/TR/css-syntax-3/#parse-stylesheet +// I don't create a stylesheet because its only perpose is to pass a list of rules to +// parse_css_stylesheet. I just return the list of rules directly instead. +raw_rule::vector css_parser::parse_stylesheet(const string& input, bool top_level) +{ + // 1. If input is a byte stream for stylesheet, decode bytes from input, and set input to the result. + // not implemented, utf-8 is always assumed + string str = decode(input, encoding::utf_8); // decoding potentially broken UTF-8 into valid UTF-8 + + // 2. Normalize input, and set input to the result. + auto tokens = normalize(str); + + return parse_stylesheet(tokens, top_level); +} +raw_rule::vector css_parser::parse_stylesheet(const css_token_vector& input, bool top_level) +{ + // 3. Create a new stylesheet, with its location set to location. + // 4. Consume a list of rules from input, with the top-level flag set, and set the stylesheet’s value to the result. + // 5. Return the stylesheet. + return css_parser(input).consume_list_of_rules(top_level); +} + +// https://www.w3.org/TR/css-syntax-3/#consume-the-next-input-token +css_token css_parser::next_token() +{ + if (m_index == (int)m_tokens.size()) + return css_token_type(EOF); + else + return m_tokens[m_index++]; +} + +css_token css_parser::peek_token() +{ + if (m_index == (int)m_tokens.size()) + return css_token_type(EOF); + else + return m_tokens[m_index]; +} + +// https://www.w3.org/TR/css-syntax-3/#consume-list-of-rules +raw_rule::vector css_parser::consume_list_of_rules(bool top_level) +{ + raw_rule::vector rules; + raw_rule::ptr rule; + + while (true) + { + // Repeatedly consume the next input token: + css_token token = next_token(); + + switch (token.type) + { + case WHITESPACE: + break; // Do nothing. + + case EOF: + return rules; // Return the list of rules. + + case CDO: + case CDC: + // If the top-level flag is set, do nothing. + if (top_level) break; + + // Otherwise, reconsume the current input token. Consume a qualified rule. + // If anything is returned, append it to the list of rules. + m_index--; + rule = consume_qualified_rule(); + if (rule) rules.push_back(rule); + break; + + case AT_KEYWORD: + // Reconsume the current input token. Consume an at-rule, and append the returned value to the list of rules. + m_index--; + rule = consume_at_rule(); + if (rule) rules.push_back(rule); + break; + + default: + // Reconsume the current input token. Consume a qualified rule. If anything is returned, append it to the list of rules. + m_index--; + rule = consume_qualified_rule(); + if (rule) rules.push_back(rule); + break; + } + } +} + +// https://www.w3.org/TR/css-syntax-3/#consume-qualified-rule +raw_rule::ptr css_parser::consume_qualified_rule() +{ + // Create a new qualified rule with its prelude initially set to an empty list, and its value initially set to nothing. + raw_rule::ptr rule = make_shared<raw_rule>(raw_rule::qualified); + + while (true) + { + // Repeatedly consume the next input token: + css_token token = next_token(); + + switch (token.type) + { + case EOF: + // This is a parse error. Return nothing. + css_parse_error("eof in qualified rule"); + return nullptr; + case '{': + // Consume a simple block and assign it to the qualified rule’s block. Return the qualified rule. + rule->block = consume_simple_block('{'); + return rule; + case CURLY_BLOCK: + // Assign the block to the qualified rule’s block. Return the qualified rule. + rule->block = token; + return rule; + default: + // Reconsume the current input token. Consume a component value. Append the returned value to the qualified rule’s prelude. + m_index--; + css_token value = consume_component_value(); + rule->prelude.push_back(value); + } + } +} + +// https://www.w3.org/TR/css-syntax-3/#consume-at-rule +raw_rule::ptr css_parser::consume_at_rule() +{ + // Consume the next input token. Create a new at-rule with its name set to the value of the current input token, + // its prelude initially set to an empty list, and its value initially set to nothing. + css_token token = next_token(); + raw_rule::ptr rule = make_shared<raw_rule>(raw_rule::at, token.str); + + while (true) + { + // Repeatedly consume the next input token: + token = next_token(); + + switch (token.type) + { + case ';': + return rule; + case EOF: + // This is a parse error. Return the at-rule. + css_parse_error("eof in at-rule"); + return rule; + case '{': + // Consume a simple block and assign it to the at-rule’s block. Return the at-rule. + rule->block = consume_simple_block('{'); + return rule; + case CURLY_BLOCK: + // Assign the block to the at-rule’s block. Return the at-rule. + rule->block = token; + return rule; + default: + // Reconsume the current input token. Consume a component value. Append the returned value to the at-rule’s prelude. + m_index--; + css_token value = consume_component_value(); + rule->prelude.push_back(value); + } + } +} + +char mirror(char c); + +// https://www.w3.org/TR/css-syntax-3/#consume-simple-block +css_token css_parser::consume_simple_block(char opening_bracket) +{ + // Create a simple block with its associated token set to the current input token and with its value initially set to an empty list. + auto block_type = css_token_type(-100 - opening_bracket); // see css_token_type + css_token block(block_type); + + char closing_bracket = mirror(opening_bracket); + + while (true) + { + // Repeatedly consume the next input token and process it as follows: + css_token token = next_token(); + + if (token.type == closing_bracket) + { + return block; + } + else if (token.type == EOF) + { + css_parse_error("eof in simple block"); + return block; + } + else + { + // Reconsume the current input token. Consume a component value and append it to the value of the block. + m_index--; + css_token val = consume_component_value(); + block.value.push_back(val); + } + } +} + +// https://www.w3.org/TR/css-syntax-3/#consume-component-value +css_token css_parser::consume_component_value() +{ + // Consume the next input token. + css_token token = next_token(); + + switch (token.type) + { + // If the current input token is a <{-token>, <[-token>, or <(-token>, consume a simple block and return it. + case '{': case '[': case '(': + return consume_simple_block((char)token.ch); + + // Otherwise, if the current input token is a <function-token>, consume a function and return it. + case FUNCTION: + return consume_function(token.name); + + // Otherwise, return the current input token. + default: + return token; + } +} + +// https://www.w3.org/TR/css-syntax-3/#consume-function +css_token css_parser::consume_function(const string& name) +{ + // Create a function with its name equal to the value of the current input token and with its value initially set to an empty list. + css_token function(CV_FUNCTION, name); + + while (true) + { + // Repeatedly consume the next input token and process it as follows: + css_token token = next_token(); + + switch (token.type) + { + case ')': + return function; + + case EOF: + css_parse_error("eof in function"); + return function; + + default: + // Reconsume the current input token. Consume a component value and append the returned value to the function’s value. + m_index--; + css_token val = consume_component_value(); + function.value.push_back(val); + } + } +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + +void trim_whitespace(css_token_vector& tokens) +{ + while (at(tokens, 0).type == ' ') remove(tokens, 0); + while (at(tokens, -1).type == ' ') remove(tokens, -1); +} + +// https://www.w3.org/TR/css-syntax-3/#consume-declaration +// next token is guaranteed to be IDENT +raw_declaration css_parser::consume_declaration() +{ + // Consume the next input token. Create a new declaration with its name set to the value of + // the current input token and its value initially set to an empty list. + css_token token = next_token(); + raw_declaration decl = {token.name}; + auto& value = decl.value; + + // 1. While the next input token is a <whitespace-token>, consume the next input token. + while (peek_token().type == ' ') next_token(); + + // 2. If the next input token is anything other than a <colon-token>, this is a parse error. Return nothing. + if (peek_token().ch != ':') + { + css_parse_error("consume_declaration: ':' not found"); + return {}; + } + // Otherwise, consume the next input token. + next_token(); + + // 3. While the next input token is a <whitespace-token>, consume the next input token. + while (peek_token().type == ' ') next_token(); + + // 4. As long as the next input token is anything other than an <EOF-token>, + // consume a component value and append it to the declaration’s value. + while (peek_token().type != EOF) + value.push_back(consume_component_value()); + + // 5. If the last two non-<whitespace-token>s in the declaration’s value are a <delim-token> with the value "!" + // followed by an <ident-token> with a value that is an ASCII case-insensitive match for "important", + // remove them from the declaration’s value and set the declaration’s important flag to true. + + trim_whitespace(value); // deviation from standard: removing leading whitespace as well + + if (at(value, -1).ident() == "important" && at(value, -2).ch == '!') + { + remove(value, -2, 2); + decl.important = true; + } + + // 6. While the last token in the declaration’s value is a <whitespace-token>, remove that token. + trim_whitespace(value); + + // 7. Return the declaration. + return decl; +} + +// https://www.w3.org/TR/css-syntax-3/#consume-style-block +void css_parser::consume_style_block_contents(/*out*/ raw_declaration::vector& decls, /*out*/ raw_rule::vector& rules) +{ + while (true) + { + // Repeatedly consume the next input token: + css_token token = next_token(); + + switch (token.type) + { + case WHITESPACE: + case ';': + break; // Do nothing. + + case EOF: + // "Extend decls with rules, then return decls." + // NOTE: I just return decls and rules separately + return; + + case AT_KEYWORD: { + // Reconsume the current input token. Consume an at-rule, and append the result to rules. + m_index--; + auto rule = consume_at_rule(); + if (rule) rules.push_back(rule); + break; + } + case IDENT: { + // Initialize a temporary list initially filled with the current input token. + css_token_vector temp = { token }; + // As long as the next input token is anything other than a <semicolon-token> or <EOF-token>, + // consume a component value and append it to the temporary list. + while (!is_one_of(peek_token().type, ';', EOF)) + temp.push_back(consume_component_value()); + + css_parser parser(temp); + // Consume a declaration from the temporary list. + auto decl = parser.consume_declaration(); + // If anything was returned, append it to decls. + if (decl) decls.push_back(decl); + break; + } + case '&': { + // Reconsume the current input token. Consume a qualified rule. If anything was returned, append it to rules. + m_index--; + auto rule = consume_qualified_rule(); + if (rule) rules.push_back(rule); + break; + } + default: + // This is a parse error. Reconsume the current input token. As long as the next input token is + // anything other than a <;> or <EOF>, consume a component value and throw away the returned value. + css_parse_error("unexpected token in a style block"); + m_index--; + while (!is_one_of(peek_token().type, ';', EOF)) + consume_component_value(); + break; + } + } +} + + +// https://www.w3.org/TR/css-syntax-3/#parse-comma-separated-list-of-component-values +// Note: result is never empty. If input is empty result is {{}}. +vector<css_token_vector> parse_comma_separated_list(const css_token_vector& tokens) +{ + vector<css_token_vector> result; + + css_token_vector list; + for (auto& tok : tokens) + { + if (tok.type == ',') // Note: EOF token is not stored in arrays + { + result.push_back(list); + list.clear(); + continue; + } + list.push_back(tok); + } + result.push_back(list); + + return result; +} + +// https://drafts.csswg.org/css-syntax-3/#typedef-any-value +// assumes that tokens have been componentized +bool is_any_value(const css_token_vector& tokens) +{ + if (tokens.empty()) return false; + for (auto& tok : tokens) + { + if (is_one_of(tok.type, BAD_STRING, BAD_URL, ')', ']', '}')) + return false; + else if (tok.is_component_value() && !is_any_value(tok.value)) + return false; + } + return true; +} + +// https://drafts.csswg.org/css-syntax-3/#typedef-declaration-value +// assumes that tokens have been componentized +bool is_declaration_value(const css_token_vector& tokens, int index) +{ + if (index >= (int)tokens.size()) return false; + for (int i = index; i < (int)tokens.size(); i++) + { + auto& tok = tokens[i]; + if (is_one_of(tok.type, BAD_STRING, BAD_URL, ')', ']', '}', ';', '!')) + return false; + // Note: ';' '!' inside component values are allowed, so using is_any_value here. + else if (tok.is_component_value() && !is_any_value(tok.value)) + return false; + } + return true; +} + +// Note: it is possible to have several whitespace tokens in a row: " /**/ /**/ " +bool skip_whitespace(const css_token_vector& tokens, int& index) +{ + int start = index; + while (at(tokens, index).type == ' ') index++; + return index != start; +} + +} // namespace litehtml
\ No newline at end of file |