#include "html.h"
#include "css_parser.h"
namespace litehtml
{
// https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
void filter_code_points(string& input)
{
const char* xFFFD = "\xEF\xBF\xBD";
size_t null_count = std::count(input.begin(), input.end(), 0);
string result(input.size() + 2 * null_count, 0);
for (int i = 0, j = 0; i < (int)input.size(); i++)
{
switch (input[i])
{
case '\r':
result[j++] = '\n';
if (i + 1 < (int)input.size() && input[i + 1] == '\n') i++; // skip \n after \r
break;
case '\f':
result[j++] = '\n';
break;
case 0:
memcpy(&result[j], xFFFD, 3);
j += 3;
break;
default:
result[j++] = input[i];
}
}
// trim trailing NULs
result.resize(strlen(result.c_str()));
input = result;
}
void remove_whitespace(css_token_vector& tokens, keep_whitespace_fn keep_whitespace)
{
for (int i = 0; i < (int)tokens.size(); i++)
{
auto& tok = tokens[i];
if (tok.type == ' ')
{
const auto& left = i > 0 ? tokens[i - 1] : css_token();
const auto& right = at(tokens, i + 1);
bool keep = keep_whitespace && keep_whitespace(left, right);
if (!keep) remove(tokens, i), i--;
}
else if (tok.is_component_value())
remove_whitespace(tok.value, keep_whitespace);
}
}
void componentize(css_token_vector& tokens)
{
css_parser parser(tokens);
css_token_vector result;
while (true)
{
css_token tok = parser.consume_component_value();
if (tok.type == EOF) break;
result.push_back(tok);
}
tokens = result;
}
// https://www.w3.org/TR/css-syntax-3/#normalize-into-a-token-stream
template<>
css_token_vector normalize(css_token_vector input, int options, keep_whitespace_fn keep_whitespace)
{
if (options & f_componentize) componentize(input);
if (options & f_remove_whitespace) remove_whitespace(input, keep_whitespace);
return input;
}
template<>
css_token_vector normalize(string input, int options, keep_whitespace_fn keep_whitespace)
{
filter_code_points(input);
auto tokens = tokenize(input);
return normalize(tokens, options, keep_whitespace);
}
// https://www.w3.org/TR/css-syntax-3/#parse-stylesheet
// I don't create a stylesheet because its only perpose is to pass a list of rules to
// parse_css_stylesheet. I just return the list of rules directly instead.
raw_rule::vector css_parser::parse_stylesheet(const string& input, bool top_level)
{
// 1. If input is a byte stream for stylesheet, decode bytes from input, and set input to the result.
// not implemented, utf-8 is always assumed
string str = decode(input, encoding::utf_8); // decoding potentially broken UTF-8 into valid UTF-8
// 2. Normalize input, and set input to the result.
auto tokens = normalize(str);
return parse_stylesheet(tokens, top_level);
}
raw_rule::vector css_parser::parse_stylesheet(const css_token_vector& input, bool top_level)
{
// 3. Create a new stylesheet, with its location set to location.
// 4. Consume a list of rules from input, with the top-level flag set, and set the stylesheet’s value to the result.
// 5. Return the stylesheet.
return css_parser(input).consume_list_of_rules(top_level);
}
// https://www.w3.org/TR/css-syntax-3/#consume-the-next-input-token
css_token css_parser::next_token()
{
if (m_index == (int)m_tokens.size())
return css_token_type(EOF);
else
return m_tokens[m_index++];
}
css_token css_parser::peek_token()
{
if (m_index == (int)m_tokens.size())
return css_token_type(EOF);
else
return m_tokens[m_index];
}
// https://www.w3.org/TR/css-syntax-3/#consume-list-of-rules
raw_rule::vector css_parser::consume_list_of_rules(bool top_level)
{
raw_rule::vector rules;
raw_rule::ptr rule;
while (true)
{
// Repeatedly consume the next input token:
css_token token = next_token();
switch (token.type)
{
case WHITESPACE:
break; // Do nothing.
case EOF:
return rules; // Return the list of rules.
case CDO:
case CDC:
// If the top-level flag is set, do nothing.
if (top_level) break;
// Otherwise, reconsume the current input token. Consume a qualified rule.
// If anything is returned, append it to the list of rules.
m_index--;
rule = consume_qualified_rule();
if (rule) rules.push_back(rule);
break;
case AT_KEYWORD:
// Reconsume the current input token. Consume an at-rule, and append the returned value to the list of rules.
m_index--;
rule = consume_at_rule();
if (rule) rules.push_back(rule);
break;
default:
// Reconsume the current input token. Consume a qualified rule. If anything is returned, append it to the list of rules.
m_index--;
rule = consume_qualified_rule();
if (rule) rules.push_back(rule);
break;
}
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-qualified-rule
raw_rule::ptr css_parser::consume_qualified_rule()
{
// Create a new qualified rule with its prelude initially set to an empty list, and its value initially set to nothing.
raw_rule::ptr rule = make_shared(raw_rule::qualified);
while (true)
{
// Repeatedly consume the next input token:
css_token token = next_token();
switch (token.type)
{
case EOF:
// This is a parse error. Return nothing.
css_parse_error("eof in qualified rule");
return nullptr;
case '{':
// Consume a simple block and assign it to the qualified rule’s block. Return the qualified rule.
rule->block = consume_simple_block('{');
return rule;
case CURLY_BLOCK:
// Assign the block to the qualified rule’s block. Return the qualified rule.
rule->block = token;
return rule;
default:
// Reconsume the current input token. Consume a component value. Append the returned value to the qualified rule’s prelude.
m_index--;
css_token value = consume_component_value();
rule->prelude.push_back(value);
}
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-at-rule
raw_rule::ptr css_parser::consume_at_rule()
{
// Consume the next input token. Create a new at-rule with its name set to the value of the current input token,
// its prelude initially set to an empty list, and its value initially set to nothing.
css_token token = next_token();
raw_rule::ptr rule = make_shared(raw_rule::at, token.str);
while (true)
{
// Repeatedly consume the next input token:
token = next_token();
switch (token.type)
{
case ';':
return rule;
case EOF:
// This is a parse error. Return the at-rule.
css_parse_error("eof in at-rule");
return rule;
case '{':
// Consume a simple block and assign it to the at-rule’s block. Return the at-rule.
rule->block = consume_simple_block('{');
return rule;
case CURLY_BLOCK:
// Assign the block to the at-rule’s block. Return the at-rule.
rule->block = token;
return rule;
default:
// Reconsume the current input token. Consume a component value. Append the returned value to the at-rule’s prelude.
m_index--;
css_token value = consume_component_value();
rule->prelude.push_back(value);
}
}
}
char mirror(char c);
// https://www.w3.org/TR/css-syntax-3/#consume-simple-block
css_token css_parser::consume_simple_block(char opening_bracket)
{
// Create a simple block with its associated token set to the current input token and with its value initially set to an empty list.
auto block_type = css_token_type(-100 - opening_bracket); // see css_token_type
css_token block(block_type);
char closing_bracket = mirror(opening_bracket);
while (true)
{
// Repeatedly consume the next input token and process it as follows:
css_token token = next_token();
if (token.type == closing_bracket)
{
return block;
}
else if (token.type == EOF)
{
css_parse_error("eof in simple block");
return block;
}
else
{
// Reconsume the current input token. Consume a component value and append it to the value of the block.
m_index--;
css_token val = consume_component_value();
block.value.push_back(val);
}
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-component-value
css_token css_parser::consume_component_value()
{
// Consume the next input token.
css_token token = next_token();
switch (token.type)
{
// If the current input token is a <{-token>, <[-token>, or <(-token>, consume a simple block and return it.
case '{': case '[': case '(':
return consume_simple_block((char)token.ch);
// Otherwise, if the current input token is a , consume a function and return it.
case FUNCTION:
return consume_function(token.name);
// Otherwise, return the current input token.
default:
return token;
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-function
css_token css_parser::consume_function(const string& name)
{
// Create a function with its name equal to the value of the current input token and with its value initially set to an empty list.
css_token function(CV_FUNCTION, name);
while (true)
{
// Repeatedly consume the next input token and process it as follows:
css_token token = next_token();
switch (token.type)
{
case ')':
return function;
case EOF:
css_parse_error("eof in function");
return function;
default:
// Reconsume the current input token. Consume a component value and append the returned value to the function’s value.
m_index--;
css_token val = consume_component_value();
function.value.push_back(val);
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void trim_whitespace(css_token_vector& tokens)
{
while (at(tokens, 0).type == ' ') remove(tokens, 0);
while (at(tokens, -1).type == ' ') remove(tokens, -1);
}
// https://www.w3.org/TR/css-syntax-3/#consume-declaration
// next token is guaranteed to be IDENT
raw_declaration css_parser::consume_declaration()
{
// Consume the next input token. Create a new declaration with its name set to the value of
// the current input token and its value initially set to an empty list.
css_token token = next_token();
raw_declaration decl = {token.name};
auto& value = decl.value;
// 1. While the next input token is a , consume the next input token.
while (peek_token().type == ' ') next_token();
// 2. If the next input token is anything other than a , this is a parse error. Return nothing.
if (peek_token().ch != ':')
{
css_parse_error("consume_declaration: ':' not found");
return {};
}
// Otherwise, consume the next input token.
next_token();
// 3. While the next input token is a , consume the next input token.
while (peek_token().type == ' ') next_token();
// 4. As long as the next input token is anything other than an ,
// consume a component value and append it to the declaration’s value.
while (peek_token().type != EOF)
value.push_back(consume_component_value());
// 5. If the last two non-s in the declaration’s value are a with the value "!"
// followed by an with a value that is an ASCII case-insensitive match for "important",
// remove them from the declaration’s value and set the declaration’s important flag to true.
trim_whitespace(value); // deviation from standard: removing leading whitespace as well
if (at(value, -1).ident() == "important" && at(value, -2).ch == '!')
{
remove(value, -2, 2);
decl.important = true;
}
// 6. While the last token in the declaration’s value is a , remove that token.
trim_whitespace(value);
// 7. Return the declaration.
return decl;
}
// https://www.w3.org/TR/css-syntax-3/#consume-style-block
void css_parser::consume_style_block_contents(/*out*/ raw_declaration::vector& decls, /*out*/ raw_rule::vector& rules)
{
while (true)
{
// Repeatedly consume the next input token:
css_token token = next_token();
switch (token.type)
{
case WHITESPACE:
case ';':
break; // Do nothing.
case EOF:
// "Extend decls with rules, then return decls."
// NOTE: I just return decls and rules separately
return;
case AT_KEYWORD: {
// Reconsume the current input token. Consume an at-rule, and append the result to rules.
m_index--;
auto rule = consume_at_rule();
if (rule) rules.push_back(rule);
break;
}
case IDENT: {
// Initialize a temporary list initially filled with the current input token.
css_token_vector temp = { token };
// As long as the next input token is anything other than a or ,
// consume a component value and append it to the temporary list.
while (!is_one_of(peek_token().type, ';', EOF))
temp.push_back(consume_component_value());
css_parser parser(temp);
// Consume a declaration from the temporary list.
auto decl = parser.consume_declaration();
// If anything was returned, append it to decls.
if (decl) decls.push_back(decl);
break;
}
case '&': {
// Reconsume the current input token. Consume a qualified rule. If anything was returned, append it to rules.
m_index--;
auto rule = consume_qualified_rule();
if (rule) rules.push_back(rule);
break;
}
default:
// This is a parse error. Reconsume the current input token. As long as the next input token is
// anything other than a <;> or , consume a component value and throw away the returned value.
css_parse_error("unexpected token in a style block");
m_index--;
while (!is_one_of(peek_token().type, ';', EOF))
consume_component_value();
break;
}
}
}
// https://www.w3.org/TR/css-syntax-3/#parse-comma-separated-list-of-component-values
// Note: result is never empty. If input is empty result is {{}}.
vector parse_comma_separated_list(const css_token_vector& tokens)
{
vector result;
css_token_vector list;
for (auto& tok : tokens)
{
if (tok.type == ',') // Note: EOF token is not stored in arrays
{
result.push_back(list);
list.clear();
continue;
}
list.push_back(tok);
}
result.push_back(list);
return result;
}
// https://drafts.csswg.org/css-syntax-3/#typedef-any-value
// assumes that tokens have been componentized
bool is_any_value(const css_token_vector& tokens)
{
if (tokens.empty()) return false;
for (auto& tok : tokens)
{
if (is_one_of(tok.type, BAD_STRING, BAD_URL, ')', ']', '}'))
return false;
else if (tok.is_component_value() && !is_any_value(tok.value))
return false;
}
return true;
}
// https://drafts.csswg.org/css-syntax-3/#typedef-declaration-value
// assumes that tokens have been componentized
bool is_declaration_value(const css_token_vector& tokens, int index)
{
if (index >= (int)tokens.size()) return false;
for (int i = index; i < (int)tokens.size(); i++)
{
auto& tok = tokens[i];
if (is_one_of(tok.type, BAD_STRING, BAD_URL, ')', ']', '}', ';', '!'))
return false;
// Note: ';' '!' inside component values are allowed, so using is_any_value here.
else if (tok.is_component_value() && !is_any_value(tok.value))
return false;
}
return true;
}
// Note: it is possible to have several whitespace tokens in a row: " /**/ /**/ "
bool skip_whitespace(const css_token_vector& tokens, int& index)
{
int start = index;
while (at(tokens, index).type == ' ') index++;
return index != start;
}
} // namespace litehtml