#include "html.h"
#include "css_tokenizer.h"
namespace litehtml
{
void css_parse_error(string /*msg*/)
{
//printf("%s\n", msg.c_str());
}
string css_token::ident() const
{
if (type != IDENT) return "";
return name.substr(0, 2) == "--" ? name : lowcase(name);
}
char mirror(char c)
{
if (c == '{') return '}';
if (c == '[') return ']';
if (c == '(') return ')';
return c;
}
string css_token::get_repr(bool insert_spaces) const
{
if (!is_component_value()) return repr;
using litehtml::get_repr;
if (type == CV_FUNCTION) return name + '(' + get_repr(value, 0, -1, insert_spaces) + ')';
char opening_bracket = char(-type - 100);
char closing_bracket = mirror(opening_bracket);
return opening_bracket + get_repr(value, 0, -1, insert_spaces) + closing_bracket;
}
// concatenate string representations of tokens
string get_repr(const css_token_vector& tokens, int index, int count, bool insert_spaces)
{
if (count == -1) count = (int)tokens.size() - index;
string str;
string space = insert_spaces ? " " : "";
for (int i = index; i < index + count; i++)
{
str += tokens[i].get_repr(insert_spaces) + space;
}
if (insert_spaces) remove(str, -1);
return str;
}
// https://www.w3.org/TR/css-syntax-3/#whitespace
bool css_tokenizer::is_whitespace(int ch) {
// NOTE: \r and \f are converted to \n in filter_code_points
return ch == '\n' || ch == '\t' || ch == ' ';
}
// https://www.w3.org/TR/css-syntax-3/#non-printable-code-point
bool css_tokenizer::is_non_printable_code_point(int ch) {
return (ch >= 0 && ch <= 8) || ch == 0xB || (ch >= 0xE && ch <= 0x1F) || ch == 0x7F;
}
// https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
bool css_tokenizer::is_ident_start_code_point(int ch) {
return is_letter(ch) || ch >= 0x80 || ch == '_';
}
// https://www.w3.org/TR/css-syntax-3/#ident-code-point
bool css_tokenizer::is_ident_code_point(int ch) {
return is_ident_start_code_point(ch) || is_digit(ch) || ch == '-';
}
// Consume the next input code point. Return the current input code point.
// When we know that next input char is ASCII and not NUL, we can just write str[index++] instead.
int css_tokenizer::consume_char()
{
// NOTE: if str[index] == 0 index is not incremented
return current_char = read_utf8_char(str, index);
}
// https://www.w3.org/TR/css-syntax-3/#reconsume-the-current-input-code-point
// "reconsume" is not a good name - it should be called unconsume (the char will actually be reconsumed later when consume_char is called).
// When we know that current input char is ASCII and index != 0, we can just write index-- instead.
void css_tokenizer::unconsume_char()
{
// see comment for current_char
if (current_char == 0)
return;
// NOTE: if index == 0 index is not decremented
prev_utf8_char(str, index);
}
int css_tokenizer::peek_char()
{
int i = index;
return read_utf8_char(str, i);
}
css_tokenizer::three_chars css_tokenizer::peek_chars()
{
three_chars chars;
int i = index;
chars._1 = read_utf8_char(str, i);
chars._2 = read_utf8_char(str, i);
chars._3 = read_utf8_char(str, i);
return chars;
}
// https://www.w3.org/TR/css-syntax-3/#consume-comments
void css_tokenizer::consume_comments()
{
while (true)
{
if (str[index] == '/' && str[index + 1] == '*')
{
int i = (int)str.find("*/", index + 2);
if (i != -1)
index = i + 2;
else
{
index = (int)str.size();
css_parse_error("eof in comment");
break;
}
}
else
break;
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
// It assumes that the U+005C (\) has already been consumed and that the next input code point
// is not a newline (see https://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape).
int css_tokenizer::consume_escaped_code_point()
{
// Consume the next input code point.
int ch = consume_char();
if (is_hex_digit(ch))
{
int number = digit_value(ch);
// Consume as many hex digits as possible, but no more than 5.
int max = 5;
while (max-- > 0 && is_hex_digit(str[index]))
{
ch = consume_char();
number = number * 16 + digit_value(ch);
}
// If the next input code point is whitespace, consume it as well.
if (is_whitespace(str[index]))
consume_char();
// If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point
if (number == 0 || is_surrogate(number) || number > 0x10FFFF)
return 0xFFFD;
// Otherwise, return the code point with that value.
return number;
}
else if (ch == 0) // EOF
{
// This is a parse error. Return U+FFFD.
css_parse_error("eof in escaped codepoint");
return 0xFFFD;
}
else // anything else
// Return the current input code point.
return ch;
}
// https://www.w3.org/TR/css-syntax-3/#consume-string-token
css_token css_tokenizer::consume_string_token(int ending_code_point)
{
// Initially create a with its value set to the empty string.
css_token token(STRING);
while (true)
{
// Repeatedly consume the next input code point from the stream:
int ch = consume_char();
switch (ch)
{
case 0: // EOF
// This is a parse error. Return the .
css_parse_error("eof in string");
return token;
case '\n':
// This is a parse error. Reconsume the current input code point, create a , and return it.
css_parse_error("newline in string");
unconsume_char();
return {BAD_STRING};
case '\\':
// If the next input code point is EOF, do nothing.
if (str[index] == 0)
break;
// Otherwise, if the next input code point is a newline, consume it.
else if (str[index] == '\n')
index++;
// Otherwise, (the stream starts with a valid escape) consume an escaped code point and
// append the returned code point to the ’s value.
else
append_char(token.str, consume_escaped_code_point());
break;
default:
if (ch == ending_code_point)
return token;
else // anything else
// Append the current input code point to the ’s value.
append_char(token.str, ch);
break;
}
}
}
// https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
bool css_tokenizer::would_start_ident_sequence(three_chars chars)
{
int c1 = chars._1;
int c2 = chars._2;
int c3 = chars._3;
if (c1 == '-')
{
// If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or
// the second and third code points are a valid escape, return true. Otherwise, return false.
return is_ident_start_code_point(c2) || c2 == '-' || (c2 == '\\' && c3 != '\n');
}
else if (is_ident_start_code_point(c1))
return true;
else if (c1 == '\\')
// If the first and second code points are a valid escape, return true. Otherwise, return false.
return c2 != '\n';
else
return false;
}
// https://www.w3.org/TR/css-syntax-3/#consume-name
string css_tokenizer::consume_ident_sequence()
{
string result;
while (true)
{
// Repeatedly consume the next input code point from the stream:
int ch = consume_char();
if (is_ident_code_point(ch))
append_char(result, ch); // Append the code point to result.
// else if the stream starts with a valid escape
// NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed)
else if (ch == '\\' && str[index] != '\n')
// Consume an escaped code point. Append the returned code point to result.
append_char(result, consume_escaped_code_point());
else
{
// Reconsume the current input code point. Return result.
unconsume_char();
return result;
}
}
}
// https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
bool css_tokenizer::would_start_a_number(int x, int y, int z)
{
if (x == '+' || x == '-')
{
// If the second code point is a digit, return true.
if (is_digit(y)) return true;
// Otherwise, if the second code point is a U+002E (.) and the third code point is a digit, return true.
else if (y == '.' && is_digit(z)) return true;
// Otherwise, return false.
else return false;
}
else if (x == '.')
// If the second code point is a digit, return true. Otherwise, return false.
return is_digit(y);
else
return is_digit(x);
}
// https://www.w3.org/TR/css-syntax-3/#convert-string-to-number
double css_tokenizer::convert_string_to_number(const string& str)
{
const char* p = str.c_str();
// Divide the string into seven components, in order from left to right:
// 1. A sign: a single U+002B (+) or U+002D (-), or the empty string.
// Let s be the number -1 if the sign is U+002D (-); otherwise, let s be the number 1.
double s = 1;
if (*p == '-') s = -1, p++;
else if (*p == '+') p++;
// 2. An integer part: zero or more digits. If there is at least one digit, let i be the number formed by
// interpreting the digits as a base-10 integer; otherwise, let i be the number 0.
double i = 0;
while (is_digit(*p)) i = i * 10 + digit_value(*p++);
// 3. A decimal point: a single U+002E (.), or the empty string.
if (*p == '.') p++;
// 4. A fractional part: zero or more digits. If there is at least one digit, let f be the number formed by
// interpreting the digits as a base-10 integer and d be the number of digits;
// otherwise, let f and d be the number 0.
double f = 0, d = 0;
while (is_digit(*p)) f = f * 10 + digit_value(*p++), d++;
// 5. An exponent indicator: a single U+0045 (E) or U+0065 (e), or the empty string.
if (*p == 'e' || *p == 'E') p++;
// 6. An exponent sign: a single U+002B (+) or U+002D (-), or the empty string.
// Let t be the number -1 if the sign is U+002D (-); otherwise, let t be the number 1.
double t = 1;
if (*p == '-') t = -1, p++;
else if (*p == '+') p++;
// 7. An exponent: zero or more digits. If there is at least one digit, let e be the number formed by
// interpreting the digits as a base-10 integer; otherwise, let e be the number 0.
double e = 0;
while (is_digit(*p)) e = e * 10 + digit_value(*p++);
// Return the number s·(i + f·10ᐨᵈ)·10ᵗᵉ.
return s * (i + f * pow(10, -d)) * pow(10, t * e);
}
// https://www.w3.org/TR/css-syntax-3/#consume-number
double css_tokenizer::consume_number(css_number_type& type)
{
// 1. Initially set type to "integer". Let repr be the empty string.
type = css_number_integer;
string repr;
// 2. If the next input code point is U+002B (+) or U+002D (-), consume it and append it to repr.
if (is_one_of(str[index], '+', '-'))
append_char(repr, str[index++]);
// 3. While the next input code point is a digit, consume it and append it to repr.
while (is_digit(str[index]))
append_char(repr, str[index++]);
// 4. If the next 2 input code points are U+002E (.) followed by a digit, then:
if (str[index] == '.' && is_digit(str[index+1]))
{
// 1. Consume them.
// 2. Append them to repr.
append_char(repr, str[index++]);
append_char(repr, str[index++]);
// 3. Set type to "number".
type = css_number_number;
// 4. While the next input code point is a digit, consume it and append it to repr.
while (is_digit(str[index]))
append_char(repr, str[index++]);
}
// 5. If the next 2 or 3 input code points are U+0045 (E) or U+0065 (e),
// optionally followed by U+002D (-) or U+002B (+), followed by a digit, then:
bool a = lowcase(str[index]) == 'e' && is_one_of(str[index+1], '+', '-') && is_digit(str[index+2]);
bool b = lowcase(str[index]) == 'e' && is_digit(str[index+1]);
if (a || b)
{
// 1. Consume them.
// 2. Append them to repr.
append_char(repr, str[index++]);
append_char(repr, str[index++]);
if (a) append_char(repr, str[index++]);
// 3. Set type to "number".
type = css_number_number;
// 4. While the next input code point is a digit, consume it and append it to repr.
while (is_digit(str[index]))
append_char(repr, str[index++]);
}
// 6. Convert repr to a number, and set the value to the returned value.
double value = convert_string_to_number(repr);
// 7. Return value and type.
return value;
}
// https://www.w3.org/TR/css-syntax-3/#consume-numeric-token
css_token css_tokenizer::consume_numeric_token()
{
// Consume a number and let number be the result.
css_number_type type;
float number = (float)consume_number(type);
// If the next 3 input code points would start an ident sequence, then:
if (would_start_ident_sequence(peek_chars()))
{
// 1. Create a with the same value and type flag as number, and
// a unit set initially to the empty string.
css_token token(DIMENSION, number, type);
// 2. Consume an ident sequence. Set the ’s unit to the returned value.
token.unit = consume_ident_sequence();
// 3. Return the .
return token;
}
// Otherwise, if the next input code point is U+0025 (%), consume it.
// Create a with the same value as number, and return it.
if (str[index] == '%')
{
index++;
return {PERCENTAGE, number}; // NOTE: number_type is unused in
}
// Otherwise, create a with the same value and type flag as number, and return it.
return {NUMBER, number, type};
}
// https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url
void css_tokenizer::consume_remnants_of_bad_url()
{
while (true)
{
// Repeatedly consume the next input code point from the stream:
int ch = consume_char();
if (ch == ')' || ch == 0) // ')' or EOF
return;
// else if the input stream starts with a valid escape
// NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed)
else if (ch == '\\' && str[index] != '\n')
{
consume_escaped_code_point();
}
// anything else: Do nothing.
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-url-token
css_token css_tokenizer::consume_url_token()
{
// Initially create a with its value set to the empty string.
css_token token(URL);
// Consume as much whitespace as possible.
while (is_whitespace(str[index]))
index++;
while (true)
{
// Repeatedly consume the next input code point from the stream:
int ch = consume_char();
switch (ch)
{
case ')':
// Return the .
return token;
case 0: // EOF
// This is a parse error. Return the .
css_parse_error("eof in unquoted url");
return token;
case '\n':
case '\t':
case ' ':
// Consume as much whitespace as possible.
while (is_whitespace(str[index]))
index++;
// If the next input code point is U+0029 ()) or EOF, consume it and return the
// (if EOF was encountered, this is a parse error);
if (str[index] == ')' || str[index] == 0)
{
if (str[index] == 0)
css_parse_error("eof in unquoted url");
else
index++; // consume ')'
return token;
}
// otherwise, consume the remnants of a bad url, create a , and return it.
consume_remnants_of_bad_url();
return {BAD_URL};
case '"':
case '\'':
case '(':
bad_url:
// This is a parse error. Consume the remnants of a bad url, create a , and return it.
css_parse_error("invalid char in unquoted url");
consume_remnants_of_bad_url();
return {BAD_URL};
case '\\':
// If the stream starts with a valid escape, consume an escaped code point and
// append the returned code point to the ’s value.
if (str[index] != '\n')
append_char(token.str, consume_escaped_code_point());
// Otherwise, this is a parse error. Consume the remnants of a bad url, create a , and return it.
else
{
css_parse_error("escaped newline in unquoted url");
consume_remnants_of_bad_url();
return {BAD_URL};
}
break;
default:
if (is_non_printable_code_point(ch))
goto bad_url;
else // anything else
// Append the current input code point to the ’s value.
append_char(token.str, ch);
break;
}
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token
css_token css_tokenizer::consume_ident_like_token()
{
// Consume an ident sequence, and let string be the result.
auto string = consume_ident_sequence();
// If string’s value is an ASCII case-insensitive match for "url", and the next input code point is
// U+0028 ((), consume it.
if (lowcase(string) == "url" && str[index] == '(')
{
index++; // consume '('
while (is_whitespace(str[index])) // not looking for 2 spaces, see next comment
index++;
if (is_one_of(str[index], '"', '\''))
{
// This is not exactly what standard says, but equivalent. The purpose is to preserve a whitespace token.
if (is_whitespace(str[index-1])) index--;
return {FUNCTION, string};
}
else // Otherwise, consume a url token, and return it.
{
return consume_url_token();
}
}
// Otherwise, if the next input code point is U+0028 ((), consume it.
// Create a with its value set to string and return it.
else if (str[index] == '(')
{
index++;
return {FUNCTION, string};
}
// Otherwise, create an with its value set to string and return it.
return {IDENT, string};
}
// https://www.w3.org/TR/css-syntax-3/#consume-token
css_token css_tokenizer::consume_token()
{
consume_comments();
css_token token;
int start = index;
// Consume the next input code point.
int ch = consume_char();
three_chars next;
switch (ch)
{
// whitespace
case '\n':
case '\t':
case ' ':
// Consume as much whitespace as possible. Return a .
while (is_whitespace(str[index]))
index++;
token.type = WHITESPACE;
break;
case '"':
case '\'':
token = consume_string_token(ch);
break;
case '#':
// If the next input code point is an ident code point or the next two input code points are a valid escape, then:
if (is_ident_code_point(peek_char()) || (str[index] == '\\' && str[index+1] != '\n'))
{
// 1. Create a .
token.type = HASH;
// 2. If the next 3 input code points would start an ident sequence, set the ’s type flag to "id".
token.hash_type = would_start_ident_sequence(peek_chars()) ? css_hash_id : css_hash_unrestricted;
// 3. Consume an ident sequence, and set the ’s value to the returned string.
token.name = consume_ident_sequence();
// 4. Return the .
}
else
// Otherwise, return a with its value set to the current input code point.
token.ch = ch;
break;
case '+':
case '.':
// If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
next = peek_chars();
if (would_start_a_number(ch, next._1, next._2))
{
unconsume_char();
token = consume_numeric_token();
}
else
// Otherwise, return a with its value set to the current input code point.
token.ch = ch;
break;
case '-':
// If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
next = peek_chars();
if (would_start_a_number(ch, next._1, next._2))
{
unconsume_char();
token = consume_numeric_token();
}
// Otherwise, if the next 2 input code points are U+002D U+003E (->), consume them and return a .
else if (next._1 == '-' && next._2 == '>')
{
index += 2;
token.type = CDC;
}
// Otherwise, if the input stream starts with an ident sequence, reconsume the current input code point,
// consume an ident-like token, and return it.
else if (would_start_ident_sequence({ ch, next._1, next._2 }))
{
unconsume_char();
token = consume_ident_like_token();
}
else
// Otherwise, return a with its value set to the current input code point.
token.ch = ch;
break;
case '<':
// If the next 3 input code points are !--, consume them and return a .
if (match(str, index, "!--"))
{
index += 3;
token.type = CDO;
}
else
// Otherwise, return a with its value set to the current input code point.
token.ch = ch;
break;
case '@':
// If the next 3 input code points would start an ident sequence, consume an ident sequence,
// create an with its value set to the returned value, and return it.
if (would_start_ident_sequence(peek_chars()))
{
token.type = AT_KEYWORD;
token.name = consume_ident_sequence();
}
else
// Otherwise, return a with its value set to the current input code point.
token.ch = ch;
break;
case '\\':
// If the input stream starts with a valid escape, reconsume the current input code point,
// consume an ident-like token, and return it.
if (str[index] != '\n')
{
unconsume_char();
token = consume_ident_like_token();
}
else
{
// Otherwise, this is a parse error. Return a with its value set to the current input code point.
css_parse_error("escaped newline outside a string");
token.ch = ch;
}
break;
case 0: // EOF
token.type = _EOF;
break;
default:
if (is_digit(ch))
{
// Reconsume the current input code point, consume a numeric token, and return it.
unconsume_char();
token = consume_numeric_token();
}
else if (is_ident_start_code_point(ch))
{
// Reconsume the current input code point, consume an ident-like token, and return it.
unconsume_char();
token = consume_ident_like_token();
}
else // anything else
// Return a with its value set to the current input code point.
token.ch = ch; // NOTE: :;,()[]{} tokens are also handled here
}
token.repr = str.substr(start, index - start);
return token;
}
css_token_vector css_tokenizer::tokenize()
{
css_token_vector tokens;
while (true)
{
css_token token = consume_token();
if (token.type == EOF) break;
tokens.push_back(token);
}
return tokens;
}
} // namespace litehtml