1 files changed, 724 insertions, 0 deletions
diff --git a/libs/litehtml/src/css_tokenizer.cpp b/libs/litehtml/src/css_tokenizer.cpp
new file mode 100644
index 0000000000..b2b8761fa4
--- /dev/null
+++ b/libs/litehtml/src/css_tokenizer.cpp
@@ -0,0 +1,724 @@
+#include "html.h"
+#include "css_tokenizer.h"
+
+namespace litehtml
+{
+
+void css_parse_error(string /*msg*/)
+{
+	//printf("%s\n", msg.c_str());
+}
+
+string css_token::ident() const
+{
+	if (type != IDENT) return "";
+	return name.substr(0, 2) == "--" ? name : lowcase(name);
+}
+
+
+char mirror(char c)
+{
+	if (c == '{') return '}';
+	if (c == '[') return ']';
+	if (c == '(') return ')';
+	return c;
+}
+
+string css_token::get_repr(bool insert_spaces) const
+{
+	if (!is_component_value()) return repr;
+
+	using litehtml::get_repr;
+	if (type == CV_FUNCTION) return name + '(' + get_repr(value, 0, -1, insert_spaces) + ')';
+
+	char opening_bracket = char(-type - 100);
+	char closing_bracket = mirror(opening_bracket);
+	return opening_bracket + get_repr(value, 0, -1, insert_spaces) + closing_bracket;
+}
+
+// concatenate string representations of tokens
+string get_repr(const css_token_vector& tokens, int index, int count, bool insert_spaces)
+{
+	if (count == -1) count = (int)tokens.size() - index;
+	string str;
+	string space = insert_spaces ? " " : "";
+	for (int i = index; i < index + count; i++)
+	{
+		str += tokens[i].get_repr(insert_spaces) + space;
+	}
+	if (insert_spaces) remove(str, -1);
+	return str;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#whitespace
+bool css_tokenizer::is_whitespace(int ch) {
+	// NOTE: \r and \f are converted to \n in filter_code_points
+	return ch == '\n' || ch == '\t' || ch == ' ';
+}
+
+// https://www.w3.org/TR/css-syntax-3/#non-printable-code-point
+bool css_tokenizer::is_non_printable_code_point(int ch) {
+	return (ch >= 0 && ch <= 8) || ch == 0xB || (ch >= 0xE && ch <= 0x1F) || ch == 0x7F;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
+bool css_tokenizer::is_ident_start_code_point(int ch) {
+	return is_letter(ch) || ch >= 0x80 || ch == '_';
+}
+
+// https://www.w3.org/TR/css-syntax-3/#ident-code-point
+bool css_tokenizer::is_ident_code_point(int ch) {
+	return is_ident_start_code_point(ch) || is_digit(ch) || ch == '-';
+}
+
+
+// Consume the next input code point. Return the current input code point.
+// When we know that next input char is ASCII and not NUL, we can just write str[index++] instead.
+int css_tokenizer::consume_char()
+{
+	// NOTE: if str[index] == 0 index is not incremented
+	return current_char = read_utf8_char(str, index);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#reconsume-the-current-input-code-point
+// "reconsume" is not a good name - it should be called unconsume (the char will actually be reconsumed later when consume_char is called).
+// When we know that current input char is ASCII and index != 0, we can just write index-- instead.
+void css_tokenizer::unconsume_char()
+{
+	// see comment for current_char
+	if (current_char == 0)
+		return;
+
+	// NOTE: if index == 0 index is not decremented
+	prev_utf8_char(str, index);
+}
+
+int css_tokenizer::peek_char()
+{
+	int i = index;
+	return read_utf8_char(str, i);
+}
+
+css_tokenizer::three_chars  css_tokenizer::peek_chars()
+{
+	three_chars chars;
+	int i = index;
+	chars._1 = read_utf8_char(str, i);
+	chars._2 = read_utf8_char(str, i);
+	chars._3 = read_utf8_char(str, i);
+	return chars;
+}
+
+
+// https://www.w3.org/TR/css-syntax-3/#consume-comments
+void css_tokenizer::consume_comments()
+{
+	while (true)
+	{
+		if (str[index] == '/' && str[index + 1] == '*')
+		{
+			int i = (int)str.find("*/", index + 2);
+			
+			if (i != -1)
+				index = i + 2;
+			else
+			{
+				index = (int)str.size();
+				css_parse_error("eof in comment");
+				break;
+			}
+		}
+		else
+			break;
+	}
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
+// It assumes that the U+005C (\) has already been consumed and that the next input code point
+// is not a newline (see https://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape).
+int css_tokenizer::consume_escaped_code_point()
+{
+	// Consume the next input code point.
+	int ch = consume_char();
+
+	if (is_hex_digit(ch))
+	{
+		int number = digit_value(ch);
+		// Consume as many hex digits as possible, but no more than 5.
+		int max = 5;
+		while (max-- > 0 && is_hex_digit(str[index]))
+		{
+			ch = consume_char();
+			number = number * 16 + digit_value(ch);
+		}
+		// If the next input code point is whitespace, consume it as well.
+		if (is_whitespace(str[index]))
+			consume_char();
+		// If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point
+		if (number == 0 || is_surrogate(number) || number > 0x10FFFF)
+			return 0xFFFD;
+		// Otherwise, return the code point with that value.
+		return number;
+	}
+	else if (ch == 0) // EOF
+	{
+		// This is a parse error. Return U+FFFD.
+		css_parse_error("eof in escaped codepoint");
+		return 0xFFFD;
+	}
+	else // anything else
+		// Return the current input code point.
+		return ch;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-string-token
+css_token css_tokenizer::consume_string_token(int ending_code_point)
+{
+	// Initially create a <string-token> with its value set to the empty string.
+	css_token token(STRING);
+
+	while (true)
+	{
+		// Repeatedly consume the next input code point from the stream:
+		int ch = consume_char();
+		switch (ch)
+		{
+		case 0: // EOF
+			// This is a parse error. Return the <string-token>.
+			css_parse_error("eof in string");
+			return token;
+		case '\n':
+			// This is a parse error. Reconsume the current input code point, create a <bad-string-token>, and return it.
+			css_parse_error("newline in string");
+			unconsume_char();
+			return {BAD_STRING};
+		case '\\':
+			// If the next input code point is EOF, do nothing.
+			if (str[index] == 0)
+				break;
+			// Otherwise, if the next input code point is a newline, consume it.
+			else if (str[index] == '\n')
+				index++;
+			// Otherwise, (the stream starts with a valid escape) consume an escaped code point and 
+			// append the returned code point to the <string-token>’s value.
+			else
+				append_char(token.str, consume_escaped_code_point());
+			break;
+		default:
+			if (ch == ending_code_point)
+				return token;
+			else // anything else
+				// Append the current input code point to the <string-token>’s value.
+				append_char(token.str, ch);
+			break;
+		}
+	}
+}
+
+// https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
+bool css_tokenizer::would_start_ident_sequence(three_chars chars)
+{
+	int c1 = chars._1;
+	int c2 = chars._2;
+	int c3 = chars._3;
+
+	if (c1 == '-')
+	{
+		// If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or 
+		// the second and third code points are a valid escape, return true. Otherwise, return false.
+		return is_ident_start_code_point(c2) || c2 == '-' || (c2 == '\\' && c3 != '\n');
+	}
+	else if (is_ident_start_code_point(c1))
+		return true;
+	else if (c1 == '\\')
+		// If the first and second code points are a valid escape, return true. Otherwise, return false.
+		return c2 != '\n';
+	else
+		return false;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-name
+string css_tokenizer::consume_ident_sequence()
+{
+	string result;
+
+	while (true)
+	{
+		// Repeatedly consume the next input code point from the stream:
+		int ch = consume_char();
+
+		if (is_ident_code_point(ch))
+			append_char(result, ch); // Append the code point to result.
+
+		// else if the stream starts with a valid escape
+		// NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed)
+		else if (ch == '\\' && str[index] != '\n')
+			// Consume an escaped code point. Append the returned code point to result.
+			append_char(result, consume_escaped_code_point());
+
+		else
+		{
+			// Reconsume the current input code point. Return result.
+			unconsume_char();
+			return result;
+		}
+	}
+}
+
+// https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
+bool css_tokenizer::would_start_a_number(int x, int y, int z)
+{
+	if (x == '+' || x == '-')
+	{
+		// If the second code point is a digit, return true.
+		if (is_digit(y)) return true;
+		// Otherwise, if the second code point is a U+002E (.) and the third code point is a digit, return true.
+		else if (y == '.' && is_digit(z)) return true;
+		// Otherwise, return false.
+		else return false;
+	}
+	else if (x == '.')
+		// If the second code point is a digit, return true. Otherwise, return false.
+		return is_digit(y);
+	else
+		return is_digit(x);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#convert-string-to-number
+double css_tokenizer::convert_string_to_number(const string& str)
+{
+	const char* p = str.c_str();
+
+	// Divide the string into seven components, in order from left to right:
+
+	// 1. A sign: a single U+002B (+) or U+002D (-), or the empty string. 
+	//    Let s be the number -1 if the sign is U+002D (-); otherwise, let s be the number 1.
+	double s = 1;
+	if (*p == '-') s = -1, p++;
+	else if (*p == '+') p++;
+
+	// 2. An integer part: zero or more digits. If there is at least one digit, let i be the number formed by 
+	//    interpreting the digits as a base-10 integer; otherwise, let i be the number 0.
+	double i = 0;
+	while (is_digit(*p)) i = i * 10 + digit_value(*p++);
+
+	// 3. A decimal point: a single U+002E (.), or the empty string.
+	if (*p == '.') p++;
+
+	// 4. A fractional part: zero or more digits. If there is at least one digit, let f be the number formed by 
+	//    interpreting the digits as a base-10 integer and d be the number of digits; 
+	//    otherwise, let f and d be the number 0.
+	double f = 0, d = 0;
+	while (is_digit(*p)) f = f * 10 + digit_value(*p++), d++;
+
+	// 5. An exponent indicator: a single U+0045 (E) or U+0065 (e), or the empty string.
+	if (*p == 'e' || *p == 'E') p++;
+
+	// 6. An exponent sign: a single U+002B (+) or U+002D (-), or the empty string. 
+	//    Let t be the number -1 if the sign is U+002D (-); otherwise, let t be the number 1.
+	double t = 1;
+	if (*p == '-') t = -1, p++;
+	else if (*p == '+') p++;
+
+	// 7. An exponent: zero or more digits. If there is at least one digit, let e be the number formed by 
+	//    interpreting the digits as a base-10 integer; otherwise, let e be the number 0.
+	double e = 0;
+	while (is_digit(*p)) e = e * 10 + digit_value(*p++);
+
+	// Return the number s·(i + f·10ᐨᵈ)·10ᵗᵉ.
+	return s * (i + f * pow(10, -d)) * pow(10, t * e);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-number
+double css_tokenizer::consume_number(css_number_type& type)
+{
+	// 1. Initially set type to "integer". Let repr be the empty string.
+	type = css_number_integer;
+	string repr;
+
+	// 2. If the next input code point is U+002B (+) or U+002D (-), consume it and append it to repr.
+	if (is_one_of(str[index], '+', '-'))
+		append_char(repr, str[index++]);
+
+	// 3. While the next input code point is a digit, consume it and append it to repr.
+	while (is_digit(str[index]))
+		append_char(repr, str[index++]);
+
+	// 4. If the next 2 input code points are U+002E (.) followed by a digit, then:
+	if (str[index] == '.' && is_digit(str[index+1]))
+	{
+		// 1. Consume them.
+		// 2. Append them to repr.
+		append_char(repr, str[index++]);
+		append_char(repr, str[index++]);
+		// 3. Set type to "number".
+		type = css_number_number;
+		// 4. While the next input code point is a digit, consume it and append it to repr.
+		while (is_digit(str[index]))
+			append_char(repr, str[index++]);
+	}
+
+	// 5. If the next 2 or 3 input code points are U+0045 (E) or U+0065 (e), 
+	//    optionally followed by U+002D (-) or U+002B (+), followed by a digit, then:
+	bool a = lowcase(str[index]) == 'e' && is_one_of(str[index+1], '+', '-') && is_digit(str[index+2]);
+	bool b = lowcase(str[index]) == 'e' && is_digit(str[index+1]);
+			 
+	if (a || b)
+	{
+		// 1. Consume them.
+		// 2. Append them to repr.
+		append_char(repr, str[index++]);
+		append_char(repr, str[index++]);
+		if (a) append_char(repr, str[index++]);
+		// 3. Set type to "number".
+		type = css_number_number;
+		// 4. While the next input code point is a digit, consume it and append it to repr.
+		while (is_digit(str[index]))
+			append_char(repr, str[index++]);
+	}
+
+	// 6. Convert repr to a number, and set the value to the returned value.
+	double value = convert_string_to_number(repr);
+
+	// 7. Return value and type.
+	return value;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-numeric-token
+css_token css_tokenizer::consume_numeric_token()
+{
+	// Consume a number and let number be the result.
+	css_number_type type;
+	float number = (float)consume_number(type);
+
+	// If the next 3 input code points would start an ident sequence, then:
+	if (would_start_ident_sequence(peek_chars()))
+	{
+		// 1. Create a <dimension-token> with the same value and type flag as number, and 
+		//    a unit set initially to the empty string.
+		css_token token(DIMENSION, number, type);
+
+		// 2. Consume an ident sequence. Set the <dimension-token>’s unit to the returned value.
+		token.unit = consume_ident_sequence();
+
+		// 3. Return the <dimension-token>.
+		return token;
+	}
+
+	// Otherwise, if the next input code point is U+0025 (%), consume it. 
+	// Create a <percentage-token> with the same value as number, and return it.
+	if (str[index] == '%')
+	{
+		index++;
+		return {PERCENTAGE, number}; // NOTE: number_type is unused in <percentage-token>
+	}
+
+	// Otherwise, create a <number-token> with the same value and type flag as number, and return it.
+	return {NUMBER, number, type};
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url
+void css_tokenizer::consume_remnants_of_bad_url()
+{
+	while (true)
+	{
+		// Repeatedly consume the next input code point from the stream:
+		int ch = consume_char();
+		if (ch == ')' || ch == 0) // ')' or EOF
+			return;
+		// else if the input stream starts with a valid escape
+		// NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed)
+		else if (ch == '\\' && str[index] != '\n')
+		{
+			consume_escaped_code_point();
+		}
+		// anything else: Do nothing.
+	}
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-url-token
+css_token css_tokenizer::consume_url_token()
+{
+	// Initially create a <url-token> with its value set to the empty string.
+	css_token token(URL);
+
+	// Consume as much whitespace as possible.
+	while (is_whitespace(str[index]))
+		index++;
+
+	while (true)
+	{
+		// Repeatedly consume the next input code point from the stream:
+		int ch = consume_char();
+		switch (ch)
+		{
+		case ')':
+			// Return the <url-token>.
+			return token;
+
+		case 0: // EOF
+			// This is a parse error. Return the <url-token>.
+			css_parse_error("eof in unquoted url");
+			return token;
+
+		case '\n':
+		case '\t':
+		case ' ':
+			// Consume as much whitespace as possible.
+			while (is_whitespace(str[index]))
+				index++;
+			// If the next input code point is U+0029 ()) or EOF, consume it and return the <url-token>
+			// (if EOF was encountered, this is a parse error);
+			if (str[index] == ')' || str[index] == 0)
+			{
+				if (str[index] == 0)
+					css_parse_error("eof in unquoted url");
+				else
+					index++; // consume ')'
+				return token;
+			}
+			// otherwise, consume the remnants of a bad url, create a <bad-url-token>, and return it.
+			consume_remnants_of_bad_url();
+			return {BAD_URL};
+
+		case '"':
+		case '\'':
+		case '(':
+		bad_url:
+			// This is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it.
+			css_parse_error("invalid char in unquoted url");
+			consume_remnants_of_bad_url();
+			return {BAD_URL};
+
+		case '\\':
+			// If the stream starts with a valid escape, consume an escaped code point and 
+			// append the returned code point to the <url-token>’s value.
+			if (str[index] != '\n')
+				append_char(token.str, consume_escaped_code_point());
+			// Otherwise, this is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it.
+			else
+			{
+				css_parse_error("escaped newline in unquoted url");
+				consume_remnants_of_bad_url();
+				return {BAD_URL};
+			}
+			break;
+
+		default:
+			if (is_non_printable_code_point(ch))
+				goto bad_url;
+			else // anything else
+				// Append the current input code point to the <url-token>’s value.
+				append_char(token.str, ch);
+			break;
+		}
+	}
+}
+
+
+// https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token
+css_token css_tokenizer::consume_ident_like_token()
+{
+	// Consume an ident sequence, and let string be the result.
+	auto string = consume_ident_sequence();
+
+	// If string’s value is an ASCII case-insensitive match for "url", and the next input code point is 
+	// U+0028 ((), consume it.
+	if (lowcase(string) == "url" && str[index] == '(')
+	{
+		index++; // consume '('
+
+		while (is_whitespace(str[index])) // not looking for 2 spaces, see next comment
+			index++;
+
+		if (is_one_of(str[index], '"', '\''))
+		{
+			// This is not exactly what standard says, but equivalent. The purpose is to preserve a whitespace token.
+			if (is_whitespace(str[index-1])) index--;
+			return {FUNCTION, string};
+		}
+		else // Otherwise, consume a url token, and return it.
+		{
+			return consume_url_token();
+		}
+	}
+
+	// Otherwise, if the next input code point is U+0028 ((), consume it. 
+	// Create a <function-token> with its value set to string and return it.
+	else if (str[index] == '(')
+	{
+		index++;
+		return {FUNCTION, string};
+	}
+
+	// Otherwise, create an <ident-token> with its value set to string and return it.
+	return {IDENT, string};
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-token
+css_token css_tokenizer::consume_token()
+{
+	consume_comments();
+
+	css_token token;
+	int start = index;
+
+	// Consume the next input code point.
+	int ch = consume_char();
+	three_chars next;
+
+	switch (ch)
+	{
+		// whitespace
+	case '\n':
+	case '\t':
+	case ' ':
+		// Consume as much whitespace as possible. Return a <whitespace-token>.
+		while (is_whitespace(str[index]))
+			index++;
+		token.type = WHITESPACE;
+		break;
+
+	case '"':
+	case '\'':
+		token = consume_string_token(ch);
+		break;
+
+	case '#':
+		// If the next input code point is an ident code point or the next two input code points are a valid escape, then:
+		if (is_ident_code_point(peek_char()) || (str[index] == '\\' && str[index+1] != '\n'))
+		{
+			// 1. Create a <hash-token>.
+			token.type = HASH;
+			// 2. If the next 3 input code points would start an ident sequence, set the <hash-token>’s type flag to "id".
+			token.hash_type = would_start_ident_sequence(peek_chars()) ? css_hash_id : css_hash_unrestricted;
+			// 3. Consume an ident sequence, and set the <hash-token>’s value to the returned string.
+			token.name = consume_ident_sequence();
+			// 4. Return the <hash-token>.
+		}
+		else
+			// Otherwise, return a <delim-token> with its value set to the current input code point.
+			token.ch = ch;
+		break;
+
+	case '+':
+	case '.':
+		// If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
+		next = peek_chars();
+		if (would_start_a_number(ch, next._1, next._2))
+		{
+			unconsume_char();
+			token = consume_numeric_token();
+		}
+		else
+			// Otherwise, return a <delim-token> with its value set to the current input code point.
+			token.ch = ch;
+		break;
+
+	case '-':
+		// If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
+		next = peek_chars();
+		if (would_start_a_number(ch, next._1, next._2))
+		{
+			unconsume_char();
+			token = consume_numeric_token();
+		}
+		// Otherwise, if the next 2 input code points are U+002D U+003E (->), consume them and return a <CDC-token>.
+		else if (next._1 == '-' && next._2 == '>')
+		{
+			index += 2;
+			token.type = CDC;
+		}
+		// Otherwise, if the input stream starts with an ident sequence, reconsume the current input code point, 
+		// consume an ident-like token, and return it.
+		else if (would_start_ident_sequence({ ch, next._1, next._2 }))
+		{
+			unconsume_char();
+			token = consume_ident_like_token();
+		}
+		else
+			// Otherwise, return a <delim-token> with its value set to the current input code point.
+			token.ch = ch;
+		break;
+
+	case '<':
+		// If the next 3 input code points are !--, consume them and return a <CDO-token>.
+		if (match(str, index, "!--"))
+		{
+			index += 3;
+			token.type = CDO;
+		}
+		else
+			// Otherwise, return a <delim-token> with its value set to the current input code point.
+			token.ch = ch;
+		break;
+
+	case '@':
+		// If the next 3 input code points would start an ident sequence, consume an ident sequence, 
+		// create an <at-keyword-token> with its value set to the returned value, and return it.
+		if (would_start_ident_sequence(peek_chars()))
+		{
+			token.type = AT_KEYWORD;
+			token.name = consume_ident_sequence();
+		}
+		else
+			// Otherwise, return a <delim-token> with its value set to the current input code point.
+			token.ch = ch;
+		break;
+
+	case '\\':
+		// If the input stream starts with a valid escape, reconsume the current input code point, 
+		// consume an ident-like token, and return it.
+		if (str[index] != '\n')
+		{
+			unconsume_char();
+			token = consume_ident_like_token();
+		}
+		else
+		{
+			// Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
+			css_parse_error("escaped newline outside a string");
+			token.ch = ch;
+		}
+		break;
+
+	case 0: // EOF
+		token.type = _EOF;
+		break;
+
+	default:
+		if (is_digit(ch))
+		{
+			// Reconsume the current input code point, consume a numeric token, and return it.
+			unconsume_char();
+			token = consume_numeric_token();
+		}
+		else if (is_ident_start_code_point(ch))
+		{
+			// Reconsume the current input code point, consume an ident-like token, and return it.
+			unconsume_char();
+			token = consume_ident_like_token();
+		}
+		else // anything else
+			// Return a <delim-token> with its value set to the current input code point.
+			token.ch = ch; // NOTE: :;,()[]{} tokens are also handled here
+	}
+
+	token.repr = str.substr(start, index - start);
+	return token;
+}
+
+css_token_vector css_tokenizer::tokenize()
+{
+	css_token_vector tokens;
+	while (true)
+	{
+		css_token token = consume_token();
+		if (token.type == EOF) break;
+		tokens.push_back(token);
+	}
+	return tokens;
+}
+
+
+} // namespace litehtml