summaryrefslogtreecommitdiff
path: root/libs/litehtml/src/css_tokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'libs/litehtml/src/css_tokenizer.cpp')
-rw-r--r--libs/litehtml/src/css_tokenizer.cpp724
1 files changed, 724 insertions, 0 deletions
diff --git a/libs/litehtml/src/css_tokenizer.cpp b/libs/litehtml/src/css_tokenizer.cpp
new file mode 100644
index 0000000000..b2b8761fa4
--- /dev/null
+++ b/libs/litehtml/src/css_tokenizer.cpp
@@ -0,0 +1,724 @@
+#include "html.h"
+#include "css_tokenizer.h"
+
+namespace litehtml
+{
+
+void css_parse_error(string /*msg*/)
+{
+ //printf("%s\n", msg.c_str());
+}
+
+string css_token::ident() const
+{
+ if (type != IDENT) return "";
+ return name.substr(0, 2) == "--" ? name : lowcase(name);
+}
+
+
+char mirror(char c)
+{
+ if (c == '{') return '}';
+ if (c == '[') return ']';
+ if (c == '(') return ')';
+ return c;
+}
+
+string css_token::get_repr(bool insert_spaces) const
+{
+ if (!is_component_value()) return repr;
+
+ using litehtml::get_repr;
+ if (type == CV_FUNCTION) return name + '(' + get_repr(value, 0, -1, insert_spaces) + ')';
+
+ char opening_bracket = char(-type - 100);
+ char closing_bracket = mirror(opening_bracket);
+ return opening_bracket + get_repr(value, 0, -1, insert_spaces) + closing_bracket;
+}
+
+// concatenate string representations of tokens
+string get_repr(const css_token_vector& tokens, int index, int count, bool insert_spaces)
+{
+ if (count == -1) count = (int)tokens.size() - index;
+ string str;
+ string space = insert_spaces ? " " : "";
+ for (int i = index; i < index + count; i++)
+ {
+ str += tokens[i].get_repr(insert_spaces) + space;
+ }
+ if (insert_spaces) remove(str, -1);
+ return str;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#whitespace
+bool css_tokenizer::is_whitespace(int ch) {
+ // NOTE: \r and \f are converted to \n in filter_code_points
+ return ch == '\n' || ch == '\t' || ch == ' ';
+}
+
+// https://www.w3.org/TR/css-syntax-3/#non-printable-code-point
+bool css_tokenizer::is_non_printable_code_point(int ch) {
+ return (ch >= 0 && ch <= 8) || ch == 0xB || (ch >= 0xE && ch <= 0x1F) || ch == 0x7F;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
+bool css_tokenizer::is_ident_start_code_point(int ch) {
+ return is_letter(ch) || ch >= 0x80 || ch == '_';
+}
+
+// https://www.w3.org/TR/css-syntax-3/#ident-code-point
+bool css_tokenizer::is_ident_code_point(int ch) {
+ return is_ident_start_code_point(ch) || is_digit(ch) || ch == '-';
+}
+
+
+// Consume the next input code point. Return the current input code point.
+// When we know that next input char is ASCII and not NUL, we can just write str[index++] instead.
+int css_tokenizer::consume_char()
+{
+ // NOTE: if str[index] == 0 index is not incremented
+ return current_char = read_utf8_char(str, index);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#reconsume-the-current-input-code-point
+// "reconsume" is not a good name - it should be called unconsume (the char will actually be reconsumed later when consume_char is called).
+// When we know that current input char is ASCII and index != 0, we can just write index-- instead.
+void css_tokenizer::unconsume_char()
+{
+ // see comment for current_char
+ if (current_char == 0)
+ return;
+
+ // NOTE: if index == 0 index is not decremented
+ prev_utf8_char(str, index);
+}
+
+int css_tokenizer::peek_char()
+{
+ int i = index;
+ return read_utf8_char(str, i);
+}
+
+css_tokenizer::three_chars css_tokenizer::peek_chars()
+{
+ three_chars chars;
+ int i = index;
+ chars._1 = read_utf8_char(str, i);
+ chars._2 = read_utf8_char(str, i);
+ chars._3 = read_utf8_char(str, i);
+ return chars;
+}
+
+
+// https://www.w3.org/TR/css-syntax-3/#consume-comments
+void css_tokenizer::consume_comments()
+{
+ while (true)
+ {
+ if (str[index] == '/' && str[index + 1] == '*')
+ {
+ int i = (int)str.find("*/", index + 2);
+
+ if (i != -1)
+ index = i + 2;
+ else
+ {
+ index = (int)str.size();
+ css_parse_error("eof in comment");
+ break;
+ }
+ }
+ else
+ break;
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
+// It assumes that the U+005C (\) has already been consumed and that the next input code point
+// is not a newline (see https://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape).
+int css_tokenizer::consume_escaped_code_point()
+{
+ // Consume the next input code point.
+ int ch = consume_char();
+
+ if (is_hex_digit(ch))
+ {
+ int number = digit_value(ch);
+ // Consume as many hex digits as possible, but no more than 5.
+ int max = 5;
+ while (max-- > 0 && is_hex_digit(str[index]))
+ {
+ ch = consume_char();
+ number = number * 16 + digit_value(ch);
+ }
+ // If the next input code point is whitespace, consume it as well.
+ if (is_whitespace(str[index]))
+ consume_char();
+ // If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point
+ if (number == 0 || is_surrogate(number) || number > 0x10FFFF)
+ return 0xFFFD;
+ // Otherwise, return the code point with that value.
+ return number;
+ }
+ else if (ch == 0) // EOF
+ {
+ // This is a parse error. Return U+FFFD.
+ css_parse_error("eof in escaped codepoint");
+ return 0xFFFD;
+ }
+ else // anything else
+ // Return the current input code point.
+ return ch;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-string-token
+css_token css_tokenizer::consume_string_token(int ending_code_point)
+{
+ // Initially create a <string-token> with its value set to the empty string.
+ css_token token(STRING);
+
+ while (true)
+ {
+ // Repeatedly consume the next input code point from the stream:
+ int ch = consume_char();
+ switch (ch)
+ {
+ case 0: // EOF
+ // This is a parse error. Return the <string-token>.
+ css_parse_error("eof in string");
+ return token;
+ case '\n':
+ // This is a parse error. Reconsume the current input code point, create a <bad-string-token>, and return it.
+ css_parse_error("newline in string");
+ unconsume_char();
+ return {BAD_STRING};
+ case '\\':
+ // If the next input code point is EOF, do nothing.
+ if (str[index] == 0)
+ break;
+ // Otherwise, if the next input code point is a newline, consume it.
+ else if (str[index] == '\n')
+ index++;
+ // Otherwise, (the stream starts with a valid escape) consume an escaped code point and
+ // append the returned code point to the <string-token>’s value.
+ else
+ append_char(token.str, consume_escaped_code_point());
+ break;
+ default:
+ if (ch == ending_code_point)
+ return token;
+ else // anything else
+ // Append the current input code point to the <string-token>’s value.
+ append_char(token.str, ch);
+ break;
+ }
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
+bool css_tokenizer::would_start_ident_sequence(three_chars chars)
+{
+ int c1 = chars._1;
+ int c2 = chars._2;
+ int c3 = chars._3;
+
+ if (c1 == '-')
+ {
+ // If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or
+ // the second and third code points are a valid escape, return true. Otherwise, return false.
+ return is_ident_start_code_point(c2) || c2 == '-' || (c2 == '\\' && c3 != '\n');
+ }
+ else if (is_ident_start_code_point(c1))
+ return true;
+ else if (c1 == '\\')
+ // If the first and second code points are a valid escape, return true. Otherwise, return false.
+ return c2 != '\n';
+ else
+ return false;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-name
+string css_tokenizer::consume_ident_sequence()
+{
+ string result;
+
+ while (true)
+ {
+ // Repeatedly consume the next input code point from the stream:
+ int ch = consume_char();
+
+ if (is_ident_code_point(ch))
+ append_char(result, ch); // Append the code point to result.
+
+ // else if the stream starts with a valid escape
+ // NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed)
+ else if (ch == '\\' && str[index] != '\n')
+ // Consume an escaped code point. Append the returned code point to result.
+ append_char(result, consume_escaped_code_point());
+
+ else
+ {
+ // Reconsume the current input code point. Return result.
+ unconsume_char();
+ return result;
+ }
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
+bool css_tokenizer::would_start_a_number(int x, int y, int z)
+{
+ if (x == '+' || x == '-')
+ {
+ // If the second code point is a digit, return true.
+ if (is_digit(y)) return true;
+ // Otherwise, if the second code point is a U+002E (.) and the third code point is a digit, return true.
+ else if (y == '.' && is_digit(z)) return true;
+ // Otherwise, return false.
+ else return false;
+ }
+ else if (x == '.')
+ // If the second code point is a digit, return true. Otherwise, return false.
+ return is_digit(y);
+ else
+ return is_digit(x);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#convert-string-to-number
+double css_tokenizer::convert_string_to_number(const string& str)
+{
+ const char* p = str.c_str();
+
+ // Divide the string into seven components, in order from left to right:
+
+ // 1. A sign: a single U+002B (+) or U+002D (-), or the empty string.
+ // Let s be the number -1 if the sign is U+002D (-); otherwise, let s be the number 1.
+ double s = 1;
+ if (*p == '-') s = -1, p++;
+ else if (*p == '+') p++;
+
+ // 2. An integer part: zero or more digits. If there is at least one digit, let i be the number formed by
+ // interpreting the digits as a base-10 integer; otherwise, let i be the number 0.
+ double i = 0;
+ while (is_digit(*p)) i = i * 10 + digit_value(*p++);
+
+ // 3. A decimal point: a single U+002E (.), or the empty string.
+ if (*p == '.') p++;
+
+ // 4. A fractional part: zero or more digits. If there is at least one digit, let f be the number formed by
+ // interpreting the digits as a base-10 integer and d be the number of digits;
+ // otherwise, let f and d be the number 0.
+ double f = 0, d = 0;
+ while (is_digit(*p)) f = f * 10 + digit_value(*p++), d++;
+
+ // 5. An exponent indicator: a single U+0045 (E) or U+0065 (e), or the empty string.
+ if (*p == 'e' || *p == 'E') p++;
+
+ // 6. An exponent sign: a single U+002B (+) or U+002D (-), or the empty string.
+ // Let t be the number -1 if the sign is U+002D (-); otherwise, let t be the number 1.
+ double t = 1;
+ if (*p == '-') t = -1, p++;
+ else if (*p == '+') p++;
+
+ // 7. An exponent: zero or more digits. If there is at least one digit, let e be the number formed by
+ // interpreting the digits as a base-10 integer; otherwise, let e be the number 0.
+ double e = 0;
+ while (is_digit(*p)) e = e * 10 + digit_value(*p++);
+
+ // Return the number s·(i + f·10ᐨᵈ)·10ᵗᵉ.
+ return s * (i + f * pow(10, -d)) * pow(10, t * e);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-number
+double css_tokenizer::consume_number(css_number_type& type)
+{
+ // 1. Initially set type to "integer". Let repr be the empty string.
+ type = css_number_integer;
+ string repr;
+
+ // 2. If the next input code point is U+002B (+) or U+002D (-), consume it and append it to repr.
+ if (is_one_of(str[index], '+', '-'))
+ append_char(repr, str[index++]);
+
+ // 3. While the next input code point is a digit, consume it and append it to repr.
+ while (is_digit(str[index]))
+ append_char(repr, str[index++]);
+
+ // 4. If the next 2 input code points are U+002E (.) followed by a digit, then:
+ if (str[index] == '.' && is_digit(str[index+1]))
+ {
+ // 1. Consume them.
+ // 2. Append them to repr.
+ append_char(repr, str[index++]);
+ append_char(repr, str[index++]);
+ // 3. Set type to "number".
+ type = css_number_number;
+ // 4. While the next input code point is a digit, consume it and append it to repr.
+ while (is_digit(str[index]))
+ append_char(repr, str[index++]);
+ }
+
+ // 5. If the next 2 or 3 input code points are U+0045 (E) or U+0065 (e),
+ // optionally followed by U+002D (-) or U+002B (+), followed by a digit, then:
+ bool a = lowcase(str[index]) == 'e' && is_one_of(str[index+1], '+', '-') && is_digit(str[index+2]);
+ bool b = lowcase(str[index]) == 'e' && is_digit(str[index+1]);
+
+ if (a || b)
+ {
+ // 1. Consume them.
+ // 2. Append them to repr.
+ append_char(repr, str[index++]);
+ append_char(repr, str[index++]);
+ if (a) append_char(repr, str[index++]);
+ // 3. Set type to "number".
+ type = css_number_number;
+ // 4. While the next input code point is a digit, consume it and append it to repr.
+ while (is_digit(str[index]))
+ append_char(repr, str[index++]);
+ }
+
+ // 6. Convert repr to a number, and set the value to the returned value.
+ double value = convert_string_to_number(repr);
+
+ // 7. Return value and type.
+ return value;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-numeric-token
+css_token css_tokenizer::consume_numeric_token()
+{
+ // Consume a number and let number be the result.
+ css_number_type type;
+ float number = (float)consume_number(type);
+
+ // If the next 3 input code points would start an ident sequence, then:
+ if (would_start_ident_sequence(peek_chars()))
+ {
+ // 1. Create a <dimension-token> with the same value and type flag as number, and
+ // a unit set initially to the empty string.
+ css_token token(DIMENSION, number, type);
+
+ // 2. Consume an ident sequence. Set the <dimension-token>’s unit to the returned value.
+ token.unit = consume_ident_sequence();
+
+ // 3. Return the <dimension-token>.
+ return token;
+ }
+
+ // Otherwise, if the next input code point is U+0025 (%), consume it.
+ // Create a <percentage-token> with the same value as number, and return it.
+ if (str[index] == '%')
+ {
+ index++;
+ return {PERCENTAGE, number}; // NOTE: number_type is unused in <percentage-token>
+ }
+
+ // Otherwise, create a <number-token> with the same value and type flag as number, and return it.
+ return {NUMBER, number, type};
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url
+void css_tokenizer::consume_remnants_of_bad_url()
+{
+ while (true)
+ {
+ // Repeatedly consume the next input code point from the stream:
+ int ch = consume_char();
+ if (ch == ')' || ch == 0) // ')' or EOF
+ return;
+ // else if the input stream starts with a valid escape
+ // NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed)
+ else if (ch == '\\' && str[index] != '\n')
+ {
+ consume_escaped_code_point();
+ }
+ // anything else: Do nothing.
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-url-token
+css_token css_tokenizer::consume_url_token()
+{
+ // Initially create a <url-token> with its value set to the empty string.
+ css_token token(URL);
+
+ // Consume as much whitespace as possible.
+ while (is_whitespace(str[index]))
+ index++;
+
+ while (true)
+ {
+ // Repeatedly consume the next input code point from the stream:
+ int ch = consume_char();
+ switch (ch)
+ {
+ case ')':
+ // Return the <url-token>.
+ return token;
+
+ case 0: // EOF
+ // This is a parse error. Return the <url-token>.
+ css_parse_error("eof in unquoted url");
+ return token;
+
+ case '\n':
+ case '\t':
+ case ' ':
+ // Consume as much whitespace as possible.
+ while (is_whitespace(str[index]))
+ index++;
+ // If the next input code point is U+0029 ()) or EOF, consume it and return the <url-token>
+ // (if EOF was encountered, this is a parse error);
+ if (str[index] == ')' || str[index] == 0)
+ {
+ if (str[index] == 0)
+ css_parse_error("eof in unquoted url");
+ else
+ index++; // consume ')'
+ return token;
+ }
+ // otherwise, consume the remnants of a bad url, create a <bad-url-token>, and return it.
+ consume_remnants_of_bad_url();
+ return {BAD_URL};
+
+ case '"':
+ case '\'':
+ case '(':
+ bad_url:
+ // This is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it.
+ css_parse_error("invalid char in unquoted url");
+ consume_remnants_of_bad_url();
+ return {BAD_URL};
+
+ case '\\':
+ // If the stream starts with a valid escape, consume an escaped code point and
+ // append the returned code point to the <url-token>’s value.
+ if (str[index] != '\n')
+ append_char(token.str, consume_escaped_code_point());
+ // Otherwise, this is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it.
+ else
+ {
+ css_parse_error("escaped newline in unquoted url");
+ consume_remnants_of_bad_url();
+ return {BAD_URL};
+ }
+ break;
+
+ default:
+ if (is_non_printable_code_point(ch))
+ goto bad_url;
+ else // anything else
+ // Append the current input code point to the <url-token>’s value.
+ append_char(token.str, ch);
+ break;
+ }
+ }
+}
+
+
+// https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token
+css_token css_tokenizer::consume_ident_like_token()
+{
+ // Consume an ident sequence, and let string be the result.
+ auto string = consume_ident_sequence();
+
+ // If string’s value is an ASCII case-insensitive match for "url", and the next input code point is
+ // U+0028 ((), consume it.
+ if (lowcase(string) == "url" && str[index] == '(')
+ {
+ index++; // consume '('
+
+ while (is_whitespace(str[index])) // not looking for 2 spaces, see next comment
+ index++;
+
+ if (is_one_of(str[index], '"', '\''))
+ {
+ // This is not exactly what standard says, but equivalent. The purpose is to preserve a whitespace token.
+ if (is_whitespace(str[index-1])) index--;
+ return {FUNCTION, string};
+ }
+ else // Otherwise, consume a url token, and return it.
+ {
+ return consume_url_token();
+ }
+ }
+
+ // Otherwise, if the next input code point is U+0028 ((), consume it.
+ // Create a <function-token> with its value set to string and return it.
+ else if (str[index] == '(')
+ {
+ index++;
+ return {FUNCTION, string};
+ }
+
+ // Otherwise, create an <ident-token> with its value set to string and return it.
+ return {IDENT, string};
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-token
+css_token css_tokenizer::consume_token()
+{
+ consume_comments();
+
+ css_token token;
+ int start = index;
+
+ // Consume the next input code point.
+ int ch = consume_char();
+ three_chars next;
+
+ switch (ch)
+ {
+ // whitespace
+ case '\n':
+ case '\t':
+ case ' ':
+ // Consume as much whitespace as possible. Return a <whitespace-token>.
+ while (is_whitespace(str[index]))
+ index++;
+ token.type = WHITESPACE;
+ break;
+
+ case '"':
+ case '\'':
+ token = consume_string_token(ch);
+ break;
+
+ case '#':
+ // If the next input code point is an ident code point or the next two input code points are a valid escape, then:
+ if (is_ident_code_point(peek_char()) || (str[index] == '\\' && str[index+1] != '\n'))
+ {
+ // 1. Create a <hash-token>.
+ token.type = HASH;
+ // 2. If the next 3 input code points would start an ident sequence, set the <hash-token>’s type flag to "id".
+ token.hash_type = would_start_ident_sequence(peek_chars()) ? css_hash_id : css_hash_unrestricted;
+ // 3. Consume an ident sequence, and set the <hash-token>’s value to the returned string.
+ token.name = consume_ident_sequence();
+ // 4. Return the <hash-token>.
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '+':
+ case '.':
+ // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
+ next = peek_chars();
+ if (would_start_a_number(ch, next._1, next._2))
+ {
+ unconsume_char();
+ token = consume_numeric_token();
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '-':
+ // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
+ next = peek_chars();
+ if (would_start_a_number(ch, next._1, next._2))
+ {
+ unconsume_char();
+ token = consume_numeric_token();
+ }
+ // Otherwise, if the next 2 input code points are U+002D U+003E (->), consume them and return a <CDC-token>.
+ else if (next._1 == '-' && next._2 == '>')
+ {
+ index += 2;
+ token.type = CDC;
+ }
+ // Otherwise, if the input stream starts with an ident sequence, reconsume the current input code point,
+ // consume an ident-like token, and return it.
+ else if (would_start_ident_sequence({ ch, next._1, next._2 }))
+ {
+ unconsume_char();
+ token = consume_ident_like_token();
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '<':
+ // If the next 3 input code points are !--, consume them and return a <CDO-token>.
+ if (match(str, index, "!--"))
+ {
+ index += 3;
+ token.type = CDO;
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '@':
+ // If the next 3 input code points would start an ident sequence, consume an ident sequence,
+ // create an <at-keyword-token> with its value set to the returned value, and return it.
+ if (would_start_ident_sequence(peek_chars()))
+ {
+ token.type = AT_KEYWORD;
+ token.name = consume_ident_sequence();
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '\\':
+ // If the input stream starts with a valid escape, reconsume the current input code point,
+ // consume an ident-like token, and return it.
+ if (str[index] != '\n')
+ {
+ unconsume_char();
+ token = consume_ident_like_token();
+ }
+ else
+ {
+ // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
+ css_parse_error("escaped newline outside a string");
+ token.ch = ch;
+ }
+ break;
+
+ case 0: // EOF
+ token.type = _EOF;
+ break;
+
+ default:
+ if (is_digit(ch))
+ {
+ // Reconsume the current input code point, consume a numeric token, and return it.
+ unconsume_char();
+ token = consume_numeric_token();
+ }
+ else if (is_ident_start_code_point(ch))
+ {
+ // Reconsume the current input code point, consume an ident-like token, and return it.
+ unconsume_char();
+ token = consume_ident_like_token();
+ }
+ else // anything else
+ // Return a <delim-token> with its value set to the current input code point.
+ token.ch = ch; // NOTE: :;,()[]{} tokens are also handled here
+ }
+
+ token.repr = str.substr(start, index - start);
+ return token;
+}
+
+css_token_vector css_tokenizer::tokenize()
+{
+ css_token_vector tokens;
+ while (true)
+ {
+ css_token token = consume_token();
+ if (token.type == EOF) break;
+ tokens.push_back(token);
+ }
+ return tokens;
+}
+
+
+} // namespace litehtml