diff options
Diffstat (limited to 'libs/litehtml/src/css_tokenizer.cpp')
-rw-r--r-- | libs/litehtml/src/css_tokenizer.cpp | 724 |
1 files changed, 724 insertions, 0 deletions
diff --git a/libs/litehtml/src/css_tokenizer.cpp b/libs/litehtml/src/css_tokenizer.cpp new file mode 100644 index 0000000000..b2b8761fa4 --- /dev/null +++ b/libs/litehtml/src/css_tokenizer.cpp @@ -0,0 +1,724 @@ +#include "html.h" +#include "css_tokenizer.h" + +namespace litehtml +{ + +void css_parse_error(string /*msg*/) +{ + //printf("%s\n", msg.c_str()); +} + +string css_token::ident() const +{ + if (type != IDENT) return ""; + return name.substr(0, 2) == "--" ? name : lowcase(name); +} + + +char mirror(char c) +{ + if (c == '{') return '}'; + if (c == '[') return ']'; + if (c == '(') return ')'; + return c; +} + +string css_token::get_repr(bool insert_spaces) const +{ + if (!is_component_value()) return repr; + + using litehtml::get_repr; + if (type == CV_FUNCTION) return name + '(' + get_repr(value, 0, -1, insert_spaces) + ')'; + + char opening_bracket = char(-type - 100); + char closing_bracket = mirror(opening_bracket); + return opening_bracket + get_repr(value, 0, -1, insert_spaces) + closing_bracket; +} + +// concatenate string representations of tokens +string get_repr(const css_token_vector& tokens, int index, int count, bool insert_spaces) +{ + if (count == -1) count = (int)tokens.size() - index; + string str; + string space = insert_spaces ? " " : ""; + for (int i = index; i < index + count; i++) + { + str += tokens[i].get_repr(insert_spaces) + space; + } + if (insert_spaces) remove(str, -1); + return str; +} + +// https://www.w3.org/TR/css-syntax-3/#whitespace +bool css_tokenizer::is_whitespace(int ch) { + // NOTE: \r and \f are converted to \n in filter_code_points + return ch == '\n' || ch == '\t' || ch == ' '; +} + +// https://www.w3.org/TR/css-syntax-3/#non-printable-code-point +bool css_tokenizer::is_non_printable_code_point(int ch) { + return (ch >= 0 && ch <= 8) || ch == 0xB || (ch >= 0xE && ch <= 0x1F) || ch == 0x7F; +} + +// https://www.w3.org/TR/css-syntax-3/#ident-start-code-point +bool css_tokenizer::is_ident_start_code_point(int ch) { + return is_letter(ch) || ch >= 0x80 || ch == '_'; +} + +// https://www.w3.org/TR/css-syntax-3/#ident-code-point +bool css_tokenizer::is_ident_code_point(int ch) { + return is_ident_start_code_point(ch) || is_digit(ch) || ch == '-'; +} + + +// Consume the next input code point. Return the current input code point. +// When we know that next input char is ASCII and not NUL, we can just write str[index++] instead. +int css_tokenizer::consume_char() +{ + // NOTE: if str[index] == 0 index is not incremented + return current_char = read_utf8_char(str, index); +} + +// https://www.w3.org/TR/css-syntax-3/#reconsume-the-current-input-code-point +// "reconsume" is not a good name - it should be called unconsume (the char will actually be reconsumed later when consume_char is called). +// When we know that current input char is ASCII and index != 0, we can just write index-- instead. +void css_tokenizer::unconsume_char() +{ + // see comment for current_char + if (current_char == 0) + return; + + // NOTE: if index == 0 index is not decremented + prev_utf8_char(str, index); +} + +int css_tokenizer::peek_char() +{ + int i = index; + return read_utf8_char(str, i); +} + +css_tokenizer::three_chars css_tokenizer::peek_chars() +{ + three_chars chars; + int i = index; + chars._1 = read_utf8_char(str, i); + chars._2 = read_utf8_char(str, i); + chars._3 = read_utf8_char(str, i); + return chars; +} + + +// https://www.w3.org/TR/css-syntax-3/#consume-comments +void css_tokenizer::consume_comments() +{ + while (true) + { + if (str[index] == '/' && str[index + 1] == '*') + { + int i = (int)str.find("*/", index + 2); + + if (i != -1) + index = i + 2; + else + { + index = (int)str.size(); + css_parse_error("eof in comment"); + break; + } + } + else + break; + } +} + +// https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point +// It assumes that the U+005C (\) has already been consumed and that the next input code point +// is not a newline (see https://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape). +int css_tokenizer::consume_escaped_code_point() +{ + // Consume the next input code point. + int ch = consume_char(); + + if (is_hex_digit(ch)) + { + int number = digit_value(ch); + // Consume as many hex digits as possible, but no more than 5. + int max = 5; + while (max-- > 0 && is_hex_digit(str[index])) + { + ch = consume_char(); + number = number * 16 + digit_value(ch); + } + // If the next input code point is whitespace, consume it as well. + if (is_whitespace(str[index])) + consume_char(); + // If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point + if (number == 0 || is_surrogate(number) || number > 0x10FFFF) + return 0xFFFD; + // Otherwise, return the code point with that value. + return number; + } + else if (ch == 0) // EOF + { + // This is a parse error. Return U+FFFD. + css_parse_error("eof in escaped codepoint"); + return 0xFFFD; + } + else // anything else + // Return the current input code point. + return ch; +} + +// https://www.w3.org/TR/css-syntax-3/#consume-string-token +css_token css_tokenizer::consume_string_token(int ending_code_point) +{ + // Initially create a <string-token> with its value set to the empty string. + css_token token(STRING); + + while (true) + { + // Repeatedly consume the next input code point from the stream: + int ch = consume_char(); + switch (ch) + { + case 0: // EOF + // This is a parse error. Return the <string-token>. + css_parse_error("eof in string"); + return token; + case '\n': + // This is a parse error. Reconsume the current input code point, create a <bad-string-token>, and return it. + css_parse_error("newline in string"); + unconsume_char(); + return {BAD_STRING}; + case '\\': + // If the next input code point is EOF, do nothing. + if (str[index] == 0) + break; + // Otherwise, if the next input code point is a newline, consume it. + else if (str[index] == '\n') + index++; + // Otherwise, (the stream starts with a valid escape) consume an escaped code point and + // append the returned code point to the <string-token>’s value. + else + append_char(token.str, consume_escaped_code_point()); + break; + default: + if (ch == ending_code_point) + return token; + else // anything else + // Append the current input code point to the <string-token>’s value. + append_char(token.str, ch); + break; + } + } +} + +// https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier +bool css_tokenizer::would_start_ident_sequence(three_chars chars) +{ + int c1 = chars._1; + int c2 = chars._2; + int c3 = chars._3; + + if (c1 == '-') + { + // If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or + // the second and third code points are a valid escape, return true. Otherwise, return false. + return is_ident_start_code_point(c2) || c2 == '-' || (c2 == '\\' && c3 != '\n'); + } + else if (is_ident_start_code_point(c1)) + return true; + else if (c1 == '\\') + // If the first and second code points are a valid escape, return true. Otherwise, return false. + return c2 != '\n'; + else + return false; +} + +// https://www.w3.org/TR/css-syntax-3/#consume-name +string css_tokenizer::consume_ident_sequence() +{ + string result; + + while (true) + { + // Repeatedly consume the next input code point from the stream: + int ch = consume_char(); + + if (is_ident_code_point(ch)) + append_char(result, ch); // Append the code point to result. + + // else if the stream starts with a valid escape + // NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed) + else if (ch == '\\' && str[index] != '\n') + // Consume an escaped code point. Append the returned code point to result. + append_char(result, consume_escaped_code_point()); + + else + { + // Reconsume the current input code point. Return result. + unconsume_char(); + return result; + } + } +} + +// https://www.w3.org/TR/css-syntax-3/#starts-with-a-number +bool css_tokenizer::would_start_a_number(int x, int y, int z) +{ + if (x == '+' || x == '-') + { + // If the second code point is a digit, return true. + if (is_digit(y)) return true; + // Otherwise, if the second code point is a U+002E (.) and the third code point is a digit, return true. + else if (y == '.' && is_digit(z)) return true; + // Otherwise, return false. + else return false; + } + else if (x == '.') + // If the second code point is a digit, return true. Otherwise, return false. + return is_digit(y); + else + return is_digit(x); +} + +// https://www.w3.org/TR/css-syntax-3/#convert-string-to-number +double css_tokenizer::convert_string_to_number(const string& str) +{ + const char* p = str.c_str(); + + // Divide the string into seven components, in order from left to right: + + // 1. A sign: a single U+002B (+) or U+002D (-), or the empty string. + // Let s be the number -1 if the sign is U+002D (-); otherwise, let s be the number 1. + double s = 1; + if (*p == '-') s = -1, p++; + else if (*p == '+') p++; + + // 2. An integer part: zero or more digits. If there is at least one digit, let i be the number formed by + // interpreting the digits as a base-10 integer; otherwise, let i be the number 0. + double i = 0; + while (is_digit(*p)) i = i * 10 + digit_value(*p++); + + // 3. A decimal point: a single U+002E (.), or the empty string. + if (*p == '.') p++; + + // 4. A fractional part: zero or more digits. If there is at least one digit, let f be the number formed by + // interpreting the digits as a base-10 integer and d be the number of digits; + // otherwise, let f and d be the number 0. + double f = 0, d = 0; + while (is_digit(*p)) f = f * 10 + digit_value(*p++), d++; + + // 5. An exponent indicator: a single U+0045 (E) or U+0065 (e), or the empty string. + if (*p == 'e' || *p == 'E') p++; + + // 6. An exponent sign: a single U+002B (+) or U+002D (-), or the empty string. + // Let t be the number -1 if the sign is U+002D (-); otherwise, let t be the number 1. + double t = 1; + if (*p == '-') t = -1, p++; + else if (*p == '+') p++; + + // 7. An exponent: zero or more digits. If there is at least one digit, let e be the number formed by + // interpreting the digits as a base-10 integer; otherwise, let e be the number 0. + double e = 0; + while (is_digit(*p)) e = e * 10 + digit_value(*p++); + + // Return the number s·(i + f·10ᐨᵈ)·10ᵗᵉ. + return s * (i + f * pow(10, -d)) * pow(10, t * e); +} + +// https://www.w3.org/TR/css-syntax-3/#consume-number +double css_tokenizer::consume_number(css_number_type& type) +{ + // 1. Initially set type to "integer". Let repr be the empty string. + type = css_number_integer; + string repr; + + // 2. If the next input code point is U+002B (+) or U+002D (-), consume it and append it to repr. + if (is_one_of(str[index], '+', '-')) + append_char(repr, str[index++]); + + // 3. While the next input code point is a digit, consume it and append it to repr. + while (is_digit(str[index])) + append_char(repr, str[index++]); + + // 4. If the next 2 input code points are U+002E (.) followed by a digit, then: + if (str[index] == '.' && is_digit(str[index+1])) + { + // 1. Consume them. + // 2. Append them to repr. + append_char(repr, str[index++]); + append_char(repr, str[index++]); + // 3. Set type to "number". + type = css_number_number; + // 4. While the next input code point is a digit, consume it and append it to repr. + while (is_digit(str[index])) + append_char(repr, str[index++]); + } + + // 5. If the next 2 or 3 input code points are U+0045 (E) or U+0065 (e), + // optionally followed by U+002D (-) or U+002B (+), followed by a digit, then: + bool a = lowcase(str[index]) == 'e' && is_one_of(str[index+1], '+', '-') && is_digit(str[index+2]); + bool b = lowcase(str[index]) == 'e' && is_digit(str[index+1]); + + if (a || b) + { + // 1. Consume them. + // 2. Append them to repr. + append_char(repr, str[index++]); + append_char(repr, str[index++]); + if (a) append_char(repr, str[index++]); + // 3. Set type to "number". + type = css_number_number; + // 4. While the next input code point is a digit, consume it and append it to repr. + while (is_digit(str[index])) + append_char(repr, str[index++]); + } + + // 6. Convert repr to a number, and set the value to the returned value. + double value = convert_string_to_number(repr); + + // 7. Return value and type. + return value; +} + +// https://www.w3.org/TR/css-syntax-3/#consume-numeric-token +css_token css_tokenizer::consume_numeric_token() +{ + // Consume a number and let number be the result. + css_number_type type; + float number = (float)consume_number(type); + + // If the next 3 input code points would start an ident sequence, then: + if (would_start_ident_sequence(peek_chars())) + { + // 1. Create a <dimension-token> with the same value and type flag as number, and + // a unit set initially to the empty string. + css_token token(DIMENSION, number, type); + + // 2. Consume an ident sequence. Set the <dimension-token>’s unit to the returned value. + token.unit = consume_ident_sequence(); + + // 3. Return the <dimension-token>. + return token; + } + + // Otherwise, if the next input code point is U+0025 (%), consume it. + // Create a <percentage-token> with the same value as number, and return it. + if (str[index] == '%') + { + index++; + return {PERCENTAGE, number}; // NOTE: number_type is unused in <percentage-token> + } + + // Otherwise, create a <number-token> with the same value and type flag as number, and return it. + return {NUMBER, number, type}; +} + +// https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url +void css_tokenizer::consume_remnants_of_bad_url() +{ + while (true) + { + // Repeatedly consume the next input code point from the stream: + int ch = consume_char(); + if (ch == ')' || ch == 0) // ')' or EOF + return; + // else if the input stream starts with a valid escape + // NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed) + else if (ch == '\\' && str[index] != '\n') + { + consume_escaped_code_point(); + } + // anything else: Do nothing. + } +} + +// https://www.w3.org/TR/css-syntax-3/#consume-url-token +css_token css_tokenizer::consume_url_token() +{ + // Initially create a <url-token> with its value set to the empty string. + css_token token(URL); + + // Consume as much whitespace as possible. + while (is_whitespace(str[index])) + index++; + + while (true) + { + // Repeatedly consume the next input code point from the stream: + int ch = consume_char(); + switch (ch) + { + case ')': + // Return the <url-token>. + return token; + + case 0: // EOF + // This is a parse error. Return the <url-token>. + css_parse_error("eof in unquoted url"); + return token; + + case '\n': + case '\t': + case ' ': + // Consume as much whitespace as possible. + while (is_whitespace(str[index])) + index++; + // If the next input code point is U+0029 ()) or EOF, consume it and return the <url-token> + // (if EOF was encountered, this is a parse error); + if (str[index] == ')' || str[index] == 0) + { + if (str[index] == 0) + css_parse_error("eof in unquoted url"); + else + index++; // consume ')' + return token; + } + // otherwise, consume the remnants of a bad url, create a <bad-url-token>, and return it. + consume_remnants_of_bad_url(); + return {BAD_URL}; + + case '"': + case '\'': + case '(': + bad_url: + // This is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it. + css_parse_error("invalid char in unquoted url"); + consume_remnants_of_bad_url(); + return {BAD_URL}; + + case '\\': + // If the stream starts with a valid escape, consume an escaped code point and + // append the returned code point to the <url-token>’s value. + if (str[index] != '\n') + append_char(token.str, consume_escaped_code_point()); + // Otherwise, this is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it. + else + { + css_parse_error("escaped newline in unquoted url"); + consume_remnants_of_bad_url(); + return {BAD_URL}; + } + break; + + default: + if (is_non_printable_code_point(ch)) + goto bad_url; + else // anything else + // Append the current input code point to the <url-token>’s value. + append_char(token.str, ch); + break; + } + } +} + + +// https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token +css_token css_tokenizer::consume_ident_like_token() +{ + // Consume an ident sequence, and let string be the result. + auto string = consume_ident_sequence(); + + // If string’s value is an ASCII case-insensitive match for "url", and the next input code point is + // U+0028 ((), consume it. + if (lowcase(string) == "url" && str[index] == '(') + { + index++; // consume '(' + + while (is_whitespace(str[index])) // not looking for 2 spaces, see next comment + index++; + + if (is_one_of(str[index], '"', '\'')) + { + // This is not exactly what standard says, but equivalent. The purpose is to preserve a whitespace token. + if (is_whitespace(str[index-1])) index--; + return {FUNCTION, string}; + } + else // Otherwise, consume a url token, and return it. + { + return consume_url_token(); + } + } + + // Otherwise, if the next input code point is U+0028 ((), consume it. + // Create a <function-token> with its value set to string and return it. + else if (str[index] == '(') + { + index++; + return {FUNCTION, string}; + } + + // Otherwise, create an <ident-token> with its value set to string and return it. + return {IDENT, string}; +} + +// https://www.w3.org/TR/css-syntax-3/#consume-token +css_token css_tokenizer::consume_token() +{ + consume_comments(); + + css_token token; + int start = index; + + // Consume the next input code point. + int ch = consume_char(); + three_chars next; + + switch (ch) + { + // whitespace + case '\n': + case '\t': + case ' ': + // Consume as much whitespace as possible. Return a <whitespace-token>. + while (is_whitespace(str[index])) + index++; + token.type = WHITESPACE; + break; + + case '"': + case '\'': + token = consume_string_token(ch); + break; + + case '#': + // If the next input code point is an ident code point or the next two input code points are a valid escape, then: + if (is_ident_code_point(peek_char()) || (str[index] == '\\' && str[index+1] != '\n')) + { + // 1. Create a <hash-token>. + token.type = HASH; + // 2. If the next 3 input code points would start an ident sequence, set the <hash-token>’s type flag to "id". + token.hash_type = would_start_ident_sequence(peek_chars()) ? css_hash_id : css_hash_unrestricted; + // 3. Consume an ident sequence, and set the <hash-token>’s value to the returned string. + token.name = consume_ident_sequence(); + // 4. Return the <hash-token>. + } + else + // Otherwise, return a <delim-token> with its value set to the current input code point. + token.ch = ch; + break; + + case '+': + case '.': + // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it. + next = peek_chars(); + if (would_start_a_number(ch, next._1, next._2)) + { + unconsume_char(); + token = consume_numeric_token(); + } + else + // Otherwise, return a <delim-token> with its value set to the current input code point. + token.ch = ch; + break; + + case '-': + // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it. + next = peek_chars(); + if (would_start_a_number(ch, next._1, next._2)) + { + unconsume_char(); + token = consume_numeric_token(); + } + // Otherwise, if the next 2 input code points are U+002D U+003E (->), consume them and return a <CDC-token>. + else if (next._1 == '-' && next._2 == '>') + { + index += 2; + token.type = CDC; + } + // Otherwise, if the input stream starts with an ident sequence, reconsume the current input code point, + // consume an ident-like token, and return it. + else if (would_start_ident_sequence({ ch, next._1, next._2 })) + { + unconsume_char(); + token = consume_ident_like_token(); + } + else + // Otherwise, return a <delim-token> with its value set to the current input code point. + token.ch = ch; + break; + + case '<': + // If the next 3 input code points are !--, consume them and return a <CDO-token>. + if (match(str, index, "!--")) + { + index += 3; + token.type = CDO; + } + else + // Otherwise, return a <delim-token> with its value set to the current input code point. + token.ch = ch; + break; + + case '@': + // If the next 3 input code points would start an ident sequence, consume an ident sequence, + // create an <at-keyword-token> with its value set to the returned value, and return it. + if (would_start_ident_sequence(peek_chars())) + { + token.type = AT_KEYWORD; + token.name = consume_ident_sequence(); + } + else + // Otherwise, return a <delim-token> with its value set to the current input code point. + token.ch = ch; + break; + + case '\\': + // If the input stream starts with a valid escape, reconsume the current input code point, + // consume an ident-like token, and return it. + if (str[index] != '\n') + { + unconsume_char(); + token = consume_ident_like_token(); + } + else + { + // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point. + css_parse_error("escaped newline outside a string"); + token.ch = ch; + } + break; + + case 0: // EOF + token.type = _EOF; + break; + + default: + if (is_digit(ch)) + { + // Reconsume the current input code point, consume a numeric token, and return it. + unconsume_char(); + token = consume_numeric_token(); + } + else if (is_ident_start_code_point(ch)) + { + // Reconsume the current input code point, consume an ident-like token, and return it. + unconsume_char(); + token = consume_ident_like_token(); + } + else // anything else + // Return a <delim-token> with its value set to the current input code point. + token.ch = ch; // NOTE: :;,()[]{} tokens are also handled here + } + + token.repr = str.substr(start, index - start); + return token; +} + +css_token_vector css_tokenizer::tokenize() +{ + css_token_vector tokens; + while (true) + { + css_token token = consume_token(); + if (token.type == EOF) break; + tokens.push_back(token); + } + return tokens; +} + + +} // namespace litehtml |