#include "html.h" #include "css_tokenizer.h" namespace litehtml { void css_parse_error(string /*msg*/) { //printf("%s\n", msg.c_str()); } string css_token::ident() const { if (type != IDENT) return ""; return name.substr(0, 2) == "--" ? name : lowcase(name); } char mirror(char c) { if (c == '{') return '}'; if (c == '[') return ']'; if (c == '(') return ')'; return c; } string css_token::get_repr(bool insert_spaces) const { if (!is_component_value()) return repr; using litehtml::get_repr; if (type == CV_FUNCTION) return name + '(' + get_repr(value, 0, -1, insert_spaces) + ')'; char opening_bracket = char(-type - 100); char closing_bracket = mirror(opening_bracket); return opening_bracket + get_repr(value, 0, -1, insert_spaces) + closing_bracket; } // concatenate string representations of tokens string get_repr(const css_token_vector& tokens, int index, int count, bool insert_spaces) { if (count == -1) count = (int)tokens.size() - index; string str; string space = insert_spaces ? " " : ""; for (int i = index; i < index + count; i++) { str += tokens[i].get_repr(insert_spaces) + space; } if (insert_spaces) remove(str, -1); return str; } // https://www.w3.org/TR/css-syntax-3/#whitespace bool css_tokenizer::is_whitespace(int ch) { // NOTE: \r and \f are converted to \n in filter_code_points return ch == '\n' || ch == '\t' || ch == ' '; } // https://www.w3.org/TR/css-syntax-3/#non-printable-code-point bool css_tokenizer::is_non_printable_code_point(int ch) { return (ch >= 0 && ch <= 8) || ch == 0xB || (ch >= 0xE && ch <= 0x1F) || ch == 0x7F; } // https://www.w3.org/TR/css-syntax-3/#ident-start-code-point bool css_tokenizer::is_ident_start_code_point(int ch) { return is_letter(ch) || ch >= 0x80 || ch == '_'; } // https://www.w3.org/TR/css-syntax-3/#ident-code-point bool css_tokenizer::is_ident_code_point(int ch) { return is_ident_start_code_point(ch) || is_digit(ch) || ch == '-'; } // Consume the next input code point. Return the current input code point. // When we know that next input char is ASCII and not NUL, we can just write str[index++] instead. int css_tokenizer::consume_char() { // NOTE: if str[index] == 0 index is not incremented return current_char = read_utf8_char(str, index); } // https://www.w3.org/TR/css-syntax-3/#reconsume-the-current-input-code-point // "reconsume" is not a good name - it should be called unconsume (the char will actually be reconsumed later when consume_char is called). // When we know that current input char is ASCII and index != 0, we can just write index-- instead. void css_tokenizer::unconsume_char() { // see comment for current_char if (current_char == 0) return; // NOTE: if index == 0 index is not decremented prev_utf8_char(str, index); } int css_tokenizer::peek_char() { int i = index; return read_utf8_char(str, i); } css_tokenizer::three_chars css_tokenizer::peek_chars() { three_chars chars; int i = index; chars._1 = read_utf8_char(str, i); chars._2 = read_utf8_char(str, i); chars._3 = read_utf8_char(str, i); return chars; } // https://www.w3.org/TR/css-syntax-3/#consume-comments void css_tokenizer::consume_comments() { while (true) { if (str[index] == '/' && str[index + 1] == '*') { int i = (int)str.find("*/", index + 2); if (i != -1) index = i + 2; else { index = (int)str.size(); css_parse_error("eof in comment"); break; } } else break; } } // https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point // It assumes that the U+005C (\) has already been consumed and that the next input code point // is not a newline (see https://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape). int css_tokenizer::consume_escaped_code_point() { // Consume the next input code point. int ch = consume_char(); if (is_hex_digit(ch)) { int number = digit_value(ch); // Consume as many hex digits as possible, but no more than 5. int max = 5; while (max-- > 0 && is_hex_digit(str[index])) { ch = consume_char(); number = number * 16 + digit_value(ch); } // If the next input code point is whitespace, consume it as well. if (is_whitespace(str[index])) consume_char(); // If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point if (number == 0 || is_surrogate(number) || number > 0x10FFFF) return 0xFFFD; // Otherwise, return the code point with that value. return number; } else if (ch == 0) // EOF { // This is a parse error. Return U+FFFD. css_parse_error("eof in escaped codepoint"); return 0xFFFD; } else // anything else // Return the current input code point. return ch; } // https://www.w3.org/TR/css-syntax-3/#consume-string-token css_token css_tokenizer::consume_string_token(int ending_code_point) { // Initially create a with its value set to the empty string. css_token token(STRING); while (true) { // Repeatedly consume the next input code point from the stream: int ch = consume_char(); switch (ch) { case 0: // EOF // This is a parse error. Return the . css_parse_error("eof in string"); return token; case '\n': // This is a parse error. Reconsume the current input code point, create a , and return it. css_parse_error("newline in string"); unconsume_char(); return {BAD_STRING}; case '\\': // If the next input code point is EOF, do nothing. if (str[index] == 0) break; // Otherwise, if the next input code point is a newline, consume it. else if (str[index] == '\n') index++; // Otherwise, (the stream starts with a valid escape) consume an escaped code point and // append the returned code point to the ’s value. else append_char(token.str, consume_escaped_code_point()); break; default: if (ch == ending_code_point) return token; else // anything else // Append the current input code point to the ’s value. append_char(token.str, ch); break; } } } // https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier bool css_tokenizer::would_start_ident_sequence(three_chars chars) { int c1 = chars._1; int c2 = chars._2; int c3 = chars._3; if (c1 == '-') { // If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or // the second and third code points are a valid escape, return true. Otherwise, return false. return is_ident_start_code_point(c2) || c2 == '-' || (c2 == '\\' && c3 != '\n'); } else if (is_ident_start_code_point(c1)) return true; else if (c1 == '\\') // If the first and second code points are a valid escape, return true. Otherwise, return false. return c2 != '\n'; else return false; } // https://www.w3.org/TR/css-syntax-3/#consume-name string css_tokenizer::consume_ident_sequence() { string result; while (true) { // Repeatedly consume the next input code point from the stream: int ch = consume_char(); if (is_ident_code_point(ch)) append_char(result, ch); // Append the code point to result. // else if the stream starts with a valid escape // NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed) else if (ch == '\\' && str[index] != '\n') // Consume an escaped code point. Append the returned code point to result. append_char(result, consume_escaped_code_point()); else { // Reconsume the current input code point. Return result. unconsume_char(); return result; } } } // https://www.w3.org/TR/css-syntax-3/#starts-with-a-number bool css_tokenizer::would_start_a_number(int x, int y, int z) { if (x == '+' || x == '-') { // If the second code point is a digit, return true. if (is_digit(y)) return true; // Otherwise, if the second code point is a U+002E (.) and the third code point is a digit, return true. else if (y == '.' && is_digit(z)) return true; // Otherwise, return false. else return false; } else if (x == '.') // If the second code point is a digit, return true. Otherwise, return false. return is_digit(y); else return is_digit(x); } // https://www.w3.org/TR/css-syntax-3/#convert-string-to-number double css_tokenizer::convert_string_to_number(const string& str) { const char* p = str.c_str(); // Divide the string into seven components, in order from left to right: // 1. A sign: a single U+002B (+) or U+002D (-), or the empty string. // Let s be the number -1 if the sign is U+002D (-); otherwise, let s be the number 1. double s = 1; if (*p == '-') s = -1, p++; else if (*p == '+') p++; // 2. An integer part: zero or more digits. If there is at least one digit, let i be the number formed by // interpreting the digits as a base-10 integer; otherwise, let i be the number 0. double i = 0; while (is_digit(*p)) i = i * 10 + digit_value(*p++); // 3. A decimal point: a single U+002E (.), or the empty string. if (*p == '.') p++; // 4. A fractional part: zero or more digits. If there is at least one digit, let f be the number formed by // interpreting the digits as a base-10 integer and d be the number of digits; // otherwise, let f and d be the number 0. double f = 0, d = 0; while (is_digit(*p)) f = f * 10 + digit_value(*p++), d++; // 5. An exponent indicator: a single U+0045 (E) or U+0065 (e), or the empty string. if (*p == 'e' || *p == 'E') p++; // 6. An exponent sign: a single U+002B (+) or U+002D (-), or the empty string. // Let t be the number -1 if the sign is U+002D (-); otherwise, let t be the number 1. double t = 1; if (*p == '-') t = -1, p++; else if (*p == '+') p++; // 7. An exponent: zero or more digits. If there is at least one digit, let e be the number formed by // interpreting the digits as a base-10 integer; otherwise, let e be the number 0. double e = 0; while (is_digit(*p)) e = e * 10 + digit_value(*p++); // Return the number s·(i + f·10ᐨᵈ)·10ᵗᵉ. return s * (i + f * pow(10, -d)) * pow(10, t * e); } // https://www.w3.org/TR/css-syntax-3/#consume-number double css_tokenizer::consume_number(css_number_type& type) { // 1. Initially set type to "integer". Let repr be the empty string. type = css_number_integer; string repr; // 2. If the next input code point is U+002B (+) or U+002D (-), consume it and append it to repr. if (is_one_of(str[index], '+', '-')) append_char(repr, str[index++]); // 3. While the next input code point is a digit, consume it and append it to repr. while (is_digit(str[index])) append_char(repr, str[index++]); // 4. If the next 2 input code points are U+002E (.) followed by a digit, then: if (str[index] == '.' && is_digit(str[index+1])) { // 1. Consume them. // 2. Append them to repr. append_char(repr, str[index++]); append_char(repr, str[index++]); // 3. Set type to "number". type = css_number_number; // 4. While the next input code point is a digit, consume it and append it to repr. while (is_digit(str[index])) append_char(repr, str[index++]); } // 5. If the next 2 or 3 input code points are U+0045 (E) or U+0065 (e), // optionally followed by U+002D (-) or U+002B (+), followed by a digit, then: bool a = lowcase(str[index]) == 'e' && is_one_of(str[index+1], '+', '-') && is_digit(str[index+2]); bool b = lowcase(str[index]) == 'e' && is_digit(str[index+1]); if (a || b) { // 1. Consume them. // 2. Append them to repr. append_char(repr, str[index++]); append_char(repr, str[index++]); if (a) append_char(repr, str[index++]); // 3. Set type to "number". type = css_number_number; // 4. While the next input code point is a digit, consume it and append it to repr. while (is_digit(str[index])) append_char(repr, str[index++]); } // 6. Convert repr to a number, and set the value to the returned value. double value = convert_string_to_number(repr); // 7. Return value and type. return value; } // https://www.w3.org/TR/css-syntax-3/#consume-numeric-token css_token css_tokenizer::consume_numeric_token() { // Consume a number and let number be the result. css_number_type type; float number = (float)consume_number(type); // If the next 3 input code points would start an ident sequence, then: if (would_start_ident_sequence(peek_chars())) { // 1. Create a with the same value and type flag as number, and // a unit set initially to the empty string. css_token token(DIMENSION, number, type); // 2. Consume an ident sequence. Set the ’s unit to the returned value. token.unit = consume_ident_sequence(); // 3. Return the . return token; } // Otherwise, if the next input code point is U+0025 (%), consume it. // Create a with the same value as number, and return it. if (str[index] == '%') { index++; return {PERCENTAGE, number}; // NOTE: number_type is unused in } // Otherwise, create a with the same value and type flag as number, and return it. return {NUMBER, number, type}; } // https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url void css_tokenizer::consume_remnants_of_bad_url() { while (true) { // Repeatedly consume the next input code point from the stream: int ch = consume_char(); if (ch == ')' || ch == 0) // ')' or EOF return; // else if the input stream starts with a valid escape // NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed) else if (ch == '\\' && str[index] != '\n') { consume_escaped_code_point(); } // anything else: Do nothing. } } // https://www.w3.org/TR/css-syntax-3/#consume-url-token css_token css_tokenizer::consume_url_token() { // Initially create a with its value set to the empty string. css_token token(URL); // Consume as much whitespace as possible. while (is_whitespace(str[index])) index++; while (true) { // Repeatedly consume the next input code point from the stream: int ch = consume_char(); switch (ch) { case ')': // Return the . return token; case 0: // EOF // This is a parse error. Return the . css_parse_error("eof in unquoted url"); return token; case '\n': case '\t': case ' ': // Consume as much whitespace as possible. while (is_whitespace(str[index])) index++; // If the next input code point is U+0029 ()) or EOF, consume it and return the // (if EOF was encountered, this is a parse error); if (str[index] == ')' || str[index] == 0) { if (str[index] == 0) css_parse_error("eof in unquoted url"); else index++; // consume ')' return token; } // otherwise, consume the remnants of a bad url, create a , and return it. consume_remnants_of_bad_url(); return {BAD_URL}; case '"': case '\'': case '(': bad_url: // This is a parse error. Consume the remnants of a bad url, create a , and return it. css_parse_error("invalid char in unquoted url"); consume_remnants_of_bad_url(); return {BAD_URL}; case '\\': // If the stream starts with a valid escape, consume an escaped code point and // append the returned code point to the ’s value. if (str[index] != '\n') append_char(token.str, consume_escaped_code_point()); // Otherwise, this is a parse error. Consume the remnants of a bad url, create a , and return it. else { css_parse_error("escaped newline in unquoted url"); consume_remnants_of_bad_url(); return {BAD_URL}; } break; default: if (is_non_printable_code_point(ch)) goto bad_url; else // anything else // Append the current input code point to the ’s value. append_char(token.str, ch); break; } } } // https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token css_token css_tokenizer::consume_ident_like_token() { // Consume an ident sequence, and let string be the result. auto string = consume_ident_sequence(); // If string’s value is an ASCII case-insensitive match for "url", and the next input code point is // U+0028 ((), consume it. if (lowcase(string) == "url" && str[index] == '(') { index++; // consume '(' while (is_whitespace(str[index])) // not looking for 2 spaces, see next comment index++; if (is_one_of(str[index], '"', '\'')) { // This is not exactly what standard says, but equivalent. The purpose is to preserve a whitespace token. if (is_whitespace(str[index-1])) index--; return {FUNCTION, string}; } else // Otherwise, consume a url token, and return it. { return consume_url_token(); } } // Otherwise, if the next input code point is U+0028 ((), consume it. // Create a with its value set to string and return it. else if (str[index] == '(') { index++; return {FUNCTION, string}; } // Otherwise, create an with its value set to string and return it. return {IDENT, string}; } // https://www.w3.org/TR/css-syntax-3/#consume-token css_token css_tokenizer::consume_token() { consume_comments(); css_token token; int start = index; // Consume the next input code point. int ch = consume_char(); three_chars next; switch (ch) { // whitespace case '\n': case '\t': case ' ': // Consume as much whitespace as possible. Return a . while (is_whitespace(str[index])) index++; token.type = WHITESPACE; break; case '"': case '\'': token = consume_string_token(ch); break; case '#': // If the next input code point is an ident code point or the next two input code points are a valid escape, then: if (is_ident_code_point(peek_char()) || (str[index] == '\\' && str[index+1] != '\n')) { // 1. Create a . token.type = HASH; // 2. If the next 3 input code points would start an ident sequence, set the ’s type flag to "id". token.hash_type = would_start_ident_sequence(peek_chars()) ? css_hash_id : css_hash_unrestricted; // 3. Consume an ident sequence, and set the ’s value to the returned string. token.name = consume_ident_sequence(); // 4. Return the . } else // Otherwise, return a with its value set to the current input code point. token.ch = ch; break; case '+': case '.': // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it. next = peek_chars(); if (would_start_a_number(ch, next._1, next._2)) { unconsume_char(); token = consume_numeric_token(); } else // Otherwise, return a with its value set to the current input code point. token.ch = ch; break; case '-': // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it. next = peek_chars(); if (would_start_a_number(ch, next._1, next._2)) { unconsume_char(); token = consume_numeric_token(); } // Otherwise, if the next 2 input code points are U+002D U+003E (->), consume them and return a . else if (next._1 == '-' && next._2 == '>') { index += 2; token.type = CDC; } // Otherwise, if the input stream starts with an ident sequence, reconsume the current input code point, // consume an ident-like token, and return it. else if (would_start_ident_sequence({ ch, next._1, next._2 })) { unconsume_char(); token = consume_ident_like_token(); } else // Otherwise, return a with its value set to the current input code point. token.ch = ch; break; case '<': // If the next 3 input code points are !--, consume them and return a . if (match(str, index, "!--")) { index += 3; token.type = CDO; } else // Otherwise, return a with its value set to the current input code point. token.ch = ch; break; case '@': // If the next 3 input code points would start an ident sequence, consume an ident sequence, // create an with its value set to the returned value, and return it. if (would_start_ident_sequence(peek_chars())) { token.type = AT_KEYWORD; token.name = consume_ident_sequence(); } else // Otherwise, return a with its value set to the current input code point. token.ch = ch; break; case '\\': // If the input stream starts with a valid escape, reconsume the current input code point, // consume an ident-like token, and return it. if (str[index] != '\n') { unconsume_char(); token = consume_ident_like_token(); } else { // Otherwise, this is a parse error. Return a with its value set to the current input code point. css_parse_error("escaped newline outside a string"); token.ch = ch; } break; case 0: // EOF token.type = _EOF; break; default: if (is_digit(ch)) { // Reconsume the current input code point, consume a numeric token, and return it. unconsume_char(); token = consume_numeric_token(); } else if (is_ident_start_code_point(ch)) { // Reconsume the current input code point, consume an ident-like token, and return it. unconsume_char(); token = consume_ident_like_token(); } else // anything else // Return a with its value set to the current input code point. token.ch = ch; // NOTE: :;,()[]{} tokens are also handled here } token.repr = str.substr(start, index - start); return token; } css_token_vector css_tokenizer::tokenize() { css_token_vector tokens; while (true) { css_token token = consume_token(); if (token.type == EOF) break; tokens.push_back(token); } return tokens; } } // namespace litehtml