summaryrefslogtreecommitdiff
path: root/libs/litehtml
diff options
context:
space:
mode:
authorGeorge Hazan <george.hazan@gmail.com>2024-10-08 17:12:27 +0300
committerGeorge Hazan <george.hazan@gmail.com>2024-10-08 17:12:27 +0300
commitc917ae9a7abdfde50c0bb1ceb85b91b9e55aa641 (patch)
tree75f891aa74fb8b8bfed8dbe7fec67e0adce45120 /libs/litehtml
parenta7551fa14b3ab1d5f999182313b0a820272e2be6 (diff)
new files in LiteHtml
Diffstat (limited to 'libs/litehtml')
-rw-r--r--libs/litehtml/.github/FUNDING.yml13
-rw-r--r--libs/litehtml/.github/workflows/cmake.yml44
-rw-r--r--libs/litehtml/include/litehtml/css_parser.h52
-rw-r--r--libs/litehtml/include/litehtml/css_tokenizer.h215
-rw-r--r--libs/litehtml/include/litehtml/html_microsyntaxes.h21
-rw-r--r--libs/litehtml/src/css_parser.cpp514
-rw-r--r--libs/litehtml/src/css_tokenizer.cpp724
-rw-r--r--libs/litehtml/src/html_microsyntaxes.cpp102
-rw-r--r--libs/litehtml/src/internal.h26
9 files changed, 1711 insertions, 0 deletions
diff --git a/libs/litehtml/.github/FUNDING.yml b/libs/litehtml/.github/FUNDING.yml
new file mode 100644
index 0000000000..1299f24686
--- /dev/null
+++ b/libs/litehtml/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: ["http://www.litehtml.com/donate.html"] # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/libs/litehtml/.github/workflows/cmake.yml b/libs/litehtml/.github/workflows/cmake.yml
new file mode 100644
index 0000000000..855ba6660e
--- /dev/null
+++ b/libs/litehtml/.github/workflows/cmake.yml
@@ -0,0 +1,44 @@
+name: CMake
+
+on:
+ push:
+ branches: [ master ]
+ pull_request:
+ branches: [ master ]
+
+env:
+ # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
+ BUILD_TYPE: Release
+
+jobs:
+ build:
+ # The CMake configure and build commands are platform agnostic and should work equally
+ # well on Windows or Mac. You can convert this to a matrix build if you need
+ # cross-platform coverage.
+ # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
+ runs-on: ubuntu-24.04
+
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Packages
+ # Install required packages
+ run: |
+ sudo apt-get update
+ sudo apt-get install -yq --no-install-recommends --no-install-suggests libcairo2-dev libpango1.0-dev libgtk-3-dev
+
+ - name: Configure CMake
+ # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
+ # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
+ run: cmake -B ${{github.workspace}}/build -DLITEHTML_BUILD_TESTING=ON -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+
+ - name: Build
+ # Build your program with the given configuration
+ run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} -j3
+
+ - name: Test
+ working-directory: ${{github.workspace}}/build
+ # Execute tests defined by the CMake configuration.
+ # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
+ run: ctest -C ${{env.BUILD_TYPE}} --test-dir litehtml-tests-build --rerun-failed --output-on-failure -j3
+
diff --git a/libs/litehtml/include/litehtml/css_parser.h b/libs/litehtml/include/litehtml/css_parser.h
new file mode 100644
index 0000000000..cbd88e3b84
--- /dev/null
+++ b/libs/litehtml/include/litehtml/css_parser.h
@@ -0,0 +1,52 @@
+#ifndef LH_CSS_PARSER_H
+#define LH_CSS_PARSER_H
+
+#include "css_tokenizer.h"
+#include "stylesheet.h"
+
+namespace litehtml
+{
+
+class css_parser
+{
+ css_token_vector m_tokens;
+ int m_index = 0;
+
+ css_token next_token();
+ css_token peek_token();
+
+public:
+ css_parser() {}
+ css_parser(const css_token_vector& tokens) : m_tokens(tokens) {}
+
+ static raw_rule::vector parse_stylesheet(const string& input, bool top_level);
+ static raw_rule::vector parse_stylesheet(const css_token_vector& input, bool top_level);
+ raw_rule::vector consume_list_of_rules(bool top_level);
+ raw_rule::ptr consume_qualified_rule();
+ raw_rule::ptr consume_at_rule();
+ css_token consume_simple_block(char opening_bracket);
+ css_token consume_component_value();
+ css_token consume_function(const string& name);
+
+ raw_declaration consume_declaration();
+ void consume_style_block_contents(/*out*/ raw_declaration::vector& decls, /*out*/ raw_rule::vector& rules);
+};
+
+using keep_whitespace_fn = std::function<bool (const css_token& left, const css_token& right)>;
+void remove_whitespace(css_token_vector& tokens, keep_whitespace_fn keep_whitespace = 0);
+
+enum {
+ f_componentize = 1,
+ f_remove_whitespace = 2
+};
+template<class Input>
+css_token_vector normalize(Input input, int options = 0, keep_whitespace_fn keep_whitespace = 0);
+
+vector<css_token_vector> parse_comma_separated_list(const css_token_vector& tokens);
+bool is_declaration_value(const css_token_vector& tokens, int index = 0);
+bool is_any_value(const css_token_vector& tokens);
+bool skip_whitespace(const css_token_vector& tokens, int& index);
+
+} // namespace litehtml
+
+#endif // LH_CSS_PARSER_H \ No newline at end of file
diff --git a/libs/litehtml/include/litehtml/css_tokenizer.h b/libs/litehtml/include/litehtml/css_tokenizer.h
new file mode 100644
index 0000000000..119c925c79
--- /dev/null
+++ b/libs/litehtml/include/litehtml/css_tokenizer.h
@@ -0,0 +1,215 @@
+#ifndef LH_CSS_TOKENIZER_H
+#define LH_CSS_TOKENIZER_H
+
+namespace litehtml
+{
+
+// https://www.w3.org/TR/css-syntax-3/#tokenization
+// :;,()[]{} token or delim token: type == this char
+// EOF token: type == EOF (-1)
+// type may be 0 to indicate an error, see at()
+enum css_token_type
+{
+ WHITESPACE = ' ',
+
+ // Giving EOF and some chars explicit names to facilitate debugging and to get rid of warning C4063: case '41' is not a valid value for switch of enum 'litehtml::css_token_type'
+ _EOF = EOF,
+ LEFT_BRACE = '{',
+ RIGHT_BRACE = '}',
+ LEFT_BRACKET = '[',
+ RIGHT_BRACKET = ']',
+ LEFT_PAREN = '(',
+ RIGHT_PAREN = ')',
+ COLON = ':',
+ SEMICOLON = ';',
+ COMMA = ',',
+ BANG = '!',
+ DOT = '.',
+ AMPERSAND = '&',
+
+ IDENT = -20, // do not collide with any unicode chars
+ FUNCTION, // calc(
+ AT_KEYWORD, // @media
+ HASH, // #foo
+ STRING, // "xxx" or 'xxx'
+ BAD_STRING,
+ URL, // url(x.com) - but not url("x.com"), which is function + string + ')'
+ BAD_URL,
+ NUMBER, // 25
+ PERCENTAGE, // 25%
+ DIMENSION, // 25px
+ CDO, // <!--
+ CDC, // -->
+
+ // https://www.w3.org/tr/css-syntax-3/#component-value
+ CV_FUNCTION = -100,
+ // simple block:
+ CURLY_BLOCK = -100 - '{',
+ ROUND_BLOCK = -100 - '(',
+ SQUARE_BLOCK = -100 - '['
+};
+
+enum css_number_type
+{
+ css_number_integer,
+ css_number_number
+};
+
+enum css_hash_type
+{
+ css_hash_unrestricted,
+ css_hash_id
+};
+
+// css_token: CSS token or component value ("fat" token)
+// Tokens exist in uncomponentized form only a short time after tokenization, most of the time they are "fat".
+// All functions in css_parser work regardless of whether tokens are fat or not, as per standard.
+// All functions outside of css_parser that parse media queries, selectors, property values assume tokens are componentized.
+struct css_token
+{
+ css_token(css_token_type type = css_token_type(),
+ float number = 0, css_number_type number_type = css_number_integer, string str = "")
+ : type(type), str(str), n{number, number_type}
+ {
+ if (is_component_value()) new(&value) vector<css_token>;
+ }
+
+ css_token(css_token_type type, const string& str)
+ : type(type), str(str), n()
+ {
+ if (is_component_value()) new(&value) vector<css_token>;
+ }
+
+ css_token(const css_token& token) : type(token.type), str(token.str), repr(token.repr)
+ {
+ switch (type)
+ {
+ case HASH:
+ hash_type = token.hash_type;
+ break;
+
+ case NUMBER:
+ case PERCENTAGE:
+ case DIMENSION:
+ n = token.n;
+ break;
+
+ case CV_FUNCTION:
+ case CURLY_BLOCK:
+ case ROUND_BLOCK:
+ case SQUARE_BLOCK:
+ new(&value) vector(token.value);
+ break;
+
+ default:;
+ }
+ }
+
+ css_token& operator=(const css_token& token)
+ {
+ this->~css_token();
+ new(this) css_token(token);
+ return *this;
+ }
+
+ ~css_token()
+ {
+ str.~string();
+ if (is_component_value()) value.~vector();
+ }
+
+ bool is_component_value() const
+ {
+ return type <= CV_FUNCTION;
+ }
+
+ string ident() const;
+ string get_repr(bool insert_spaces = false) const;
+
+ union {
+ css_token_type type;
+ int ch; // used for <delim-token> or :;,()[]{}
+ };
+ union {
+ string str; // STRING, URL
+ string name; // HASH, IDENT, AT_KEYWORD, FUNCTION, CV_FUNCTION
+ string unit; // DIMENSION
+ };
+ struct number {
+ float number; // NUMBER, PERCENTAGE, DIMENSION
+ css_number_type number_type; // NUMBER, DIMENSION
+ };
+ union {
+ css_hash_type hash_type; // HASH
+ number n;
+ vector<css_token> value; // CV_FUNCTION, XXX_BLOCK
+ };
+
+ string repr; // https://www.w3.org/TR/css-syntax-3/#representation
+};
+
+using css_token_vector = vector<css_token>;
+string get_repr(const css_token_vector& tokens, int index = 0, int count = -1, bool insert_spaces = false);
+
+class css_tokenizer
+{
+public:
+ css_tokenizer(const string& input) : str(input), index(0), current_char(0) {}
+
+ css_token_vector tokenize();
+
+private:
+ // Input stream. Valid UTF-8; no NUL bytes. https://www.w3.org/TR/css-syntax-3/#input-stream
+ string str;
+
+ // Index of the next input char. https://www.w3.org/TR/css-syntax-3/#next-input-code-point
+ int index;
+
+ // https://www.w3.org/TR/css-syntax-3/#current-input-code-point
+ // This is needed to handle the situation when unconsume_char is called when index == str.size().
+ // We need to distinguish between the situation when we just read the last char and
+ // the situation when we already have been at the end and just read NUL.
+ // If we don't do this tokenizer will loop forever on input "a".
+ int current_char;
+
+private:
+ static bool is_whitespace(int ch);
+ static bool is_non_printable_code_point(int ch);
+ static bool is_ident_start_code_point(int ch);
+ static bool is_ident_code_point(int ch);
+
+ struct three_chars { int _1, _2, _3; };
+
+ int consume_char();
+ void unconsume_char();
+ int peek_char();
+ three_chars peek_chars();
+
+ void consume_comments();
+ int consume_escaped_code_point();
+ css_token consume_string_token(int ending_code_point);
+
+ static bool would_start_ident_sequence(three_chars chars);
+ string consume_ident_sequence();
+
+ static bool would_start_a_number(int x, int y, int z);
+ static double convert_string_to_number(const string& str);
+ double consume_number(css_number_type& number_type);
+ css_token consume_numeric_token();
+
+ void consume_remnants_of_bad_url();
+ css_token consume_url_token();
+
+ css_token consume_ident_like_token();
+ css_token consume_token();
+};
+
+void css_parse_error(string msg);
+inline css_token_vector tokenize(const string& str)
+{
+ return css_tokenizer(str).tokenize();
+}
+
+} // namespace litehtml
+
+#endif // LH_CSS_TOKENIZER_H \ No newline at end of file
diff --git a/libs/litehtml/include/litehtml/html_microsyntaxes.h b/libs/litehtml/include/litehtml/html_microsyntaxes.h
new file mode 100644
index 0000000000..193dd20d62
--- /dev/null
+++ b/libs/litehtml/include/litehtml/html_microsyntaxes.h
@@ -0,0 +1,21 @@
+#ifndef LH_HTML_MICROSYNTAXES_H
+#define LH_HTML_MICROSYNTAXES_H
+
+namespace litehtml
+{
+
+bool html_parse_integer(const string& str, int& val);
+bool html_parse_non_negative_integer(const string& str, int& val);
+
+enum html_dimension_type
+{
+ html_length,
+ html_percentage
+};
+
+bool html_parse_dimension_value(const string& str, float& val, html_dimension_type& type);
+bool html_parse_nonzero_dimension_value(const string& str, float& val, html_dimension_type& type);
+
+} // namespace litehtml
+
+#endif // LH_HTML_MICROSYNTAXES_H
diff --git a/libs/litehtml/src/css_parser.cpp b/libs/litehtml/src/css_parser.cpp
new file mode 100644
index 0000000000..24e49a78c8
--- /dev/null
+++ b/libs/litehtml/src/css_parser.cpp
@@ -0,0 +1,514 @@
+#include "html.h"
+#include "css_parser.h"
+
+namespace litehtml
+{
+
+// https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
+void filter_code_points(string& input)
+{
+ const char* xFFFD = "\xEF\xBF\xBD";
+
+ size_t null_count = std::count(input.begin(), input.end(), 0);
+
+ string result(input.size() + 2 * null_count, 0);
+
+ for (int i = 0, j = 0; i < (int)input.size(); i++)
+ {
+ switch (input[i])
+ {
+ case '\r':
+ result[j++] = '\n';
+ if (i + 1 < (int)input.size() && input[i + 1] == '\n') i++; // skip \n after \r
+ break;
+ case '\f':
+ result[j++] = '\n';
+ break;
+ case 0:
+ memcpy(&result[j], xFFFD, 3);
+ j += 3;
+ break;
+ default:
+ result[j++] = input[i];
+ }
+ }
+
+ // trim trailing NULs
+ result.resize(strlen(result.c_str()));
+ input = result;
+}
+
+void remove_whitespace(css_token_vector& tokens, keep_whitespace_fn keep_whitespace)
+{
+ for (int i = 0; i < (int)tokens.size(); i++)
+ {
+ auto& tok = tokens[i];
+ if (tok.type == ' ')
+ {
+ const auto& left = i > 0 ? tokens[i - 1] : css_token();
+ const auto& right = at(tokens, i + 1);
+ bool keep = keep_whitespace && keep_whitespace(left, right);
+ if (!keep) remove(tokens, i), i--;
+ }
+ else if (tok.is_component_value())
+ remove_whitespace(tok.value, keep_whitespace);
+ }
+}
+
+void componentize(css_token_vector& tokens)
+{
+ css_parser parser(tokens);
+ css_token_vector result;
+ while (true)
+ {
+ css_token tok = parser.consume_component_value();
+ if (tok.type == EOF) break;
+ result.push_back(tok);
+ }
+ tokens = result;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#normalize-into-a-token-stream
+template<>
+css_token_vector normalize(css_token_vector input, int options, keep_whitespace_fn keep_whitespace)
+{
+ if (options & f_componentize) componentize(input);
+ if (options & f_remove_whitespace) remove_whitespace(input, keep_whitespace);
+ return input;
+}
+template<>
+css_token_vector normalize(string input, int options, keep_whitespace_fn keep_whitespace)
+{
+ filter_code_points(input);
+ auto tokens = tokenize(input);
+ return normalize(tokens, options, keep_whitespace);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#parse-stylesheet
+// I don't create a stylesheet because its only perpose is to pass a list of rules to
+// parse_css_stylesheet. I just return the list of rules directly instead.
+raw_rule::vector css_parser::parse_stylesheet(const string& input, bool top_level)
+{
+ // 1. If input is a byte stream for stylesheet, decode bytes from input, and set input to the result.
+ // not implemented, utf-8 is always assumed
+ string str = decode(input, encoding::utf_8); // decoding potentially broken UTF-8 into valid UTF-8
+
+ // 2. Normalize input, and set input to the result.
+ auto tokens = normalize(str);
+
+ return parse_stylesheet(tokens, top_level);
+}
+raw_rule::vector css_parser::parse_stylesheet(const css_token_vector& input, bool top_level)
+{
+ // 3. Create a new stylesheet, with its location set to location.
+ // 4. Consume a list of rules from input, with the top-level flag set, and set the stylesheet’s value to the result.
+ // 5. Return the stylesheet.
+ return css_parser(input).consume_list_of_rules(top_level);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-the-next-input-token
+css_token css_parser::next_token()
+{
+ if (m_index == (int)m_tokens.size())
+ return css_token_type(EOF);
+ else
+ return m_tokens[m_index++];
+}
+
+css_token css_parser::peek_token()
+{
+ if (m_index == (int)m_tokens.size())
+ return css_token_type(EOF);
+ else
+ return m_tokens[m_index];
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-list-of-rules
+raw_rule::vector css_parser::consume_list_of_rules(bool top_level)
+{
+ raw_rule::vector rules;
+ raw_rule::ptr rule;
+
+ while (true)
+ {
+ // Repeatedly consume the next input token:
+ css_token token = next_token();
+
+ switch (token.type)
+ {
+ case WHITESPACE:
+ break; // Do nothing.
+
+ case EOF:
+ return rules; // Return the list of rules.
+
+ case CDO:
+ case CDC:
+ // If the top-level flag is set, do nothing.
+ if (top_level) break;
+
+ // Otherwise, reconsume the current input token. Consume a qualified rule.
+ // If anything is returned, append it to the list of rules.
+ m_index--;
+ rule = consume_qualified_rule();
+ if (rule) rules.push_back(rule);
+ break;
+
+ case AT_KEYWORD:
+ // Reconsume the current input token. Consume an at-rule, and append the returned value to the list of rules.
+ m_index--;
+ rule = consume_at_rule();
+ if (rule) rules.push_back(rule);
+ break;
+
+ default:
+ // Reconsume the current input token. Consume a qualified rule. If anything is returned, append it to the list of rules.
+ m_index--;
+ rule = consume_qualified_rule();
+ if (rule) rules.push_back(rule);
+ break;
+ }
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-qualified-rule
+raw_rule::ptr css_parser::consume_qualified_rule()
+{
+ // Create a new qualified rule with its prelude initially set to an empty list, and its value initially set to nothing.
+ raw_rule::ptr rule = make_shared<raw_rule>(raw_rule::qualified);
+
+ while (true)
+ {
+ // Repeatedly consume the next input token:
+ css_token token = next_token();
+
+ switch (token.type)
+ {
+ case EOF:
+ // This is a parse error. Return nothing.
+ css_parse_error("eof in qualified rule");
+ return nullptr;
+ case '{':
+ // Consume a simple block and assign it to the qualified rule’s block. Return the qualified rule.
+ rule->block = consume_simple_block('{');
+ return rule;
+ case CURLY_BLOCK:
+ // Assign the block to the qualified rule’s block. Return the qualified rule.
+ rule->block = token;
+ return rule;
+ default:
+ // Reconsume the current input token. Consume a component value. Append the returned value to the qualified rule’s prelude.
+ m_index--;
+ css_token value = consume_component_value();
+ rule->prelude.push_back(value);
+ }
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-at-rule
+raw_rule::ptr css_parser::consume_at_rule()
+{
+ // Consume the next input token. Create a new at-rule with its name set to the value of the current input token,
+ // its prelude initially set to an empty list, and its value initially set to nothing.
+ css_token token = next_token();
+ raw_rule::ptr rule = make_shared<raw_rule>(raw_rule::at, token.str);
+
+ while (true)
+ {
+ // Repeatedly consume the next input token:
+ token = next_token();
+
+ switch (token.type)
+ {
+ case ';':
+ return rule;
+ case EOF:
+ // This is a parse error. Return the at-rule.
+ css_parse_error("eof in at-rule");
+ return rule;
+ case '{':
+ // Consume a simple block and assign it to the at-rule’s block. Return the at-rule.
+ rule->block = consume_simple_block('{');
+ return rule;
+ case CURLY_BLOCK:
+ // Assign the block to the at-rule’s block. Return the at-rule.
+ rule->block = token;
+ return rule;
+ default:
+ // Reconsume the current input token. Consume a component value. Append the returned value to the at-rule’s prelude.
+ m_index--;
+ css_token value = consume_component_value();
+ rule->prelude.push_back(value);
+ }
+ }
+}
+
+char mirror(char c);
+
+// https://www.w3.org/TR/css-syntax-3/#consume-simple-block
+css_token css_parser::consume_simple_block(char opening_bracket)
+{
+ // Create a simple block with its associated token set to the current input token and with its value initially set to an empty list.
+ auto block_type = css_token_type(-100 - opening_bracket); // see css_token_type
+ css_token block(block_type);
+
+ char closing_bracket = mirror(opening_bracket);
+
+ while (true)
+ {
+ // Repeatedly consume the next input token and process it as follows:
+ css_token token = next_token();
+
+ if (token.type == closing_bracket)
+ {
+ return block;
+ }
+ else if (token.type == EOF)
+ {
+ css_parse_error("eof in simple block");
+ return block;
+ }
+ else
+ {
+ // Reconsume the current input token. Consume a component value and append it to the value of the block.
+ m_index--;
+ css_token val = consume_component_value();
+ block.value.push_back(val);
+ }
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-component-value
+css_token css_parser::consume_component_value()
+{
+ // Consume the next input token.
+ css_token token = next_token();
+
+ switch (token.type)
+ {
+ // If the current input token is a <{-token>, <[-token>, or <(-token>, consume a simple block and return it.
+ case '{': case '[': case '(':
+ return consume_simple_block((char)token.ch);
+
+ // Otherwise, if the current input token is a <function-token>, consume a function and return it.
+ case FUNCTION:
+ return consume_function(token.name);
+
+ // Otherwise, return the current input token.
+ default:
+ return token;
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-function
+css_token css_parser::consume_function(const string& name)
+{
+ // Create a function with its name equal to the value of the current input token and with its value initially set to an empty list.
+ css_token function(CV_FUNCTION, name);
+
+ while (true)
+ {
+ // Repeatedly consume the next input token and process it as follows:
+ css_token token = next_token();
+
+ switch (token.type)
+ {
+ case ')':
+ return function;
+
+ case EOF:
+ css_parse_error("eof in function");
+ return function;
+
+ default:
+ // Reconsume the current input token. Consume a component value and append the returned value to the function’s value.
+ m_index--;
+ css_token val = consume_component_value();
+ function.value.push_back(val);
+ }
+ }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+void trim_whitespace(css_token_vector& tokens)
+{
+ while (at(tokens, 0).type == ' ') remove(tokens, 0);
+ while (at(tokens, -1).type == ' ') remove(tokens, -1);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-declaration
+// next token is guaranteed to be IDENT
+raw_declaration css_parser::consume_declaration()
+{
+ // Consume the next input token. Create a new declaration with its name set to the value of
+ // the current input token and its value initially set to an empty list.
+ css_token token = next_token();
+ raw_declaration decl = {token.name};
+ auto& value = decl.value;
+
+ // 1. While the next input token is a <whitespace-token>, consume the next input token.
+ while (peek_token().type == ' ') next_token();
+
+ // 2. If the next input token is anything other than a <colon-token>, this is a parse error. Return nothing.
+ if (peek_token().ch != ':')
+ {
+ css_parse_error("consume_declaration: ':' not found");
+ return {};
+ }
+ // Otherwise, consume the next input token.
+ next_token();
+
+ // 3. While the next input token is a <whitespace-token>, consume the next input token.
+ while (peek_token().type == ' ') next_token();
+
+ // 4. As long as the next input token is anything other than an <EOF-token>,
+ // consume a component value and append it to the declaration’s value.
+ while (peek_token().type != EOF)
+ value.push_back(consume_component_value());
+
+ // 5. If the last two non-<whitespace-token>s in the declaration’s value are a <delim-token> with the value "!"
+ // followed by an <ident-token> with a value that is an ASCII case-insensitive match for "important",
+ // remove them from the declaration’s value and set the declaration’s important flag to true.
+
+ trim_whitespace(value); // deviation from standard: removing leading whitespace as well
+
+ if (at(value, -1).ident() == "important" && at(value, -2).ch == '!')
+ {
+ remove(value, -2, 2);
+ decl.important = true;
+ }
+
+ // 6. While the last token in the declaration’s value is a <whitespace-token>, remove that token.
+ trim_whitespace(value);
+
+ // 7. Return the declaration.
+ return decl;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-style-block
+void css_parser::consume_style_block_contents(/*out*/ raw_declaration::vector& decls, /*out*/ raw_rule::vector& rules)
+{
+ while (true)
+ {
+ // Repeatedly consume the next input token:
+ css_token token = next_token();
+
+ switch (token.type)
+ {
+ case WHITESPACE:
+ case ';':
+ break; // Do nothing.
+
+ case EOF:
+ // "Extend decls with rules, then return decls."
+ // NOTE: I just return decls and rules separately
+ return;
+
+ case AT_KEYWORD: {
+ // Reconsume the current input token. Consume an at-rule, and append the result to rules.
+ m_index--;
+ auto rule = consume_at_rule();
+ if (rule) rules.push_back(rule);
+ break;
+ }
+ case IDENT: {
+ // Initialize a temporary list initially filled with the current input token.
+ css_token_vector temp = { token };
+ // As long as the next input token is anything other than a <semicolon-token> or <EOF-token>,
+ // consume a component value and append it to the temporary list.
+ while (!is_one_of(peek_token().type, ';', EOF))
+ temp.push_back(consume_component_value());
+
+ css_parser parser(temp);
+ // Consume a declaration from the temporary list.
+ auto decl = parser.consume_declaration();
+ // If anything was returned, append it to decls.
+ if (decl) decls.push_back(decl);
+ break;
+ }
+ case '&': {
+ // Reconsume the current input token. Consume a qualified rule. If anything was returned, append it to rules.
+ m_index--;
+ auto rule = consume_qualified_rule();
+ if (rule) rules.push_back(rule);
+ break;
+ }
+ default:
+ // This is a parse error. Reconsume the current input token. As long as the next input token is
+ // anything other than a <;> or <EOF>, consume a component value and throw away the returned value.
+ css_parse_error("unexpected token in a style block");
+ m_index--;
+ while (!is_one_of(peek_token().type, ';', EOF))
+ consume_component_value();
+ break;
+ }
+ }
+}
+
+
+// https://www.w3.org/TR/css-syntax-3/#parse-comma-separated-list-of-component-values
+// Note: result is never empty. If input is empty result is {{}}.
+vector<css_token_vector> parse_comma_separated_list(const css_token_vector& tokens)
+{
+ vector<css_token_vector> result;
+
+ css_token_vector list;
+ for (auto& tok : tokens)
+ {
+ if (tok.type == ',') // Note: EOF token is not stored in arrays
+ {
+ result.push_back(list);
+ list.clear();
+ continue;
+ }
+ list.push_back(tok);
+ }
+ result.push_back(list);
+
+ return result;
+}
+
+// https://drafts.csswg.org/css-syntax-3/#typedef-any-value
+// assumes that tokens have been componentized
+bool is_any_value(const css_token_vector& tokens)
+{
+ if (tokens.empty()) return false;
+ for (auto& tok : tokens)
+ {
+ if (is_one_of(tok.type, BAD_STRING, BAD_URL, ')', ']', '}'))
+ return false;
+ else if (tok.is_component_value() && !is_any_value(tok.value))
+ return false;
+ }
+ return true;
+}
+
+// https://drafts.csswg.org/css-syntax-3/#typedef-declaration-value
+// assumes that tokens have been componentized
+bool is_declaration_value(const css_token_vector& tokens, int index)
+{
+ if (index >= (int)tokens.size()) return false;
+ for (int i = index; i < (int)tokens.size(); i++)
+ {
+ auto& tok = tokens[i];
+ if (is_one_of(tok.type, BAD_STRING, BAD_URL, ')', ']', '}', ';', '!'))
+ return false;
+ // Note: ';' '!' inside component values are allowed, so using is_any_value here.
+ else if (tok.is_component_value() && !is_any_value(tok.value))
+ return false;
+ }
+ return true;
+}
+
+// Note: it is possible to have several whitespace tokens in a row: " /**/ /**/ "
+bool skip_whitespace(const css_token_vector& tokens, int& index)
+{
+ int start = index;
+ while (at(tokens, index).type == ' ') index++;
+ return index != start;
+}
+
+} // namespace litehtml \ No newline at end of file
diff --git a/libs/litehtml/src/css_tokenizer.cpp b/libs/litehtml/src/css_tokenizer.cpp
new file mode 100644
index 0000000000..b2b8761fa4
--- /dev/null
+++ b/libs/litehtml/src/css_tokenizer.cpp
@@ -0,0 +1,724 @@
+#include "html.h"
+#include "css_tokenizer.h"
+
+namespace litehtml
+{
+
+void css_parse_error(string /*msg*/)
+{
+ //printf("%s\n", msg.c_str());
+}
+
+string css_token::ident() const
+{
+ if (type != IDENT) return "";
+ return name.substr(0, 2) == "--" ? name : lowcase(name);
+}
+
+
+char mirror(char c)
+{
+ if (c == '{') return '}';
+ if (c == '[') return ']';
+ if (c == '(') return ')';
+ return c;
+}
+
+string css_token::get_repr(bool insert_spaces) const
+{
+ if (!is_component_value()) return repr;
+
+ using litehtml::get_repr;
+ if (type == CV_FUNCTION) return name + '(' + get_repr(value, 0, -1, insert_spaces) + ')';
+
+ char opening_bracket = char(-type - 100);
+ char closing_bracket = mirror(opening_bracket);
+ return opening_bracket + get_repr(value, 0, -1, insert_spaces) + closing_bracket;
+}
+
+// concatenate string representations of tokens
+string get_repr(const css_token_vector& tokens, int index, int count, bool insert_spaces)
+{
+ if (count == -1) count = (int)tokens.size() - index;
+ string str;
+ string space = insert_spaces ? " " : "";
+ for (int i = index; i < index + count; i++)
+ {
+ str += tokens[i].get_repr(insert_spaces) + space;
+ }
+ if (insert_spaces) remove(str, -1);
+ return str;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#whitespace
+bool css_tokenizer::is_whitespace(int ch) {
+ // NOTE: \r and \f are converted to \n in filter_code_points
+ return ch == '\n' || ch == '\t' || ch == ' ';
+}
+
+// https://www.w3.org/TR/css-syntax-3/#non-printable-code-point
+bool css_tokenizer::is_non_printable_code_point(int ch) {
+ return (ch >= 0 && ch <= 8) || ch == 0xB || (ch >= 0xE && ch <= 0x1F) || ch == 0x7F;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
+bool css_tokenizer::is_ident_start_code_point(int ch) {
+ return is_letter(ch) || ch >= 0x80 || ch == '_';
+}
+
+// https://www.w3.org/TR/css-syntax-3/#ident-code-point
+bool css_tokenizer::is_ident_code_point(int ch) {
+ return is_ident_start_code_point(ch) || is_digit(ch) || ch == '-';
+}
+
+
+// Consume the next input code point. Return the current input code point.
+// When we know that next input char is ASCII and not NUL, we can just write str[index++] instead.
+int css_tokenizer::consume_char()
+{
+ // NOTE: if str[index] == 0 index is not incremented
+ return current_char = read_utf8_char(str, index);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#reconsume-the-current-input-code-point
+// "reconsume" is not a good name - it should be called unconsume (the char will actually be reconsumed later when consume_char is called).
+// When we know that current input char is ASCII and index != 0, we can just write index-- instead.
+void css_tokenizer::unconsume_char()
+{
+ // see comment for current_char
+ if (current_char == 0)
+ return;
+
+ // NOTE: if index == 0 index is not decremented
+ prev_utf8_char(str, index);
+}
+
+int css_tokenizer::peek_char()
+{
+ int i = index;
+ return read_utf8_char(str, i);
+}
+
+css_tokenizer::three_chars css_tokenizer::peek_chars()
+{
+ three_chars chars;
+ int i = index;
+ chars._1 = read_utf8_char(str, i);
+ chars._2 = read_utf8_char(str, i);
+ chars._3 = read_utf8_char(str, i);
+ return chars;
+}
+
+
+// https://www.w3.org/TR/css-syntax-3/#consume-comments
+void css_tokenizer::consume_comments()
+{
+ while (true)
+ {
+ if (str[index] == '/' && str[index + 1] == '*')
+ {
+ int i = (int)str.find("*/", index + 2);
+
+ if (i != -1)
+ index = i + 2;
+ else
+ {
+ index = (int)str.size();
+ css_parse_error("eof in comment");
+ break;
+ }
+ }
+ else
+ break;
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
+// It assumes that the U+005C (\) has already been consumed and that the next input code point
+// is not a newline (see https://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape).
+int css_tokenizer::consume_escaped_code_point()
+{
+ // Consume the next input code point.
+ int ch = consume_char();
+
+ if (is_hex_digit(ch))
+ {
+ int number = digit_value(ch);
+ // Consume as many hex digits as possible, but no more than 5.
+ int max = 5;
+ while (max-- > 0 && is_hex_digit(str[index]))
+ {
+ ch = consume_char();
+ number = number * 16 + digit_value(ch);
+ }
+ // If the next input code point is whitespace, consume it as well.
+ if (is_whitespace(str[index]))
+ consume_char();
+ // If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point
+ if (number == 0 || is_surrogate(number) || number > 0x10FFFF)
+ return 0xFFFD;
+ // Otherwise, return the code point with that value.
+ return number;
+ }
+ else if (ch == 0) // EOF
+ {
+ // This is a parse error. Return U+FFFD.
+ css_parse_error("eof in escaped codepoint");
+ return 0xFFFD;
+ }
+ else // anything else
+ // Return the current input code point.
+ return ch;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-string-token
+css_token css_tokenizer::consume_string_token(int ending_code_point)
+{
+ // Initially create a <string-token> with its value set to the empty string.
+ css_token token(STRING);
+
+ while (true)
+ {
+ // Repeatedly consume the next input code point from the stream:
+ int ch = consume_char();
+ switch (ch)
+ {
+ case 0: // EOF
+ // This is a parse error. Return the <string-token>.
+ css_parse_error("eof in string");
+ return token;
+ case '\n':
+ // This is a parse error. Reconsume the current input code point, create a <bad-string-token>, and return it.
+ css_parse_error("newline in string");
+ unconsume_char();
+ return {BAD_STRING};
+ case '\\':
+ // If the next input code point is EOF, do nothing.
+ if (str[index] == 0)
+ break;
+ // Otherwise, if the next input code point is a newline, consume it.
+ else if (str[index] == '\n')
+ index++;
+ // Otherwise, (the stream starts with a valid escape) consume an escaped code point and
+ // append the returned code point to the <string-token>’s value.
+ else
+ append_char(token.str, consume_escaped_code_point());
+ break;
+ default:
+ if (ch == ending_code_point)
+ return token;
+ else // anything else
+ // Append the current input code point to the <string-token>’s value.
+ append_char(token.str, ch);
+ break;
+ }
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
+bool css_tokenizer::would_start_ident_sequence(three_chars chars)
+{
+ int c1 = chars._1;
+ int c2 = chars._2;
+ int c3 = chars._3;
+
+ if (c1 == '-')
+ {
+ // If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or
+ // the second and third code points are a valid escape, return true. Otherwise, return false.
+ return is_ident_start_code_point(c2) || c2 == '-' || (c2 == '\\' && c3 != '\n');
+ }
+ else if (is_ident_start_code_point(c1))
+ return true;
+ else if (c1 == '\\')
+ // If the first and second code points are a valid escape, return true. Otherwise, return false.
+ return c2 != '\n';
+ else
+ return false;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-name
+string css_tokenizer::consume_ident_sequence()
+{
+ string result;
+
+ while (true)
+ {
+ // Repeatedly consume the next input code point from the stream:
+ int ch = consume_char();
+
+ if (is_ident_code_point(ch))
+ append_char(result, ch); // Append the code point to result.
+
+ // else if the stream starts with a valid escape
+ // NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed)
+ else if (ch == '\\' && str[index] != '\n')
+ // Consume an escaped code point. Append the returned code point to result.
+ append_char(result, consume_escaped_code_point());
+
+ else
+ {
+ // Reconsume the current input code point. Return result.
+ unconsume_char();
+ return result;
+ }
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
+bool css_tokenizer::would_start_a_number(int x, int y, int z)
+{
+ if (x == '+' || x == '-')
+ {
+ // If the second code point is a digit, return true.
+ if (is_digit(y)) return true;
+ // Otherwise, if the second code point is a U+002E (.) and the third code point is a digit, return true.
+ else if (y == '.' && is_digit(z)) return true;
+ // Otherwise, return false.
+ else return false;
+ }
+ else if (x == '.')
+ // If the second code point is a digit, return true. Otherwise, return false.
+ return is_digit(y);
+ else
+ return is_digit(x);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#convert-string-to-number
+double css_tokenizer::convert_string_to_number(const string& str)
+{
+ const char* p = str.c_str();
+
+ // Divide the string into seven components, in order from left to right:
+
+ // 1. A sign: a single U+002B (+) or U+002D (-), or the empty string.
+ // Let s be the number -1 if the sign is U+002D (-); otherwise, let s be the number 1.
+ double s = 1;
+ if (*p == '-') s = -1, p++;
+ else if (*p == '+') p++;
+
+ // 2. An integer part: zero or more digits. If there is at least one digit, let i be the number formed by
+ // interpreting the digits as a base-10 integer; otherwise, let i be the number 0.
+ double i = 0;
+ while (is_digit(*p)) i = i * 10 + digit_value(*p++);
+
+ // 3. A decimal point: a single U+002E (.), or the empty string.
+ if (*p == '.') p++;
+
+ // 4. A fractional part: zero or more digits. If there is at least one digit, let f be the number formed by
+ // interpreting the digits as a base-10 integer and d be the number of digits;
+ // otherwise, let f and d be the number 0.
+ double f = 0, d = 0;
+ while (is_digit(*p)) f = f * 10 + digit_value(*p++), d++;
+
+ // 5. An exponent indicator: a single U+0045 (E) or U+0065 (e), or the empty string.
+ if (*p == 'e' || *p == 'E') p++;
+
+ // 6. An exponent sign: a single U+002B (+) or U+002D (-), or the empty string.
+ // Let t be the number -1 if the sign is U+002D (-); otherwise, let t be the number 1.
+ double t = 1;
+ if (*p == '-') t = -1, p++;
+ else if (*p == '+') p++;
+
+ // 7. An exponent: zero or more digits. If there is at least one digit, let e be the number formed by
+ // interpreting the digits as a base-10 integer; otherwise, let e be the number 0.
+ double e = 0;
+ while (is_digit(*p)) e = e * 10 + digit_value(*p++);
+
+ // Return the number s·(i + f·10ᐨᵈ)·10ᵗᵉ.
+ return s * (i + f * pow(10, -d)) * pow(10, t * e);
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-number
+double css_tokenizer::consume_number(css_number_type& type)
+{
+ // 1. Initially set type to "integer". Let repr be the empty string.
+ type = css_number_integer;
+ string repr;
+
+ // 2. If the next input code point is U+002B (+) or U+002D (-), consume it and append it to repr.
+ if (is_one_of(str[index], '+', '-'))
+ append_char(repr, str[index++]);
+
+ // 3. While the next input code point is a digit, consume it and append it to repr.
+ while (is_digit(str[index]))
+ append_char(repr, str[index++]);
+
+ // 4. If the next 2 input code points are U+002E (.) followed by a digit, then:
+ if (str[index] == '.' && is_digit(str[index+1]))
+ {
+ // 1. Consume them.
+ // 2. Append them to repr.
+ append_char(repr, str[index++]);
+ append_char(repr, str[index++]);
+ // 3. Set type to "number".
+ type = css_number_number;
+ // 4. While the next input code point is a digit, consume it and append it to repr.
+ while (is_digit(str[index]))
+ append_char(repr, str[index++]);
+ }
+
+ // 5. If the next 2 or 3 input code points are U+0045 (E) or U+0065 (e),
+ // optionally followed by U+002D (-) or U+002B (+), followed by a digit, then:
+ bool a = lowcase(str[index]) == 'e' && is_one_of(str[index+1], '+', '-') && is_digit(str[index+2]);
+ bool b = lowcase(str[index]) == 'e' && is_digit(str[index+1]);
+
+ if (a || b)
+ {
+ // 1. Consume them.
+ // 2. Append them to repr.
+ append_char(repr, str[index++]);
+ append_char(repr, str[index++]);
+ if (a) append_char(repr, str[index++]);
+ // 3. Set type to "number".
+ type = css_number_number;
+ // 4. While the next input code point is a digit, consume it and append it to repr.
+ while (is_digit(str[index]))
+ append_char(repr, str[index++]);
+ }
+
+ // 6. Convert repr to a number, and set the value to the returned value.
+ double value = convert_string_to_number(repr);
+
+ // 7. Return value and type.
+ return value;
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-numeric-token
+css_token css_tokenizer::consume_numeric_token()
+{
+ // Consume a number and let number be the result.
+ css_number_type type;
+ float number = (float)consume_number(type);
+
+ // If the next 3 input code points would start an ident sequence, then:
+ if (would_start_ident_sequence(peek_chars()))
+ {
+ // 1. Create a <dimension-token> with the same value and type flag as number, and
+ // a unit set initially to the empty string.
+ css_token token(DIMENSION, number, type);
+
+ // 2. Consume an ident sequence. Set the <dimension-token>’s unit to the returned value.
+ token.unit = consume_ident_sequence();
+
+ // 3. Return the <dimension-token>.
+ return token;
+ }
+
+ // Otherwise, if the next input code point is U+0025 (%), consume it.
+ // Create a <percentage-token> with the same value as number, and return it.
+ if (str[index] == '%')
+ {
+ index++;
+ return {PERCENTAGE, number}; // NOTE: number_type is unused in <percentage-token>
+ }
+
+ // Otherwise, create a <number-token> with the same value and type flag as number, and return it.
+ return {NUMBER, number, type};
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url
+void css_tokenizer::consume_remnants_of_bad_url()
+{
+ while (true)
+ {
+ // Repeatedly consume the next input code point from the stream:
+ int ch = consume_char();
+ if (ch == ')' || ch == 0) // ')' or EOF
+ return;
+ // else if the input stream starts with a valid escape
+ // NOTE: the wording is confusing because ch is not in the input stream anymore (it has been consumed)
+ else if (ch == '\\' && str[index] != '\n')
+ {
+ consume_escaped_code_point();
+ }
+ // anything else: Do nothing.
+ }
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-url-token
+css_token css_tokenizer::consume_url_token()
+{
+ // Initially create a <url-token> with its value set to the empty string.
+ css_token token(URL);
+
+ // Consume as much whitespace as possible.
+ while (is_whitespace(str[index]))
+ index++;
+
+ while (true)
+ {
+ // Repeatedly consume the next input code point from the stream:
+ int ch = consume_char();
+ switch (ch)
+ {
+ case ')':
+ // Return the <url-token>.
+ return token;
+
+ case 0: // EOF
+ // This is a parse error. Return the <url-token>.
+ css_parse_error("eof in unquoted url");
+ return token;
+
+ case '\n':
+ case '\t':
+ case ' ':
+ // Consume as much whitespace as possible.
+ while (is_whitespace(str[index]))
+ index++;
+ // If the next input code point is U+0029 ()) or EOF, consume it and return the <url-token>
+ // (if EOF was encountered, this is a parse error);
+ if (str[index] == ')' || str[index] == 0)
+ {
+ if (str[index] == 0)
+ css_parse_error("eof in unquoted url");
+ else
+ index++; // consume ')'
+ return token;
+ }
+ // otherwise, consume the remnants of a bad url, create a <bad-url-token>, and return it.
+ consume_remnants_of_bad_url();
+ return {BAD_URL};
+
+ case '"':
+ case '\'':
+ case '(':
+ bad_url:
+ // This is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it.
+ css_parse_error("invalid char in unquoted url");
+ consume_remnants_of_bad_url();
+ return {BAD_URL};
+
+ case '\\':
+ // If the stream starts with a valid escape, consume an escaped code point and
+ // append the returned code point to the <url-token>’s value.
+ if (str[index] != '\n')
+ append_char(token.str, consume_escaped_code_point());
+ // Otherwise, this is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it.
+ else
+ {
+ css_parse_error("escaped newline in unquoted url");
+ consume_remnants_of_bad_url();
+ return {BAD_URL};
+ }
+ break;
+
+ default:
+ if (is_non_printable_code_point(ch))
+ goto bad_url;
+ else // anything else
+ // Append the current input code point to the <url-token>’s value.
+ append_char(token.str, ch);
+ break;
+ }
+ }
+}
+
+
+// https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token
+css_token css_tokenizer::consume_ident_like_token()
+{
+ // Consume an ident sequence, and let string be the result.
+ auto string = consume_ident_sequence();
+
+ // If string’s value is an ASCII case-insensitive match for "url", and the next input code point is
+ // U+0028 ((), consume it.
+ if (lowcase(string) == "url" && str[index] == '(')
+ {
+ index++; // consume '('
+
+ while (is_whitespace(str[index])) // not looking for 2 spaces, see next comment
+ index++;
+
+ if (is_one_of(str[index], '"', '\''))
+ {
+ // This is not exactly what standard says, but equivalent. The purpose is to preserve a whitespace token.
+ if (is_whitespace(str[index-1])) index--;
+ return {FUNCTION, string};
+ }
+ else // Otherwise, consume a url token, and return it.
+ {
+ return consume_url_token();
+ }
+ }
+
+ // Otherwise, if the next input code point is U+0028 ((), consume it.
+ // Create a <function-token> with its value set to string and return it.
+ else if (str[index] == '(')
+ {
+ index++;
+ return {FUNCTION, string};
+ }
+
+ // Otherwise, create an <ident-token> with its value set to string and return it.
+ return {IDENT, string};
+}
+
+// https://www.w3.org/TR/css-syntax-3/#consume-token
+css_token css_tokenizer::consume_token()
+{
+ consume_comments();
+
+ css_token token;
+ int start = index;
+
+ // Consume the next input code point.
+ int ch = consume_char();
+ three_chars next;
+
+ switch (ch)
+ {
+ // whitespace
+ case '\n':
+ case '\t':
+ case ' ':
+ // Consume as much whitespace as possible. Return a <whitespace-token>.
+ while (is_whitespace(str[index]))
+ index++;
+ token.type = WHITESPACE;
+ break;
+
+ case '"':
+ case '\'':
+ token = consume_string_token(ch);
+ break;
+
+ case '#':
+ // If the next input code point is an ident code point or the next two input code points are a valid escape, then:
+ if (is_ident_code_point(peek_char()) || (str[index] == '\\' && str[index+1] != '\n'))
+ {
+ // 1. Create a <hash-token>.
+ token.type = HASH;
+ // 2. If the next 3 input code points would start an ident sequence, set the <hash-token>’s type flag to "id".
+ token.hash_type = would_start_ident_sequence(peek_chars()) ? css_hash_id : css_hash_unrestricted;
+ // 3. Consume an ident sequence, and set the <hash-token>’s value to the returned string.
+ token.name = consume_ident_sequence();
+ // 4. Return the <hash-token>.
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '+':
+ case '.':
+ // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
+ next = peek_chars();
+ if (would_start_a_number(ch, next._1, next._2))
+ {
+ unconsume_char();
+ token = consume_numeric_token();
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '-':
+ // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
+ next = peek_chars();
+ if (would_start_a_number(ch, next._1, next._2))
+ {
+ unconsume_char();
+ token = consume_numeric_token();
+ }
+ // Otherwise, if the next 2 input code points are U+002D U+003E (->), consume them and return a <CDC-token>.
+ else if (next._1 == '-' && next._2 == '>')
+ {
+ index += 2;
+ token.type = CDC;
+ }
+ // Otherwise, if the input stream starts with an ident sequence, reconsume the current input code point,
+ // consume an ident-like token, and return it.
+ else if (would_start_ident_sequence({ ch, next._1, next._2 }))
+ {
+ unconsume_char();
+ token = consume_ident_like_token();
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '<':
+ // If the next 3 input code points are !--, consume them and return a <CDO-token>.
+ if (match(str, index, "!--"))
+ {
+ index += 3;
+ token.type = CDO;
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '@':
+ // If the next 3 input code points would start an ident sequence, consume an ident sequence,
+ // create an <at-keyword-token> with its value set to the returned value, and return it.
+ if (would_start_ident_sequence(peek_chars()))
+ {
+ token.type = AT_KEYWORD;
+ token.name = consume_ident_sequence();
+ }
+ else
+ // Otherwise, return a <delim-token> with its value set to the current input code point.
+ token.ch = ch;
+ break;
+
+ case '\\':
+ // If the input stream starts with a valid escape, reconsume the current input code point,
+ // consume an ident-like token, and return it.
+ if (str[index] != '\n')
+ {
+ unconsume_char();
+ token = consume_ident_like_token();
+ }
+ else
+ {
+ // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
+ css_parse_error("escaped newline outside a string");
+ token.ch = ch;
+ }
+ break;
+
+ case 0: // EOF
+ token.type = _EOF;
+ break;
+
+ default:
+ if (is_digit(ch))
+ {
+ // Reconsume the current input code point, consume a numeric token, and return it.
+ unconsume_char();
+ token = consume_numeric_token();
+ }
+ else if (is_ident_start_code_point(ch))
+ {
+ // Reconsume the current input code point, consume an ident-like token, and return it.
+ unconsume_char();
+ token = consume_ident_like_token();
+ }
+ else // anything else
+ // Return a <delim-token> with its value set to the current input code point.
+ token.ch = ch; // NOTE: :;,()[]{} tokens are also handled here
+ }
+
+ token.repr = str.substr(start, index - start);
+ return token;
+}
+
+css_token_vector css_tokenizer::tokenize()
+{
+ css_token_vector tokens;
+ while (true)
+ {
+ css_token token = consume_token();
+ if (token.type == EOF) break;
+ tokens.push_back(token);
+ }
+ return tokens;
+}
+
+
+} // namespace litehtml
diff --git a/libs/litehtml/src/html_microsyntaxes.cpp b/libs/litehtml/src/html_microsyntaxes.cpp
new file mode 100644
index 0000000000..f16bd622e0
--- /dev/null
+++ b/libs/litehtml/src/html_microsyntaxes.cpp
@@ -0,0 +1,102 @@
+#include "html.h"
+
+namespace litehtml
+{
+
+// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-integers
+bool html_parse_integer(const string& str, int& val)
+{
+ const char* ptr = str.c_str();
+ char* end;
+ // AFAICT strtol does exactly what's required by the standard
+ int n = strtol(ptr, &end, 10);
+ if (end == ptr) return false;
+ val = n;
+ return true;
+}
+
+// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-non-negative-integers
+bool html_parse_non_negative_integer(const string& str, int& val)
+{
+ int n = 0;
+ if (!html_parse_integer(str, n) || n < 0)
+ return false;
+ val = n;
+ return true;
+}
+
+// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-dimension-values
+bool html_parse_dimension_value(const string& str, float& result, html_dimension_type& type)
+{
+ // 1. Let input be the string being parsed.
+ // 2. Let position be a position variable for input, initially pointing at the start of input.
+ auto position = str.c_str();
+ // 3. Skip ASCII whitespace within input given position.
+ while (is_whitespace(*position)) position++;
+ // 4. If position is past the end of input or the code point at position within input is not an ASCII digit, then return failure.
+ if (!is_digit(*position)) return false;
+ // 5. Collect a sequence of code points that are ASCII digits from input given position, and interpret the resulting sequence as a base-ten integer. Let value be that number.
+ char* end;
+ float value = (float)strtol(position, &end, 10);
+ position = end;
+ // 6. If position is past the end of input, then return value as a length.
+ if (!*position)
+ {
+ result = value;
+ type = html_length;
+ return true;
+ }
+ // 7. If the code point at position within input is U+002E (.), then:
+ if (*position == '.')
+ {
+ // 1. Advance position by 1.
+ position++;
+ // 2. If position is past the end of input or the code point at position within input is not an ASCII digit, then return the current dimension value with value, input, and position.
+ if (!is_digit(*position))
+ {
+ result = value;
+ type = *position == '%' ? html_percentage : html_length;
+ return true;
+ }
+ // 3. Let divisor have the value 1.
+ float divisor = 1;
+ // 4. While true:
+ while (true)
+ {
+ // 1. Multiply divisor by ten.
+ divisor *= 10;
+ // 2. Add the value of the code point at position within input, interpreted as a base-ten digit (0..9) and divided by divisor, to value.
+ value += digit_value(*position) / divisor;
+ // 3. Advance position by 1.
+ position++;
+ // 4. If position is past the end of input, then return value as a length.
+ if (!*position)
+ {
+ result = value;
+ type = html_length;;
+ return true;
+ }
+ // 5. If the code point at position within input is not an ASCII digit, then break.
+ if (!is_digit(*position))
+ break;
+ }
+ }
+ // 8. Return the current dimension value with value, input, and position.
+ result = value;
+ type = *position == '%' ? html_percentage : html_length;
+ return true;
+}
+
+// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-non-zero-dimension-values
+bool html_parse_nonzero_dimension_value(const string& str, float& val, html_dimension_type& type)
+{
+ float x;
+ html_dimension_type t;
+ if (!html_parse_dimension_value(str, x, t) || x == 0)
+ return false;
+ val = x;
+ type = t;
+ return true;
+}
+
+} // namespace litehtml
diff --git a/libs/litehtml/src/internal.h b/libs/litehtml/src/internal.h
new file mode 100644
index 0000000000..9b53a436df
--- /dev/null
+++ b/libs/litehtml/src/internal.h
@@ -0,0 +1,26 @@
+#ifndef LH_INTERNAL_H
+#define LH_INTERNAL_H
+// internal.h should not be included in header files
+// internal.h should be included after all other headers in a source file
+
+namespace litehtml
+{
+
+template<class T, class TT>
+bool operator/(const T& x, const TT& xx)
+{
+ return contains(xx, x);
+}
+// a in b if b contains a
+#define in /
+
+/* Limitations of overloaded operators compared to regular function calls:
+* 1. at least one operand must be a class, so cannot just write `ch in "abc"`
+* (possible solution: ch in "abc"_s)
+* 2. operand cannot be initializer list (exception: assignment ops), so cannot just write `ch in {'a','b','c'}`
+* (possible solution: ch in ${'a','b','c'})
+*/
+
+} // namespace litehtml
+
+#endif // LH_INTERNAL_H \ No newline at end of file