diff options
author | George Hazan <george.hazan@gmail.com> | 2024-10-08 17:12:27 +0300 |
---|---|---|
committer | George Hazan <george.hazan@gmail.com> | 2024-10-08 17:12:27 +0300 |
commit | c917ae9a7abdfde50c0bb1ceb85b91b9e55aa641 (patch) | |
tree | 75f891aa74fb8b8bfed8dbe7fec67e0adce45120 /libs/litehtml/include | |
parent | a7551fa14b3ab1d5f999182313b0a820272e2be6 (diff) |
new files in LiteHtml
Diffstat (limited to 'libs/litehtml/include')
-rw-r--r-- | libs/litehtml/include/litehtml/css_parser.h | 52 | ||||
-rw-r--r-- | libs/litehtml/include/litehtml/css_tokenizer.h | 215 | ||||
-rw-r--r-- | libs/litehtml/include/litehtml/html_microsyntaxes.h | 21 |
3 files changed, 288 insertions, 0 deletions
diff --git a/libs/litehtml/include/litehtml/css_parser.h b/libs/litehtml/include/litehtml/css_parser.h new file mode 100644 index 0000000000..cbd88e3b84 --- /dev/null +++ b/libs/litehtml/include/litehtml/css_parser.h @@ -0,0 +1,52 @@ +#ifndef LH_CSS_PARSER_H +#define LH_CSS_PARSER_H + +#include "css_tokenizer.h" +#include "stylesheet.h" + +namespace litehtml +{ + +class css_parser +{ + css_token_vector m_tokens; + int m_index = 0; + + css_token next_token(); + css_token peek_token(); + +public: + css_parser() {} + css_parser(const css_token_vector& tokens) : m_tokens(tokens) {} + + static raw_rule::vector parse_stylesheet(const string& input, bool top_level); + static raw_rule::vector parse_stylesheet(const css_token_vector& input, bool top_level); + raw_rule::vector consume_list_of_rules(bool top_level); + raw_rule::ptr consume_qualified_rule(); + raw_rule::ptr consume_at_rule(); + css_token consume_simple_block(char opening_bracket); + css_token consume_component_value(); + css_token consume_function(const string& name); + + raw_declaration consume_declaration(); + void consume_style_block_contents(/*out*/ raw_declaration::vector& decls, /*out*/ raw_rule::vector& rules); +}; + +using keep_whitespace_fn = std::function<bool (const css_token& left, const css_token& right)>; +void remove_whitespace(css_token_vector& tokens, keep_whitespace_fn keep_whitespace = 0); + +enum { + f_componentize = 1, + f_remove_whitespace = 2 +}; +template<class Input> +css_token_vector normalize(Input input, int options = 0, keep_whitespace_fn keep_whitespace = 0); + +vector<css_token_vector> parse_comma_separated_list(const css_token_vector& tokens); +bool is_declaration_value(const css_token_vector& tokens, int index = 0); +bool is_any_value(const css_token_vector& tokens); +bool skip_whitespace(const css_token_vector& tokens, int& index); + +} // namespace litehtml + +#endif // LH_CSS_PARSER_H
\ No newline at end of file diff --git a/libs/litehtml/include/litehtml/css_tokenizer.h b/libs/litehtml/include/litehtml/css_tokenizer.h new file mode 100644 index 0000000000..119c925c79 --- /dev/null +++ b/libs/litehtml/include/litehtml/css_tokenizer.h @@ -0,0 +1,215 @@ +#ifndef LH_CSS_TOKENIZER_H +#define LH_CSS_TOKENIZER_H + +namespace litehtml +{ + +// https://www.w3.org/TR/css-syntax-3/#tokenization +// :;,()[]{} token or delim token: type == this char +// EOF token: type == EOF (-1) +// type may be 0 to indicate an error, see at() +enum css_token_type +{ + WHITESPACE = ' ', + + // Giving EOF and some chars explicit names to facilitate debugging and to get rid of warning C4063: case '41' is not a valid value for switch of enum 'litehtml::css_token_type' + _EOF = EOF, + LEFT_BRACE = '{', + RIGHT_BRACE = '}', + LEFT_BRACKET = '[', + RIGHT_BRACKET = ']', + LEFT_PAREN = '(', + RIGHT_PAREN = ')', + COLON = ':', + SEMICOLON = ';', + COMMA = ',', + BANG = '!', + DOT = '.', + AMPERSAND = '&', + + IDENT = -20, // do not collide with any unicode chars + FUNCTION, // calc( + AT_KEYWORD, // @media + HASH, // #foo + STRING, // "xxx" or 'xxx' + BAD_STRING, + URL, // url(x.com) - but not url("x.com"), which is function + string + ')' + BAD_URL, + NUMBER, // 25 + PERCENTAGE, // 25% + DIMENSION, // 25px + CDO, // <!-- + CDC, // --> + + // https://www.w3.org/tr/css-syntax-3/#component-value + CV_FUNCTION = -100, + // simple block: + CURLY_BLOCK = -100 - '{', + ROUND_BLOCK = -100 - '(', + SQUARE_BLOCK = -100 - '[' +}; + +enum css_number_type +{ + css_number_integer, + css_number_number +}; + +enum css_hash_type +{ + css_hash_unrestricted, + css_hash_id +}; + +// css_token: CSS token or component value ("fat" token) +// Tokens exist in uncomponentized form only a short time after tokenization, most of the time they are "fat". +// All functions in css_parser work regardless of whether tokens are fat or not, as per standard. +// All functions outside of css_parser that parse media queries, selectors, property values assume tokens are componentized. +struct css_token +{ + css_token(css_token_type type = css_token_type(), + float number = 0, css_number_type number_type = css_number_integer, string str = "") + : type(type), str(str), n{number, number_type} + { + if (is_component_value()) new(&value) vector<css_token>; + } + + css_token(css_token_type type, const string& str) + : type(type), str(str), n() + { + if (is_component_value()) new(&value) vector<css_token>; + } + + css_token(const css_token& token) : type(token.type), str(token.str), repr(token.repr) + { + switch (type) + { + case HASH: + hash_type = token.hash_type; + break; + + case NUMBER: + case PERCENTAGE: + case DIMENSION: + n = token.n; + break; + + case CV_FUNCTION: + case CURLY_BLOCK: + case ROUND_BLOCK: + case SQUARE_BLOCK: + new(&value) vector(token.value); + break; + + default:; + } + } + + css_token& operator=(const css_token& token) + { + this->~css_token(); + new(this) css_token(token); + return *this; + } + + ~css_token() + { + str.~string(); + if (is_component_value()) value.~vector(); + } + + bool is_component_value() const + { + return type <= CV_FUNCTION; + } + + string ident() const; + string get_repr(bool insert_spaces = false) const; + + union { + css_token_type type; + int ch; // used for <delim-token> or :;,()[]{} + }; + union { + string str; // STRING, URL + string name; // HASH, IDENT, AT_KEYWORD, FUNCTION, CV_FUNCTION + string unit; // DIMENSION + }; + struct number { + float number; // NUMBER, PERCENTAGE, DIMENSION + css_number_type number_type; // NUMBER, DIMENSION + }; + union { + css_hash_type hash_type; // HASH + number n; + vector<css_token> value; // CV_FUNCTION, XXX_BLOCK + }; + + string repr; // https://www.w3.org/TR/css-syntax-3/#representation +}; + +using css_token_vector = vector<css_token>; +string get_repr(const css_token_vector& tokens, int index = 0, int count = -1, bool insert_spaces = false); + +class css_tokenizer +{ +public: + css_tokenizer(const string& input) : str(input), index(0), current_char(0) {} + + css_token_vector tokenize(); + +private: + // Input stream. Valid UTF-8; no NUL bytes. https://www.w3.org/TR/css-syntax-3/#input-stream + string str; + + // Index of the next input char. https://www.w3.org/TR/css-syntax-3/#next-input-code-point + int index; + + // https://www.w3.org/TR/css-syntax-3/#current-input-code-point + // This is needed to handle the situation when unconsume_char is called when index == str.size(). + // We need to distinguish between the situation when we just read the last char and + // the situation when we already have been at the end and just read NUL. + // If we don't do this tokenizer will loop forever on input "a". + int current_char; + +private: + static bool is_whitespace(int ch); + static bool is_non_printable_code_point(int ch); + static bool is_ident_start_code_point(int ch); + static bool is_ident_code_point(int ch); + + struct three_chars { int _1, _2, _3; }; + + int consume_char(); + void unconsume_char(); + int peek_char(); + three_chars peek_chars(); + + void consume_comments(); + int consume_escaped_code_point(); + css_token consume_string_token(int ending_code_point); + + static bool would_start_ident_sequence(three_chars chars); + string consume_ident_sequence(); + + static bool would_start_a_number(int x, int y, int z); + static double convert_string_to_number(const string& str); + double consume_number(css_number_type& number_type); + css_token consume_numeric_token(); + + void consume_remnants_of_bad_url(); + css_token consume_url_token(); + + css_token consume_ident_like_token(); + css_token consume_token(); +}; + +void css_parse_error(string msg); +inline css_token_vector tokenize(const string& str) +{ + return css_tokenizer(str).tokenize(); +} + +} // namespace litehtml + +#endif // LH_CSS_TOKENIZER_H
\ No newline at end of file diff --git a/libs/litehtml/include/litehtml/html_microsyntaxes.h b/libs/litehtml/include/litehtml/html_microsyntaxes.h new file mode 100644 index 0000000000..193dd20d62 --- /dev/null +++ b/libs/litehtml/include/litehtml/html_microsyntaxes.h @@ -0,0 +1,21 @@ +#ifndef LH_HTML_MICROSYNTAXES_H +#define LH_HTML_MICROSYNTAXES_H + +namespace litehtml +{ + +bool html_parse_integer(const string& str, int& val); +bool html_parse_non_negative_integer(const string& str, int& val); + +enum html_dimension_type +{ + html_length, + html_percentage +}; + +bool html_parse_dimension_value(const string& str, float& val, html_dimension_type& type); +bool html_parse_nonzero_dimension_value(const string& str, float& val, html_dimension_type& type); + +} // namespace litehtml + +#endif // LH_HTML_MICROSYNTAXES_H |