new files in LiteHtml

author: George Hazan <george.hazan@gmail.com> 2024-10-08 17:12:27 +0300
committer: George Hazan <george.hazan@gmail.com> 2024-10-08 17:12:27 +0300
commit: c917ae9a7abdfde50c0bb1ceb85b91b9e55aa641 (patch)
tree: 75f891aa74fb8b8bfed8dbe7fec67e0adce45120 /libs/litehtml/include
parent: a7551fa14b3ab1d5f999182313b0a820272e2be6 (diff)
3 files changed, 288 insertions, 0 deletions
diff --git a/libs/litehtml/include/litehtml/css_parser.h b/libs/litehtml/include/litehtml/css_parser.h
new file mode 100644
index 0000000000..cbd88e3b84
--- /dev/null
+++ b/libs/litehtml/include/litehtml/css_parser.h
@@ -0,0 +1,52 @@
+#ifndef LH_CSS_PARSER_H
+#define LH_CSS_PARSER_H
+
+#include "css_tokenizer.h"
+#include "stylesheet.h"
+
+namespace litehtml
+{
+
+class css_parser
+{
+	css_token_vector m_tokens;
+	int m_index = 0;
+
+	css_token next_token();
+	css_token peek_token();
+
+public:
+	css_parser() {}
+	css_parser(const css_token_vector& tokens) : m_tokens(tokens) {}
+
+	static raw_rule::vector parse_stylesheet(const string& input,           bool top_level);
+	static raw_rule::vector parse_stylesheet(const css_token_vector& input, bool top_level);
+	raw_rule::vector		consume_list_of_rules(bool top_level);
+	raw_rule::ptr			consume_qualified_rule();
+	raw_rule::ptr			consume_at_rule();
+	css_token				consume_simple_block(char opening_bracket);
+	css_token				consume_component_value();
+	css_token				consume_function(const string& name);
+
+	raw_declaration			consume_declaration();
+	void					consume_style_block_contents(/*out*/ raw_declaration::vector& decls, /*out*/ raw_rule::vector& rules);
+};
+
+using keep_whitespace_fn = std::function<bool (const css_token& left, const css_token& right)>;
+void remove_whitespace(css_token_vector& tokens, keep_whitespace_fn keep_whitespace = 0);
+
+enum {
+	f_componentize = 1,
+	f_remove_whitespace = 2
+};
+template<class Input>
+css_token_vector normalize(Input input, int options = 0, keep_whitespace_fn keep_whitespace = 0);
+
+vector<css_token_vector> parse_comma_separated_list(const css_token_vector& tokens);
+bool is_declaration_value(const css_token_vector& tokens, int index = 0);
+bool is_any_value(const css_token_vector& tokens);
+bool skip_whitespace(const css_token_vector& tokens, int& index);
+
+} // namespace litehtml
+
+#endif // LH_CSS_PARSER_H
+\ No newline at end of file
diff --git a/libs/litehtml/include/litehtml/css_tokenizer.h b/libs/litehtml/include/litehtml/css_tokenizer.h
new file mode 100644
index 0000000000..119c925c79
--- /dev/null
+++ b/libs/litehtml/include/litehtml/css_tokenizer.h
@@ -0,0 +1,215 @@
+#ifndef LH_CSS_TOKENIZER_H
+#define LH_CSS_TOKENIZER_H
+
+namespace litehtml
+{
+
+// https://www.w3.org/TR/css-syntax-3/#tokenization
+// :;,()[]{} token or delim token: type == this char
+// EOF token: type == EOF (-1)
+// type may be 0 to indicate an error, see at()
+enum css_token_type
+{
+	WHITESPACE = ' ',
+	
+	// Giving EOF and some chars explicit names to facilitate debugging and to get rid of warning C4063: case '41' is not a valid value for switch of enum 'litehtml::css_token_type'
+	_EOF = EOF,
+	LEFT_BRACE = '{',
+	RIGHT_BRACE = '}',
+	LEFT_BRACKET = '[',
+	RIGHT_BRACKET = ']',
+	LEFT_PAREN = '(',
+	RIGHT_PAREN = ')',
+	COLON = ':',
+	SEMICOLON = ';',
+	COMMA = ',',
+	BANG = '!',
+	DOT = '.',
+	AMPERSAND = '&',
+	
+	IDENT = -20,    // do not collide with any unicode chars
+	FUNCTION,       // calc(
+	AT_KEYWORD,     // @media
+	HASH,           // #foo
+	STRING,         // "xxx" or 'xxx'
+	BAD_STRING,
+	URL,            // url(x.com)  - but not url("x.com"), which is function + string + ')'
+	BAD_URL,
+	NUMBER,         // 25
+	PERCENTAGE,     // 25%
+	DIMENSION,      // 25px
+	CDO,            // <!--
+	CDC,            // -->
+
+	// https://www.w3.org/tr/css-syntax-3/#component-value
+	CV_FUNCTION  = -100,
+	// simple block:
+	CURLY_BLOCK  = -100 - '{',
+	ROUND_BLOCK  = -100 - '(',
+	SQUARE_BLOCK = -100 - '['
+};
+
+enum css_number_type
+{
+	css_number_integer,
+	css_number_number
+};
+
+enum css_hash_type
+{
+	css_hash_unrestricted,
+	css_hash_id
+};
+
+// css_token: CSS token or component value ("fat" token)
+// Tokens exist in uncomponentized form only a short time after tokenization, most of the time they are "fat". 
+// All functions in css_parser work regardless of whether tokens are fat or not, as per standard.
+// All functions outside of css_parser that parse media queries, selectors, property values assume tokens are componentized.
+struct css_token
+{
+	css_token(css_token_type type = css_token_type(), 
+		float number = 0, css_number_type number_type = css_number_integer, string str = "")
+		: type(type), str(str), n{number, number_type}
+	{
+		if (is_component_value()) new(&value) vector<css_token>;
+	}
+
+	css_token(css_token_type type, const string& str)
+		: type(type), str(str), n()
+	{
+		if (is_component_value()) new(&value) vector<css_token>;
+	}
+
+	css_token(const css_token& token) : type(token.type), str(token.str), repr(token.repr)
+	{
+		switch (type)
+		{
+		case HASH:
+			hash_type = token.hash_type;
+			break;
+		
+		case NUMBER:
+		case PERCENTAGE:
+		case DIMENSION:
+			n = token.n;
+			break;
+		
+		case CV_FUNCTION:
+		case CURLY_BLOCK:
+		case ROUND_BLOCK:
+		case SQUARE_BLOCK:
+			new(&value) vector(token.value);
+			break;
+
+		default:;
+		}
+	}
+	
+	css_token& operator=(const css_token& token)
+	{
+		this->~css_token();
+		new(this) css_token(token);
+		return *this;
+	}
+	
+	~css_token()
+	{ 
+		str.~string(); 
+		if (is_component_value()) value.~vector();
+	}
+
+	bool is_component_value() const
+	{
+		return type <= CV_FUNCTION;
+	}
+
+	string ident() const;
+	string get_repr(bool insert_spaces = false) const;
+
+	union {
+		css_token_type type;
+		int ch; // used for <delim-token> or :;,()[]{}
+	};
+	union {
+		string str;  // STRING, URL
+		string name; // HASH, IDENT, AT_KEYWORD, FUNCTION, CV_FUNCTION 
+		string unit; // DIMENSION
+	};
+	struct number {
+		float number; // NUMBER, PERCENTAGE, DIMENSION
+		css_number_type number_type; // NUMBER, DIMENSION
+	};
+	union {
+		css_hash_type hash_type; // HASH
+		number n;
+		vector<css_token> value; // CV_FUNCTION, XXX_BLOCK
+	};
+	
+	string repr; // https://www.w3.org/TR/css-syntax-3/#representation
+};
+
+using css_token_vector = vector<css_token>;
+string get_repr(const css_token_vector& tokens, int index = 0, int count = -1, bool insert_spaces = false);
+
+class css_tokenizer
+{
+public:
+	css_tokenizer(const string& input) : str(input), index(0), current_char(0) {}
+	
+	css_token_vector tokenize();
+
+private:
+	// Input stream. Valid UTF-8; no NUL bytes. https://www.w3.org/TR/css-syntax-3/#input-stream
+	string	str;
+
+	// Index of the next input char.  https://www.w3.org/TR/css-syntax-3/#next-input-code-point
+	int		index;
+
+	// https://www.w3.org/TR/css-syntax-3/#current-input-code-point
+	// This is needed to handle the situation when unconsume_char is called when index == str.size().
+	// We need to distinguish between the situation when we just read the last char and
+	// the situation when we already have been at the end and just read NUL.
+	// If we don't do this tokenizer will loop forever on input "a".
+	int		current_char;
+
+private:
+	static bool is_whitespace(int ch);
+	static bool is_non_printable_code_point(int ch);
+	static bool is_ident_start_code_point(int ch);
+	static bool is_ident_code_point(int ch);
+
+	struct three_chars { int _1, _2, _3; };
+
+	int			consume_char();
+	void		unconsume_char();
+	int			peek_char();
+	three_chars peek_chars();
+
+	void		consume_comments();
+	int			consume_escaped_code_point();
+	css_token	consume_string_token(int ending_code_point);
+	
+	static bool	would_start_ident_sequence(three_chars chars);
+	string		consume_ident_sequence();
+
+	static bool	would_start_a_number(int x, int y, int z);
+	static double convert_string_to_number(const string& str);
+	double		consume_number(css_number_type& number_type);
+	css_token	consume_numeric_token();
+
+	void		consume_remnants_of_bad_url();
+	css_token	consume_url_token();
+
+	css_token	consume_ident_like_token();
+	css_token	consume_token();
+};
+
+void css_parse_error(string msg);
+inline css_token_vector tokenize(const string& str)
+{
+	return css_tokenizer(str).tokenize();
+}
+
+} // namespace litehtml
+
+#endif // LH_CSS_TOKENIZER_H
+\ No newline at end of file
diff --git a/libs/litehtml/include/litehtml/html_microsyntaxes.h b/libs/litehtml/include/litehtml/html_microsyntaxes.h
new file mode 100644
index 0000000000..193dd20d62
--- /dev/null
+++ b/libs/litehtml/include/litehtml/html_microsyntaxes.h
@@ -0,0 +1,21 @@
+#ifndef LH_HTML_MICROSYNTAXES_H
+#define LH_HTML_MICROSYNTAXES_H
+
+namespace litehtml
+{
+
+bool html_parse_integer(const string& str, int& val);
+bool html_parse_non_negative_integer(const string& str, int& val);
+
+enum html_dimension_type
+{
+	html_length,
+	html_percentage
+};
+
+bool html_parse_dimension_value(const string& str, float& val, html_dimension_type& type);
+bool html_parse_nonzero_dimension_value(const string& str, float& val, html_dimension_type& type);
+
+} // namespace litehtml
+
+#endif // LH_HTML_MICROSYNTAXES_H
author	George Hazan <george.hazan@gmail.com>	2024-10-08 17:12:27 +0300
committer	George Hazan <george.hazan@gmail.com>	2024-10-08 17:12:27 +0300
commit	c917ae9a7abdfde50c0bb1ceb85b91b9e55aa641 (patch)
tree	75f891aa74fb8b8bfed8dbe7fec67e0adce45120 /libs/litehtml/include
parent	a7551fa14b3ab1d5f999182313b0a820272e2be6 (diff)