1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
|
#include "html.h"
#include "css_parser.h"
namespace litehtml
{
// https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
void filter_code_points(string& input)
{
const char* xFFFD = "\xEF\xBF\xBD";
size_t null_count = std::count(input.begin(), input.end(), 0);
string result(input.size() + 2 * null_count, 0);
for (int i = 0, j = 0; i < (int)input.size(); i++)
{
switch (input[i])
{
case '\r':
result[j++] = '\n';
if (i + 1 < (int)input.size() && input[i + 1] == '\n') i++; // skip \n after \r
break;
case '\f':
result[j++] = '\n';
break;
case 0:
memcpy(&result[j], xFFFD, 3);
j += 3;
break;
default:
result[j++] = input[i];
}
}
// trim trailing NULs
result.resize(strlen(result.c_str()));
input = result;
}
void remove_whitespace(css_token_vector& tokens, keep_whitespace_fn keep_whitespace)
{
for (int i = 0; i < (int)tokens.size(); i++)
{
auto& tok = tokens[i];
if (tok.type == ' ')
{
const auto& left = i > 0 ? tokens[i - 1] : css_token();
const auto& right = at(tokens, i + 1);
bool keep = keep_whitespace && keep_whitespace(left, right);
if (!keep) remove(tokens, i), i--;
}
else if (tok.is_component_value())
remove_whitespace(tok.value, keep_whitespace);
}
}
void componentize(css_token_vector& tokens)
{
css_parser parser(tokens);
css_token_vector result;
while (true)
{
css_token tok = parser.consume_component_value();
if (tok.type == EOF) break;
result.push_back(tok);
}
tokens = result;
}
// https://www.w3.org/TR/css-syntax-3/#normalize-into-a-token-stream
template<>
css_token_vector normalize(css_token_vector input, int options, keep_whitespace_fn keep_whitespace)
{
if (options & f_componentize) componentize(input);
if (options & f_remove_whitespace) remove_whitespace(input, keep_whitespace);
return input;
}
template<>
css_token_vector normalize(string input, int options, keep_whitespace_fn keep_whitespace)
{
filter_code_points(input);
auto tokens = tokenize(input);
return normalize(tokens, options, keep_whitespace);
}
// https://www.w3.org/TR/css-syntax-3/#parse-stylesheet
// I don't create a stylesheet because its only perpose is to pass a list of rules to
// parse_css_stylesheet. I just return the list of rules directly instead.
raw_rule::vector css_parser::parse_stylesheet(const string& input, bool top_level)
{
// 1. If input is a byte stream for stylesheet, decode bytes from input, and set input to the result.
// not implemented, utf-8 is always assumed
string str = decode(input, encoding::utf_8); // decoding potentially broken UTF-8 into valid UTF-8
// 2. Normalize input, and set input to the result.
auto tokens = normalize(str);
return parse_stylesheet(tokens, top_level);
}
raw_rule::vector css_parser::parse_stylesheet(const css_token_vector& input, bool top_level)
{
// 3. Create a new stylesheet, with its location set to location.
// 4. Consume a list of rules from input, with the top-level flag set, and set the stylesheet’s value to the result.
// 5. Return the stylesheet.
return css_parser(input).consume_list_of_rules(top_level);
}
// https://www.w3.org/TR/css-syntax-3/#consume-the-next-input-token
css_token css_parser::next_token()
{
if (m_index == (int)m_tokens.size())
return css_token_type(EOF);
else
return m_tokens[m_index++];
}
css_token css_parser::peek_token()
{
if (m_index == (int)m_tokens.size())
return css_token_type(EOF);
else
return m_tokens[m_index];
}
// https://www.w3.org/TR/css-syntax-3/#consume-list-of-rules
raw_rule::vector css_parser::consume_list_of_rules(bool top_level)
{
raw_rule::vector rules;
raw_rule::ptr rule;
while (true)
{
// Repeatedly consume the next input token:
css_token token = next_token();
switch (token.type)
{
case WHITESPACE:
break; // Do nothing.
case EOF:
return rules; // Return the list of rules.
case CDO:
case CDC:
// If the top-level flag is set, do nothing.
if (top_level) break;
// Otherwise, reconsume the current input token. Consume a qualified rule.
// If anything is returned, append it to the list of rules.
m_index--;
rule = consume_qualified_rule();
if (rule) rules.push_back(rule);
break;
case AT_KEYWORD:
// Reconsume the current input token. Consume an at-rule, and append the returned value to the list of rules.
m_index--;
rule = consume_at_rule();
if (rule) rules.push_back(rule);
break;
default:
// Reconsume the current input token. Consume a qualified rule. If anything is returned, append it to the list of rules.
m_index--;
rule = consume_qualified_rule();
if (rule) rules.push_back(rule);
break;
}
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-qualified-rule
raw_rule::ptr css_parser::consume_qualified_rule()
{
// Create a new qualified rule with its prelude initially set to an empty list, and its value initially set to nothing.
raw_rule::ptr rule = make_shared<raw_rule>(raw_rule::qualified);
while (true)
{
// Repeatedly consume the next input token:
css_token token = next_token();
switch (token.type)
{
case EOF:
// This is a parse error. Return nothing.
css_parse_error("eof in qualified rule");
return nullptr;
case '{':
// Consume a simple block and assign it to the qualified rule’s block. Return the qualified rule.
rule->block = consume_simple_block('{');
return rule;
case CURLY_BLOCK:
// Assign the block to the qualified rule’s block. Return the qualified rule.
rule->block = token;
return rule;
default:
// Reconsume the current input token. Consume a component value. Append the returned value to the qualified rule’s prelude.
m_index--;
css_token value = consume_component_value();
rule->prelude.push_back(value);
}
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-at-rule
raw_rule::ptr css_parser::consume_at_rule()
{
// Consume the next input token. Create a new at-rule with its name set to the value of the current input token,
// its prelude initially set to an empty list, and its value initially set to nothing.
css_token token = next_token();
raw_rule::ptr rule = make_shared<raw_rule>(raw_rule::at, token.str);
while (true)
{
// Repeatedly consume the next input token:
token = next_token();
switch (token.type)
{
case ';':
return rule;
case EOF:
// This is a parse error. Return the at-rule.
css_parse_error("eof in at-rule");
return rule;
case '{':
// Consume a simple block and assign it to the at-rule’s block. Return the at-rule.
rule->block = consume_simple_block('{');
return rule;
case CURLY_BLOCK:
// Assign the block to the at-rule’s block. Return the at-rule.
rule->block = token;
return rule;
default:
// Reconsume the current input token. Consume a component value. Append the returned value to the at-rule’s prelude.
m_index--;
css_token value = consume_component_value();
rule->prelude.push_back(value);
}
}
}
char mirror(char c);
// https://www.w3.org/TR/css-syntax-3/#consume-simple-block
css_token css_parser::consume_simple_block(char opening_bracket)
{
// Create a simple block with its associated token set to the current input token and with its value initially set to an empty list.
auto block_type = css_token_type(-100 - opening_bracket); // see css_token_type
css_token block(block_type);
char closing_bracket = mirror(opening_bracket);
while (true)
{
// Repeatedly consume the next input token and process it as follows:
css_token token = next_token();
if (token.type == closing_bracket)
{
return block;
}
else if (token.type == EOF)
{
css_parse_error("eof in simple block");
return block;
}
else
{
// Reconsume the current input token. Consume a component value and append it to the value of the block.
m_index--;
css_token val = consume_component_value();
block.value.push_back(val);
}
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-component-value
css_token css_parser::consume_component_value()
{
// Consume the next input token.
css_token token = next_token();
switch (token.type)
{
// If the current input token is a <{-token>, <[-token>, or <(-token>, consume a simple block and return it.
case '{': case '[': case '(':
return consume_simple_block((char)token.ch);
// Otherwise, if the current input token is a <function-token>, consume a function and return it.
case FUNCTION:
return consume_function(token.name);
// Otherwise, return the current input token.
default:
return token;
}
}
// https://www.w3.org/TR/css-syntax-3/#consume-function
css_token css_parser::consume_function(const string& name)
{
// Create a function with its name equal to the value of the current input token and with its value initially set to an empty list.
css_token function(CV_FUNCTION, name);
while (true)
{
// Repeatedly consume the next input token and process it as follows:
css_token token = next_token();
switch (token.type)
{
case ')':
return function;
case EOF:
css_parse_error("eof in function");
return function;
default:
// Reconsume the current input token. Consume a component value and append the returned value to the function’s value.
m_index--;
css_token val = consume_component_value();
function.value.push_back(val);
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void trim_whitespace(css_token_vector& tokens)
{
while (at(tokens, 0).type == ' ') remove(tokens, 0);
while (at(tokens, -1).type == ' ') remove(tokens, -1);
}
// https://www.w3.org/TR/css-syntax-3/#consume-declaration
// next token is guaranteed to be IDENT
raw_declaration css_parser::consume_declaration()
{
// Consume the next input token. Create a new declaration with its name set to the value of
// the current input token and its value initially set to an empty list.
css_token token = next_token();
raw_declaration decl = {token.name};
auto& value = decl.value;
// 1. While the next input token is a <whitespace-token>, consume the next input token.
while (peek_token().type == ' ') next_token();
// 2. If the next input token is anything other than a <colon-token>, this is a parse error. Return nothing.
if (peek_token().ch != ':')
{
css_parse_error("consume_declaration: ':' not found");
return {};
}
// Otherwise, consume the next input token.
next_token();
// 3. While the next input token is a <whitespace-token>, consume the next input token.
while (peek_token().type == ' ') next_token();
// 4. As long as the next input token is anything other than an <EOF-token>,
// consume a component value and append it to the declaration’s value.
while (peek_token().type != EOF)
value.push_back(consume_component_value());
// 5. If the last two non-<whitespace-token>s in the declaration’s value are a <delim-token> with the value "!"
// followed by an <ident-token> with a value that is an ASCII case-insensitive match for "important",
// remove them from the declaration’s value and set the declaration’s important flag to true.
trim_whitespace(value); // deviation from standard: removing leading whitespace as well
if (at(value, -1).ident() == "important" && at(value, -2).ch == '!')
{
remove(value, -2, 2);
decl.important = true;
}
// 6. While the last token in the declaration’s value is a <whitespace-token>, remove that token.
trim_whitespace(value);
// 7. Return the declaration.
return decl;
}
// https://www.w3.org/TR/css-syntax-3/#consume-style-block
void css_parser::consume_style_block_contents(/*out*/ raw_declaration::vector& decls, /*out*/ raw_rule::vector& rules)
{
while (true)
{
// Repeatedly consume the next input token:
css_token token = next_token();
switch (token.type)
{
case WHITESPACE:
case ';':
break; // Do nothing.
case EOF:
// "Extend decls with rules, then return decls."
// NOTE: I just return decls and rules separately
return;
case AT_KEYWORD: {
// Reconsume the current input token. Consume an at-rule, and append the result to rules.
m_index--;
auto rule = consume_at_rule();
if (rule) rules.push_back(rule);
break;
}
case IDENT: {
// Initialize a temporary list initially filled with the current input token.
css_token_vector temp = { token };
// As long as the next input token is anything other than a <semicolon-token> or <EOF-token>,
// consume a component value and append it to the temporary list.
while (!is_one_of(peek_token().type, ';', EOF))
temp.push_back(consume_component_value());
css_parser parser(temp);
// Consume a declaration from the temporary list.
auto decl = parser.consume_declaration();
// If anything was returned, append it to decls.
if (decl) decls.push_back(decl);
break;
}
case '&': {
// Reconsume the current input token. Consume a qualified rule. If anything was returned, append it to rules.
m_index--;
auto rule = consume_qualified_rule();
if (rule) rules.push_back(rule);
break;
}
default:
// This is a parse error. Reconsume the current input token. As long as the next input token is
// anything other than a <;> or <EOF>, consume a component value and throw away the returned value.
css_parse_error("unexpected token in a style block");
m_index--;
while (!is_one_of(peek_token().type, ';', EOF))
consume_component_value();
break;
}
}
}
// https://www.w3.org/TR/css-syntax-3/#parse-comma-separated-list-of-component-values
// Note: result is never empty. If input is empty result is {{}}.
vector<css_token_vector> parse_comma_separated_list(const css_token_vector& tokens)
{
vector<css_token_vector> result;
css_token_vector list;
for (auto& tok : tokens)
{
if (tok.type == ',') // Note: EOF token is not stored in arrays
{
result.push_back(list);
list.clear();
continue;
}
list.push_back(tok);
}
result.push_back(list);
return result;
}
// https://drafts.csswg.org/css-syntax-3/#typedef-any-value
// assumes that tokens have been componentized
bool is_any_value(const css_token_vector& tokens)
{
if (tokens.empty()) return false;
for (auto& tok : tokens)
{
if (is_one_of(tok.type, BAD_STRING, BAD_URL, ')', ']', '}'))
return false;
else if (tok.is_component_value() && !is_any_value(tok.value))
return false;
}
return true;
}
// https://drafts.csswg.org/css-syntax-3/#typedef-declaration-value
// assumes that tokens have been componentized
bool is_declaration_value(const css_token_vector& tokens, int index)
{
if (index >= (int)tokens.size()) return false;
for (int i = index; i < (int)tokens.size(); i++)
{
auto& tok = tokens[i];
if (is_one_of(tok.type, BAD_STRING, BAD_URL, ')', ']', '}', ';', '!'))
return false;
// Note: ';' '!' inside component values are allowed, so using is_any_value here.
else if (tok.is_component_value() && !is_any_value(tok.value))
return false;
}
return true;
}
// Note: it is possible to have several whitespace tokens in a row: " /**/ /**/ "
bool skip_whitespace(const css_token_vector& tokens, int& index)
{
int start = index;
while (at(tokens, index).type == ' ') index++;
return index != start;
}
} // namespace litehtml
|