litehtml - lightweight html renderer

author: George Hazan <george.hazan@gmail.com> 2024-03-18 12:13:54 +0300
committer: George Hazan <george.hazan@gmail.com> 2024-03-18 12:13:54 +0300
commit: 705c4d24c9c61edffc82864bf9c24384dc29a8d7 (patch)
tree: 4d21f87671db36b99402da3221d45b64c257c1fe /libs/litehtml/src/gumbo/utf8.c
parent: 5784fc3a62b9136c6690ed45ec7b505f35512e08 (diff)
1 files changed, 270 insertions, 0 deletions
diff --git a/libs/litehtml/src/gumbo/utf8.c b/libs/litehtml/src/gumbo/utf8.c
new file mode 100644
index 0000000000..ad73cefa6a
--- /dev/null
+++ b/libs/litehtml/src/gumbo/utf8.c
@@ -0,0 +1,270 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: jdtang@google.com (Jonathan Tang)
+
+#include "utf8.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>  // For strncasecmp.
+
+#include "error.h"
+#include "gumbo.h"
+#include "parser.h"
+#include "util.h"
+#include "vector.h"
+
+const int kUtf8ReplacementChar = 0xFFFD;
+
+// Reference material:
+// Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
+// RFC 3629: http://tools.ietf.org/html/rfc3629
+// HTML5 Unicode handling:
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
+//
+// This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
+// <bjoern@hoehrmann.de>.  We wrap the inner table-based decoder routine in our
+// own handling for newlines, tabs, invalid continuation bytes, and other
+// conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
+// not handle.
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.  Full text of
+// the license agreement and code follows.
+
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to
+// do
+// so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+    // The first part of the table maps bytes to character classes that
+    // to reduce the size of the transition table and create bitmasks.
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8,
+
+    // The second part is a transition table that maps a combination
+    // of a state of the automaton and a character class to a state.
+    0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12,
+    12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
+    12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
+    12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+};
+
+uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
+                                   : (0xff >> type) & (byte);
+
+  *state = utf8d[256 + *state + type];
+  return *state;
+}
+
+// END COPIED CODE.
+
+// Adds a decoding error to the parser's error list, based on the current state
+// of the Utf8Iterator.
+static void add_error(Utf8Iterator* iter, GumboErrorType type) {
+  GumboParser* parser = iter->_parser;
+
+  GumboError* error = gumbo_add_error(parser);
+  if (!error) {
+    return;
+  }
+  error->type = type;
+  error->position = iter->_pos;
+  error->original_text = iter->_start;
+
+  // At the point the error is recorded, the code point hasn't been computed
+  // yet (and can't be, because it's invalid), so we need to build up the raw
+  // hex value from the bytes under the cursor.
+  uint64_t code_point = 0;
+  for (int i = 0; i < iter->_width; ++i) {
+    code_point = (code_point << 8) | (unsigned char) iter->_start[i];
+  }
+  error->v.codepoint = code_point;
+}
+
+// Reads the next UTF-8 character in the iter.
+// This assumes that iter->_start points to the beginning of the character.
+// When this method returns, iter->_width and iter->_current will be set
+// appropriately, as well as any error flags.
+static void read_char(Utf8Iterator* iter) {
+  if (iter->_start >= iter->_end) {
+    // No input left to consume; emit an EOF and set width = 0.
+    iter->_current = -1;
+    iter->_width = 0;
+    return;
+  }
+
+  uint32_t code_point = 0;
+  uint32_t state = UTF8_ACCEPT;
+  for (const char* c = iter->_start; c < iter->_end; ++c) {
+    decode(&state, &code_point, (uint32_t)(unsigned char) (*c));
+    if (state == UTF8_ACCEPT) {
+      iter->_width = (int)(c - iter->_start + 1);
+      // This is the special handling for carriage returns that is mandated by
+      // the HTML5 spec.  Since we're looking for particular 7-bit literal
+      // characters, we operate in terms of chars and only need a check for iter
+      // overrun, instead of having to read in a full next code point.
+      // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
+      if (code_point == '\r') {
+        assert(iter->_width == 1);
+        const char* next = c + 1;
+        if (next < iter->_end && *next == '\n') {
+          // Advance the iter, as if the carriage return didn't exist.
+          ++iter->_start;
+          // Preserve the true offset, since other tools that look at it may be
+          // unaware of HTML5's rules for converting \r into \n.
+          ++iter->_pos.offset;
+        }
+        code_point = '\n';
+      }
+      if (utf8_is_invalid_code_point(code_point)) {
+        add_error(iter, GUMBO_ERR_UTF8_INVALID);
+        code_point = kUtf8ReplacementChar;
+      }
+      iter->_current = code_point;
+      return;
+    } else if (state == UTF8_REJECT) {
+      // We don't want to consume the invalid continuation byte of a multi-byte
+      // run, but we do want to skip past an invalid first byte.
+      iter->_width = c - iter->_start + (c == iter->_start);
+      iter->_current = kUtf8ReplacementChar;
+      add_error(iter, GUMBO_ERR_UTF8_INVALID);
+      return;
+    }
+  }
+  // If we got here without exiting early, then we've reached the end of the
+  // iterator.  Add an error for truncated input, set the width to consume the
+  // rest of the iterator, and emit a replacement character.  The next time we
+  // enter this method, it will detect that there's no input to consume and
+  // output an EOF.
+  iter->_current = kUtf8ReplacementChar;
+  iter->_width = iter->_end - iter->_start;
+  add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
+}
+
+static void update_position(Utf8Iterator* iter) {
+  iter->_pos.offset += (int)iter->_width;
+  if (iter->_current == '\n') {
+    ++iter->_pos.line;
+    iter->_pos.column = 1;
+  } else if (iter->_current == '\t') {
+    int tab_stop = iter->_parser->_options->tab_stop;
+    iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
+  } else if (iter->_current != -1) {
+    ++iter->_pos.column;
+  }
+}
+
+// Returns true if this Unicode code point is in the list of characters
+// forbidden by the HTML5 spec, such as undefined control chars.
+bool utf8_is_invalid_code_point(int c) {
+  return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
+         (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
+         ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
+}
+
+void utf8iterator_init(GumboParser* parser, const char* source,
+    size_t source_length, Utf8Iterator* iter) {
+  iter->_start = source;
+  iter->_end = source + source_length;
+  iter->_pos.line = 1;
+  iter->_pos.column = 1;
+  iter->_pos.offset = 0;
+  iter->_parser = parser;
+  read_char(iter);
+}
+
+void utf8iterator_next(Utf8Iterator* iter) {
+  // We update positions based on the *last* character read, so that the first
+  // character following a newline is at column 1 in the next line.
+  update_position(iter);
+  iter->_start += iter->_width;
+  read_char(iter);
+}
+
+int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; }
+
+void utf8iterator_get_position(
+    const Utf8Iterator* iter, GumboSourcePosition* output) {
+  *output = iter->_pos;
+}
+
+const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
+  return iter->_start;
+}
+
+const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
+  return iter->_end;
+}
+
+bool utf8iterator_maybe_consume_match(Utf8Iterator* iter, const char* prefix,
+    size_t length, bool case_sensitive) {
+  bool matched = (iter->_start + length <= iter->_end) &&
+                 (case_sensitive ? !strncmp(iter->_start, prefix, length)
+                                 : !strncasecmp(iter->_start, prefix, length));
+  if (matched) {
+    for (unsigned int i = 0; i < length; ++i) {
+      utf8iterator_next(iter);
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void utf8iterator_mark(Utf8Iterator* iter) {
+  iter->_mark = iter->_start;
+  iter->_mark_pos = iter->_pos;
+}
+
+// Returns the current input stream position to the mark.
+void utf8iterator_reset(Utf8Iterator* iter) {
+  iter->_start = iter->_mark;
+  iter->_pos = iter->_mark_pos;
+  read_char(iter);
+}
+
+// Sets the position and original text fields of an error to the value at the
+// mark.
+void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
+  error->position = iter->_mark_pos;
+  error->original_text = iter->_mark;
+}
author	George Hazan <george.hazan@gmail.com>	2024-03-18 12:13:54 +0300
committer	George Hazan <george.hazan@gmail.com>	2024-03-18 12:13:54 +0300
commit	705c4d24c9c61edffc82864bf9c24384dc29a8d7 (patch)
tree	4d21f87671db36b99402da3221d45b64c257c1fe /libs/litehtml/src/gumbo/utf8.c
parent	5784fc3a62b9136c6690ed45ec7b505f35512e08 (diff)