summaryrefslogtreecommitdiff
path: root/libs/litehtml/src/gumbo/include/gumbo.h
blob: 27e6c6c575d8259fee279ffaacc7942e32e10bb7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
//
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
// GUMBO_ as a prefix for enum constants (static constants get the Google-style
// kGumbo prefix).

/**
 * @file
 * @mainpage Gumbo HTML Parser
 *
 * This provides a conformant, no-dependencies implementation of the HTML5
 * parsing algorithm.  It supports only UTF8; if you need to parse a different
 * encoding, run a preprocessing step to convert to UTF8.  It returns a parse
 * tree made of the structs in this file.
 *
 * Example:
 * @code
 *    GumboOutput* output = gumbo_parse(input);
 *    do_something_with_doctype(output->document);
 *    do_something_with_html_tree(output->root);
 *    gumbo_destroy_output(&options, output);
 * @endcode
 * HTML5 Spec:
 *
 * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
 */

#ifndef GUMBO_GUMBO_H_
#define GUMBO_GUMBO_H_

#ifdef _MSC_VER
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#endif
#ifndef fileno
#define fileno _fileno
#endif
#endif

#include <stdbool.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

/**
 * A struct representing a character position within the original text buffer.
 * Line and column numbers are 1-based and offsets are 0-based, which matches
 * how most editors and command-line tools work.  Also, columns measure
 * positions in terms of characters while offsets measure by bytes; this is
 * because the offset field is often used to pull out a particular region of
 * text (which in most languages that bind to C implies pointer arithmetic on a
 * buffer of bytes), while the column field is often used to reference a
 * particular column on a printable display, which nowadays is usually UTF-8.
 */
typedef struct {
  unsigned int line;
  unsigned int column;
  unsigned int offset;
} GumboSourcePosition;

/**
 * A SourcePosition used for elements that have no source position, i.e.
 * parser-inserted elements.
 */
extern const GumboSourcePosition kGumboEmptySourcePosition;

/**
 * A struct representing a string or part of a string.  Strings within the
 * parser are represented by a char* and a length; the char* points into
 * an existing data buffer owned by some other code (often the original input).
 * GumboStringPieces are assumed (by convention) to be immutable, because they
 * may share data.  Use GumboStringBuffer if you need to construct a string.
 * Clients should assume that it is not NUL-terminated, and should always use
 * explicit lengths when manipulating them.
 */
typedef struct {
  /** A pointer to the beginning of the string.  NULL iff length == 0. */
  const char* data;

  /** The length of the string fragment, in bytes.  May be zero. */
  size_t length;
} GumboStringPiece;

/** A constant to represent a 0-length null string. */
extern const GumboStringPiece kGumboEmptyString;

/**
 * Compares two GumboStringPieces, and returns true if they're equal or false
 * otherwise.
 */
bool gumbo_string_equals(
    const GumboStringPiece* str1, const GumboStringPiece* str2);

/**
 * Compares two GumboStringPieces ignoring case, and returns true if they're
 * equal or false otherwise.
 */
bool gumbo_string_equals_ignore_case(
    const GumboStringPiece* str1, const GumboStringPiece* str2);

/**
 * A simple vector implementation.  This stores a pointer to a data array and a
 * length.  All elements are stored as void*; client code must cast to the
 * appropriate type.  Overflows upon addition result in reallocation of the data
 * array, with the size doubling to maintain O(1) amortized cost.  There is no
 * removal function, as this isn't needed for any of the operations within this
 * library.  Iteration can be done through inspecting the structure directly in
 * a for-loop.
 */
typedef struct {
  /** Data elements.  This points to a dynamically-allocated array of capacity
   * elements, each a void* to the element itself.
   */
  void** data;

  /** Number of elements currently in the vector. */
  unsigned int length;

  /** Current array capacity. */
  unsigned int capacity;
} GumboVector;

/** An empty (0-length, 0-capacity) GumboVector. */
extern const GumboVector kGumboEmptyVector;

/**
 * Returns the first index at which an element appears in this vector (testing
 * by pointer equality), or -1 if it never does.
 */
int gumbo_vector_index_of(GumboVector* vector, const void* element);

/**
 * An enum for all the tags defined in the HTML5 standard.  These correspond to
 * the tag names themselves.  Enum constants exist only for tags which appear in
 * the spec itself (or for tags with special handling in the SVG and MathML
 * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
 * name can be obtained through original_tag.
 *
 * This is mostly for API convenience, so that clients of this library don't
 * need to perform a strcasecmp to find the normalized tag name.  It also has
 * efficiency benefits, by letting the parser work with enums instead of
 * strings.
 */
typedef enum {
// Load all the tags from an external source, generated from tag.in.
#include "gumbo/tag_enum.h"
  // Used for all tags that don't have special handling in HTML.  Add new tags
  // to the end of tag.in so as to preserve backwards-compatibility.
  GUMBO_TAG_UNKNOWN,
  // A marker value to indicate the end of the enum, for iterating over it.
  // Also used as the terminator for varargs functions that take tags.
  GUMBO_TAG_LAST,
} GumboTag;

/**
 * Returns the normalized (usually all-lowercased, except for foreign content)
 * tag name for an GumboTag enum.  Return value is static data owned by the
 * library.
 */
const char* gumbo_normalized_tagname(GumboTag tag);

/**
 * Extracts the tag name from the original_text field of an element or token by
 * stripping off </> characters and attributes and adjusting the passed-in
 * GumboStringPiece appropriately.  The tag name is in the original case and
 * shares a buffer with the original text, to simplify memory management.
 * Behavior is undefined if a string-piece that doesn't represent an HTML tag
 * (<tagname> or </tagname>) is passed in.  If the string piece is completely
 * empty (NULL data pointer), then this function will exit successfully as a
 * no-op.
 */
void gumbo_tag_from_original_text(GumboStringPiece* text);

/**
 * Fixes the case of SVG elements that are not all lowercase.
 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
 * This is not done at parse time because there's no place to store a mutated
 * tag name.  tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
 * without special handling), while original_tag_name is a pointer into the
 * original buffer.  Instead, we provide this helper function that clients can
 * use to rename SVG tags as appropriate.
 * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
 * no normalization is called for.  The return value is static data and owned by
 * the library.
 */
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);

/**
 * Converts a tag name string (which may be in upper or mixed case) to a tag
 * enum. The `tag` version expects `tagname` to be NULL-terminated
 */
GumboTag gumbo_tag_enum(const char* tagname);
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);

/**
 * Attribute namespaces.
 * HTML includes special handling for XLink, XML, and XMLNS namespaces on
 * attributes.  Everything else goes in the generic "NONE" namespace.
 */
typedef enum {
  GUMBO_ATTR_NAMESPACE_NONE,
  GUMBO_ATTR_NAMESPACE_XLINK,
  GUMBO_ATTR_NAMESPACE_XML,
  GUMBO_ATTR_NAMESPACE_XMLNS,
} GumboAttributeNamespaceEnum;

/**
 * A struct representing a single attribute on an HTML tag.  This is a
 * name-value pair, but also includes information about source locations and
 * original source text.
 */
typedef struct {
  /**
   * The namespace for the attribute.  This will usually be
   * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
   * values, per:
   * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
   */
  GumboAttributeNamespaceEnum attr_namespace;

  /**
   * The name of the attribute.  This is in a freshly-allocated buffer to deal
   * with case-normalization, and is null-terminated.
   */
  const char* name;

  /**
   * The original text of the attribute name, as a pointer into the original
   * source buffer.
   */
  GumboStringPiece original_name;

  /**
   * The value of the attribute.  This is in a freshly-allocated buffer to deal
   * with unescaping, and is null-terminated.  It does not include any quotes
   * that surround the attribute.  If the attribute has no value (for example,
   * 'selected' on a checkbox), this will be an empty string.
   */
  const char* value;

  /**
   * The original text of the value of the attribute.  This points into the
   * original source buffer.  It includes any quotes that surround the
   * attribute, and you can look at original_value.data[0] and
   * original_value.data[original_value.length - 1] to determine what the quote
   * characters were.  If the attribute has no value, this will be a 0-length
   * string.
   */
  GumboStringPiece original_value;

  /** The starting position of the attribute name. */
  GumboSourcePosition name_start;

  /**
   * The ending position of the attribute name.  This is not always derivable
   * from the starting position of the value because of the possibility of
   * whitespace around the = sign.
   */
  GumboSourcePosition name_end;

  /** The starting position of the attribute value. */
  GumboSourcePosition value_start;

  /** The ending position of the attribute value. */
  GumboSourcePosition value_end;
} GumboAttribute;

/**
 * Given a vector of GumboAttributes, look up the one with the specified name
 * and return it, or NULL if no such attribute exists.  This uses a
 * case-insensitive match, as HTML is case-insensitive.
 */
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);

/**
 * Enum denoting the type of node.  This determines the type of the node.v
 * union.
 */
typedef enum {
  /** Document node.  v will be a GumboDocument. */
  GUMBO_NODE_DOCUMENT,
  /** Element node.  v will be a GumboElement. */
  GUMBO_NODE_ELEMENT,
  /** Text node.  v will be a GumboText. */
  GUMBO_NODE_TEXT,
  /** CDATA node. v will be a GumboText. */
  GUMBO_NODE_CDATA,
  /** Comment node.  v will be a GumboText, excluding comment delimiters. */
  GUMBO_NODE_COMMENT,
  /** Text node, where all contents is whitespace.  v will be a GumboText. */
  GUMBO_NODE_WHITESPACE,
  /** Template node.  This is separate from GUMBO_NODE_ELEMENT because many
   * client libraries will want to ignore the contents of template nodes, as
   * the spec suggests.  Recursing on GUMBO_NODE_ELEMENT will do the right thing
   * here, while clients that want to include template contents should also
   * check for GUMBO_NODE_TEMPLATE.  v will be a GumboElement.  */
  GUMBO_NODE_TEMPLATE
} GumboNodeType;

/**
 * Forward declaration of GumboNode so it can be used recursively in
 * GumboNode.parent.
 */
typedef struct GumboInternalNode GumboNode;

/**
 * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
 */
typedef enum {
  GUMBO_DOCTYPE_NO_QUIRKS,
  GUMBO_DOCTYPE_QUIRKS,
  GUMBO_DOCTYPE_LIMITED_QUIRKS
} GumboQuirksModeEnum;

/**
 * Namespaces.
 * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.  Rather,
 * anything inside an <svg> tag is in the SVG namespace, anything inside the
 * <math> tag is in the MathML namespace, and anything else is inside the HTML
 * namespace.  No other namespaces are supported, so this can be an enum only.
 */
typedef enum {
  GUMBO_NAMESPACE_HTML,
  GUMBO_NAMESPACE_SVG,
  GUMBO_NAMESPACE_MATHML
} GumboNamespaceEnum;

/**
 * Parse flags.
 * We track the reasons for parser insertion of nodes and store them in a
 * bitvector in the node itself.  This lets client code optimize out nodes that
 * are implied by the HTML structure of the document, or flag constructs that
 * may not be allowed by a style guide, or track the prevalence of incorrect or
 * tricky HTML code.
 */
typedef enum {
  /**
   * A normal node - both start and end tags appear in the source, nothing has
   * been reparented.
   */
  GUMBO_INSERTION_NORMAL = 0,

  /**
   * A node inserted by the parser to fulfill some implicit insertion rule.
   * This is usually set in addition to some other flag giving a more specific
   * insertion reason; it's a generic catch-all term meaning "The start tag for
   * this node did not appear in the document source".
   */
  GUMBO_INSERTION_BY_PARSER = 1 << 0,

  /**
   * A flag indicating that the end tag for this node did not appear in the
   * document source.  Note that in some cases, you can still have
   * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
   * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
   * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
   * exists.  This flag will be set only if the end tag is completely missing;
   * in some cases, the end tag may be misplaced (eg. a </body> tag with text
   * afterwards), which will leave this flag unset and require clients to
   * inspect the parse errors for that case.
   */
  GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,

  // Value 1 << 2 was for a flag that has since been removed.

  /**
   * A flag for nodes that are inserted because their presence is implied by
   * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
   */
  GUMBO_INSERTION_IMPLIED = 1 << 3,

  /**
   * A flag for nodes that are converted from their end tag equivalents.  For
   * example, </p> when no paragraph is open implies that the parser should
   * create a <p> tag and immediately close it, while </br> means the same thing
   * as <br>.
   */
  GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,

  /** A flag for nodes that are converted from the parse of an <isindex> tag. */
  GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,

  /** A flag for <image> tags that are rewritten as <img>. */
  GUMBO_INSERTION_FROM_IMAGE = 1 << 6,

  /**
   * A flag for nodes that are cloned as a result of the reconstruction of
   * active formatting elements.  This is set only on the clone; the initial
   * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
   */
  GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,

  /** A flag for nodes that are cloned by the adoption agency algorithm. */
  GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,

  /** A flag for nodes that are moved by the adoption agency algorithm. */
  GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,

  /**
   * A flag for nodes that have been foster-parented out of a table (or
   * should've been foster-parented, if verbatim mode is set).
   */
  GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
} GumboParseFlags;

/**
 * Information specific to document nodes.
 */
typedef struct {
  /**
   * An array of GumboNodes, containing the children of this element.  This will
   * normally consist of the <html> element and any comment nodes found.
   * Pointers are owned.
   */
  GumboVector /* GumboNode* */ children;

  // True if there was an explicit doctype token as opposed to it being omitted.
  bool has_doctype;

  // Fields from the doctype token, copied verbatim.
  const char* name;
  const char* public_identifier;
  const char* system_identifier;

  /**
   * Whether or not the document is in QuirksMode, as determined by the values
   * in the GumboTokenDocType template.
   */
  GumboQuirksModeEnum doc_type_quirks_mode;
} GumboDocument;

/**
 * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
 * This contains just a block of text and its position.
 */
typedef struct {
  /**
   * The text of this node, after entities have been parsed and decoded.  For
   * comment/cdata nodes, this does not include the comment delimiters.
   */
  const char* text;

  /**
   * The original text of this node, as a pointer into the original buffer.  For
   * comment/cdata nodes, this includes the comment delimiters.
   */
  GumboStringPiece original_text;

  /**
   * The starting position of this node.  This corresponds to the position of
   * original_text, before entities are decoded.
   * */
  GumboSourcePosition start_pos;
} GumboText;

/**
 * The struct used to represent all HTML elements.  This contains information
 * about the tag, attributes, and child nodes.
 */
typedef struct {
  /**
   * An array of GumboNodes, containing the children of this element.  Pointers
   * are owned.
   */
  GumboVector /* GumboNode* */ children;

  /** The GumboTag enum for this element. */
  GumboTag tag;

  /** The GumboNamespaceEnum for this element. */
  GumboNamespaceEnum tag_namespace;

  /**
   * A GumboStringPiece pointing to the original tag text for this element,
   * pointing directly into the source buffer.  If the tag was inserted
   * algorithmically (for example, <head> or <tbody> insertion), this will be a
   * zero-length string.
   */
  GumboStringPiece original_tag;

  /**
   * A GumboStringPiece pointing to the original end tag text for this element.
   * If the end tag was inserted algorithmically, (for example, closing a
   * self-closing tag), this will be a zero-length string.
   */
  GumboStringPiece original_end_tag;

  /** The source position for the start of the start tag. */
  GumboSourcePosition start_pos;

  /** The source position for the start of the end tag. */
  GumboSourcePosition end_pos;

  /**
   * An array of GumboAttributes, containing the attributes for this tag in the
   * order that they were parsed.  Pointers are owned.
   */
  GumboVector /* GumboAttribute* */ attributes;
} GumboElement;

/**
 * A supertype for GumboElement and GumboText, so that we can include one
 * generic type in lists of children and cast as necessary to subtypes.
 */
struct GumboInternalNode {
  /** The type of node that this is. */
  GumboNodeType type;

  /** Pointer back to parent node.  Not owned. */
  GumboNode* parent;

  /** The index within the parent's children vector of this node. */
  size_t index_within_parent;

  /**
   * A bitvector of flags containing information about why this element was
   * inserted into the parse tree, including a variety of special parse
   * situations.
   */
  GumboParseFlags parse_flags;

  /** The actual node data. */
  union {
    GumboDocument document;  // For GUMBO_NODE_DOCUMENT.
    GumboElement element;    // For GUMBO_NODE_ELEMENT.
    GumboText text;          // For everything else.
  } v;
};

/**
 * The type for an allocator function.  Takes the 'userdata' member of the
 * GumboParser struct as its first argument.  Semantics should be the same as
 * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
 * Allocating a block of 0 bytes behaves as per malloc.
 */
// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);

/**
 * The type for a deallocator function.  Takes the 'userdata' member of the
 * GumboParser struct as its first argument.
 */
typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);

/**
 * Input struct containing configuration options for the parser.
 * These let you specify alternate memory managers, provide different error
 * handling, etc.
 * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
 */
typedef struct GumboInternalOptions {
  /** A memory allocator function.  Default: malloc. */
  GumboAllocatorFunction allocator;

  /** A memory deallocator function. Default: free. */
  GumboDeallocatorFunction deallocator;

  /**
   * An opaque object that's passed in as the first argument to all callbacks
   * used by this library.  Default: NULL.
   */
  void* userdata;

  /**
   * The tab-stop size, for computing positions in source code that uses tabs.
   * Default: 8.
   */
  int tab_stop;

  /**
   * Whether or not to stop parsing when the first error is encountered.
   * Default: false.
   */
  bool stop_on_first_error;

  /**
   * The maximum number of errors before the parser stops recording them.  This
   * is provided so that if the page is totally borked, we don't completely fill
   * up the errors vector and exhaust memory with useless redundant errors.  Set
   * to -1 to disable the limit.
   * Default: -1
   */
  int max_errors;

  /**
   * The fragment context for parsing:
   * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
   *
   * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
   * the regular parsing algorithm.  Otherwise, pass the tag enum for the
   * intended parent of the parsed fragment.  We use just the tag enum rather
   * than a full node because that's enough to set all the parsing context we
   * need, and it provides some additional flexibility for client code to act as
   * if parsing a fragment even when a full HTML tree isn't available.
   *
   * Default: GUMBO_TAG_LAST
   */
  GumboTag fragment_context;

  /**
   * The namespace for the fragment context.  This lets client code
   * differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
   * HTML.
   * Default: GUMBO_NAMESPACE_HTML
   */
  GumboNamespaceEnum fragment_namespace;
} GumboOptions;

/** Default options struct; use this with gumbo_parse_with_options. */
extern const GumboOptions kGumboDefaultOptions;

/** The output struct containing the results of the parse. */
typedef struct GumboInternalOutput {
  /**
   * Pointer to the document node.  This is a GumboNode of type NODE_DOCUMENT
   * that contains the entire document as its child.
   */
  GumboNode* document;

  /**
   * Pointer to the root node.  This the <html> tag that forms the root of the
   * document.
   */
  GumboNode* root;

  /**
   * A list of errors that occurred during the parse.
   * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
   * fleshed out and may change in the future.  For this reason, the GumboError
   * header isn't part of the public API.  Contact us if you need errors
   * reported so we can work out something appropriate for your use-case.
   */
  GumboVector /* GumboError */ errors;
} GumboOutput;

/**
 * Parses a buffer of UTF8 text into an GumboNode parse tree.  The buffer must
 * live at least as long as the parse tree, as some fields (eg. original_text)
 * point directly into the original buffer.
 *
 * This doesn't support buffers longer than 4 gigabytes.
 */
GumboOutput* gumbo_parse(const char* buffer);

/**
 * Extended version of gumbo_parse that takes an explicit options structure,
 * buffer, and length.
 */
GumboOutput* gumbo_parse_with_options(
    const GumboOptions* options, const char* buffer, size_t buffer_length);

/** Release the memory used for the parse tree & parse errors. */
void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);

#ifdef __cplusplus
}
#endif

#endif  // GUMBO_GUMBO_H_