Gumbo  1.0
A C library for parsing HTML.
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Pages
gumbo.h
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 //
17 // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18 // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19 // kGumbo prefix).
20 
42 #ifndef GUMBO_GUMBO_H_
43 #define GUMBO_GUMBO_H_
44 
45 #ifdef _MSC_VER
46 #define _CRT_SECURE_NO_WARNINGS
47 #define fileno _fileno
48 #endif
49 
50 #include <stdbool.h>
51 #include <stddef.h>
52 
53 #ifdef __cplusplus
54 extern "C" {
55 #endif
56 
67 typedef struct {
68  unsigned int line;
69  unsigned int column;
70  unsigned int offset;
72 
78 
79 
89 typedef struct {
91  const char* data;
92 
94  size_t length;
96 
99 
105  const GumboStringPiece* str1, const GumboStringPiece* str2);
106 
112  const GumboStringPiece* str1, const GumboStringPiece* str2);
113 
114 
124 typedef struct {
128  void** data;
129 
131  unsigned int length;
132 
134  unsigned int capacity;
135 } GumboVector;
136 
138 extern const GumboVector kGumboEmptyVector;
139 
144 int gumbo_vector_index_of(GumboVector* vector, void* element);
145 
146 
159 typedef enum {
160  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
161  GUMBO_TAG_HTML,
162  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
163  GUMBO_TAG_HEAD,
164  GUMBO_TAG_TITLE,
165  GUMBO_TAG_BASE,
166  GUMBO_TAG_LINK,
167  GUMBO_TAG_META,
168  GUMBO_TAG_STYLE,
169  // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
170  GUMBO_TAG_SCRIPT,
171  GUMBO_TAG_NOSCRIPT,
172  GUMBO_TAG_TEMPLATE,
173  // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
174  GUMBO_TAG_BODY,
175  GUMBO_TAG_ARTICLE,
176  GUMBO_TAG_SECTION,
177  GUMBO_TAG_NAV,
178  GUMBO_TAG_ASIDE,
179  GUMBO_TAG_H1,
180  GUMBO_TAG_H2,
181  GUMBO_TAG_H3,
182  GUMBO_TAG_H4,
183  GUMBO_TAG_H5,
184  GUMBO_TAG_H6,
185  GUMBO_TAG_HGROUP,
186  GUMBO_TAG_HEADER,
187  GUMBO_TAG_FOOTER,
188  GUMBO_TAG_ADDRESS,
189  // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
190  GUMBO_TAG_P,
191  GUMBO_TAG_HR,
192  GUMBO_TAG_PRE,
193  GUMBO_TAG_BLOCKQUOTE,
194  GUMBO_TAG_OL,
195  GUMBO_TAG_UL,
196  GUMBO_TAG_LI,
197  GUMBO_TAG_DL,
198  GUMBO_TAG_DT,
199  GUMBO_TAG_DD,
200  GUMBO_TAG_FIGURE,
201  GUMBO_TAG_FIGCAPTION,
202  GUMBO_TAG_MAIN,
203  GUMBO_TAG_DIV,
204  // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
205  GUMBO_TAG_A,
206  GUMBO_TAG_EM,
207  GUMBO_TAG_STRONG,
208  GUMBO_TAG_SMALL,
209  GUMBO_TAG_S,
210  GUMBO_TAG_CITE,
211  GUMBO_TAG_Q,
212  GUMBO_TAG_DFN,
213  GUMBO_TAG_ABBR,
214  GUMBO_TAG_DATA,
215  GUMBO_TAG_TIME,
216  GUMBO_TAG_CODE,
217  GUMBO_TAG_VAR,
218  GUMBO_TAG_SAMP,
219  GUMBO_TAG_KBD,
220  GUMBO_TAG_SUB,
221  GUMBO_TAG_SUP,
222  GUMBO_TAG_I,
223  GUMBO_TAG_B,
224  GUMBO_TAG_U,
225  GUMBO_TAG_MARK,
226  GUMBO_TAG_RUBY,
227  GUMBO_TAG_RT,
228  GUMBO_TAG_RP,
229  GUMBO_TAG_BDI,
230  GUMBO_TAG_BDO,
231  GUMBO_TAG_SPAN,
232  GUMBO_TAG_BR,
233  GUMBO_TAG_WBR,
234  // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
235  GUMBO_TAG_INS,
236  GUMBO_TAG_DEL,
237  // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
238  GUMBO_TAG_IMAGE,
239  GUMBO_TAG_IMG,
240  GUMBO_TAG_IFRAME,
241  GUMBO_TAG_EMBED,
242  GUMBO_TAG_OBJECT,
243  GUMBO_TAG_PARAM,
244  GUMBO_TAG_VIDEO,
245  GUMBO_TAG_AUDIO,
246  GUMBO_TAG_SOURCE,
247  GUMBO_TAG_TRACK,
248  GUMBO_TAG_CANVAS,
249  GUMBO_TAG_MAP,
250  GUMBO_TAG_AREA,
251  // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
252  GUMBO_TAG_MATH,
253  GUMBO_TAG_MI,
254  GUMBO_TAG_MO,
255  GUMBO_TAG_MN,
256  GUMBO_TAG_MS,
257  GUMBO_TAG_MTEXT,
258  GUMBO_TAG_MGLYPH,
259  GUMBO_TAG_MALIGNMARK,
260  GUMBO_TAG_ANNOTATION_XML,
261  // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
262  GUMBO_TAG_SVG,
263  GUMBO_TAG_FOREIGNOBJECT,
264  GUMBO_TAG_DESC,
265  // SVG title tags will have GUMBO_TAG_TITLE as with HTML.
266  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
267  GUMBO_TAG_TABLE,
268  GUMBO_TAG_CAPTION,
269  GUMBO_TAG_COLGROUP,
270  GUMBO_TAG_COL,
271  GUMBO_TAG_TBODY,
272  GUMBO_TAG_THEAD,
273  GUMBO_TAG_TFOOT,
274  GUMBO_TAG_TR,
275  GUMBO_TAG_TD,
276  GUMBO_TAG_TH,
277  // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
278  GUMBO_TAG_FORM,
279  GUMBO_TAG_FIELDSET,
280  GUMBO_TAG_LEGEND,
281  GUMBO_TAG_LABEL,
282  GUMBO_TAG_INPUT,
283  GUMBO_TAG_BUTTON,
284  GUMBO_TAG_SELECT,
285  GUMBO_TAG_DATALIST,
286  GUMBO_TAG_OPTGROUP,
287  GUMBO_TAG_OPTION,
288  GUMBO_TAG_TEXTAREA,
289  GUMBO_TAG_KEYGEN,
290  GUMBO_TAG_OUTPUT,
291  GUMBO_TAG_PROGRESS,
292  GUMBO_TAG_METER,
293  // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
294  GUMBO_TAG_DETAILS,
295  GUMBO_TAG_SUMMARY,
296  GUMBO_TAG_MENU,
297  GUMBO_TAG_MENUITEM,
298  // Non-conforming elements that nonetheless appear in the HTML5 spec.
299  // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
300  GUMBO_TAG_APPLET,
301  GUMBO_TAG_ACRONYM,
302  GUMBO_TAG_BGSOUND,
303  GUMBO_TAG_DIR,
304  GUMBO_TAG_FRAME,
305  GUMBO_TAG_FRAMESET,
306  GUMBO_TAG_NOFRAMES,
307  GUMBO_TAG_ISINDEX,
308  GUMBO_TAG_LISTING,
309  GUMBO_TAG_XMP,
310  GUMBO_TAG_NEXTID,
311  GUMBO_TAG_NOEMBED,
312  GUMBO_TAG_PLAINTEXT,
313  GUMBO_TAG_RB,
314  GUMBO_TAG_STRIKE,
315  GUMBO_TAG_BASEFONT,
316  GUMBO_TAG_BIG,
317  GUMBO_TAG_BLINK,
318  GUMBO_TAG_CENTER,
319  GUMBO_TAG_FONT,
320  GUMBO_TAG_MARQUEE,
321  GUMBO_TAG_MULTICOL,
322  GUMBO_TAG_NOBR,
323  GUMBO_TAG_SPACER,
324  GUMBO_TAG_TT,
325  // Used for all tags that don't have special handling in HTML.
326  GUMBO_TAG_UNKNOWN,
327  // A marker value to indicate the end of the enum, for iterating over it.
328  // Also used as the terminator for varargs functions that take tags.
329  GUMBO_TAG_LAST,
330 } GumboTag;
331 
337 const char* gumbo_normalized_tagname(GumboTag tag);
338 
350 
363 const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
364 
369 GumboTag gumbo_tag_enum(const char* tagname);
370 
376 typedef enum {
377  GUMBO_ATTR_NAMESPACE_NONE,
378  GUMBO_ATTR_NAMESPACE_XLINK,
379  GUMBO_ATTR_NAMESPACE_XML,
380  GUMBO_ATTR_NAMESPACE_XMLNS,
382 
388 typedef struct {
396 
401  const char* name;
402 
408 
415  const char* value;
416 
426 
429 
436 
439 
443 
449 GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
450 
455 typedef enum {
468 } GumboNodeType;
469 
474 typedef struct GumboInternalNode GumboNode;
475 
477 typedef enum {
478  GUMBO_DOCTYPE_NO_QUIRKS,
479  GUMBO_DOCTYPE_QUIRKS,
480  GUMBO_DOCTYPE_LIMITED_QUIRKS
482 
490 typedef enum {
491  GUMBO_NAMESPACE_HTML,
492  GUMBO_NAMESPACE_SVG,
493  GUMBO_NAMESPACE_MATHML
495 
504 typedef enum {
510 
518 
531 
532  // Value 1 << 2 was for a flag that has since been removed.
533 
539 
547 
550 
553 
560 
563 
566 
573 
574 
578 typedef struct {
584  GumboVector /* GumboNode* */ children;
585 
586  // True if there was an explicit doctype token as opposed to it being omitted.
587  bool has_doctype;
588 
589  // Fields from the doctype token, copied verbatim.
590  const char* name;
591  const char* public_identifier;
592  const char* system_identifier;
593 
599 } GumboDocument;
600 
605 typedef struct {
610  const char* text;
611 
617 
623 } GumboText;
624 
629 typedef struct {
634  GumboVector /* GumboNode* */ children;
635 
638 
641 
649 
656 
659 
662 
667  GumboVector /* GumboAttribute* */ attributes;
668 } GumboElement;
669 
677 
679  GumboNode* parent;
680 
683 
690 
692  union {
693  GumboDocument document; // For GUMBO_NODE_DOCUMENT.
694  GumboElement element; // For GUMBO_NODE_ELEMENT.
695  GumboText text; // For everything else.
696  } v;
697 };
698 
705 // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
706 typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
707 
712 typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
713 
720 typedef struct GumboInternalOptions {
723 
726 
731  void* userdata;
732 
737  int tab_stop;
738 
744 
753 } GumboOptions;
754 
757 
759 typedef struct GumboInternalOutput {
764  GumboNode* document;
765 
770  GumboNode* root;
771 
779  GumboVector /* GumboError */ errors;
780 } GumboOutput;
781 
789 GumboOutput* gumbo_parse(const char* buffer);
790 
796  const GumboOptions* options, const char* buffer, size_t buffer_length);
797 
800  const GumboOptions* options, GumboOutput* output);
801 
802 
803 #ifdef __cplusplus
804 }
805 #endif
806 
807 #endif // GUMBO_GUMBO_H_
Gumbo: gumbo.h Source File
Gumbo  1.0
A C library for parsing HTML.