PageRenderTime 50ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/src/error.h

https://github.com/lvfeng1130/gumbo-parser
C Header | 225 lines | 115 code | 33 blank | 77 comment | 0 complexity | c796a3ae1a0ac7f339012439b1baf146 MD5 | raw file
Possible License(s): Apache-2.0
  1. // Copyright 2010 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Author: jdtang@google.com (Jonathan Tang)
  16. //
  17. // Error types, enums, and handling functions.
  18. #ifndef GUMBO_ERROR_H_
  19. #define GUMBO_ERROR_H_
  20. #include <stdint.h>
  21. #include "gumbo.h"
  22. #include "insertion_mode.h"
  23. #include "string_buffer.h"
  24. #include "token_type.h"
  25. #ifdef __cplusplus
  26. extern "C" {
  27. #endif
  28. struct _GumboParser;
  29. typedef enum {
  30. GUMBO_ERR_UTF8_INVALID,
  31. GUMBO_ERR_UTF8_TRUNCATED,
  32. GUMBO_ERR_UTF8_NULL,
  33. GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
  34. GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
  35. GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
  36. GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
  37. GUMBO_ERR_NAMED_CHAR_REF_INVALID,
  38. GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
  39. GUMBO_ERR_TAG_EOF,
  40. GUMBO_ERR_TAG_INVALID,
  41. GUMBO_ERR_CLOSE_TAG_EMPTY,
  42. GUMBO_ERR_CLOSE_TAG_EOF,
  43. GUMBO_ERR_CLOSE_TAG_INVALID,
  44. GUMBO_ERR_SCRIPT_EOF,
  45. GUMBO_ERR_ATTR_NAME_EOF,
  46. GUMBO_ERR_ATTR_NAME_INVALID,
  47. GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
  48. GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
  49. GUMBO_ERR_ATTR_UNQUOTED_EOF,
  50. GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
  51. GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
  52. GUMBO_ERR_ATTR_AFTER_EOF,
  53. GUMBO_ERR_ATTR_AFTER_INVALID,
  54. GUMBO_ERR_DUPLICATE_ATTR,
  55. GUMBO_ERR_SOLIDUS_EOF,
  56. GUMBO_ERR_SOLIDUS_INVALID,
  57. GUMBO_ERR_DASHES_OR_DOCTYPE,
  58. GUMBO_ERR_COMMENT_EOF,
  59. GUMBO_ERR_COMMENT_INVALID,
  60. GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
  61. GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
  62. GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
  63. GUMBO_ERR_COMMENT_END_BANG_EOF,
  64. GUMBO_ERR_DOCTYPE_EOF,
  65. GUMBO_ERR_DOCTYPE_INVALID,
  66. GUMBO_ERR_DOCTYPE_SPACE,
  67. GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
  68. GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
  69. GUMBO_ERR_DOCTYPE_END,
  70. GUMBO_ERR_PARSER,
  71. GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
  72. } GumboErrorType;
  73. // Additional data for duplicated attributes.
  74. typedef struct _GumboDuplicateAttrError {
  75. // The name of the attribute. Owned by this struct.
  76. const char* name;
  77. // The (0-based) index within the attributes vector of the original
  78. // occurrence.
  79. unsigned int original_index;
  80. // The (0-based) index where the new occurrence would be.
  81. unsigned int new_index;
  82. } GumboDuplicateAttrError;
  83. // A simplified representation of the tokenizer state, designed to be more
  84. // useful to clients of this library than the internal representation. This
  85. // condenses the actual states used in the tokenizer state machine into a few
  86. // values that will be familiar to users of HTML.
  87. typedef enum {
  88. GUMBO_ERR_TOKENIZER_DATA,
  89. GUMBO_ERR_TOKENIZER_CHAR_REF,
  90. GUMBO_ERR_TOKENIZER_RCDATA,
  91. GUMBO_ERR_TOKENIZER_RAWTEXT,
  92. GUMBO_ERR_TOKENIZER_PLAINTEXT,
  93. GUMBO_ERR_TOKENIZER_SCRIPT,
  94. GUMBO_ERR_TOKENIZER_TAG,
  95. GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
  96. GUMBO_ERR_TOKENIZER_ATTR_NAME,
  97. GUMBO_ERR_TOKENIZER_ATTR_VALUE,
  98. GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
  99. GUMBO_ERR_TOKENIZER_COMMENT,
  100. GUMBO_ERR_TOKENIZER_DOCTYPE,
  101. GUMBO_ERR_TOKENIZER_CDATA,
  102. } GumboTokenizerErrorState;
  103. // Additional data for tokenizer errors.
  104. // This records the current state and codepoint encountered - this is usually
  105. // enough to reconstruct what went wrong and provide a friendly error message.
  106. typedef struct _GumboTokenizerError {
  107. // The bad codepoint encountered.
  108. int codepoint;
  109. // The state that the tokenizer was in at the time.
  110. GumboTokenizerErrorState state;
  111. } GumboTokenizerError;
  112. // Additional data for parse errors.
  113. typedef struct _GumboParserError {
  114. // The type of input token that resulted in this error.
  115. GumboTokenType input_type;
  116. // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
  117. GumboTag input_tag;
  118. // The insertion mode that the parser was in at the time.
  119. GumboInsertionMode parser_state;
  120. // The tag stack at the point of the error. Note that this is an GumboVector
  121. // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
  122. // get at the tag.
  123. GumboVector /* GumboTag */ tag_stack;
  124. } GumboParserError;
  125. // The overall error struct representing an error in decoding/tokenizing/parsing
  126. // the HTML. This contains an enumerated type flag, a source position, and then
  127. // a union of fields containing data specific to the error.
  128. typedef struct _GumboError {
  129. // The type of error.
  130. GumboErrorType type;
  131. // The position within the source file where the error occurred.
  132. GumboSourcePosition position;
  133. // A pointer to the byte within the original source file text where the error
  134. // occurred (note that this is not the same as position.offset, as that gives
  135. // character-based instead of byte-based offsets).
  136. const char* original_text;
  137. // Type-specific error information.
  138. union {
  139. // The code point we encountered, for:
  140. // * GUMBO_ERR_UTF8_INVALID
  141. // * GUMBO_ERR_UTF8_TRUNCATED
  142. // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
  143. // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
  144. uint64_t codepoint;
  145. // Tokenizer errors.
  146. GumboTokenizerError tokenizer;
  147. // Short textual data, for:
  148. // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
  149. // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
  150. GumboStringPiece text;
  151. // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
  152. GumboDuplicateAttrError duplicate_attr;
  153. // Parser state, for GUMBO_ERR_PARSER and
  154. // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
  155. struct _GumboParserError parser;
  156. } v;
  157. } GumboError;
  158. // Adds a new error to the parser's error list, and returns a pointer to it so
  159. // that clients can fill out the rest of its fields. May return NULL if we're
  160. // already over the max_errors field specified in GumboOptions.
  161. GumboError* gumbo_add_error(struct _GumboParser* parser);
  162. // Initializes the errors vector in the parser.
  163. void gumbo_init_errors(struct _GumboParser* errors);
  164. // Frees all the errors in the 'errors_' field of the parser.
  165. void gumbo_destroy_errors(struct _GumboParser* errors);
  166. // Frees the memory used for a single GumboError.
  167. void gumbo_error_destroy(struct _GumboParser* parser, GumboError* error);
  168. // Prints an error to a string. This fills an empty GumboStringBuffer with a
  169. // freshly-allocated buffer containing the error message text. The caller is
  170. // responsible for deleting the buffer. (Note that the buffer is allocated with
  171. // the allocator specified in the GumboParser config and hence should be freed
  172. // by gumbo_parser_deallocate().)
  173. void gumbo_error_to_string(
  174. struct _GumboParser* parser, const GumboError* error,
  175. GumboStringBuffer* output);
  176. // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
  177. // with a freshly-allocated buffer containing the error message text. The
  178. // caller is responsible for deleting the buffer. (Note that the buffer is
  179. // allocated with the allocator specified in the GumboParser config and hence
  180. // should be freed by gumbo_parser_deallocate().)
  181. void gumbo_caret_diagnostic_to_string(
  182. struct _GumboParser* parser, const GumboError* error,
  183. const char* source_text, GumboStringBuffer* output);
  184. // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
  185. // of writing to a string.
  186. void gumbo_print_caret_diagnostic(
  187. struct _GumboParser* parser, const GumboError* error,
  188. const char* source_text);
  189. #ifdef __cplusplus
  190. }
  191. #endif
  192. #endif // GUMBO_ERROR_H_