PageRenderTime 48ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/src/error.h

https://github.com/igottogetoff/gumbo-parser
C Header | 227 lines | 118 code | 32 blank | 77 comment | 0 complexity | 25d7e54e7188d59a4aebd674d9dd9446 MD5 | raw file
Possible License(s): Apache-2.0
  1. // Copyright 2010 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Author: jdtang@google.com (Jonathan Tang)
  16. //
  17. // Error types, enums, and handling functions.
  18. #ifndef GUMBO_ERROR_H_
  19. #define GUMBO_ERROR_H_
  20. #ifdef _MSC_VER
  21. #define _CRT_SECURE_NO_WARNINGS
  22. #endif
  23. #include <stdint.h>
  24. #include "gumbo.h"
  25. #include "insertion_mode.h"
  26. #include "string_buffer.h"
  27. #include "token_type.h"
  28. #ifdef __cplusplus
  29. extern "C" {
  30. #endif
  31. struct GumboInternalParser;
  32. typedef enum {
  33. GUMBO_ERR_UTF8_INVALID,
  34. GUMBO_ERR_UTF8_TRUNCATED,
  35. GUMBO_ERR_UTF8_NULL,
  36. GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
  37. GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
  38. GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
  39. GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
  40. GUMBO_ERR_NAMED_CHAR_REF_INVALID,
  41. GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
  42. GUMBO_ERR_TAG_EOF,
  43. GUMBO_ERR_TAG_INVALID,
  44. GUMBO_ERR_CLOSE_TAG_EMPTY,
  45. GUMBO_ERR_CLOSE_TAG_EOF,
  46. GUMBO_ERR_CLOSE_TAG_INVALID,
  47. GUMBO_ERR_SCRIPT_EOF,
  48. GUMBO_ERR_ATTR_NAME_EOF,
  49. GUMBO_ERR_ATTR_NAME_INVALID,
  50. GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
  51. GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
  52. GUMBO_ERR_ATTR_UNQUOTED_EOF,
  53. GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
  54. GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
  55. GUMBO_ERR_ATTR_AFTER_EOF,
  56. GUMBO_ERR_ATTR_AFTER_INVALID,
  57. GUMBO_ERR_DUPLICATE_ATTR,
  58. GUMBO_ERR_SOLIDUS_EOF,
  59. GUMBO_ERR_SOLIDUS_INVALID,
  60. GUMBO_ERR_DASHES_OR_DOCTYPE,
  61. GUMBO_ERR_COMMENT_EOF,
  62. GUMBO_ERR_COMMENT_INVALID,
  63. GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
  64. GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
  65. GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
  66. GUMBO_ERR_COMMENT_END_BANG_EOF,
  67. GUMBO_ERR_DOCTYPE_EOF,
  68. GUMBO_ERR_DOCTYPE_INVALID,
  69. GUMBO_ERR_DOCTYPE_SPACE,
  70. GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
  71. GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
  72. GUMBO_ERR_DOCTYPE_END,
  73. GUMBO_ERR_PARSER,
  74. GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
  75. } GumboErrorType;
  76. // Additional data for duplicated attributes.
  77. typedef struct GumboInternalDuplicateAttrError {
  78. // The name of the attribute. Owned by this struct.
  79. const char* name;
  80. // The (0-based) index within the attributes vector of the original
  81. // occurrence.
  82. unsigned int original_index;
  83. // The (0-based) index where the new occurrence would be.
  84. unsigned int new_index;
  85. } GumboDuplicateAttrError;
  86. // A simplified representation of the tokenizer state, designed to be more
  87. // useful to clients of this library than the internal representation. This
  88. // condenses the actual states used in the tokenizer state machine into a few
  89. // values that will be familiar to users of HTML.
  90. typedef enum {
  91. GUMBO_ERR_TOKENIZER_DATA,
  92. GUMBO_ERR_TOKENIZER_CHAR_REF,
  93. GUMBO_ERR_TOKENIZER_RCDATA,
  94. GUMBO_ERR_TOKENIZER_RAWTEXT,
  95. GUMBO_ERR_TOKENIZER_PLAINTEXT,
  96. GUMBO_ERR_TOKENIZER_SCRIPT,
  97. GUMBO_ERR_TOKENIZER_TAG,
  98. GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
  99. GUMBO_ERR_TOKENIZER_ATTR_NAME,
  100. GUMBO_ERR_TOKENIZER_ATTR_VALUE,
  101. GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
  102. GUMBO_ERR_TOKENIZER_COMMENT,
  103. GUMBO_ERR_TOKENIZER_DOCTYPE,
  104. GUMBO_ERR_TOKENIZER_CDATA,
  105. } GumboTokenizerErrorState;
  106. // Additional data for tokenizer errors.
  107. // This records the current state and codepoint encountered - this is usually
  108. // enough to reconstruct what went wrong and provide a friendly error message.
  109. typedef struct GumboInternalTokenizerError {
  110. // The bad codepoint encountered.
  111. int codepoint;
  112. // The state that the tokenizer was in at the time.
  113. GumboTokenizerErrorState state;
  114. } GumboTokenizerError;
  115. // Additional data for parse errors.
  116. typedef struct GumboInternalParserError {
  117. // The type of input token that resulted in this error.
  118. GumboTokenType input_type;
  119. // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
  120. GumboTag input_tag;
  121. // The insertion mode that the parser was in at the time.
  122. GumboInsertionMode parser_state;
  123. // The tag stack at the point of the error. Note that this is an GumboVector
  124. // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
  125. // get at the tag.
  126. GumboVector /* GumboTag */ tag_stack;
  127. } GumboParserError;
  128. // The overall error struct representing an error in decoding/tokenizing/parsing
  129. // the HTML. This contains an enumerated type flag, a source position, and then
  130. // a union of fields containing data specific to the error.
  131. typedef struct GumboInternalError {
  132. // The type of error.
  133. GumboErrorType type;
  134. // The position within the source file where the error occurred.
  135. GumboSourcePosition position;
  136. // A pointer to the byte within the original source file text where the error
  137. // occurred (note that this is not the same as position.offset, as that gives
  138. // character-based instead of byte-based offsets).
  139. const char* original_text;
  140. // Type-specific error information.
  141. union {
  142. // The code point we encountered, for:
  143. // * GUMBO_ERR_UTF8_INVALID
  144. // * GUMBO_ERR_UTF8_TRUNCATED
  145. // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
  146. // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
  147. uint64_t codepoint;
  148. // Tokenizer errors.
  149. GumboTokenizerError tokenizer;
  150. // Short textual data, for:
  151. // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
  152. // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
  153. GumboStringPiece text;
  154. // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
  155. GumboDuplicateAttrError duplicate_attr;
  156. // Parser state, for GUMBO_ERR_PARSER and
  157. // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
  158. struct GumboInternalParserError parser;
  159. } v;
  160. } GumboError;
  161. // Adds a new error to the parser's error list, and returns a pointer to it so
  162. // that clients can fill out the rest of its fields. May return NULL if we're
  163. // already over the max_errors field specified in GumboOptions.
  164. GumboError* gumbo_add_error(struct GumboInternalParser* parser);
  165. // Initializes the errors vector in the parser.
  166. void gumbo_init_errors(struct GumboInternalParser* errors);
  167. // Frees all the errors in the 'errors_' field of the parser.
  168. void gumbo_destroy_errors(struct GumboInternalParser* errors);
  169. // Frees the memory used for a single GumboError.
  170. void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
  171. // Prints an error to a string. This fills an empty GumboStringBuffer with a
  172. // freshly-allocated buffer containing the error message text. The caller is
  173. // responsible for deleting the buffer. (Note that the buffer is allocated with
  174. // the allocator specified in the GumboParser config and hence should be freed
  175. // by gumbo_parser_deallocate().)
  176. void gumbo_error_to_string(
  177. struct GumboInternalParser* parser, const GumboError* error,
  178. GumboStringBuffer* output);
  179. // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
  180. // with a freshly-allocated buffer containing the error message text. The
  181. // caller is responsible for deleting the buffer. (Note that the buffer is
  182. // allocated with the allocator specified in the GumboParser config and hence
  183. // should be freed by gumbo_parser_deallocate().)
  184. void gumbo_caret_diagnostic_to_string(
  185. struct GumboInternalParser* parser, const GumboError* error,
  186. const char* source_text, GumboStringBuffer* output);
  187. // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
  188. // of writing to a string.
  189. void gumbo_print_caret_diagnostic(
  190. struct GumboInternalParser* parser, const GumboError* error,
  191. const char* source_text);
  192. #ifdef __cplusplus
  193. }
  194. #endif
  195. #endif // GUMBO_ERROR_H_