PageRenderTime 19ms CodeModel.GetById 2ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 0ms

/thirdparty/breakpad/third_party/protobuf/protobuf/src/google/protobuf/io/tokenizer.h

http://github.com/tomahawk-player/tomahawk
C++ Header | 313 lines | 105 code | 55 blank | 153 comment | 0 complexity | 41daeb774d73f97c5a8c961b67c55cd4 MD5 | raw file
  1// Protocol Buffers - Google's data interchange format
  2// Copyright 2008 Google Inc.  All rights reserved.
  3// http://code.google.com/p/protobuf/
  4//
  5// Redistribution and use in source and binary forms, with or without
  6// modification, are permitted provided that the following conditions are
  7// met:
  8//
  9//     * Redistributions of source code must retain the above copyright
 10// notice, this list of conditions and the following disclaimer.
 11//     * Redistributions in binary form must reproduce the above
 12// copyright notice, this list of conditions and the following disclaimer
 13// in the documentation and/or other materials provided with the
 14// distribution.
 15//     * Neither the name of Google Inc. nor the names of its
 16// contributors may be used to endorse or promote products derived from
 17// this software without specific prior written permission.
 18//
 19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30
 31// Author: kenton@google.com (Kenton Varda)
 32//  Based on original Protocol Buffers design by
 33//  Sanjay Ghemawat, Jeff Dean, and others.
 34//
 35// Class for parsing tokenized text from a ZeroCopyInputStream.
 36
 37#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
 38#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
 39
 40#include <string>
 41#include <google/protobuf/stubs/common.h>
 42
 43namespace google {
 44namespace protobuf {
 45namespace io {
 46
 47class ZeroCopyInputStream;     // zero_copy_stream.h
 48
 49// Defined in this file.
 50class ErrorCollector;
 51class Tokenizer;
 52
 53// Abstract interface for an object which collects the errors that occur
 54// during parsing.  A typical implementation might simply print the errors
 55// to stdout.
 56class LIBPROTOBUF_EXPORT ErrorCollector {
 57 public:
 58  inline ErrorCollector() {}
 59  virtual ~ErrorCollector();
 60
 61  // Indicates that there was an error in the input at the given line and
 62  // column numbers.  The numbers are zero-based, so you may want to add
 63  // 1 to each before printing them.
 64  virtual void AddError(int line, int column, const string& message) = 0;
 65
 66  // Indicates that there was a warning in the input at the given line and
 67  // column numbers.  The numbers are zero-based, so you may want to add
 68  // 1 to each before printing them.
 69  virtual void AddWarning(int line, int column, const string& message) { }
 70
 71 private:
 72  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
 73};
 74
 75// This class converts a stream of raw text into a stream of tokens for
 76// the protocol definition parser to parse.  The tokens recognized are
 77// similar to those that make up the C language; see the TokenType enum for
 78// precise descriptions.  Whitespace and comments are skipped.  By default,
 79// C- and C++-style comments are recognized, but other styles can be used by
 80// calling set_comment_style().
 81class LIBPROTOBUF_EXPORT Tokenizer {
 82 public:
 83  // Construct a Tokenizer that reads and tokenizes text from the given
 84  // input stream and writes errors to the given error_collector.
 85  // The caller keeps ownership of input and error_collector.
 86  Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
 87  ~Tokenizer();
 88
 89  enum TokenType {
 90    TYPE_START,       // Next() has not yet been called.
 91    TYPE_END,         // End of input reached.  "text" is empty.
 92
 93    TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
 94                      // starting with a digit.  It is an error for a number
 95                      // to be followed by an identifier with no space in
 96                      // between.
 97    TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
 98                      // the digits are decimal, but a prefix of "0x" indicates
 99                      // a hex number and a leading zero indicates octal, just
100                      // like with C numeric literals.  A leading negative sign
101                      // is NOT included in the token; it's up to the parser to
102                      // interpret the unary minus operator on its own.
103    TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
104                      // an exponent.  Always in decimal.  Again, never
105                      // negative.
106    TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
107                      // or double quotes can be used, but they must match.
108                      // A string literal cannot cross a line break.
109    TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
110                      // Symbols are always a single character, so "!+$%" is
111                      // four tokens.
112  };
113
114  // Structure representing a token read from the token stream.
115  struct Token {
116    TokenType type;
117    string text;       // The exact text of the token as it appeared in
118                       // the input.  e.g. tokens of TYPE_STRING will still
119                       // be escaped and in quotes.
120
121    // "line" and "column" specify the position of the first character of
122    // the token within the input stream.  They are zero-based.
123    int line;
124    int column;
125    int end_column;
126  };
127
128  // Get the current token.  This is updated when Next() is called.  Before
129  // the first call to Next(), current() has type TYPE_START and no contents.
130  const Token& current();
131
132  // Return the previous token -- i.e. what current() returned before the
133  // previous call to Next().
134  const Token& previous();
135
136  // Advance to the next token.  Returns false if the end of the input is
137  // reached.
138  bool Next();
139
140  // Parse helpers ---------------------------------------------------
141
142  // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
143  // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
144  // result is undefined (possibly an assert failure).
145  static double ParseFloat(const string& text);
146
147  // Parses a TYPE_STRING token.  This never fails, so long as the text actually
148  // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
149  // result is undefined (possibly an assert failure).
150  static void ParseString(const string& text, string* output);
151
152  // Identical to ParseString, but appends to output.
153  static void ParseStringAppend(const string& text, string* output);
154
155  // Parses a TYPE_INTEGER token.  Returns false if the result would be
156  // greater than max_value.  Otherwise, returns true and sets *output to the
157  // result.  If the text is not from a Token of type TYPE_INTEGER originally
158  // parsed by a Tokenizer, the result is undefined (possibly an assert
159  // failure).
160  static bool ParseInteger(const string& text, uint64 max_value,
161                           uint64* output);
162
163  // Options ---------------------------------------------------------
164
165  // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
166  // which would otherwise be integers but which have the 'f' suffix will be
167  // forced to be interpreted as floats.  For all other purposes, the 'f' is
168  // ignored.
169  void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
170
171  // Valid values for set_comment_style().
172  enum CommentStyle {
173    // Line comments begin with "//", block comments are delimited by "/*" and
174    // "*/".
175    CPP_COMMENT_STYLE,
176    // Line comments begin with "#".  No way to write block comments.
177    SH_COMMENT_STYLE
178  };
179
180  // Sets the comment style.
181  void set_comment_style(CommentStyle style) { comment_style_ = style; }
182
183  // -----------------------------------------------------------------
184 private:
185  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
186
187  Token current_;           // Returned by current().
188  Token previous_;          // Returned by previous().
189
190  ZeroCopyInputStream* input_;
191  ErrorCollector* error_collector_;
192
193  char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
194  const char* buffer_;      // Current buffer returned from input_.
195  int buffer_size_;         // Size of buffer_.
196  int buffer_pos_;          // Current position within the buffer.
197  bool read_error_;         // Did we previously encounter a read error?
198
199  // Line and column number of current_char_ within the whole input stream.
200  int line_;
201  int column_;
202
203  // Position in buffer_ where StartToken() was called.  If the token
204  // started in the previous buffer, this is zero, and current_.text already
205  // contains the part of the token from the previous buffer.  If not
206  // currently parsing a token, this is -1.
207  int token_start_;
208
209  // Options.
210  bool allow_f_after_float_;
211  CommentStyle comment_style_;
212
213  // Since we count columns we need to interpret tabs somehow.  We'll take
214  // the standard 8-character definition for lack of any way to do better.
215  static const int kTabWidth = 8;
216
217  // -----------------------------------------------------------------
218  // Helper methods.
219
220  // Consume this character and advance to the next one.
221  void NextChar();
222
223  // Read a new buffer from the input.
224  void Refresh();
225
226  // Called when the current character is the first character of a new
227  // token (not including whitespace or comments).
228  inline void StartToken();
229  // Called when the current character is the first character after the
230  // end of the last token.  After this returns, current_.text will
231  // contain all text consumed since StartToken() was called.
232  inline void EndToken();
233
234  // Convenience method to add an error at the current line and column.
235  void AddError(const string& message) {
236    error_collector_->AddError(line_, column_, message);
237  }
238
239  // -----------------------------------------------------------------
240  // The following four methods are used to consume tokens of specific
241  // types.  They are actually used to consume all characters *after*
242  // the first, since the calling function consumes the first character
243  // in order to decide what kind of token is being read.
244
245  // Read and consume a string, ending when the given delimiter is
246  // consumed.
247  void ConsumeString(char delimiter);
248
249  // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
250  // depending on what was read.  This needs to know if the first
251  // character was a zero in order to correctly recognize hex and octal
252  // numbers.
253  // It also needs to know if the first characted was a . to parse floating
254  // point correctly.
255  TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
256
257  // Consume the rest of a line.
258  void ConsumeLineComment();
259  // Consume until "*/".
260  void ConsumeBlockComment();
261
262  // -----------------------------------------------------------------
263  // These helper methods make the parsing code more readable.  The
264  // "character classes" refered to are defined at the top of the .cc file.
265  // Basically it is a C++ class with one method:
266  //   static bool InClass(char c);
267  // The method returns true if c is a member of this "class", like "Letter"
268  // or "Digit".
269
270  // Returns true if the current character is of the given character
271  // class, but does not consume anything.
272  template<typename CharacterClass>
273  inline bool LookingAt();
274
275  // If the current character is in the given class, consume it and return
276  // true.  Otherwise return false.
277  // e.g. TryConsumeOne<Letter>()
278  template<typename CharacterClass>
279  inline bool TryConsumeOne();
280
281  // Like above, but try to consume the specific character indicated.
282  inline bool TryConsume(char c);
283
284  // Consume zero or more of the given character class.
285  template<typename CharacterClass>
286  inline void ConsumeZeroOrMore();
287
288  // Consume one or more of the given character class or log the given
289  // error message.
290  // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
291  template<typename CharacterClass>
292  inline void ConsumeOneOrMore(const char* error);
293};
294
295// inline methods ====================================================
296inline const Tokenizer::Token& Tokenizer::current() {
297  return current_;
298}
299
300inline const Tokenizer::Token& Tokenizer::previous() {
301  return previous_;
302}
303
304inline void Tokenizer::ParseString(const string& text, string* output) {
305  output->clear();
306  ParseStringAppend(text, output);
307}
308
309}  // namespace io
310}  // namespace protobuf
311
312}  // namespace google
313#endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__