PageRenderTime 65ms CodeModel.GetById 13ms app.highlight 45ms RepoModel.GetById 1ms app.codeStats 1ms

/thirdparty/breakpad/third_party/protobuf/protobuf/src/google/protobuf/io/tokenizer_unittest.cc

http://github.com/tomahawk-player/tomahawk
C++ | 766 lines | 493 code | 109 blank | 164 comment | 25 complexity | 988392f54bc75a6a9b33e8de9f276dc1 MD5 | raw file
  1// Protocol Buffers - Google's data interchange format
  2// Copyright 2008 Google Inc.  All rights reserved.
  3// http://code.google.com/p/protobuf/
  4//
  5// Redistribution and use in source and binary forms, with or without
  6// modification, are permitted provided that the following conditions are
  7// met:
  8//
  9//     * Redistributions of source code must retain the above copyright
 10// notice, this list of conditions and the following disclaimer.
 11//     * Redistributions in binary form must reproduce the above
 12// copyright notice, this list of conditions and the following disclaimer
 13// in the documentation and/or other materials provided with the
 14// distribution.
 15//     * Neither the name of Google Inc. nor the names of its
 16// contributors may be used to endorse or promote products derived from
 17// this software without specific prior written permission.
 18//
 19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30
 31// Author: kenton@google.com (Kenton Varda)
 32//  Based on original Protocol Buffers design by
 33//  Sanjay Ghemawat, Jeff Dean, and others.
 34
 35#include <vector>
 36#include <math.h>
 37#include <limits.h>
 38
 39#include <google/protobuf/io/tokenizer.h>
 40#include <google/protobuf/io/zero_copy_stream_impl.h>
 41
 42#include <google/protobuf/stubs/common.h>
 43#include <google/protobuf/stubs/strutil.h>
 44#include <google/protobuf/stubs/substitute.h>
 45#include <google/protobuf/testing/googletest.h>
 46#include <gtest/gtest.h>
 47
 48namespace google {
 49namespace protobuf {
 50namespace io {
 51namespace {
 52
 53// ===================================================================
 54// Data-Driven Test Infrastructure
 55
 56// TODO(kenton):  This is copied from coded_stream_unittest.  This is
 57//   temporary until these fetaures are integrated into gTest itself.
 58
 59// TEST_1D and TEST_2D are macros I'd eventually like to see added to
 60// gTest.  These macros can be used to declare tests which should be
 61// run multiple times, once for each item in some input array.  TEST_1D
 62// tests all cases in a single input array.  TEST_2D tests all
 63// combinations of cases from two arrays.  The arrays must be statically
 64// defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
 65//
 66// int kCases[] = {1, 2, 3, 4}
 67// TEST_1D(MyFixture, MyTest, kCases) {
 68//   EXPECT_GT(kCases_case, 0);
 69// }
 70//
 71// This test iterates through the numbers 1, 2, 3, and 4 and tests that
 72// they are all grater than zero.  In case of failure, the exact case
 73// which failed will be printed.  The case type must be printable using
 74// ostream::operator<<.
 75
 76#define TEST_1D(FIXTURE, NAME, CASES)                                      \
 77  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
 78   protected:                                                              \
 79    template <typename CaseType>                                           \
 80    void DoSingleCase(const CaseType& CASES##_case);                       \
 81  };                                                                       \
 82                                                                           \
 83  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
 84    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
 85      SCOPED_TRACE(testing::Message()                                      \
 86        << #CASES " case #" << i << ": " << CASES[i]);                     \
 87      DoSingleCase(CASES[i]);                                              \
 88    }                                                                      \
 89  }                                                                        \
 90                                                                           \
 91  template <typename CaseType>                                             \
 92  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
 93
 94#define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
 95  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
 96   protected:                                                              \
 97    template <typename CaseType1, typename CaseType2>                      \
 98    void DoSingleCase(const CaseType1& CASES1##_case,                      \
 99                      const CaseType2& CASES2##_case);                     \
100  };                                                                       \
101                                                                           \
102  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
103    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
104      for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
105        SCOPED_TRACE(testing::Message()                                    \
106          << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
107          << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
108        DoSingleCase(CASES1[i], CASES2[j]);                                \
109      }                                                                    \
110    }                                                                      \
111  }                                                                        \
112                                                                           \
113  template <typename CaseType1, typename CaseType2>                        \
114  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
115                                           const CaseType2& CASES2##_case)
116
117// -------------------------------------------------------------------
118
119// An input stream that is basically like an ArrayInputStream but sometimes
120// returns empty buffers, just to throw us off.
121class TestInputStream : public ZeroCopyInputStream {
122 public:
123  TestInputStream(const void* data, int size, int block_size)
124    : array_stream_(data, size, block_size), counter_(0) {}
125  ~TestInputStream() {}
126
127  // implements ZeroCopyInputStream ----------------------------------
128  bool Next(const void** data, int* size) {
129    // We'll return empty buffers starting with the first buffer, and every
130    // 3 and 5 buffers after that.
131    if (counter_ % 3 == 0 || counter_ % 5 == 0) {
132      *data = NULL;
133      *size = 0;
134      ++counter_;
135      return true;
136    } else {
137      ++counter_;
138      return array_stream_.Next(data, size);
139    }
140  }
141
142  void BackUp(int count)  { return array_stream_.BackUp(count); }
143  bool Skip(int count)    { return array_stream_.Skip(count);   }
144  int64 ByteCount() const { return array_stream_.ByteCount();   }
145
146 private:
147  ArrayInputStream array_stream_;
148  int counter_;
149};
150
151// -------------------------------------------------------------------
152
153// An error collector which simply concatenates all its errors into a big
154// block of text which can be checked.
155class TestErrorCollector : public ErrorCollector {
156 public:
157  TestErrorCollector() {}
158  ~TestErrorCollector() {}
159
160  string text_;
161
162  // implements ErrorCollector ---------------------------------------
163  void AddError(int line, int column, const string& message) {
164    strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
165                                 line, column, message);
166  }
167};
168
169// -------------------------------------------------------------------
170
171// We test each operation over a variety of block sizes to insure that
172// we test cases where reads cross buffer boundaries as well as cases
173// where they don't.  This is sort of a brute-force approach to this,
174// but it's easy to write and easy to understand.
175const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
176
177class TokenizerTest : public testing::Test {
178 protected:
179  // For easy testing.
180  uint64 ParseInteger(const string& text) {
181    uint64 result;
182    EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
183    return result;
184  }
185};
186
187// ===================================================================
188
189// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
190//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
191#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
192
193// In each test case, the entire input text should parse as a single token
194// of the given type.
195struct SimpleTokenCase {
196  string input;
197  Tokenizer::TokenType type;
198};
199
200inline ostream& operator<<(ostream& out,
201                           const SimpleTokenCase& test_case) {
202  return out << CEscape(test_case.input);
203}
204
205SimpleTokenCase kSimpleTokenCases[] = {
206  // Test identifiers.
207  { "hello",       Tokenizer::TYPE_IDENTIFIER },
208
209  // Test integers.
210  { "123",         Tokenizer::TYPE_INTEGER },
211  { "0xab6",       Tokenizer::TYPE_INTEGER },
212  { "0XAB6",       Tokenizer::TYPE_INTEGER },
213  { "0X1234567",   Tokenizer::TYPE_INTEGER },
214  { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
215  { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
216  { "01234567",    Tokenizer::TYPE_INTEGER },
217
218  // Test floats.
219  { "123.45",      Tokenizer::TYPE_FLOAT },
220  { "1.",          Tokenizer::TYPE_FLOAT },
221  { "1e3",         Tokenizer::TYPE_FLOAT },
222  { "1E3",         Tokenizer::TYPE_FLOAT },
223  { "1e-3",        Tokenizer::TYPE_FLOAT },
224  { "1e+3",        Tokenizer::TYPE_FLOAT },
225  { "1.e3",        Tokenizer::TYPE_FLOAT },
226  { "1.2e3",       Tokenizer::TYPE_FLOAT },
227  { ".1",          Tokenizer::TYPE_FLOAT },
228  { ".1e3",        Tokenizer::TYPE_FLOAT },
229  { ".1e-3",       Tokenizer::TYPE_FLOAT },
230  { ".1e+3",       Tokenizer::TYPE_FLOAT },
231
232  // Test strings.
233  { "'hello'",     Tokenizer::TYPE_STRING },
234  { "\"foo\"",     Tokenizer::TYPE_STRING },
235  { "'a\"b'",      Tokenizer::TYPE_STRING },
236  { "\"a'b\"",     Tokenizer::TYPE_STRING },
237  { "'a\\'b'",     Tokenizer::TYPE_STRING },
238  { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
239  { "'\\xf'",      Tokenizer::TYPE_STRING },
240  { "'\\0'",       Tokenizer::TYPE_STRING },
241
242  // Test symbols.
243  { "+",           Tokenizer::TYPE_SYMBOL },
244  { ".",           Tokenizer::TYPE_SYMBOL },
245};
246
247TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
248  // Set up the tokenizer.
249  TestInputStream input(kSimpleTokenCases_case.input.data(),
250                        kSimpleTokenCases_case.input.size(),
251                        kBlockSizes_case);
252  TestErrorCollector error_collector;
253  Tokenizer tokenizer(&input, &error_collector);
254
255  // Before Next() is called, the initial token should always be TYPE_START.
256  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
257  EXPECT_EQ("", tokenizer.current().text);
258  EXPECT_EQ(0, tokenizer.current().line);
259  EXPECT_EQ(0, tokenizer.current().column);
260  EXPECT_EQ(0, tokenizer.current().end_column);
261
262  // Parse the token.
263  ASSERT_TRUE(tokenizer.Next());
264
265  // Check that it has the right type.
266  EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
267  // Check that it contains the complete input text.
268  EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
269  // Check that it is located at the beginning of the input
270  EXPECT_EQ(0, tokenizer.current().line);
271  EXPECT_EQ(0, tokenizer.current().column);
272  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
273            tokenizer.current().end_column);
274
275  // There should be no more input.
276  EXPECT_FALSE(tokenizer.Next());
277
278  // After Next() returns false, the token should have type TYPE_END.
279  EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
280  EXPECT_EQ("", tokenizer.current().text);
281  EXPECT_EQ(0, tokenizer.current().line);
282  EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
283  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
284            tokenizer.current().end_column);
285
286  // There should be no errors.
287  EXPECT_TRUE(error_collector.text_.empty());
288}
289
290TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
291  // Test the "allow_f_after_float" option.
292
293  // Set up the tokenizer.
294  const char* text = "1f 2.5f 6e3f 7F";
295  TestInputStream input(text, strlen(text), kBlockSizes_case);
296  TestErrorCollector error_collector;
297  Tokenizer tokenizer(&input, &error_collector);
298  tokenizer.set_allow_f_after_float(true);
299
300  // Advance through tokens and check that they are parsed as expected.
301  ASSERT_TRUE(tokenizer.Next());
302  EXPECT_EQ(tokenizer.current().text, "1f");
303  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
304  ASSERT_TRUE(tokenizer.Next());
305  EXPECT_EQ(tokenizer.current().text, "2.5f");
306  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
307  ASSERT_TRUE(tokenizer.Next());
308  EXPECT_EQ(tokenizer.current().text, "6e3f");
309  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
310  ASSERT_TRUE(tokenizer.Next());
311  EXPECT_EQ(tokenizer.current().text, "7F");
312  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
313
314  // There should be no more input.
315  EXPECT_FALSE(tokenizer.Next());
316  // There should be no errors.
317  EXPECT_TRUE(error_collector.text_.empty());
318}
319
320#endif
321
322// -------------------------------------------------------------------
323
324// In each case, the input is parsed to produce a list of tokens.  The
325// last token in "output" must have type TYPE_END.
326struct MultiTokenCase {
327  string input;
328  Tokenizer::Token output[10];  // The compiler wants a constant array
329                                // size for initialization to work.  There
330                                // is no reason this can't be increased if
331                                // needed.
332};
333
334inline ostream& operator<<(ostream& out,
335                           const MultiTokenCase& test_case) {
336  return out << CEscape(test_case.input);
337}
338
339MultiTokenCase kMultiTokenCases[] = {
340  // Test empty input.
341  { "", {
342    { Tokenizer::TYPE_END       , ""     , 0,  0 },
343  }},
344
345  // Test all token types at the same time.
346  { "foo 1 1.2 + 'bar'", {
347    { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0,  3 },
348    { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4,  5 },
349    { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6,  9 },
350    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10, 11 },
351    { Tokenizer::TYPE_STRING    , "'bar'", 0, 12, 17 },
352    { Tokenizer::TYPE_END       , ""     , 0, 17, 17 },
353  }},
354
355  // Test that consecutive symbols are parsed as separate tokens.
356  { "!@+%", {
357    { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0, 1 },
358    { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1, 2 },
359    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2, 3 },
360    { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3, 4 },
361    { Tokenizer::TYPE_END       , ""     , 0, 4, 4 },
362  }},
363
364  // Test that newlines affect line numbers correctly.
365  { "foo bar\nrab oof", {
366    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0, 3 },
367    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4, 7 },
368    { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0, 3 },
369    { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4, 7 },
370    { Tokenizer::TYPE_END       , ""   , 1,  7, 7 },
371  }},
372
373  // Test that tabs affect column numbers correctly.
374  { "foo\tbar  \tbaz", {
375    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
376    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8, 11 },
377    { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
378    { Tokenizer::TYPE_END       , ""   , 0, 19, 19 },
379  }},
380
381  // Test that tabs in string literals affect column numbers correctly.
382  { "\"foo\tbar\" baz", {
383    { Tokenizer::TYPE_STRING    , "\"foo\tbar\"", 0,  0, 12 },
384    { Tokenizer::TYPE_IDENTIFIER, "baz"         , 0, 13, 16 },
385    { Tokenizer::TYPE_END       , ""            , 0, 16, 16 },
386  }},
387
388  // Test that line comments are ignored.
389  { "foo // This is a comment\n"
390    "bar // This is another comment", {
391    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
392    { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0,  3 },
393    { Tokenizer::TYPE_END       , ""   , 1, 30, 30 },
394  }},
395
396  // Test that block comments are ignored.
397  { "foo /* This is a block comment */ bar", {
398    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
399    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
400    { Tokenizer::TYPE_END       , ""   , 0, 37, 37 },
401  }},
402
403  // Test that sh-style comments are not ignored by default.
404  { "foo # bar\n"
405    "baz", {
406    { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
407    { Tokenizer::TYPE_SYMBOL    , "#"  , 0, 4, 5 },
408    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
409    { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
410    { Tokenizer::TYPE_END       , ""   , 1, 3, 3 },
411  }},
412
413  // Bytes with the high-order bit set should not be seen as control characters.
414  { "\300", {
415    { Tokenizer::TYPE_SYMBOL, "\300", 0, 0, 1 },
416    { Tokenizer::TYPE_END   , ""    , 0, 1, 1 },
417  }},
418
419  // Test all whitespace chars
420  { "foo\n\t\r\v\fbar", {
421    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
422    { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
423    { Tokenizer::TYPE_END       , ""   , 1, 14, 14 },
424  }},
425};
426
427TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
428  // Set up the tokenizer.
429  TestInputStream input(kMultiTokenCases_case.input.data(),
430                        kMultiTokenCases_case.input.size(),
431                        kBlockSizes_case);
432  TestErrorCollector error_collector;
433  Tokenizer tokenizer(&input, &error_collector);
434
435  // Before Next() is called, the initial token should always be TYPE_START.
436  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
437  EXPECT_EQ("", tokenizer.current().text);
438  EXPECT_EQ(0, tokenizer.current().line);
439  EXPECT_EQ(0, tokenizer.current().column);
440  EXPECT_EQ(0, tokenizer.current().end_column);
441
442  // Loop through all expected tokens.
443  int i = 0;
444  Tokenizer::Token token;
445  do {
446    token = kMultiTokenCases_case.output[i++];
447
448    SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
449
450    Tokenizer::Token previous = tokenizer.current();
451
452    // Next() should only return false when it hits the end token.
453    if (token.type != Tokenizer::TYPE_END) {
454      ASSERT_TRUE(tokenizer.Next());
455    } else {
456      ASSERT_FALSE(tokenizer.Next());
457    }
458
459    // Check that the previous token is set correctly.
460    EXPECT_EQ(previous.type, tokenizer.previous().type);
461    EXPECT_EQ(previous.text, tokenizer.previous().text);
462    EXPECT_EQ(previous.line, tokenizer.previous().line);
463    EXPECT_EQ(previous.column, tokenizer.previous().column);
464    EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
465
466    // Check that the token matches the expected one.
467    EXPECT_EQ(token.type, tokenizer.current().type);
468    EXPECT_EQ(token.text, tokenizer.current().text);
469    EXPECT_EQ(token.line, tokenizer.current().line);
470    EXPECT_EQ(token.column, tokenizer.current().column);
471    EXPECT_EQ(token.end_column, tokenizer.current().end_column);
472
473  } while (token.type != Tokenizer::TYPE_END);
474
475  // There should be no errors.
476  EXPECT_TRUE(error_collector.text_.empty());
477}
478
479// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
480//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
481#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
482
483TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
484  // Test the "comment_style" option.
485
486  const char* text = "foo # bar\n"
487                     "baz // qux\n"
488                     "corge /* grault */\n"
489                     "garply";
490  const char* const kTokens[] = {"foo",  // "# bar" is ignored
491                                 "baz", "/", "/", "qux",
492                                 "corge", "/", "*", "grault", "*", "/",
493                                 "garply"};
494
495  // Set up the tokenizer.
496  TestInputStream input(text, strlen(text), kBlockSizes_case);
497  TestErrorCollector error_collector;
498  Tokenizer tokenizer(&input, &error_collector);
499  tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
500
501  // Advance through tokens and check that they are parsed as expected.
502  for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
503    EXPECT_TRUE(tokenizer.Next());
504    EXPECT_EQ(tokenizer.current().text, kTokens[i]);
505  }
506
507  // There should be no more input.
508  EXPECT_FALSE(tokenizer.Next());
509  // There should be no errors.
510  EXPECT_TRUE(error_collector.text_.empty());
511}
512
513#endif
514
515// -------------------------------------------------------------------
516
517// Test parse helpers.  It's not really worth setting up a full data-driven
518// test here.
519TEST_F(TokenizerTest, ParseInteger) {
520  EXPECT_EQ(0, ParseInteger("0"));
521  EXPECT_EQ(123, ParseInteger("123"));
522  EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
523  EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
524  EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
525  EXPECT_EQ(01234567, ParseInteger("01234567"));
526  EXPECT_EQ(0X123, ParseInteger("0X123"));
527
528  // Test invalid integers that may still be tokenized as integers.
529  EXPECT_EQ(0, ParseInteger("0x"));
530
531  uint64 i;
532#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
533  // Test invalid integers that will never be tokenized as integers.
534  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
535    "passed text that could not have been tokenized as an integer");
536  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
537    "passed text that could not have been tokenized as an integer");
538  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
539    "passed text that could not have been tokenized as an integer");
540  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
541    "passed text that could not have been tokenized as an integer");
542  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
543    "passed text that could not have been tokenized as an integer");
544#endif  // GTEST_HAS_DEATH_TEST
545
546  // Test overflows.
547  EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
548  EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
549  EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
550  EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
551  EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
552  EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
553  EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
554}
555
556TEST_F(TokenizerTest, ParseFloat) {
557  EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
558  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
559  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
560  EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
561  EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
562  EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
563  EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
564  EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
565  EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
566  EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
567  EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
568  EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
569  EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
570  EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
571
572  // Test invalid integers that may still be tokenized as integers.
573  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
574  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
575  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
576
577  // Test 'f' suffix.
578  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
579  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
580  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
581
582  // These should parse successfully even though they are out of range.
583  // Overflows become infinity and underflows become zero.
584  EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
585  EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
586
587#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
588  // Test invalid integers that will never be tokenized as integers.
589  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
590    "passed text that could not have been tokenized as a float");
591  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
592    "passed text that could not have been tokenized as a float");
593  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
594    "passed text that could not have been tokenized as a float");
595#endif  // GTEST_HAS_DEATH_TEST
596}
597
598TEST_F(TokenizerTest, ParseString) {
599  string output;
600  Tokenizer::ParseString("'hello'", &output);
601  EXPECT_EQ("hello", output);
602  Tokenizer::ParseString("\"blah\\nblah2\"", &output);
603  EXPECT_EQ("blah\nblah2", output);
604  Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
605  EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
606  Tokenizer::ParseString("'\\x20\\x4'", &output);
607  EXPECT_EQ("\x20\x4", output);
608
609  // Test invalid strings that may still be tokenized as strings.
610  Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
611  EXPECT_EQ("\a?\v\t", output);
612  Tokenizer::ParseString("'", &output);
613  EXPECT_EQ("", output);
614  Tokenizer::ParseString("'\\", &output);
615  EXPECT_EQ("\\", output);
616
617  // Test invalid strings that will never be tokenized as strings.
618#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
619  EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
620    "passed text that could not have been tokenized as a string");
621#endif  // GTEST_HAS_DEATH_TEST
622}
623
624TEST_F(TokenizerTest, ParseStringAppend) {
625  // Check that ParseString and ParseStringAppend differ.
626  string output("stuff+");
627  Tokenizer::ParseStringAppend("'hello'", &output);
628  EXPECT_EQ("stuff+hello", output);
629  Tokenizer::ParseString("'hello'", &output);
630  EXPECT_EQ("hello", output);
631}
632
633// -------------------------------------------------------------------
634
635// Each case parses some input text, ignoring the tokens produced, and
636// checks that the error output matches what is expected.
637struct ErrorCase {
638  string input;
639  bool recoverable;  // True if the tokenizer should be able to recover and
640                     // parse more tokens after seeing this error.  Cases
641                     // for which this is true must end with "foo" as
642                     // the last token, which the test will check for.
643  const char* errors;
644};
645
646inline ostream& operator<<(ostream& out,
647                           const ErrorCase& test_case) {
648  return out << CEscape(test_case.input);
649}
650
651ErrorCase kErrorCases[] = {
652  // String errors.
653  { "'\\l' foo", true,
654    "0:2: Invalid escape sequence in string literal.\n" },
655  { "'\\x' foo", true,
656    "0:3: Expected hex digits for escape sequence.\n" },
657  { "'foo", false,
658    "0:4: String literals cannot cross line boundaries.\n" },
659  { "'bar\nfoo", true,
660    "0:4: String literals cannot cross line boundaries.\n" },
661
662  // Integer errors.
663  { "123foo", true,
664    "0:3: Need space between number and identifier.\n" },
665
666  // Hex/octal errors.
667  { "0x foo", true,
668    "0:2: \"0x\" must be followed by hex digits.\n" },
669  { "0541823 foo", true,
670    "0:4: Numbers starting with leading zero must be in octal.\n" },
671  { "0x123z foo", true,
672    "0:5: Need space between number and identifier.\n" },
673  { "0x123.4 foo", true,
674    "0:5: Hex and octal numbers must be integers.\n" },
675  { "0123.4 foo", true,
676    "0:4: Hex and octal numbers must be integers.\n" },
677
678  // Float errors.
679  { "1e foo", true,
680    "0:2: \"e\" must be followed by exponent.\n" },
681  { "1e- foo", true,
682    "0:3: \"e\" must be followed by exponent.\n" },
683  { "1.2.3 foo", true,
684    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
685  { "1e2.3 foo", true,
686    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
687  { "a.1 foo", true,
688    "0:1: Need space between identifier and decimal point.\n" },
689  // allow_f_after_float not enabled, so this should be an error.
690  { "1.0f foo", true,
691    "0:3: Need space between number and identifier.\n" },
692
693  // Block comment errors.
694  { "/*", false,
695    "0:2: End-of-file inside block comment.\n"
696    "0:0:   Comment started here.\n"},
697  { "/*/*/ foo", true,
698    "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
699
700  // Control characters.  Multiple consecutive control characters should only
701  // produce one error.
702  { "\b foo", true,
703    "0:0: Invalid control characters encountered in text.\n" },
704  { "\b\b foo", true,
705    "0:0: Invalid control characters encountered in text.\n" },
706
707  // Check that control characters at end of input don't result in an
708  // infinite loop.
709  { "\b", false,
710    "0:0: Invalid control characters encountered in text.\n" },
711
712  // Check recovery from '\0'.  We have to explicitly specify the length of
713  // these strings because otherwise the string constructor will just call
714  // strlen() which will see the first '\0' and think that is the end of the
715  // string.
716  { string("\0foo", 4), true,
717    "0:0: Invalid control characters encountered in text.\n" },
718  { string("\0\0foo", 5), true,
719    "0:0: Invalid control characters encountered in text.\n" },
720};
721
722TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
723  // Set up the tokenizer.
724  TestInputStream input(kErrorCases_case.input.data(),
725                        kErrorCases_case.input.size(),
726                        kBlockSizes_case);
727  TestErrorCollector error_collector;
728  Tokenizer tokenizer(&input, &error_collector);
729
730  // Ignore all input, except remember if the last token was "foo".
731  bool last_was_foo = false;
732  while (tokenizer.Next()) {
733    last_was_foo = tokenizer.current().text == "foo";
734  }
735
736  // Check that the errors match what was expected.
737  EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
738
739  // If the error was recoverable, make sure we saw "foo" after it.
740  if (kErrorCases_case.recoverable) {
741    EXPECT_TRUE(last_was_foo);
742  }
743}
744
745// -------------------------------------------------------------------
746
747TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
748  string text = "foo bar";
749  TestInputStream input(text.data(), text.size(), kBlockSizes_case);
750
751  // Create a tokenizer, read one token, then destroy it.
752  {
753    TestErrorCollector error_collector;
754    Tokenizer tokenizer(&input, &error_collector);
755
756    tokenizer.Next();
757  }
758
759  // Only "foo" should have been read.
760  EXPECT_EQ(strlen("foo"), input.ByteCount());
761}
762
763}  // namespace
764}  // namespace io
765}  // namespace protobuf
766}  // namespace google