tokenizer_unittest.cc - This C++ code is a test suite for a…

/thirdparty/breakpad/third_party/protobuf/protobuf/src/google/protobuf/io/tokenizer_unittest.cc

http://github.com/tomahawk-player/tomahawk · C++ · 766 lines · 493 code · 109 blank · 164 comment · 25 complexity · 988392f54bc75a6a9b33e8de9f276dc1 MD5 · raw file

// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// http://code.google.com/p/protobuf/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// Author: kenton@google.com (Kenton Varda)
//  Based on original Protocol Buffers design by
//  Sanjay Ghemawat, Jeff Dean, and others.

#include <vector>
#include <math.h>
#include <limits.h>

#include <google/protobuf/io/tokenizer.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>

#include <google/protobuf/stubs/common.h>
#include <google/protobuf/stubs/strutil.h>
#include <google/protobuf/stubs/substitute.h>
#include <google/protobuf/testing/googletest.h>
#include <gtest/gtest.h>

namespace google {
namespace protobuf {
namespace io {
namespace {

// ===================================================================
// Data-Driven Test Infrastructure

// TODO(kenton):  This is copied from coded_stream_unittest.  This is
//   temporary until these fetaures are integrated into gTest itself.

// TEST_1D and TEST_2D are macros I'd eventually like to see added to
// gTest.  These macros can be used to declare tests which should be
// run multiple times, once for each item in some input array.  TEST_1D
// tests all cases in a single input array.  TEST_2D tests all
// combinations of cases from two arrays.  The arrays must be statically
// defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
//
// int kCases[] = {1, 2, 3, 4}
// TEST_1D(MyFixture, MyTest, kCases) {
//   EXPECT_GT(kCases_case, 0);
// }
//
// This test iterates through the numbers 1, 2, 3, and 4 and tests that
// they are all grater than zero.  In case of failure, the exact case
// which failed will be printed.  The case type must be printable using
// ostream::operator<<.

#define TEST_1D(FIXTURE, NAME, CASES)                                      \
  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
   protected:                                                              \
    template <typename CaseType>                                           \
    void DoSingleCase(const CaseType& CASES##_case);                       \
  };                                                                       \
                                                                           \
  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
      SCOPED_TRACE(testing::Message()                                      \
        << #CASES " case #" << i << ": " << CASES[i]);                     \
      DoSingleCase(CASES[i]);                                              \
    }                                                                      \
  }                                                                        \
                                                                           \
  template <typename CaseType>                                             \
  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)

#define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
   protected:                                                              \
    template <typename CaseType1, typename CaseType2>                      \
    void DoSingleCase(const CaseType1& CASES1##_case,                      \
                      const CaseType2& CASES2##_case);                     \
  };                                                                       \
                                                                           \
  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
      for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
        SCOPED_TRACE(testing::Message()                                    \
          << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
          << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
        DoSingleCase(CASES1[i], CASES2[j]);                                \
      }                                                                    \
    }                                                                      \
  }                                                                        \
                                                                           \
  template <typename CaseType1, typename CaseType2>                        \
  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
                                           const CaseType2& CASES2##_case)

// -------------------------------------------------------------------

// An input stream that is basically like an ArrayInputStream but sometimes
// returns empty buffers, just to throw us off.
class TestInputStream : public ZeroCopyInputStream {
 public:
  TestInputStream(const void* data, int size, int block_size)
    : array_stream_(data, size, block_size), counter_(0) {}
  ~TestInputStream() {}

  // implements ZeroCopyInputStream ----------------------------------
  bool Next(const void** data, int* size) {
    // We'll return empty buffers starting with the first buffer, and every
    // 3 and 5 buffers after that.
    if (counter_ % 3 == 0 || counter_ % 5 == 0) {
      *data = NULL;
      *size = 0;
      ++counter_;
      return true;
    } else {
      ++counter_;
      return array_stream_.Next(data, size);
    }
  }

  void BackUp(int count)  { return array_stream_.BackUp(count); }
  bool Skip(int count)    { return array_stream_.Skip(count);   }
  int64 ByteCount() const { return array_stream_.ByteCount();   }

 private:
  ArrayInputStream array_stream_;
  int counter_;
};

// -------------------------------------------------------------------

// An error collector which simply concatenates all its errors into a big
// block of text which can be checked.
class TestErrorCollector : public ErrorCollector {
 public:
  TestErrorCollector() {}
  ~TestErrorCollector() {}

  string text_;

  // implements ErrorCollector ---------------------------------------
  void AddError(int line, int column, const string& message) {
    strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
                                 line, column, message);
  }
};

// -------------------------------------------------------------------

// We test each operation over a variety of block sizes to insure that
// we test cases where reads cross buffer boundaries as well as cases
// where they don't.  This is sort of a brute-force approach to this,
// but it's easy to write and easy to understand.
const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};

class TokenizerTest : public testing::Test {
 protected:
  // For easy testing.
  uint64 ParseInteger(const string& text) {
    uint64 result;
    EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
    return result;
  }
};

// ===================================================================

// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)

// In each test case, the entire input text should parse as a single token
// of the given type.
struct SimpleTokenCase {
  string input;
  Tokenizer::TokenType type;
};

inline ostream& operator<<(ostream& out,
                           const SimpleTokenCase& test_case) {
  return out << CEscape(test_case.input);
}

SimpleTokenCase kSimpleTokenCases[] = {
  // Test identifiers.
  { "hello",       Tokenizer::TYPE_IDENTIFIER },

  // Test integers.
  { "123",         Tokenizer::TYPE_INTEGER },
  { "0xab6",       Tokenizer::TYPE_INTEGER },
  { "0XAB6",       Tokenizer::TYPE_INTEGER },
  { "0X1234567",   Tokenizer::TYPE_INTEGER },
  { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
  { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
  { "01234567",    Tokenizer::TYPE_INTEGER },

  // Test floats.
  { "123.45",      Tokenizer::TYPE_FLOAT },
  { "1.",          Tokenizer::TYPE_FLOAT },
  { "1e3",         Tokenizer::TYPE_FLOAT },
  { "1E3",         Tokenizer::TYPE_FLOAT },
  { "1e-3",        Tokenizer::TYPE_FLOAT },
  { "1e+3",        Tokenizer::TYPE_FLOAT },
  { "1.e3",        Tokenizer::TYPE_FLOAT },
  { "1.2e3",       Tokenizer::TYPE_FLOAT },
  { ".1",          Tokenizer::TYPE_FLOAT },
  { ".1e3",        Tokenizer::TYPE_FLOAT },
  { ".1e-3",       Tokenizer::TYPE_FLOAT },
  { ".1e+3",       Tokenizer::TYPE_FLOAT },

  // Test strings.
  { "'hello'",     Tokenizer::TYPE_STRING },
  { "\"foo\"",     Tokenizer::TYPE_STRING },
  { "'a\"b'",      Tokenizer::TYPE_STRING },
  { "\"a'b\"",     Tokenizer::TYPE_STRING },
  { "'a\\'b'",     Tokenizer::TYPE_STRING },
  { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
  { "'\\xf'",      Tokenizer::TYPE_STRING },
  { "'\\0'",       Tokenizer::TYPE_STRING },

  // Test symbols.
  { "+",           Tokenizer::TYPE_SYMBOL },
  { ".",           Tokenizer::TYPE_SYMBOL },
};

TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
  // Set up the tokenizer.
  TestInputStream input(kSimpleTokenCases_case.input.data(),
                        kSimpleTokenCases_case.input.size(),
                        kBlockSizes_case);
  TestErrorCollector error_collector;
  Tokenizer tokenizer(&input, &error_collector);

  // Before Next() is called, the initial token should always be TYPE_START.
  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
  EXPECT_EQ("", tokenizer.current().text);
  EXPECT_EQ(0, tokenizer.current().line);
  EXPECT_EQ(0, tokenizer.current().column);
  EXPECT_EQ(0, tokenizer.current().end_column);

  // Parse the token.
  ASSERT_TRUE(tokenizer.Next());

  // Check that it has the right type.
  EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
  // Check that it contains the complete input text.
  EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
  // Check that it is located at the beginning of the input
  EXPECT_EQ(0, tokenizer.current().line);
  EXPECT_EQ(0, tokenizer.current().column);
  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
            tokenizer.current().end_column);

  // There should be no more input.
  EXPECT_FALSE(tokenizer.Next());

  // After Next() returns false, the token should have type TYPE_END.
  EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
  EXPECT_EQ("", tokenizer.current().text);
  EXPECT_EQ(0, tokenizer.current().line);
  EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
            tokenizer.current().end_column);

  // There should be no errors.
  EXPECT_TRUE(error_collector.text_.empty());
}

TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
  // Test the "allow_f_after_float" option.

  // Set up the tokenizer.
  const char* text = "1f 2.5f 6e3f 7F";
  TestInputStream input(text, strlen(text), kBlockSizes_case);
  TestErrorCollector error_collector;
  Tokenizer tokenizer(&input, &error_collector);
  tokenizer.set_allow_f_after_float(true);

  // Advance through tokens and check that they are parsed as expected.
  ASSERT_TRUE(tokenizer.Next());
  EXPECT_EQ(tokenizer.current().text, "1f");
  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  ASSERT_TRUE(tokenizer.Next());
  EXPECT_EQ(tokenizer.current().text, "2.5f");
  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  ASSERT_TRUE(tokenizer.Next());
  EXPECT_EQ(tokenizer.current().text, "6e3f");
  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  ASSERT_TRUE(tokenizer.Next());
  EXPECT_EQ(tokenizer.current().text, "7F");
  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);

  // There should be no more input.
  EXPECT_FALSE(tokenizer.Next());
  // There should be no errors.
  EXPECT_TRUE(error_collector.text_.empty());
}

#endif

// -------------------------------------------------------------------

// In each case, the input is parsed to produce a list of tokens.  The
// last token in "output" must have type TYPE_END.
struct MultiTokenCase {
  string input;
  Tokenizer::Token output[10];  // The compiler wants a constant array
                                // size for initialization to work.  There
                                // is no reason this can't be increased if
                                // needed.
};

inline ostream& operator<<(ostream& out,
                           const MultiTokenCase& test_case) {
  return out << CEscape(test_case.input);
}

MultiTokenCase kMultiTokenCases[] = {
  // Test empty input.
  { "", {
    { Tokenizer::TYPE_END       , ""     , 0,  0 },
  }},

  // Test all token types at the same time.
  { "foo 1 1.2 + 'bar'", {
    { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0,  3 },
    { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4,  5 },
    { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6,  9 },
    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10, 11 },
    { Tokenizer::TYPE_STRING    , "'bar'", 0, 12, 17 },
    { Tokenizer::TYPE_END       , ""     , 0, 17, 17 },
  }},

  // Test that consecutive symbols are parsed as separate tokens.
  { "!@+%", {
    { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0, 1 },
    { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1, 2 },
    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2, 3 },
    { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3, 4 },
    { Tokenizer::TYPE_END       , ""     , 0, 4, 4 },
  }},

  // Test that newlines affect line numbers correctly.
  { "foo bar\nrab oof", {
    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0, 3 },
    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4, 7 },
    { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0, 3 },
    { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4, 7 },
    { Tokenizer::TYPE_END       , ""   , 1,  7, 7 },
  }},

  // Test that tabs affect column numbers correctly.
  { "foo\tbar  \tbaz", {
    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8, 11 },
    { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
    { Tokenizer::TYPE_END       , ""   , 0, 19, 19 },
  }},

  // Test that tabs in string literals affect column numbers correctly.
  { "\"foo\tbar\" baz", {
    { Tokenizer::TYPE_STRING    , "\"foo\tbar\"", 0,  0, 12 },
    { Tokenizer::TYPE_IDENTIFIER, "baz"         , 0, 13, 16 },
    { Tokenizer::TYPE_END       , ""            , 0, 16, 16 },
  }},

  // Test that line comments are ignored.
  { "foo // This is a comment\n"
    "bar // This is another comment", {
    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
    { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0,  3 },
    { Tokenizer::TYPE_END       , ""   , 1, 30, 30 },
  }},

  // Test that block comments are ignored.
  { "foo /* This is a block comment */ bar", {
    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
    { Tokenizer::TYPE_END       , ""   , 0, 37, 37 },
  }},

  // Test that sh-style comments are not ignored by default.
  { "foo # bar\n"
    "baz", {
    { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
    { Tokenizer::TYPE_SYMBOL    , "#"  , 0, 4, 5 },
    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
    { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
    { Tokenizer::TYPE_END       , ""   , 1, 3, 3 },
  }},

  // Bytes with the high-order bit set should not be seen as control characters.
  { "\300", {
    { Tokenizer::TYPE_SYMBOL, "\300", 0, 0, 1 },
    { Tokenizer::TYPE_END   , ""    , 0, 1, 1 },
  }},

  // Test all whitespace chars
  { "foo\n\t\r\v\fbar", {
    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
    { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
    { Tokenizer::TYPE_END       , ""   , 1, 14, 14 },
  }},
};

TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
  // Set up the tokenizer.
  TestInputStream input(kMultiTokenCases_case.input.data(),
                        kMultiTokenCases_case.input.size(),
                        kBlockSizes_case);
  TestErrorCollector error_collector;
  Tokenizer tokenizer(&input, &error_collector);

  // Before Next() is called, the initial token should always be TYPE_START.
  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
  EXPECT_EQ("", tokenizer.current().text);
  EXPECT_EQ(0, tokenizer.current().line);
  EXPECT_EQ(0, tokenizer.current().column);
  EXPECT_EQ(0, tokenizer.current().end_column);

  // Loop through all expected tokens.
  int i = 0;
  Tokenizer::Token token;
  do {
    token = kMultiTokenCases_case.output[i++];

    SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);

    Tokenizer::Token previous = tokenizer.current();

    // Next() should only return false when it hits the end token.
    if (token.type != Tokenizer::TYPE_END) {
      ASSERT_TRUE(tokenizer.Next());
    } else {
      ASSERT_FALSE(tokenizer.Next());
    }

    // Check that the previous token is set correctly.
    EXPECT_EQ(previous.type, tokenizer.previous().type);
    EXPECT_EQ(previous.text, tokenizer.previous().text);
    EXPECT_EQ(previous.line, tokenizer.previous().line);
    EXPECT_EQ(previous.column, tokenizer.previous().column);
    EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);

    // Check that the token matches the expected one.
    EXPECT_EQ(token.type, tokenizer.current().type);
    EXPECT_EQ(token.text, tokenizer.current().text);
    EXPECT_EQ(token.line, tokenizer.current().line);
    EXPECT_EQ(token.column, tokenizer.current().column);
    EXPECT_EQ(token.end_column, tokenizer.current().end_column);

  } while (token.type != Tokenizer::TYPE_END);

  // There should be no errors.
  EXPECT_TRUE(error_collector.text_.empty());
}

// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)

TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
  // Test the "comment_style" option.

  const char* text = "foo # bar\n"
                     "baz // qux\n"
                     "corge /* grault */\n"
                     "garply";
  const char* const kTokens[] = {"foo",  // "# bar" is ignored
                                 "baz", "/", "/", "qux",
                                 "corge", "/", "*", "grault", "*", "/",
                                 "garply"};

  // Set up the tokenizer.
  TestInputStream input(text, strlen(text), kBlockSizes_case);
  TestErrorCollector error_collector;
  Tokenizer tokenizer(&input, &error_collector);
  tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);

  // Advance through tokens and check that they are parsed as expected.
  for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
    EXPECT_TRUE(tokenizer.Next());
    EXPECT_EQ(tokenizer.current().text, kTokens[i]);
  }

  // There should be no more input.
  EXPECT_FALSE(tokenizer.Next());
  // There should be no errors.
  EXPECT_TRUE(error_collector.text_.empty());
}

#endif

// -------------------------------------------------------------------

// Test parse helpers.  It's not really worth setting up a full data-driven
// test here.
TEST_F(TokenizerTest, ParseInteger) {
  EXPECT_EQ(0, ParseInteger("0"));
  EXPECT_EQ(123, ParseInteger("123"));
  EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
  EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
  EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
  EXPECT_EQ(01234567, ParseInteger("01234567"));
  EXPECT_EQ(0X123, ParseInteger("0X123"));

  // Test invalid integers that may still be tokenized as integers.
  EXPECT_EQ(0, ParseInteger("0x"));

  uint64 i;
#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
  // Test invalid integers that will never be tokenized as integers.
  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
    "passed text that could not have been tokenized as an integer");
  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
    "passed text that could not have been tokenized as an integer");
  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
    "passed text that could not have been tokenized as an integer");
  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
    "passed text that could not have been tokenized as an integer");
  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
    "passed text that could not have been tokenized as an integer");
#endif  // GTEST_HAS_DEATH_TEST

  // Test overflows.
  EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
  EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
  EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
  EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
  EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
  EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
  EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
}

TEST_F(TokenizerTest, ParseFloat) {
  EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
  EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
  EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
  EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
  EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
  EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
  EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
  EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
  EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
  EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
  EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
  EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));

  // Test invalid integers that may still be tokenized as integers.
  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));

  // Test 'f' suffix.
  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));

  // These should parse successfully even though they are out of range.
  // Overflows become infinity and underflows become zero.
  EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
  EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));

#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
  // Test invalid integers that will never be tokenized as integers.
  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
    "passed text that could not have been tokenized as a float");
  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
    "passed text that could not have been tokenized as a float");
  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
    "passed text that could not have been tokenized as a float");
#endif  // GTEST_HAS_DEATH_TEST
}

TEST_F(TokenizerTest, ParseString) {
  string output;
  Tokenizer::ParseString("'hello'", &output);
  EXPECT_EQ("hello", output);
  Tokenizer::ParseString("\"blah\\nblah2\"", &output);
  EXPECT_EQ("blah\nblah2", output);
  Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
  EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
  Tokenizer::ParseString("'\\x20\\x4'", &output);
  EXPECT_EQ("\x20\x4", output);

  // Test invalid strings that may still be tokenized as strings.
  Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
  EXPECT_EQ("\a?\v\t", output);
  Tokenizer::ParseString("'", &output);
  EXPECT_EQ("", output);
  Tokenizer::ParseString("'\\", &output);
  EXPECT_EQ("\\", output);

  // Test invalid strings that will never be tokenized as strings.
#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
  EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
    "passed text that could not have been tokenized as a string");
#endif  // GTEST_HAS_DEATH_TEST
}

TEST_F(TokenizerTest, ParseStringAppend) {
  // Check that ParseString and ParseStringAppend differ.
  string output("stuff+");
  Tokenizer::ParseStringAppend("'hello'", &output);
  EXPECT_EQ("stuff+hello", output);
  Tokenizer::ParseString("'hello'", &output);
  EXPECT_EQ("hello", output);
}

// -------------------------------------------------------------------

// Each case parses some input text, ignoring the tokens produced, and
// checks that the error output matches what is expected.
struct ErrorCase {
  string input;
  bool recoverable;  // True if the tokenizer should be able to recover and
                     // parse more tokens after seeing this error.  Cases
                     // for which this is true must end with "foo" as
                     // the last token, which the test will check for.
  const char* errors;
};

inline ostream& operator<<(ostream& out,
                           const ErrorCase& test_case) {
  return out << CEscape(test_case.input);
}

ErrorCase kErrorCases[] = {
  // String errors.
  { "'\\l' foo", true,
    "0:2: Invalid escape sequence in string literal.\n" },
  { "'\\x' foo", true,
    "0:3: Expected hex digits for escape sequence.\n" },
  { "'foo", false,
    "0:4: String literals cannot cross line boundaries.\n" },
  { "'bar\nfoo", true,
    "0:4: String literals cannot cross line boundaries.\n" },

  // Integer errors.
  { "123foo", true,
    "0:3: Need space between number and identifier.\n" },

  // Hex/octal errors.
  { "0x foo", true,
    "0:2: \"0x\" must be followed by hex digits.\n" },
  { "0541823 foo", true,
    "0:4: Numbers starting with leading zero must be in octal.\n" },
  { "0x123z foo", true,
    "0:5: Need space between number and identifier.\n" },
  { "0x123.4 foo", true,
    "0:5: Hex and octal numbers must be integers.\n" },
  { "0123.4 foo", true,
    "0:4: Hex and octal numbers must be integers.\n" },

  // Float errors.
  { "1e foo", true,
    "0:2: \"e\" must be followed by exponent.\n" },
  { "1e- foo", true,
    "0:3: \"e\" must be followed by exponent.\n" },
  { "1.2.3 foo", true,
    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
  { "1e2.3 foo", true,
    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
  { "a.1 foo", true,
    "0:1: Need space between identifier and decimal point.\n" },
  // allow_f_after_float not enabled, so this should be an error.
  { "1.0f foo", true,
    "0:3: Need space between number and identifier.\n" },

  // Block comment errors.
  { "/*", false,
    "0:2: End-of-file inside block comment.\n"
    "0:0:   Comment started here.\n"},
  { "/*/*/ foo", true,
    "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},

  // Control characters.  Multiple consecutive control characters should only
  // produce one error.
  { "\b foo", true,
    "0:0: Invalid control characters encountered in text.\n" },
  { "\b\b foo", true,
    "0:0: Invalid control characters encountered in text.\n" },

  // Check that control characters at end of input don't result in an
  // infinite loop.
  { "\b", false,
    "0:0: Invalid control characters encountered in text.\n" },

  // Check recovery from '\0'.  We have to explicitly specify the length of
  // these strings because otherwise the string constructor will just call
  // strlen() which will see the first '\0' and think that is the end of the
  // string.
  { string("\0foo", 4), true,
    "0:0: Invalid control characters encountered in text.\n" },
  { string("\0\0foo", 5), true,
    "0:0: Invalid control characters encountered in text.\n" },
};

TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
  // Set up the tokenizer.
  TestInputStream input(kErrorCases_case.input.data(),
                        kErrorCases_case.input.size(),
                        kBlockSizes_case);
  TestErrorCollector error_collector;
  Tokenizer tokenizer(&input, &error_collector);

  // Ignore all input, except remember if the last token was "foo".
  bool last_was_foo = false;
  while (tokenizer.Next()) {
    last_was_foo = tokenizer.current().text == "foo";
  }

  // Check that the errors match what was expected.
  EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);

  // If the error was recoverable, make sure we saw "foo" after it.
  if (kErrorCases_case.recoverable) {
    EXPECT_TRUE(last_was_foo);
  }
}

// -------------------------------------------------------------------

TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
  string text = "foo bar";
  TestInputStream input(text.data(), text.size(), kBlockSizes_case);

  // Create a tokenizer, read one token, then destroy it.
  {
    TestErrorCollector error_collector;
    Tokenizer tokenizer(&input, &error_collector);

    tokenizer.Next();
  }

  // Only "foo" should have been read.
  EXPECT_EQ(strlen("foo"), input.ByteCount());
}

}  // namespace
}  // namespace io
}  // namespace protobuf
}  // namespace google
Summary ✨

This C++ code is a test suite for a Tokenizer class, which parses input text into tokens (e.g., keywords, identifiers, literals). The tests verify that the tokenizer correctly handles various scenarios, such as syntax errors, control characters, and block comments. It checks that the tokenizer produces the expected output and reports errors accurately.
Alerts (3)

Complexity hotspot; line 131 (total complexity: 4)
131
Complexity hotspot; line 191 (total complexity: 4)
191
Complexity hotspot; line 481 (total complexity: 4)
481