/thirdparty/breakpad/third_party/protobuf/protobuf/src/google/protobuf/io/tokenizer_unittest.cc

http://github.com/tomahawk-player/tomahawk · C++ · 766 lines · 493 code · 109 blank · 164 comment · 25 complexity · 988392f54bc75a6a9b33e8de9f276dc1 MD5 · raw file

  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc. All rights reserved.
  3. // http://code.google.com/p/protobuf/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. // Author: kenton@google.com (Kenton Varda)
  31. // Based on original Protocol Buffers design by
  32. // Sanjay Ghemawat, Jeff Dean, and others.
  33. #include <vector>
  34. #include <math.h>
  35. #include <limits.h>
  36. #include <google/protobuf/io/tokenizer.h>
  37. #include <google/protobuf/io/zero_copy_stream_impl.h>
  38. #include <google/protobuf/stubs/common.h>
  39. #include <google/protobuf/stubs/strutil.h>
  40. #include <google/protobuf/stubs/substitute.h>
  41. #include <google/protobuf/testing/googletest.h>
  42. #include <gtest/gtest.h>
  43. namespace google {
  44. namespace protobuf {
  45. namespace io {
  46. namespace {
  47. // ===================================================================
  48. // Data-Driven Test Infrastructure
  49. // TODO(kenton): This is copied from coded_stream_unittest. This is
  50. // temporary until these fetaures are integrated into gTest itself.
  51. // TEST_1D and TEST_2D are macros I'd eventually like to see added to
  52. // gTest. These macros can be used to declare tests which should be
  53. // run multiple times, once for each item in some input array. TEST_1D
  54. // tests all cases in a single input array. TEST_2D tests all
  55. // combinations of cases from two arrays. The arrays must be statically
  56. // defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
  57. //
  58. // int kCases[] = {1, 2, 3, 4}
  59. // TEST_1D(MyFixture, MyTest, kCases) {
  60. // EXPECT_GT(kCases_case, 0);
  61. // }
  62. //
  63. // This test iterates through the numbers 1, 2, 3, and 4 and tests that
  64. // they are all grater than zero. In case of failure, the exact case
  65. // which failed will be printed. The case type must be printable using
  66. // ostream::operator<<.
  67. #define TEST_1D(FIXTURE, NAME, CASES) \
  68. class FIXTURE##_##NAME##_DD : public FIXTURE { \
  69. protected: \
  70. template <typename CaseType> \
  71. void DoSingleCase(const CaseType& CASES##_case); \
  72. }; \
  73. \
  74. TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
  75. for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
  76. SCOPED_TRACE(testing::Message() \
  77. << #CASES " case #" << i << ": " << CASES[i]); \
  78. DoSingleCase(CASES[i]); \
  79. } \
  80. } \
  81. \
  82. template <typename CaseType> \
  83. void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
  84. #define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
  85. class FIXTURE##_##NAME##_DD : public FIXTURE { \
  86. protected: \
  87. template <typename CaseType1, typename CaseType2> \
  88. void DoSingleCase(const CaseType1& CASES1##_case, \
  89. const CaseType2& CASES2##_case); \
  90. }; \
  91. \
  92. TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
  93. for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
  94. for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
  95. SCOPED_TRACE(testing::Message() \
  96. << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
  97. << #CASES2 " case #" << j << ": " << CASES2[j]); \
  98. DoSingleCase(CASES1[i], CASES2[j]); \
  99. } \
  100. } \
  101. } \
  102. \
  103. template <typename CaseType1, typename CaseType2> \
  104. void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
  105. const CaseType2& CASES2##_case)
  106. // -------------------------------------------------------------------
  107. // An input stream that is basically like an ArrayInputStream but sometimes
  108. // returns empty buffers, just to throw us off.
  109. class TestInputStream : public ZeroCopyInputStream {
  110. public:
  111. TestInputStream(const void* data, int size, int block_size)
  112. : array_stream_(data, size, block_size), counter_(0) {}
  113. ~TestInputStream() {}
  114. // implements ZeroCopyInputStream ----------------------------------
  115. bool Next(const void** data, int* size) {
  116. // We'll return empty buffers starting with the first buffer, and every
  117. // 3 and 5 buffers after that.
  118. if (counter_ % 3 == 0 || counter_ % 5 == 0) {
  119. *data = NULL;
  120. *size = 0;
  121. ++counter_;
  122. return true;
  123. } else {
  124. ++counter_;
  125. return array_stream_.Next(data, size);
  126. }
  127. }
  128. void BackUp(int count) { return array_stream_.BackUp(count); }
  129. bool Skip(int count) { return array_stream_.Skip(count); }
  130. int64 ByteCount() const { return array_stream_.ByteCount(); }
  131. private:
  132. ArrayInputStream array_stream_;
  133. int counter_;
  134. };
  135. // -------------------------------------------------------------------
  136. // An error collector which simply concatenates all its errors into a big
  137. // block of text which can be checked.
  138. class TestErrorCollector : public ErrorCollector {
  139. public:
  140. TestErrorCollector() {}
  141. ~TestErrorCollector() {}
  142. string text_;
  143. // implements ErrorCollector ---------------------------------------
  144. void AddError(int line, int column, const string& message) {
  145. strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
  146. line, column, message);
  147. }
  148. };
  149. // -------------------------------------------------------------------
  150. // We test each operation over a variety of block sizes to insure that
  151. // we test cases where reads cross buffer boundaries as well as cases
  152. // where they don't. This is sort of a brute-force approach to this,
  153. // but it's easy to write and easy to understand.
  154. const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
  155. class TokenizerTest : public testing::Test {
  156. protected:
  157. // For easy testing.
  158. uint64 ParseInteger(const string& text) {
  159. uint64 result;
  160. EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
  161. return result;
  162. }
  163. };
  164. // ===================================================================
  165. // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
  166. // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
  167. #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
  168. // In each test case, the entire input text should parse as a single token
  169. // of the given type.
  170. struct SimpleTokenCase {
  171. string input;
  172. Tokenizer::TokenType type;
  173. };
  174. inline ostream& operator<<(ostream& out,
  175. const SimpleTokenCase& test_case) {
  176. return out << CEscape(test_case.input);
  177. }
  178. SimpleTokenCase kSimpleTokenCases[] = {
  179. // Test identifiers.
  180. { "hello", Tokenizer::TYPE_IDENTIFIER },
  181. // Test integers.
  182. { "123", Tokenizer::TYPE_INTEGER },
  183. { "0xab6", Tokenizer::TYPE_INTEGER },
  184. { "0XAB6", Tokenizer::TYPE_INTEGER },
  185. { "0X1234567", Tokenizer::TYPE_INTEGER },
  186. { "0x89abcdef", Tokenizer::TYPE_INTEGER },
  187. { "0x89ABCDEF", Tokenizer::TYPE_INTEGER },
  188. { "01234567", Tokenizer::TYPE_INTEGER },
  189. // Test floats.
  190. { "123.45", Tokenizer::TYPE_FLOAT },
  191. { "1.", Tokenizer::TYPE_FLOAT },
  192. { "1e3", Tokenizer::TYPE_FLOAT },
  193. { "1E3", Tokenizer::TYPE_FLOAT },
  194. { "1e-3", Tokenizer::TYPE_FLOAT },
  195. { "1e+3", Tokenizer::TYPE_FLOAT },
  196. { "1.e3", Tokenizer::TYPE_FLOAT },
  197. { "1.2e3", Tokenizer::TYPE_FLOAT },
  198. { ".1", Tokenizer::TYPE_FLOAT },
  199. { ".1e3", Tokenizer::TYPE_FLOAT },
  200. { ".1e-3", Tokenizer::TYPE_FLOAT },
  201. { ".1e+3", Tokenizer::TYPE_FLOAT },
  202. // Test strings.
  203. { "'hello'", Tokenizer::TYPE_STRING },
  204. { "\"foo\"", Tokenizer::TYPE_STRING },
  205. { "'a\"b'", Tokenizer::TYPE_STRING },
  206. { "\"a'b\"", Tokenizer::TYPE_STRING },
  207. { "'a\\'b'", Tokenizer::TYPE_STRING },
  208. { "\"a\\\"b\"", Tokenizer::TYPE_STRING },
  209. { "'\\xf'", Tokenizer::TYPE_STRING },
  210. { "'\\0'", Tokenizer::TYPE_STRING },
  211. // Test symbols.
  212. { "+", Tokenizer::TYPE_SYMBOL },
  213. { ".", Tokenizer::TYPE_SYMBOL },
  214. };
  215. TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
  216. // Set up the tokenizer.
  217. TestInputStream input(kSimpleTokenCases_case.input.data(),
  218. kSimpleTokenCases_case.input.size(),
  219. kBlockSizes_case);
  220. TestErrorCollector error_collector;
  221. Tokenizer tokenizer(&input, &error_collector);
  222. // Before Next() is called, the initial token should always be TYPE_START.
  223. EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
  224. EXPECT_EQ("", tokenizer.current().text);
  225. EXPECT_EQ(0, tokenizer.current().line);
  226. EXPECT_EQ(0, tokenizer.current().column);
  227. EXPECT_EQ(0, tokenizer.current().end_column);
  228. // Parse the token.
  229. ASSERT_TRUE(tokenizer.Next());
  230. // Check that it has the right type.
  231. EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
  232. // Check that it contains the complete input text.
  233. EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
  234. // Check that it is located at the beginning of the input
  235. EXPECT_EQ(0, tokenizer.current().line);
  236. EXPECT_EQ(0, tokenizer.current().column);
  237. EXPECT_EQ(kSimpleTokenCases_case.input.size(),
  238. tokenizer.current().end_column);
  239. // There should be no more input.
  240. EXPECT_FALSE(tokenizer.Next());
  241. // After Next() returns false, the token should have type TYPE_END.
  242. EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
  243. EXPECT_EQ("", tokenizer.current().text);
  244. EXPECT_EQ(0, tokenizer.current().line);
  245. EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
  246. EXPECT_EQ(kSimpleTokenCases_case.input.size(),
  247. tokenizer.current().end_column);
  248. // There should be no errors.
  249. EXPECT_TRUE(error_collector.text_.empty());
  250. }
  251. TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
  252. // Test the "allow_f_after_float" option.
  253. // Set up the tokenizer.
  254. const char* text = "1f 2.5f 6e3f 7F";
  255. TestInputStream input(text, strlen(text), kBlockSizes_case);
  256. TestErrorCollector error_collector;
  257. Tokenizer tokenizer(&input, &error_collector);
  258. tokenizer.set_allow_f_after_float(true);
  259. // Advance through tokens and check that they are parsed as expected.
  260. ASSERT_TRUE(tokenizer.Next());
  261. EXPECT_EQ(tokenizer.current().text, "1f");
  262. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  263. ASSERT_TRUE(tokenizer.Next());
  264. EXPECT_EQ(tokenizer.current().text, "2.5f");
  265. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  266. ASSERT_TRUE(tokenizer.Next());
  267. EXPECT_EQ(tokenizer.current().text, "6e3f");
  268. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  269. ASSERT_TRUE(tokenizer.Next());
  270. EXPECT_EQ(tokenizer.current().text, "7F");
  271. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  272. // There should be no more input.
  273. EXPECT_FALSE(tokenizer.Next());
  274. // There should be no errors.
  275. EXPECT_TRUE(error_collector.text_.empty());
  276. }
  277. #endif
  278. // -------------------------------------------------------------------
  279. // In each case, the input is parsed to produce a list of tokens. The
  280. // last token in "output" must have type TYPE_END.
  281. struct MultiTokenCase {
  282. string input;
  283. Tokenizer::Token output[10]; // The compiler wants a constant array
  284. // size for initialization to work. There
  285. // is no reason this can't be increased if
  286. // needed.
  287. };
  288. inline ostream& operator<<(ostream& out,
  289. const MultiTokenCase& test_case) {
  290. return out << CEscape(test_case.input);
  291. }
  292. MultiTokenCase kMultiTokenCases[] = {
  293. // Test empty input.
  294. { "", {
  295. { Tokenizer::TYPE_END , "" , 0, 0 },
  296. }},
  297. // Test all token types at the same time.
  298. { "foo 1 1.2 + 'bar'", {
  299. { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 },
  300. { Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 },
  301. { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 },
  302. { Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 },
  303. { Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 },
  304. { Tokenizer::TYPE_END , "" , 0, 17, 17 },
  305. }},
  306. // Test that consecutive symbols are parsed as separate tokens.
  307. { "!@+%", {
  308. { Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 },
  309. { Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 },
  310. { Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 },
  311. { Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 },
  312. { Tokenizer::TYPE_END , "" , 0, 4, 4 },
  313. }},
  314. // Test that newlines affect line numbers correctly.
  315. { "foo bar\nrab oof", {
  316. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  317. { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 },
  318. { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 },
  319. { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 },
  320. { Tokenizer::TYPE_END , "" , 1, 7, 7 },
  321. }},
  322. // Test that tabs affect column numbers correctly.
  323. { "foo\tbar \tbaz", {
  324. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  325. { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 },
  326. { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
  327. { Tokenizer::TYPE_END , "" , 0, 19, 19 },
  328. }},
  329. // Test that tabs in string literals affect column numbers correctly.
  330. { "\"foo\tbar\" baz", {
  331. { Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 },
  332. { Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 },
  333. { Tokenizer::TYPE_END , "" , 0, 16, 16 },
  334. }},
  335. // Test that line comments are ignored.
  336. { "foo // This is a comment\n"
  337. "bar // This is another comment", {
  338. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  339. { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 },
  340. { Tokenizer::TYPE_END , "" , 1, 30, 30 },
  341. }},
  342. // Test that block comments are ignored.
  343. { "foo /* This is a block comment */ bar", {
  344. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  345. { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
  346. { Tokenizer::TYPE_END , "" , 0, 37, 37 },
  347. }},
  348. // Test that sh-style comments are not ignored by default.
  349. { "foo # bar\n"
  350. "baz", {
  351. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  352. { Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 },
  353. { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
  354. { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
  355. { Tokenizer::TYPE_END , "" , 1, 3, 3 },
  356. }},
  357. // Bytes with the high-order bit set should not be seen as control characters.
  358. { "\300", {
  359. { Tokenizer::TYPE_SYMBOL, "\300", 0, 0, 1 },
  360. { Tokenizer::TYPE_END , "" , 0, 1, 1 },
  361. }},
  362. // Test all whitespace chars
  363. { "foo\n\t\r\v\fbar", {
  364. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  365. { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
  366. { Tokenizer::TYPE_END , "" , 1, 14, 14 },
  367. }},
  368. };
  369. TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
  370. // Set up the tokenizer.
  371. TestInputStream input(kMultiTokenCases_case.input.data(),
  372. kMultiTokenCases_case.input.size(),
  373. kBlockSizes_case);
  374. TestErrorCollector error_collector;
  375. Tokenizer tokenizer(&input, &error_collector);
  376. // Before Next() is called, the initial token should always be TYPE_START.
  377. EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
  378. EXPECT_EQ("", tokenizer.current().text);
  379. EXPECT_EQ(0, tokenizer.current().line);
  380. EXPECT_EQ(0, tokenizer.current().column);
  381. EXPECT_EQ(0, tokenizer.current().end_column);
  382. // Loop through all expected tokens.
  383. int i = 0;
  384. Tokenizer::Token token;
  385. do {
  386. token = kMultiTokenCases_case.output[i++];
  387. SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
  388. Tokenizer::Token previous = tokenizer.current();
  389. // Next() should only return false when it hits the end token.
  390. if (token.type != Tokenizer::TYPE_END) {
  391. ASSERT_TRUE(tokenizer.Next());
  392. } else {
  393. ASSERT_FALSE(tokenizer.Next());
  394. }
  395. // Check that the previous token is set correctly.
  396. EXPECT_EQ(previous.type, tokenizer.previous().type);
  397. EXPECT_EQ(previous.text, tokenizer.previous().text);
  398. EXPECT_EQ(previous.line, tokenizer.previous().line);
  399. EXPECT_EQ(previous.column, tokenizer.previous().column);
  400. EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
  401. // Check that the token matches the expected one.
  402. EXPECT_EQ(token.type, tokenizer.current().type);
  403. EXPECT_EQ(token.text, tokenizer.current().text);
  404. EXPECT_EQ(token.line, tokenizer.current().line);
  405. EXPECT_EQ(token.column, tokenizer.current().column);
  406. EXPECT_EQ(token.end_column, tokenizer.current().end_column);
  407. } while (token.type != Tokenizer::TYPE_END);
  408. // There should be no errors.
  409. EXPECT_TRUE(error_collector.text_.empty());
  410. }
  411. // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
  412. // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
  413. #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
  414. TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
  415. // Test the "comment_style" option.
  416. const char* text = "foo # bar\n"
  417. "baz // qux\n"
  418. "corge /* grault */\n"
  419. "garply";
  420. const char* const kTokens[] = {"foo", // "# bar" is ignored
  421. "baz", "/", "/", "qux",
  422. "corge", "/", "*", "grault", "*", "/",
  423. "garply"};
  424. // Set up the tokenizer.
  425. TestInputStream input(text, strlen(text), kBlockSizes_case);
  426. TestErrorCollector error_collector;
  427. Tokenizer tokenizer(&input, &error_collector);
  428. tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
  429. // Advance through tokens and check that they are parsed as expected.
  430. for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
  431. EXPECT_TRUE(tokenizer.Next());
  432. EXPECT_EQ(tokenizer.current().text, kTokens[i]);
  433. }
  434. // There should be no more input.
  435. EXPECT_FALSE(tokenizer.Next());
  436. // There should be no errors.
  437. EXPECT_TRUE(error_collector.text_.empty());
  438. }
  439. #endif
  440. // -------------------------------------------------------------------
  441. // Test parse helpers. It's not really worth setting up a full data-driven
  442. // test here.
  443. TEST_F(TokenizerTest, ParseInteger) {
  444. EXPECT_EQ(0, ParseInteger("0"));
  445. EXPECT_EQ(123, ParseInteger("123"));
  446. EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
  447. EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
  448. EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
  449. EXPECT_EQ(01234567, ParseInteger("01234567"));
  450. EXPECT_EQ(0X123, ParseInteger("0X123"));
  451. // Test invalid integers that may still be tokenized as integers.
  452. EXPECT_EQ(0, ParseInteger("0x"));
  453. uint64 i;
  454. #ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
  455. // Test invalid integers that will never be tokenized as integers.
  456. EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
  457. "passed text that could not have been tokenized as an integer");
  458. EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
  459. "passed text that could not have been tokenized as an integer");
  460. EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
  461. "passed text that could not have been tokenized as an integer");
  462. EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
  463. "passed text that could not have been tokenized as an integer");
  464. EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
  465. "passed text that could not have been tokenized as an integer");
  466. #endif // GTEST_HAS_DEATH_TEST
  467. // Test overflows.
  468. EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
  469. EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
  470. EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
  471. EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
  472. EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
  473. EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
  474. EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
  475. }
  476. TEST_F(TokenizerTest, ParseFloat) {
  477. EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1."));
  478. EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3"));
  479. EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3"));
  480. EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
  481. EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1"));
  482. EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25"));
  483. EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
  484. EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
  485. EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
  486. EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
  487. EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5"));
  488. EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
  489. EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2"));
  490. EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
  491. // Test invalid integers that may still be tokenized as integers.
  492. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
  493. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
  494. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
  495. // Test 'f' suffix.
  496. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
  497. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
  498. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
  499. // These should parse successfully even though they are out of range.
  500. // Overflows become infinity and underflows become zero.
  501. EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
  502. EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
  503. #ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
  504. // Test invalid integers that will never be tokenized as integers.
  505. EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
  506. "passed text that could not have been tokenized as a float");
  507. EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
  508. "passed text that could not have been tokenized as a float");
  509. EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
  510. "passed text that could not have been tokenized as a float");
  511. #endif // GTEST_HAS_DEATH_TEST
  512. }
  513. TEST_F(TokenizerTest, ParseString) {
  514. string output;
  515. Tokenizer::ParseString("'hello'", &output);
  516. EXPECT_EQ("hello", output);
  517. Tokenizer::ParseString("\"blah\\nblah2\"", &output);
  518. EXPECT_EQ("blah\nblah2", output);
  519. Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
  520. EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
  521. Tokenizer::ParseString("'\\x20\\x4'", &output);
  522. EXPECT_EQ("\x20\x4", output);
  523. // Test invalid strings that may still be tokenized as strings.
  524. Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
  525. EXPECT_EQ("\a?\v\t", output);
  526. Tokenizer::ParseString("'", &output);
  527. EXPECT_EQ("", output);
  528. Tokenizer::ParseString("'\\", &output);
  529. EXPECT_EQ("\\", output);
  530. // Test invalid strings that will never be tokenized as strings.
  531. #ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
  532. EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
  533. "passed text that could not have been tokenized as a string");
  534. #endif // GTEST_HAS_DEATH_TEST
  535. }
  536. TEST_F(TokenizerTest, ParseStringAppend) {
  537. // Check that ParseString and ParseStringAppend differ.
  538. string output("stuff+");
  539. Tokenizer::ParseStringAppend("'hello'", &output);
  540. EXPECT_EQ("stuff+hello", output);
  541. Tokenizer::ParseString("'hello'", &output);
  542. EXPECT_EQ("hello", output);
  543. }
  544. // -------------------------------------------------------------------
  545. // Each case parses some input text, ignoring the tokens produced, and
  546. // checks that the error output matches what is expected.
  547. struct ErrorCase {
  548. string input;
  549. bool recoverable; // True if the tokenizer should be able to recover and
  550. // parse more tokens after seeing this error. Cases
  551. // for which this is true must end with "foo" as
  552. // the last token, which the test will check for.
  553. const char* errors;
  554. };
  555. inline ostream& operator<<(ostream& out,
  556. const ErrorCase& test_case) {
  557. return out << CEscape(test_case.input);
  558. }
  559. ErrorCase kErrorCases[] = {
  560. // String errors.
  561. { "'\\l' foo", true,
  562. "0:2: Invalid escape sequence in string literal.\n" },
  563. { "'\\x' foo", true,
  564. "0:3: Expected hex digits for escape sequence.\n" },
  565. { "'foo", false,
  566. "0:4: String literals cannot cross line boundaries.\n" },
  567. { "'bar\nfoo", true,
  568. "0:4: String literals cannot cross line boundaries.\n" },
  569. // Integer errors.
  570. { "123foo", true,
  571. "0:3: Need space between number and identifier.\n" },
  572. // Hex/octal errors.
  573. { "0x foo", true,
  574. "0:2: \"0x\" must be followed by hex digits.\n" },
  575. { "0541823 foo", true,
  576. "0:4: Numbers starting with leading zero must be in octal.\n" },
  577. { "0x123z foo", true,
  578. "0:5: Need space between number and identifier.\n" },
  579. { "0x123.4 foo", true,
  580. "0:5: Hex and octal numbers must be integers.\n" },
  581. { "0123.4 foo", true,
  582. "0:4: Hex and octal numbers must be integers.\n" },
  583. // Float errors.
  584. { "1e foo", true,
  585. "0:2: \"e\" must be followed by exponent.\n" },
  586. { "1e- foo", true,
  587. "0:3: \"e\" must be followed by exponent.\n" },
  588. { "1.2.3 foo", true,
  589. "0:3: Already saw decimal point or exponent; can't have another one.\n" },
  590. { "1e2.3 foo", true,
  591. "0:3: Already saw decimal point or exponent; can't have another one.\n" },
  592. { "a.1 foo", true,
  593. "0:1: Need space between identifier and decimal point.\n" },
  594. // allow_f_after_float not enabled, so this should be an error.
  595. { "1.0f foo", true,
  596. "0:3: Need space between number and identifier.\n" },
  597. // Block comment errors.
  598. { "/*", false,
  599. "0:2: End-of-file inside block comment.\n"
  600. "0:0: Comment started here.\n"},
  601. { "/*/*/ foo", true,
  602. "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
  603. // Control characters. Multiple consecutive control characters should only
  604. // produce one error.
  605. { "\b foo", true,
  606. "0:0: Invalid control characters encountered in text.\n" },
  607. { "\b\b foo", true,
  608. "0:0: Invalid control characters encountered in text.\n" },
  609. // Check that control characters at end of input don't result in an
  610. // infinite loop.
  611. { "\b", false,
  612. "0:0: Invalid control characters encountered in text.\n" },
  613. // Check recovery from '\0'. We have to explicitly specify the length of
  614. // these strings because otherwise the string constructor will just call
  615. // strlen() which will see the first '\0' and think that is the end of the
  616. // string.
  617. { string("\0foo", 4), true,
  618. "0:0: Invalid control characters encountered in text.\n" },
  619. { string("\0\0foo", 5), true,
  620. "0:0: Invalid control characters encountered in text.\n" },
  621. };
  622. TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
  623. // Set up the tokenizer.
  624. TestInputStream input(kErrorCases_case.input.data(),
  625. kErrorCases_case.input.size(),
  626. kBlockSizes_case);
  627. TestErrorCollector error_collector;
  628. Tokenizer tokenizer(&input, &error_collector);
  629. // Ignore all input, except remember if the last token was "foo".
  630. bool last_was_foo = false;
  631. while (tokenizer.Next()) {
  632. last_was_foo = tokenizer.current().text == "foo";
  633. }
  634. // Check that the errors match what was expected.
  635. EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
  636. // If the error was recoverable, make sure we saw "foo" after it.
  637. if (kErrorCases_case.recoverable) {
  638. EXPECT_TRUE(last_was_foo);
  639. }
  640. }
  641. // -------------------------------------------------------------------
  642. TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
  643. string text = "foo bar";
  644. TestInputStream input(text.data(), text.size(), kBlockSizes_case);
  645. // Create a tokenizer, read one token, then destroy it.
  646. {
  647. TestErrorCollector error_collector;
  648. Tokenizer tokenizer(&input, &error_collector);
  649. tokenizer.Next();
  650. }
  651. // Only "foo" should have been read.
  652. EXPECT_EQ(strlen("foo"), input.ByteCount());
  653. }
  654. } // namespace
  655. } // namespace io
  656. } // namespace protobuf
  657. } // namespace google