PageRenderTime 47ms CodeModel.GetById 11ms RepoModel.GetById 1ms app.codeStats 0ms

/src/tokenize.cpp

http://github.com/bengardner/uncrustify
C++ | 2047 lines | 1590 code | 201 blank | 256 comment | 371 complexity | e2414d84fb693b154ad3e8152180e063 MD5 | raw file
Possible License(s): GPL-2.0
  1. /**
  2. * @file tokenize.cpp
  3. * This file breaks up the text stream into tokens or chunks.
  4. *
  5. * Each routine needs to set pc.len and pc.type.
  6. *
  7. * @author Ben Gardner
  8. * @license GPL v2+
  9. */
  10. #include "tokenize.h"
  11. #include "uncrustify_types.h"
  12. #include "char_table.h"
  13. #include "prototypes.h"
  14. #include "chunk_list.h"
  15. #include <cstdio>
  16. #include <cstdlib>
  17. #include <cstring>
  18. #include "unc_ctype.h"
  19. #include "uncrustify.h"
  20. #include "keywords.h"
  21. struct tok_info
  22. {
  23. tok_info()
  24. : last_ch(0)
  25. , idx(0)
  26. , row(1)
  27. , col(1)
  28. {
  29. }
  30. int last_ch;
  31. int idx;
  32. int row;
  33. int col;
  34. };
  35. struct tok_ctx
  36. {
  37. tok_ctx(const deque<int> &d)
  38. : data(d)
  39. {
  40. }
  41. /* save before trying to parse something that may fail */
  42. void save()
  43. {
  44. save(s);
  45. }
  46. void save(tok_info &info)
  47. {
  48. info = c;
  49. }
  50. /* restore previous saved state */
  51. void restore()
  52. {
  53. restore(s);
  54. }
  55. void restore(const tok_info &info)
  56. {
  57. c = info;
  58. }
  59. bool more()
  60. {
  61. return(c.idx < (int)data.size());
  62. }
  63. int peek()
  64. {
  65. return(more() ? data[c.idx] : -1);
  66. }
  67. int peek(int idx)
  68. {
  69. idx += c.idx;
  70. return((idx < (int)data.size()) ? data[idx] : -1);
  71. }
  72. int get()
  73. {
  74. if (more())
  75. {
  76. int ch = data[c.idx++];
  77. switch (ch)
  78. {
  79. case '\t':
  80. c.col = calc_next_tab_column(c.col, cpd.settings[UO_input_tab_size].u);
  81. break;
  82. case '\n':
  83. if (c.last_ch != '\r')
  84. {
  85. c.row++;
  86. c.col = 1;
  87. }
  88. break;
  89. case '\r':
  90. c.row++;
  91. c.col = 1;
  92. break;
  93. default:
  94. c.col++;
  95. break;
  96. }
  97. c.last_ch = ch;
  98. return(ch);
  99. }
  100. return(-1);
  101. }
  102. bool expect(int ch)
  103. {
  104. if (peek() == ch)
  105. {
  106. get();
  107. return(true);
  108. }
  109. return(false);
  110. }
  111. const deque<int> &data;
  112. tok_info c; /* current */
  113. tok_info s; /* saved */
  114. };
  115. /**
  116. * Count the number of characters in a quoted string.
  117. * The next bit of text starts with a quote char " or ' or <.
  118. * Count the number of characters until the matching character.
  119. *
  120. * @param pc The structure to update, str is an input.
  121. * @return Whether a string was parsed
  122. */
  123. static bool parse_string(tok_ctx &ctx, chunk_t &pc, int quote_idx, bool allow_escape);
  124. /**
  125. * Literal string, ends with single "
  126. * Two "" don't end the string.
  127. *
  128. * @param pc The structure to update, str is an input.
  129. * @return Whether a string was parsed
  130. */
  131. static bool parse_cs_string(tok_ctx &ctx, chunk_t &pc);
  132. /**
  133. * Interpolated strings start with $" end with a single "
  134. * Double quotes are escaped by doubling.
  135. * Need to track embedded { } pairs and ignore anything between.
  136. *
  137. * @param pc The structure to update, str is an input.
  138. * @return Whether a string was parsed
  139. */
  140. static bool parse_cs_interpolated_string(tok_ctx &ctx, chunk_t &pc);
  141. /**
  142. * VALA verbatim string, ends with three quotes (""")
  143. *
  144. * @param pc The structure to update, str is an input.
  145. */
  146. static void parse_verbatim_string(tok_ctx &ctx, chunk_t &pc);
  147. static bool tag_compare(const deque<int> &d, int a_idx, int b_idx, int len);
  148. /**
  149. * Parses a C++0x 'R' string. R"( xxx )" R"tag( )tag" u8R"(x)" uR"(x)"
  150. * Newlines may be in the string.
  151. */
  152. static bool parse_cr_string(tok_ctx &ctx, chunk_t &pc, int q_idx);
  153. /**
  154. * Count the number of whitespace characters.
  155. *
  156. * @param pc The structure to update, str is an input.
  157. * @return Whether whitespace was parsed
  158. */
  159. static bool parse_whitespace(tok_ctx &ctx, chunk_t &pc);
  160. /**
  161. * Called when we hit a backslash.
  162. * If there is nothing but whitespace until the newline, then this is a
  163. * backslash newline
  164. */
  165. static bool parse_bs_newline(tok_ctx &ctx, chunk_t &pc);
  166. /**
  167. * Parses any number of tab or space chars followed by a newline.
  168. * Does not change pc.len if a newline isn't found.
  169. * This is not the same as parse_whitespace() because it only consumes until
  170. * a single newline is encountered.
  171. */
  172. static bool parse_newline(tok_ctx &ctx);
  173. /**
  174. * PAWN #define is different than C/C++.
  175. * #define PATTERN REPLACEMENT_TEXT
  176. * The PATTERN may not contain a space or '[' or ']'.
  177. * A generic whitespace check should be good enough.
  178. * Do not change the pattern.
  179. */
  180. static void parse_pawn_pattern(tok_ctx &ctx, chunk_t &pc, c_token_t tt);
  181. static bool parse_ignored(tok_ctx &ctx, chunk_t &pc);
  182. /**
  183. * Skips the next bit of whatever and returns the type of block.
  184. *
  185. * pc.str is the input text.
  186. * pc.len in the output length.
  187. * pc.type is the output type
  188. * pc.column is output column
  189. *
  190. * @param pc The structure to update, str is an input.
  191. * @return true/false - whether anything was parsed
  192. */
  193. static bool parse_next(tok_ctx &ctx, chunk_t &pc);
  194. /**
  195. * Parses all legal D string constants.
  196. *
  197. * Quoted strings:
  198. * r"Wysiwyg" # WYSIWYG string
  199. * x"hexstring" # Hexadecimal array
  200. * `Wysiwyg` # WYSIWYG string
  201. * 'char' # single character
  202. * "reg_string" # regular string
  203. *
  204. * Non-quoted strings:
  205. * \x12 # 1-byte hex constant
  206. * \u1234 # 2-byte hex constant
  207. * \U12345678 # 4-byte hex constant
  208. * \123 # octal constant
  209. * \&amp; # named entity
  210. * \n # single character
  211. *
  212. * @param pc The structure to update, str is an input.
  213. * @return Whether a string was parsed
  214. */
  215. static bool d_parse_string(tok_ctx &ctx, chunk_t &pc);
  216. /**
  217. * Figure of the length of the comment at text.
  218. * The next bit of text starts with a '/', so it might be a comment.
  219. * There are three types of comments:
  220. * - C comments that start with '/ *' and end with '* /'
  221. * - C++ comments that start with //
  222. * - D nestable comments '/+' '+/'
  223. *
  224. * @param pc The structure to update, str is an input.
  225. * @return Whether a comment was parsed
  226. */
  227. static bool parse_comment(tok_ctx &ctx, chunk_t &pc);
  228. /**
  229. * Figure of the length of the code placeholder at text, if present.
  230. * This is only for Xcode which sometimes inserts temporary code placeholder chunks, which in plaintext <#look like this#>.
  231. *
  232. * @param pc The structure to update, str is an input.
  233. * @return Whether a placeholder was parsed.
  234. */
  235. static bool parse_code_placeholder(tok_ctx &ctx, chunk_t &pc);
  236. /**
  237. * Parse any attached suffix, which may be a user-defined literal suffix.
  238. * If for a string, explicitly exclude common format and scan specifiers, ie,
  239. * PRIx32 and SCNx64.
  240. */
  241. static void parse_suffix(tok_ctx &ctx, chunk_t &pc, bool forstring);
  242. static bool is_bin(int ch);
  243. static bool is_bin_(int ch);
  244. static bool is_oct(int ch);
  245. static bool is_oct_(int ch);
  246. static bool is_dec(int ch);
  247. static bool is_dec_(int ch);
  248. static bool is_hex(int ch);
  249. static bool is_hex_(int ch);
  250. /**
  251. * Count the number of characters in the number.
  252. * The next bit of text starts with a number (0-9 or '.'), so it is a number.
  253. * Count the number of characters in the number.
  254. *
  255. * This should cover all number formats for all languages.
  256. * Note that this is not a strict parser. It will happily parse numbers in
  257. * an invalid format.
  258. *
  259. * For example, only D allows underscores in the numbers, but they are
  260. * allowed in all formats.
  261. *
  262. * @param pc The structure to update, str is an input.
  263. * @return Whether a number was parsed
  264. */
  265. static bool parse_number(tok_ctx &ctx, chunk_t &pc);
  266. static bool d_parse_string(tok_ctx &ctx, chunk_t &pc)
  267. {
  268. int ch = ctx.peek();
  269. if ((ch == '"') || (ch == '\'') || (ch == '`'))
  270. {
  271. return(parse_string(ctx, pc, 0, true));
  272. }
  273. else if (ch == '\\')
  274. {
  275. ctx.save();
  276. int cnt;
  277. pc.str.clear();
  278. while (ctx.peek() == '\\')
  279. {
  280. pc.str.append(ctx.get());
  281. /* Check for end of file */
  282. switch (ctx.peek())
  283. {
  284. case 'x':
  285. /* \x HexDigit HexDigit */
  286. cnt = 3;
  287. while (cnt--)
  288. {
  289. pc.str.append(ctx.get());
  290. }
  291. break;
  292. case 'u':
  293. /* \u HexDigit HexDigit HexDigit HexDigit */
  294. cnt = 5;
  295. while (cnt--)
  296. {
  297. pc.str.append(ctx.get());
  298. }
  299. break;
  300. case 'U':
  301. /* \U HexDigit (x8) */
  302. cnt = 9;
  303. while (cnt--)
  304. {
  305. pc.str.append(ctx.get());
  306. }
  307. break;
  308. case '0':
  309. case '1':
  310. case '2':
  311. case '3':
  312. case '4':
  313. case '5':
  314. case '6':
  315. case '7':
  316. /* handle up to 3 octal digits */
  317. pc.str.append(ctx.get());
  318. ch = ctx.peek();
  319. if ((ch >= '0') && (ch <= '7'))
  320. {
  321. pc.str.append(ctx.get());
  322. ch = ctx.peek();
  323. if ((ch >= '0') && (ch <= '7'))
  324. {
  325. pc.str.append(ctx.get());
  326. }
  327. }
  328. break;
  329. case '&':
  330. /* \& NamedCharacterEntity ; */
  331. pc.str.append(ctx.get());
  332. while (unc_isalpha(ctx.peek()))
  333. {
  334. pc.str.append(ctx.get());
  335. }
  336. if (ctx.peek() == ';')
  337. {
  338. pc.str.append(ctx.get());
  339. }
  340. break;
  341. default:
  342. /* Everything else is a single character */
  343. pc.str.append(ctx.get());
  344. break;
  345. } // switch
  346. }
  347. if (pc.str.size() > 1)
  348. {
  349. pc.type = CT_STRING;
  350. return(true);
  351. }
  352. ctx.restore();
  353. }
  354. else if (((ch == 'r') || (ch == 'x')) && (ctx.peek(1) == '"'))
  355. {
  356. return(parse_string(ctx, pc, 1, false));
  357. }
  358. return(false);
  359. } // d_parse_string
  360. #if 0
  361. /**
  362. * A string-in-string search. Like strstr() with a haystack length.
  363. */
  364. static const char *str_search(const char *needle, const char *haystack, int haystack_len)
  365. {
  366. int needle_len = strlen(needle);
  367. while (haystack_len-- >= needle_len)
  368. {
  369. if (memcmp(needle, haystack, needle_len) == 0)
  370. {
  371. return(haystack);
  372. }
  373. haystack++;
  374. }
  375. return(NULL);
  376. }
  377. #endif
  378. static bool parse_comment(tok_ctx &ctx, chunk_t &pc)
  379. {
  380. bool is_d = (cpd.lang_flags & LANG_D) != 0; // forcing value to bool
  381. bool is_cs = (cpd.lang_flags & LANG_CS) != 0; // forcing value to bool
  382. int d_level = 0;
  383. /* does this start with '/ /' or '/ *' or '/ +' (d) */
  384. if ((ctx.peek() != '/') ||
  385. ((ctx.peek(1) != '*') && (ctx.peek(1) != '/') &&
  386. ((ctx.peek(1) != '+') || !is_d)))
  387. {
  388. return(false);
  389. }
  390. ctx.save();
  391. /* account for opening two chars */
  392. pc.str = ctx.get(); /* opening '/' */
  393. int ch = ctx.get();
  394. pc.str.append(ch); /* second char */
  395. if (ch == '/')
  396. {
  397. pc.type = CT_COMMENT_CPP;
  398. while (true)
  399. {
  400. int bs_cnt = 0;
  401. while (ctx.more())
  402. {
  403. ch = ctx.peek();
  404. if ((ch == '\r') || (ch == '\n'))
  405. {
  406. break;
  407. }
  408. if ((ch == '\\') && !is_cs) /* backslashes aren't special in comments in C# */
  409. {
  410. bs_cnt++;
  411. }
  412. else
  413. {
  414. bs_cnt = 0;
  415. }
  416. pc.str.append(ctx.get());
  417. }
  418. /* If we hit an odd number of backslashes right before the newline,
  419. * then we keep going.
  420. */
  421. if (((bs_cnt & 1) == 0) || !ctx.more())
  422. {
  423. break;
  424. }
  425. if (ctx.peek() == '\r')
  426. {
  427. pc.str.append(ctx.get());
  428. }
  429. if (ctx.peek() == '\n')
  430. {
  431. pc.str.append(ctx.get());
  432. }
  433. pc.nl_count++;
  434. cpd.did_newline = true;
  435. }
  436. }
  437. else if (!ctx.more())
  438. {
  439. /* unexpected end of file */
  440. ctx.restore();
  441. return(false);
  442. }
  443. else if (ch == '+')
  444. {
  445. pc.type = CT_COMMENT;
  446. d_level++;
  447. while ((d_level > 0) && ctx.more())
  448. {
  449. if ((ctx.peek() == '+') && (ctx.peek(1) == '/'))
  450. {
  451. pc.str.append(ctx.get()); /* store the '+' */
  452. pc.str.append(ctx.get()); /* store the '/' */
  453. d_level--;
  454. continue;
  455. }
  456. if ((ctx.peek() == '/') && (ctx.peek(1) == '+'))
  457. {
  458. pc.str.append(ctx.get()); /* store the '/' */
  459. pc.str.append(ctx.get()); /* store the '+' */
  460. d_level++;
  461. continue;
  462. }
  463. ch = ctx.get();
  464. pc.str.append(ch);
  465. if ((ch == '\n') || (ch == '\r'))
  466. {
  467. pc.type = CT_COMMENT_MULTI;
  468. pc.nl_count++;
  469. if (ch == '\r')
  470. {
  471. if (ctx.peek() == '\n')
  472. {
  473. cpd.le_counts[LE_CRLF]++;
  474. pc.str.append(ctx.get()); /* store the '\n' */
  475. }
  476. else
  477. {
  478. cpd.le_counts[LE_CR]++;
  479. }
  480. }
  481. else
  482. {
  483. cpd.le_counts[LE_LF]++;
  484. }
  485. }
  486. }
  487. }
  488. else /* must be '/ *' */
  489. {
  490. pc.type = CT_COMMENT;
  491. while (ctx.more())
  492. {
  493. if ((ctx.peek() == '*') && (ctx.peek(1) == '/'))
  494. {
  495. pc.str.append(ctx.get()); /* store the '*' */
  496. pc.str.append(ctx.get()); /* store the '/' */
  497. tok_info ss;
  498. ctx.save(ss);
  499. int oldsize = pc.str.size();
  500. /* If there is another C comment right after this one, combine them */
  501. while ((ctx.peek() == ' ') || (ctx.peek() == '\t'))
  502. {
  503. pc.str.append(ctx.get());
  504. }
  505. if ((ctx.peek() != '/') || (ctx.peek(1) != '*'))
  506. {
  507. /* undo the attempt to join */
  508. ctx.restore(ss);
  509. pc.str.resize(oldsize);
  510. break;
  511. }
  512. }
  513. ch = ctx.get();
  514. pc.str.append(ch);
  515. if ((ch == '\n') || (ch == '\r'))
  516. {
  517. pc.type = CT_COMMENT_MULTI;
  518. pc.nl_count++;
  519. if (ch == '\r')
  520. {
  521. if (ctx.peek() == '\n')
  522. {
  523. cpd.le_counts[LE_CRLF]++;
  524. pc.str.append(ctx.get()); /* store the '\n' */
  525. }
  526. else
  527. {
  528. cpd.le_counts[LE_CR]++;
  529. }
  530. }
  531. else
  532. {
  533. cpd.le_counts[LE_LF]++;
  534. }
  535. }
  536. }
  537. }
  538. if (cpd.unc_off)
  539. {
  540. const char *ontext = cpd.settings[UO_enable_processing_cmt].str;
  541. if ((ontext == NULL) || !ontext[0])
  542. {
  543. ontext = UNCRUSTIFY_ON_TEXT;
  544. }
  545. if (pc.str.find(ontext) >= 0)
  546. {
  547. LOG_FMT(LBCTRL, "Found '%s' on line %zu\n", ontext, pc.orig_line);
  548. cpd.unc_off = false;
  549. }
  550. }
  551. else
  552. {
  553. const char *offtext = cpd.settings[UO_disable_processing_cmt].str;
  554. if ((offtext == NULL) || !offtext[0])
  555. {
  556. offtext = UNCRUSTIFY_OFF_TEXT;
  557. }
  558. if (pc.str.find(offtext) >= 0)
  559. {
  560. LOG_FMT(LBCTRL, "Found '%s' on line %zu\n", offtext, pc.orig_line);
  561. cpd.unc_off = true;
  562. // Issue #842
  563. cpd.unc_off_used = true;
  564. }
  565. }
  566. return(true);
  567. } // parse_comment
  568. static bool parse_code_placeholder(tok_ctx &ctx, chunk_t &pc)
  569. {
  570. if ((ctx.peek() != '<') || (ctx.peek(1) != '#'))
  571. {
  572. return(false);
  573. }
  574. ctx.save();
  575. /* account for opening two chars '<#' */
  576. pc.str = ctx.get();
  577. pc.str.append(ctx.get());
  578. /* grab everything until '#>', fail if not found. */
  579. int last1 = 0;
  580. while (ctx.more())
  581. {
  582. int last2 = last1;
  583. last1 = ctx.get();
  584. pc.str.append(last1);
  585. if ((last2 == '#') && (last1 == '>'))
  586. {
  587. pc.type = CT_WORD;
  588. return(true);
  589. }
  590. }
  591. ctx.restore();
  592. return(false);
  593. }
  594. static void parse_suffix(tok_ctx &ctx, chunk_t &pc, bool forstring = false)
  595. {
  596. if (CharTable::IsKw1(ctx.peek()))
  597. {
  598. int slen = 0;
  599. int oldsize = pc.str.size();
  600. /* don't add the suffix if we see L" or L' or S" */
  601. int p1 = ctx.peek();
  602. int p2 = ctx.peek(1);
  603. if (forstring &&
  604. (((p1 == 'L') && ((p2 == '"') || (p2 == '\''))) ||
  605. ((p1 == 'S') && (p2 == '"'))))
  606. {
  607. return;
  608. }
  609. tok_info ss;
  610. ctx.save(ss);
  611. while (ctx.more() && CharTable::IsKw2(ctx.peek()))
  612. {
  613. slen++;
  614. pc.str.append(ctx.get());
  615. }
  616. if (forstring && (slen >= 4) &&
  617. (pc.str.startswith("PRI", oldsize) ||
  618. pc.str.startswith("SCN", oldsize)))
  619. {
  620. ctx.restore(ss);
  621. pc.str.resize(oldsize);
  622. }
  623. }
  624. }
  625. static bool is_bin(int ch)
  626. {
  627. return((ch == '0') || (ch == '1'));
  628. }
  629. static bool is_bin_(int ch)
  630. {
  631. return(is_bin(ch) || (ch == '_'));
  632. }
  633. static bool is_oct(int ch)
  634. {
  635. return((ch >= '0') && (ch <= '7'));
  636. }
  637. static bool is_oct_(int ch)
  638. {
  639. return(is_oct(ch) || (ch == '_'));
  640. }
  641. static bool is_dec(int ch)
  642. {
  643. return((ch >= '0') && (ch <= '9'));
  644. }
  645. static bool is_dec_(int ch)
  646. {
  647. return(is_dec(ch) || (ch == '_'));
  648. }
  649. static bool is_hex(int ch)
  650. {
  651. return(((ch >= '0') && (ch <= '9')) ||
  652. ((ch >= 'a') && (ch <= 'f')) ||
  653. ((ch >= 'A') && (ch <= 'F')));
  654. }
  655. static bool is_hex_(int ch)
  656. {
  657. return(is_hex(ch) || (ch == '_'));
  658. }
  659. static bool parse_number(tok_ctx &ctx, chunk_t &pc)
  660. {
  661. /* A number must start with a digit or a dot, followed by a digit */
  662. if (!is_dec(ctx.peek()) &&
  663. ((ctx.peek() != '.') || !is_dec(ctx.peek(1))))
  664. {
  665. return(false);
  666. }
  667. bool is_float = (ctx.peek() == '.');
  668. if (is_float && (ctx.peek(1) == '.'))
  669. {
  670. return(false);
  671. }
  672. /* Check for Hex, Octal, or Binary
  673. * Note that only D and Pawn support binary, but who cares?
  674. */
  675. bool did_hex = false;
  676. if (ctx.peek() == '0')
  677. {
  678. pc.str.append(ctx.get()); /* store the '0' */
  679. int ch;
  680. chunk_t pc_temp;
  681. size_t pc_length;
  682. pc_temp.str.append('0');
  683. // MS constant might have an "h" at the end. Look for it
  684. ctx.save();
  685. while (ctx.more() && CharTable::IsKw2(ctx.peek()))
  686. {
  687. ch = ctx.get();
  688. pc_temp.str.append(ch);
  689. }
  690. pc_length = pc_temp.len();
  691. ch = pc_temp.str[pc_length - 1];
  692. ctx.restore();
  693. LOG_FMT(LGUY, "%s(%d): pc_temp:%s\n", __func__, __LINE__, pc_temp.text());
  694. if (ch == 'h')
  695. {
  696. // we have an MS hexadecimal number with "h" at the end
  697. LOG_FMT(LGUY, "%s(%d): MS hexadecimal number\n", __func__, __LINE__);
  698. did_hex = true;
  699. do
  700. {
  701. pc.str.append(ctx.get()); /* store the rest */
  702. } while (is_hex_(ctx.peek()));
  703. pc.str.append(ctx.get()); /* store the h */
  704. LOG_FMT(LGUY, "%s(%d): pc:%s\n", __func__, __LINE__, pc.text());
  705. }
  706. else
  707. {
  708. switch (unc_toupper(ctx.peek()))
  709. {
  710. case 'X': /* hex */
  711. did_hex = true;
  712. do
  713. {
  714. pc.str.append(ctx.get()); /* store the 'x' and then the rest */
  715. } while (is_hex_(ctx.peek()));
  716. break;
  717. case 'B': /* binary */
  718. do
  719. {
  720. pc.str.append(ctx.get()); /* store the 'b' and then the rest */
  721. } while (is_bin_(ctx.peek()));
  722. break;
  723. case '0': /* octal or decimal */
  724. case '1':
  725. case '2':
  726. case '3':
  727. case '4':
  728. case '5':
  729. case '6':
  730. case '7':
  731. case '8':
  732. case '9':
  733. do
  734. {
  735. pc.str.append(ctx.get());
  736. } while (is_oct_(ctx.peek()));
  737. break;
  738. default:
  739. /* either just 0 or 0.1 or 0UL, etc */
  740. break;
  741. }
  742. }
  743. }
  744. else
  745. {
  746. /* Regular int or float */
  747. while (is_dec_(ctx.peek()))
  748. {
  749. pc.str.append(ctx.get());
  750. }
  751. }
  752. /* Check if we stopped on a decimal point & make sure it isn't '..' */
  753. if ((ctx.peek() == '.') && (ctx.peek(1) != '.'))
  754. {
  755. pc.str.append(ctx.get());
  756. is_float = true;
  757. if (did_hex)
  758. {
  759. while (is_hex_(ctx.peek()))
  760. {
  761. pc.str.append(ctx.get());
  762. }
  763. }
  764. else
  765. {
  766. while (is_dec_(ctx.peek()))
  767. {
  768. pc.str.append(ctx.get());
  769. }
  770. }
  771. }
  772. /* Check exponent
  773. * Valid exponents per language (not that it matters):
  774. * C/C++/D/Java: eEpP
  775. * C#/Pawn: eE
  776. */
  777. int tmp = unc_toupper(ctx.peek());
  778. if ((tmp == 'E') || (tmp == 'P'))
  779. {
  780. is_float = true;
  781. pc.str.append(ctx.get());
  782. if ((ctx.peek() == '+') || (ctx.peek() == '-'))
  783. {
  784. pc.str.append(ctx.get());
  785. }
  786. while (is_dec_(ctx.peek()))
  787. {
  788. pc.str.append(ctx.get());
  789. }
  790. }
  791. /* Check the suffixes
  792. * Valid suffixes per language (not that it matters):
  793. * Integer Float
  794. * C/C++: uUlL64 lLfF
  795. * C#: uUlL fFdDMm
  796. * D: uUL ifFL
  797. * Java: lL fFdD
  798. * Pawn: (none) (none)
  799. *
  800. * Note that i, f, d, and m only appear in floats.
  801. */
  802. while (1)
  803. {
  804. int tmp = unc_toupper(ctx.peek());
  805. if ((tmp == 'I') || (tmp == 'F') || (tmp == 'D') || (tmp == 'M'))
  806. {
  807. is_float = true;
  808. }
  809. else if ((tmp != 'L') && (tmp != 'U'))
  810. {
  811. break;
  812. }
  813. pc.str.append(ctx.get());
  814. }
  815. /* skip the Microsoft-specific '64' suffix */
  816. if ((ctx.peek() == '6') && (ctx.peek(1) == '4'))
  817. {
  818. pc.str.append(ctx.get());
  819. pc.str.append(ctx.get());
  820. }
  821. pc.type = is_float ? CT_NUMBER_FP : CT_NUMBER;
  822. /* If there is anything left, then we are probably dealing with garbage or
  823. * some sick macro junk. Eat it.
  824. */
  825. parse_suffix(ctx, pc);
  826. return(true);
  827. } // parse_number
  828. static bool parse_string(tok_ctx &ctx, chunk_t &pc, int quote_idx, bool allow_escape)
  829. {
  830. char escape_char = cpd.settings[UO_string_escape_char].n;
  831. char escape_char2 = cpd.settings[UO_string_escape_char2].n;
  832. bool should_escape_tabs = cpd.settings[UO_string_replace_tab_chars].b && (cpd.lang_flags & LANG_ALLC);
  833. pc.str.clear();
  834. while (quote_idx-- > 0)
  835. {
  836. pc.str.append(ctx.get());
  837. }
  838. pc.type = CT_STRING;
  839. int end_ch = CharTable::Get(ctx.peek()) & 0xff;
  840. pc.str.append(ctx.get()); /* store the " */
  841. bool escaped = false;
  842. while (ctx.more())
  843. {
  844. int lastcol = ctx.c.col;
  845. int ch = ctx.get();
  846. if ((ch == '\t') && should_escape_tabs)
  847. {
  848. ctx.c.col = lastcol + 2;
  849. pc.str.append(escape_char);
  850. pc.str.append('t');
  851. continue;
  852. }
  853. pc.str.append(ch);
  854. if (ch == '\n')
  855. {
  856. pc.nl_count++;
  857. pc.type = CT_STRING_MULTI;
  858. escaped = false;
  859. continue;
  860. }
  861. if ((ch == '\r') && (ctx.peek() != '\n'))
  862. {
  863. pc.str.append(ctx.get());
  864. pc.nl_count++;
  865. pc.type = CT_STRING_MULTI;
  866. escaped = false;
  867. continue;
  868. }
  869. if (!escaped)
  870. {
  871. if (ch == escape_char)
  872. {
  873. escaped = (escape_char != 0);
  874. }
  875. else if ((ch == escape_char2) && (ctx.peek() == end_ch))
  876. {
  877. escaped = allow_escape;
  878. }
  879. else if (ch == end_ch)
  880. {
  881. break;
  882. }
  883. }
  884. else
  885. {
  886. escaped = false;
  887. }
  888. }
  889. parse_suffix(ctx, pc, true);
  890. return(true);
  891. } // parse_string
  892. static bool parse_cs_string(tok_ctx &ctx, chunk_t &pc)
  893. {
  894. pc.str = ctx.get();
  895. pc.str.append(ctx.get());
  896. pc.type = CT_STRING;
  897. bool should_escape_tabs = cpd.settings[UO_string_replace_tab_chars].b;
  898. /* go until we hit a zero (end of file) or a single " */
  899. while (ctx.more())
  900. {
  901. int ch = ctx.get();
  902. pc.str.append(ch);
  903. if ((ch == '\n') || (ch == '\r'))
  904. {
  905. pc.type = CT_STRING_MULTI;
  906. pc.nl_count++;
  907. }
  908. else if (ch == '\t')
  909. {
  910. if (should_escape_tabs && !cpd.warned_unable_string_replace_tab_chars)
  911. {
  912. cpd.warned_unable_string_replace_tab_chars = true;
  913. log_sev_t warnlevel = (log_sev_t)cpd.settings[UO_warn_level_tabs_found_in_verbatim_string_literals].n;
  914. /* a tab char can't be replaced with \\t because escapes don't work in here-strings. best we can do is warn. */
  915. LOG_FMT(warnlevel, "%s:%zu Detected non-replaceable tab char in literal string\n", cpd.filename, pc.orig_line);
  916. if (warnlevel < LWARN)
  917. {
  918. cpd.error_count++;
  919. }
  920. }
  921. }
  922. else if (ch == '"')
  923. {
  924. if (ctx.peek() == '"')
  925. {
  926. pc.str.append(ctx.get());
  927. }
  928. else
  929. {
  930. break;
  931. }
  932. }
  933. }
  934. return(true);
  935. } // parse_cs_string
  936. static bool parse_cs_interpolated_string(tok_ctx &ctx, chunk_t &pc)
  937. {
  938. pc.str = ctx.get(); // '$'
  939. pc.str.append(ctx.get()); // '"'
  940. pc.type = CT_STRING;
  941. int depth = 0;
  942. /* go until we hit a zero (end of file) or a single " */
  943. while (ctx.more())
  944. {
  945. int ch = ctx.get();
  946. pc.str.append(ch);
  947. /* if we are inside a { }, then we only look for a } */
  948. if (depth > 0)
  949. {
  950. if (ch == '}')
  951. {
  952. if (ctx.peek() == '}')
  953. {
  954. // }} doesn't decrease the depth
  955. pc.str.append(ctx.get()); // '{'
  956. }
  957. else
  958. {
  959. depth--;
  960. }
  961. }
  962. }
  963. else
  964. {
  965. if (ch == '{')
  966. {
  967. if (ctx.peek() == '{')
  968. {
  969. // {{ doesn't increase the depth
  970. pc.str.append(ctx.get());
  971. }
  972. else
  973. {
  974. depth++;
  975. }
  976. }
  977. else if (ch == '"')
  978. {
  979. if (ctx.peek() == '"')
  980. {
  981. pc.str.append(ctx.get());
  982. }
  983. else
  984. {
  985. break;
  986. }
  987. }
  988. }
  989. }
  990. return(true);
  991. } // parse_cs_interpolated_string
  992. static void parse_verbatim_string(tok_ctx &ctx, chunk_t &pc)
  993. {
  994. pc.type = CT_STRING;
  995. // consume the initial """
  996. pc.str = ctx.get();
  997. pc.str.append(ctx.get());
  998. pc.str.append(ctx.get());
  999. /* go until we hit a zero (end of file) or a """ */
  1000. while (ctx.more())
  1001. {
  1002. int ch = ctx.get();
  1003. pc.str.append(ch);
  1004. if ((ch == '"') &&
  1005. (ctx.peek() == '"') &&
  1006. (ctx.peek(1) == '"'))
  1007. {
  1008. pc.str.append(ctx.get());
  1009. pc.str.append(ctx.get());
  1010. break;
  1011. }
  1012. if ((ch == '\n') || (ch == '\r'))
  1013. {
  1014. pc.type = CT_STRING_MULTI;
  1015. pc.nl_count++;
  1016. }
  1017. }
  1018. }
  1019. static bool tag_compare(const deque<int> &d, int a_idx, int b_idx, int len)
  1020. {
  1021. if (a_idx != b_idx)
  1022. {
  1023. while (len-- > 0)
  1024. {
  1025. if (d[a_idx] != d[b_idx])
  1026. {
  1027. return(false);
  1028. }
  1029. }
  1030. }
  1031. return(true);
  1032. }
  1033. static bool parse_cr_string(tok_ctx &ctx, chunk_t &pc, int q_idx)
  1034. {
  1035. int tag_idx = ctx.c.idx + q_idx + 1;
  1036. int tag_len = 0;
  1037. ctx.save();
  1038. /* Copy the prefix + " to the string */
  1039. pc.str.clear();
  1040. int cnt = q_idx + 1;
  1041. while (cnt--)
  1042. {
  1043. pc.str.append(ctx.get());
  1044. }
  1045. /* Add the tag and get the length of the tag */
  1046. while (ctx.more() && (ctx.peek() != '('))
  1047. {
  1048. tag_len++;
  1049. pc.str.append(ctx.get());
  1050. }
  1051. if (ctx.peek() != '(')
  1052. {
  1053. ctx.restore();
  1054. return(false);
  1055. }
  1056. pc.type = CT_STRING;
  1057. while (ctx.more())
  1058. {
  1059. if ((ctx.peek() == ')') &&
  1060. (ctx.peek(tag_len + 1) == '"') &&
  1061. tag_compare(ctx.data, tag_idx, ctx.c.idx + 1, tag_len))
  1062. {
  1063. cnt = tag_len + 2; /* for the )" */
  1064. while (cnt--)
  1065. {
  1066. pc.str.append(ctx.get());
  1067. }
  1068. parse_suffix(ctx, pc);
  1069. return(true);
  1070. }
  1071. if (ctx.peek() == '\n')
  1072. {
  1073. pc.str.append(ctx.get());
  1074. pc.nl_count++;
  1075. pc.type = CT_STRING_MULTI;
  1076. }
  1077. else
  1078. {
  1079. pc.str.append(ctx.get());
  1080. }
  1081. }
  1082. ctx.restore();
  1083. return(false);
  1084. } // parse_cr_string
  1085. /**
  1086. * Count the number of characters in a word.
  1087. * The first character is already valid for a keyword
  1088. *
  1089. * @param pc The structure to update, str is an input.
  1090. * @return Whether a word was parsed (always true)
  1091. */
  1092. bool parse_word(tok_ctx &ctx, chunk_t &pc, bool skipcheck)
  1093. {
  1094. static unc_text intr_txt("@interface");
  1095. /* The first character is already valid */
  1096. pc.str.clear();
  1097. pc.str.append(ctx.get());
  1098. while (ctx.more())
  1099. {
  1100. int ch = ctx.peek();
  1101. if (CharTable::IsKw2(ch))
  1102. {
  1103. pc.str.append(ctx.get());
  1104. }
  1105. else if ((ch == '\\') && (unc_tolower(ctx.peek(1)) == 'u'))
  1106. {
  1107. pc.str.append(ctx.get());
  1108. pc.str.append(ctx.get());
  1109. skipcheck = true;
  1110. }
  1111. else
  1112. {
  1113. break;
  1114. }
  1115. /* HACK: Non-ASCII character are only allowed in identifiers */
  1116. if (ch > 0x7f)
  1117. {
  1118. skipcheck = true;
  1119. }
  1120. }
  1121. pc.type = CT_WORD;
  1122. if (skipcheck)
  1123. {
  1124. return(true);
  1125. }
  1126. /* Detect pre-processor functions now */
  1127. if ((cpd.in_preproc == CT_PP_DEFINE) &&
  1128. (cpd.preproc_ncnl_count == 1))
  1129. {
  1130. if (ctx.peek() == '(')
  1131. {
  1132. pc.type = CT_MACRO_FUNC;
  1133. }
  1134. else
  1135. {
  1136. pc.type = CT_MACRO;
  1137. }
  1138. }
  1139. else
  1140. {
  1141. /* '@interface' is reserved, not an interface itself */
  1142. if ((cpd.lang_flags & LANG_JAVA) && pc.str.startswith("@") &&
  1143. !pc.str.equals(intr_txt))
  1144. {
  1145. pc.type = CT_ANNOTATION;
  1146. }
  1147. else
  1148. {
  1149. /* Turn it into a keyword now */
  1150. pc.type = find_keyword_type(pc.text(), pc.str.size());
  1151. }
  1152. }
  1153. return(true);
  1154. } // parse_word
  1155. static bool parse_whitespace(tok_ctx &ctx, chunk_t &pc)
  1156. {
  1157. int nl_count = 0;
  1158. int ch = -2;
  1159. /* REVISIT: use a better whitespace detector? */
  1160. while (ctx.more() && unc_isspace(ctx.peek()))
  1161. {
  1162. ch = ctx.get(); /* throw away the whitespace char */
  1163. switch (ch)
  1164. {
  1165. case '\r':
  1166. if (ctx.expect('\n'))
  1167. {
  1168. /* CRLF ending */
  1169. cpd.le_counts[LE_CRLF]++;
  1170. }
  1171. else
  1172. {
  1173. /* CR ending */
  1174. cpd.le_counts[LE_CR]++;
  1175. }
  1176. nl_count++;
  1177. pc.orig_prev_sp = 0;
  1178. break;
  1179. case '\n':
  1180. /* LF ending */
  1181. cpd.le_counts[LE_LF]++;
  1182. nl_count++;
  1183. pc.orig_prev_sp = 0;
  1184. break;
  1185. case '\t':
  1186. pc.orig_prev_sp += calc_next_tab_column(cpd.column, cpd.settings[UO_input_tab_size].u) - cpd.column;
  1187. break;
  1188. case ' ':
  1189. pc.orig_prev_sp++;
  1190. break;
  1191. default:
  1192. break;
  1193. }
  1194. }
  1195. if (ch != -2)
  1196. {
  1197. pc.str.clear();
  1198. pc.nl_count = nl_count;
  1199. pc.type = nl_count ? CT_NEWLINE : CT_WHITESPACE;
  1200. pc.after_tab = (ctx.c.last_ch == '\t');
  1201. return(true);
  1202. }
  1203. return(false);
  1204. } // parse_whitespace
  1205. static bool parse_bs_newline(tok_ctx &ctx, chunk_t &pc)
  1206. {
  1207. ctx.save();
  1208. ctx.get(); /* skip the '\' */
  1209. int ch;
  1210. while (ctx.more() && unc_isspace(ch = ctx.peek()))
  1211. {
  1212. ctx.get();
  1213. if ((ch == '\r') || (ch == '\n'))
  1214. {
  1215. if (ch == '\r')
  1216. {
  1217. ctx.expect('\n');
  1218. }
  1219. pc.str = "\\";
  1220. pc.type = CT_NL_CONT;
  1221. pc.nl_count = 1;
  1222. return(true);
  1223. }
  1224. }
  1225. ctx.restore();
  1226. return(false);
  1227. }
  1228. static bool parse_newline(tok_ctx &ctx)
  1229. {
  1230. ctx.save();
  1231. /* Eat whitespace */
  1232. while ((ctx.peek() == ' ') || (ctx.peek() == '\t'))
  1233. {
  1234. ctx.get();
  1235. }
  1236. if ((ctx.peek() == '\r') || (ctx.peek() == '\n'))
  1237. {
  1238. if (!ctx.expect('\n'))
  1239. {
  1240. ctx.get();
  1241. ctx.expect('\n');
  1242. }
  1243. return(true);
  1244. }
  1245. ctx.restore();
  1246. return(false);
  1247. }
  1248. static void parse_pawn_pattern(tok_ctx &ctx, chunk_t &pc, c_token_t tt)
  1249. {
  1250. pc.str.clear();
  1251. pc.type = tt;
  1252. while (!unc_isspace(ctx.peek()))
  1253. {
  1254. /* end the pattern on an escaped newline */
  1255. if (ctx.peek() == '\\')
  1256. {
  1257. int ch = ctx.peek(1);
  1258. if ((ch == '\n') || (ch == '\r'))
  1259. {
  1260. break;
  1261. }
  1262. }
  1263. pc.str.append(ctx.get());
  1264. }
  1265. }
  1266. static bool parse_ignored(tok_ctx &ctx, chunk_t &pc)
  1267. {
  1268. int nl_count = 0;
  1269. /* Parse off newlines/blank lines */
  1270. while (parse_newline(ctx))
  1271. {
  1272. nl_count++;
  1273. }
  1274. if (nl_count > 0)
  1275. {
  1276. pc.nl_count = nl_count;
  1277. pc.type = CT_NEWLINE;
  1278. return(true);
  1279. }
  1280. /* See if the UO_enable_processing_cmt text is on this line */
  1281. ctx.save();
  1282. pc.str.clear();
  1283. while (ctx.more() &&
  1284. (ctx.peek() != '\r') &&
  1285. (ctx.peek() != '\n'))
  1286. {
  1287. pc.str.append(ctx.get());
  1288. }
  1289. if (pc.str.size() == 0)
  1290. {
  1291. /* end of file? */
  1292. return(false);
  1293. }
  1294. /* Note that we aren't actually making sure this is in a comment, yet */
  1295. const char *ontext = cpd.settings[UO_enable_processing_cmt].str;
  1296. if (ontext == NULL)
  1297. {
  1298. ontext = UNCRUSTIFY_ON_TEXT;
  1299. }
  1300. if (pc.str.find(ontext) < 0)
  1301. {
  1302. pc.type = CT_IGNORED;
  1303. return(true);
  1304. }
  1305. ctx.restore();
  1306. /* parse off whitespace leading to the comment */
  1307. if (parse_whitespace(ctx, pc))
  1308. {
  1309. pc.type = CT_IGNORED;
  1310. return(true);
  1311. }
  1312. /* Look for the ending comment and let it pass */
  1313. if (parse_comment(ctx, pc) && !cpd.unc_off)
  1314. {
  1315. return(true);
  1316. }
  1317. /* Reset the chunk & scan to until a newline */
  1318. pc.str.clear();
  1319. while (ctx.more() &&
  1320. (ctx.peek() != '\r') &&
  1321. (ctx.peek() != '\n'))
  1322. {
  1323. pc.str.append(ctx.get());
  1324. }
  1325. if (pc.str.size() > 0)
  1326. {
  1327. pc.type = CT_IGNORED;
  1328. return(true);
  1329. }
  1330. return(false);
  1331. } // parse_ignored
  1332. static bool parse_next(tok_ctx &ctx, chunk_t &pc)
  1333. {
  1334. if (!ctx.more())
  1335. {
  1336. //fprintf(stderr, "All done!\n");
  1337. return(false);
  1338. }
  1339. /* Save off the current column */
  1340. pc.orig_line = ctx.c.row;
  1341. pc.column = ctx.c.col;
  1342. pc.orig_col = ctx.c.col;
  1343. pc.type = CT_NONE;
  1344. pc.nl_count = 0;
  1345. pc.flags = 0;
  1346. /* If it is turned off, we put everything except newlines into CT_UNKNOWN */
  1347. if (cpd.unc_off)
  1348. {
  1349. if (parse_ignored(ctx, pc))
  1350. {
  1351. return(true);
  1352. }
  1353. }
  1354. /**
  1355. * Parse whitespace
  1356. */
  1357. if (parse_whitespace(ctx, pc))
  1358. {
  1359. return(true);
  1360. }
  1361. /**
  1362. * Handle unknown/unhandled preprocessors
  1363. */
  1364. if ((cpd.in_preproc > CT_PP_BODYCHUNK) &&
  1365. (cpd.in_preproc <= CT_PP_OTHER))
  1366. {
  1367. pc.str.clear();
  1368. tok_info ss;
  1369. ctx.save(ss);
  1370. /* Chunk to a newline or comment */
  1371. pc.type = CT_PREPROC_BODY;
  1372. int last = 0;
  1373. while (ctx.more())
  1374. {
  1375. int ch = ctx.peek();
  1376. if ((ch == '\n') || (ch == '\r'))
  1377. {
  1378. /* Back off if this is an escaped newline */
  1379. if (last == '\\')
  1380. {
  1381. ctx.restore(ss);
  1382. pc.str.pop_back();
  1383. }
  1384. break;
  1385. }
  1386. /* Quit on a C++ comment start */
  1387. if ((ch == '/') && (ctx.peek(1) == '/'))
  1388. {
  1389. break;
  1390. }
  1391. last = ch;
  1392. ctx.save(ss);
  1393. pc.str.append(ctx.get());
  1394. }
  1395. if (pc.str.size() > 0)
  1396. {
  1397. return(true);
  1398. }
  1399. }
  1400. /**
  1401. * Detect backslash-newline
  1402. */
  1403. if ((ctx.peek() == '\\') && parse_bs_newline(ctx, pc))
  1404. {
  1405. return(true);
  1406. }
  1407. /**
  1408. * Parse comments
  1409. */
  1410. if (parse_comment(ctx, pc))
  1411. {
  1412. return(true);
  1413. }
  1414. /* Parse code placeholders */
  1415. if (parse_code_placeholder(ctx, pc))
  1416. {
  1417. return(true);
  1418. }
  1419. /* Check for C# literal strings, ie @"hello" and identifiers @for*/
  1420. if ((cpd.lang_flags & LANG_CS) && (ctx.peek() == '@'))
  1421. {
  1422. if (ctx.peek(1) == '"')
  1423. {
  1424. parse_cs_string(ctx, pc);
  1425. return(true);
  1426. }
  1427. /* check for non-keyword identifiers such as @if @switch, etc */
  1428. if (CharTable::IsKw1(ctx.peek(1)))
  1429. {
  1430. parse_word(ctx, pc, true);
  1431. return(true);
  1432. }
  1433. }
  1434. /* Check for C# Interpolated strings */
  1435. if ((cpd.lang_flags & LANG_CS) && (ctx.peek() == '$') && (ctx.peek(1) == '"'))
  1436. {
  1437. parse_cs_interpolated_string(ctx, pc);
  1438. return(true);
  1439. }
  1440. /* handle VALA """ strings """ */
  1441. if ((cpd.lang_flags & LANG_VALA) &&
  1442. (ctx.peek() == '"') &&
  1443. (ctx.peek(1) == '"') &&
  1444. (ctx.peek(2) == '"'))
  1445. {
  1446. parse_verbatim_string(ctx, pc);
  1447. return(true);
  1448. }
  1449. /* handle C++0x strings u8"x" u"x" U"x" R"x" u8R"XXX(I'm a "raw UTF-8" string.)XXX" */
  1450. int ch = ctx.peek();
  1451. if ((cpd.lang_flags & LANG_CPP) &&
  1452. ((ch == 'u') || (ch == 'U') || (ch == 'R')))
  1453. {
  1454. int idx = 0;
  1455. bool is_real = false;
  1456. if ((ch == 'u') && (ctx.peek(1) == '8'))
  1457. {
  1458. idx = 2;
  1459. }
  1460. else if (unc_tolower(ch) == 'u')
  1461. {
  1462. idx++;
  1463. }
  1464. if (ctx.peek(idx) == 'R')
  1465. {
  1466. idx++;
  1467. is_real = true;
  1468. }
  1469. if (ctx.peek(idx) == '"')
  1470. {
  1471. if (is_real)
  1472. {
  1473. if (parse_cr_string(ctx, pc, idx))
  1474. {
  1475. return(true);
  1476. }
  1477. }
  1478. else
  1479. {
  1480. if (parse_string(ctx, pc, idx, true))
  1481. {
  1482. parse_suffix(ctx, pc, true);
  1483. return(true);
  1484. }
  1485. }
  1486. }
  1487. }
  1488. /* PAWN specific stuff */
  1489. if (cpd.lang_flags & LANG_PAWN)
  1490. {
  1491. if ((cpd.preproc_ncnl_count == 1) &&
  1492. ((cpd.in_preproc == CT_PP_DEFINE) ||
  1493. (cpd.in_preproc == CT_PP_EMIT)))
  1494. {
  1495. parse_pawn_pattern(ctx, pc, CT_MACRO);
  1496. return(true);
  1497. }
  1498. /* Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi" */
  1499. if ((ctx.peek() == '\\') || (ctx.peek() == '!'))
  1500. {
  1501. if (ctx.peek(1) == '"')
  1502. {
  1503. parse_string(ctx, pc, 1, (ctx.peek() == '!'));
  1504. return(true);
  1505. }
  1506. else if (((ctx.peek(1) == '\\') || (ctx.peek(1) == '!')) &&
  1507. (ctx.peek(2) == '"'))
  1508. {
  1509. parse_string(ctx, pc, 2, false);
  1510. return(true);
  1511. }
  1512. }
  1513. /* handle PAWN preprocessor args %0 .. %9 */
  1514. if ((cpd.in_preproc == CT_PP_DEFINE) &&
  1515. (ctx.peek() == '%') &&
  1516. unc_isdigit(ctx.peek(1)))
  1517. {
  1518. pc.str.clear();
  1519. pc.str.append(ctx.get());
  1520. pc.str.append(ctx.get());
  1521. pc.type = CT_WORD;
  1522. return(true);
  1523. }
  1524. }
  1525. /**
  1526. * Parse strings and character constants
  1527. */
  1528. //parse_word(ctx, pc_temp, true);
  1529. //ctx.restore(ctx.c);
  1530. if (parse_number(ctx, pc))
  1531. {
  1532. return(true);
  1533. }
  1534. if (cpd.lang_flags & LANG_D)
  1535. {
  1536. /* D specific stuff */
  1537. if (d_parse_string(ctx, pc))
  1538. {
  1539. return(true);
  1540. }
  1541. }
  1542. else
  1543. {
  1544. /* Not D stuff */
  1545. /* Check for L'a', L"abc", 'a', "abc", <abc> strings */
  1546. ch = ctx.peek();
  1547. int ch1 = ctx.peek(1);
  1548. if ((((ch == 'L') || (ch == 'S')) &&
  1549. ((ch1 == '"') || (ch1 == '\''))) ||
  1550. (ch == '"') ||
  1551. (ch == '\'') ||
  1552. ((ch == '<') && (cpd.in_preproc == CT_PP_INCLUDE)))
  1553. {
  1554. parse_string(ctx, pc, unc_isalpha(ch) ? 1 : 0, true);
  1555. return(true);
  1556. }
  1557. if ((ch == '<') && (cpd.in_preproc == CT_PP_DEFINE))
  1558. {
  1559. if (chunk_get_tail()->type == CT_MACRO)
  1560. {
  1561. /* We have "#define XXX <", assume '<' starts an include string */
  1562. parse_string(ctx, pc, 0, false);
  1563. return(true);
  1564. }
  1565. }
  1566. }
  1567. /* Check for Objective C literals and VALA identifiers ('@1', '@if')*/
  1568. if ((cpd.lang_flags & (LANG_OC | LANG_VALA)) && (ctx.peek() == '@'))
  1569. {
  1570. int nc = ctx.peek(1);
  1571. if ((nc == '"') || (nc == '\''))
  1572. {
  1573. /* literal string */
  1574. parse_string(ctx, pc, 1, true);
  1575. return(true);
  1576. }
  1577. else if ((nc >= '0') && (nc <= '9'))
  1578. {
  1579. /* literal number */
  1580. pc.str.append(ctx.get()); /* store the '@' */
  1581. parse_number(ctx, pc);
  1582. return(true);
  1583. }
  1584. }
  1585. /* Check for pawn/ObjectiveC/Java and normal identifiers */
  1586. if (CharTable::IsKw1(ctx.peek()) ||
  1587. ((ctx.peek() == '\\') && (unc_tolower(ctx.peek(1)) == 'u')) ||
  1588. ((ctx.peek() == '@') && CharTable::IsKw1(ctx.peek(1))))
  1589. {
  1590. parse_word(ctx, pc, false);
  1591. return(true);
  1592. }
  1593. /* see if we have a punctuator */
  1594. char punc_txt[4];
  1595. punc_txt[0] = ctx.peek();
  1596. punc_txt[1] = ctx.peek(1);
  1597. punc_txt[2] = ctx.peek(2);
  1598. punc_txt[3] = ctx.peek(3);
  1599. const chunk_tag_t *punc;
  1600. if ((punc = find_punctuator(punc_txt, cpd.lang_flags)) != NULL)
  1601. {
  1602. int cnt = strlen(punc->tag);
  1603. while (cnt--)
  1604. {
  1605. pc.str.append(ctx.get());
  1606. }
  1607. pc.type = punc->type;
  1608. pc.flags |= PCF_PUNCTUATOR;
  1609. return(true);
  1610. }
  1611. /* throw away this character */
  1612. pc.type = CT_UNKNOWN;
  1613. pc.str.append(ctx.get());
  1614. LOG_FMT(LWARN, "%s:%zu Garbage in col %d: %x\n",
  1615. cpd.filename, pc.orig_line, (int)ctx.c.col, pc.str[0]);
  1616. cpd.error_count++;
  1617. return(true);
  1618. } // parse_next
  1619. /**
  1620. * This function parses or tokenizes the whole buffer into a list.
  1621. * It has to do some tricks to parse preprocessors.
  1622. *
  1623. * If output_text() were called immediately after, two things would happen:
  1624. * - trailing whitespace are removed.
  1625. * - leading space & tabs are converted to the appropriate format.
  1626. *
  1627. * All the tokens are inserted before ref. If ref is NULL, they are inserted
  1628. * at the end of the list. Line numbers are relative to the start of the data.
  1629. */
  1630. void tokenize(const deque<int> &data, chunk_t *ref)
  1631. {
  1632. tok_ctx ctx(data);
  1633. chunk_t chunk;
  1634. chunk_t *pc = NULL;
  1635. chunk_t *rprev = NULL;
  1636. parse_frame_t frm;
  1637. bool last_was_tab = false;
  1638. int prev_sp = 0;
  1639. cpd.unc_stage = US_TOKENIZE;
  1640. memset(&frm, 0, sizeof(frm));
  1641. while (ctx.more())
  1642. {
  1643. chunk.reset();
  1644. if (!parse_next(ctx, chunk))
  1645. {
  1646. LOG_FMT(LERR, "%s:%d Bailed before the end?\n",
  1647. cpd.filename, ctx.c.row);
  1648. cpd.error_count++;
  1649. break;
  1650. }
  1651. /* Don't create an entry for whitespace */
  1652. if (chunk.type == CT_WHITESPACE)
  1653. {
  1654. last_was_tab = chunk.after_tab;
  1655. prev_sp = chunk.orig_prev_sp;
  1656. continue;
  1657. }
  1658. chunk.orig_prev_sp = prev_sp;
  1659. prev_sp = 0;
  1660. if (chunk.type == CT_NEWLINE)
  1661. {
  1662. last_was_tab = chunk.after_tab;
  1663. chunk.after_tab = false;
  1664. chunk.str.clear();
  1665. }
  1666. else if (chunk.type == CT_NL_CONT)
  1667. {
  1668. last_was_tab = chunk.after_tab;
  1669. chunk.after_tab = false;
  1670. chunk.str = "\\\n";
  1671. }
  1672. else
  1673. {
  1674. chunk.after_tab = last_was_tab;
  1675. last_was_tab = false;
  1676. }
  1677. /* Strip trailing whitespace (for CPP comments and PP blocks) */
  1678. while ((chunk.str.size() > 0) &&
  1679. ((chunk.str[chunk.str.size() - 1] == ' ') ||
  1680. (chunk.str[chunk.str.size() - 1] == '\t')))
  1681. {
  1682. // If comment contains backslash '\' followed by whitespace chars, keep last one;
  1683. // this will prevent it from turning '\' into line continuation.
  1684. if ((chunk.str.size() > 1) && (chunk.str[chunk.str.size() - 2] == '\\'))
  1685. {
  1686. break;
  1687. }
  1688. chunk.str.pop_back();
  1689. }
  1690. /* Store off the end column */
  1691. chunk.orig_col_end = ctx.c.col;
  1692. /* Add the chunk to the list */
  1693. rprev = pc;
  1694. if (rprev != NULL)
  1695. {
  1696. chunk_flags_set(pc, rprev->flags & PCF_COPY_FLAGS);
  1697. /* a newline can't be in a preprocessor */
  1698. if (pc->type == CT_NEWLINE)
  1699. {
  1700. chunk_flags_clr(pc, PCF_IN_PREPROC);
  1701. }
  1702. }
  1703. if (ref != NULL)
  1704. {
  1705. chunk.flags |= PCF_INSERTED;
  1706. }
  1707. else
  1708. {
  1709. chunk.flags &= ~PCF_INSERTED;
  1710. }
  1711. pc = chunk_add_before(&chunk, ref);
  1712. /* A newline marks the end of a preprocessor */
  1713. if (pc->type == CT_NEWLINE) // || (pc->type == CT_COMMENT_MULTI))
  1714. {
  1715. cpd.in_preproc = CT_NONE;
  1716. cpd.preproc_ncnl_count = 0;
  1717. }
  1718. /* Special handling for preprocessor stuff */
  1719. if (cpd.in_preproc != CT_NONE)
  1720. {
  1721. chunk_flags_set(pc, PCF_IN_PREPROC);
  1722. /* Count words after the preprocessor */
  1723. if (!chunk_is_comment(pc) && !chunk_is_newline(pc))
  1724. {
  1725. cpd.preproc_ncnl_count++;
  1726. }
  1727. /* Figure out the type of preprocessor for #include parsing */
  1728. if (cpd.in_preproc == CT_PREPROC)
  1729. {
  1730. if ((pc->type < CT_PP_DEFINE) || (pc->type > CT_PP_OTHER))
  1731. {
  1732. set_chunk_type(pc, CT_PP_OTHER);
  1733. }
  1734. cpd.in_preproc = pc->type;
  1735. }
  1736. }
  1737. else
  1738. {
  1739. /* Check for a preprocessor start */
  1740. if ((pc->type == CT_POUND) &&
  1741. ((rprev == NULL) || (rprev->type == CT_NEWLINE)))
  1742. {
  1743. set_chunk_type(pc, CT_PREPROC);
  1744. pc->flags |= PCF_IN_PREPROC;
  1745. cpd.in_preproc = CT_PREPROC;
  1746. }
  1747. }
  1748. if (pc->type == CT_NEWLINE)
  1749. {
  1750. LOG_FMT(LGUY, "%s(%d): (%zu)<NL> col=%zu\n",
  1751. __func__, __LINE__, pc->orig_line, pc->orig_col);
  1752. }
  1753. else
  1754. {
  1755. LOG_FMT(LGUY, "%s(%d): text():%s, type:%s, orig_col=%zu, orig_col_end=%d\n",
  1756. __func__, __LINE__, pc->text(), get_token_name(pc->type), pc->orig_col, pc->orig_col_end);
  1757. }
  1758. }
  1759. /* Set the cpd.newline string for this file */
  1760. if ((cpd.settings[UO_newlines].le == LE_LF) ||
  1761. ((cpd.settings[UO_newlines].le == LE_AUTO) &&
  1762. (cpd.le_counts[LE_LF] >= cpd.le_counts[LE_CRLF]) &&
  1763. (cpd.le_counts[LE_LF] >= cpd.le_counts[LE_CR])))
  1764. {
  1765. /* LF line ends */
  1766. cpd.newline = "\n";
  1767. LOG_FMT(LLINEENDS, "Using LF line endings\n");
  1768. }
  1769. else if ((cpd.settings[UO_newlines].le == LE_CRLF) ||
  1770. ((cpd.settings[UO_newlines].le == LE_AUTO) &&
  1771. (cpd.le_counts[LE_CRLF] >= cpd.le_counts[LE_LF]) &&
  1772. (cpd.le_counts[LE_CRLF] >= cpd.le_counts[LE_CR])))
  1773. {
  1774. /* CRLF line ends */
  1775. cpd.newline = "\r\n";
  1776. LOG_FMT(LLINEENDS, "Using CRLF line endings\n");
  1777. }
  1778. else
  1779. {
  1780. /* CR line ends */
  1781. cpd.newline = "\r";
  1782. LOG_FMT(LLINEENDS, "Using CR line endings\n");
  1783. }
  1784. } // tokenize
  1785. // /**
  1786. // * A simplistic fixed-sized needle in the fixed-size haystack string search.
  1787. // */
  1788. // int str_find(const char *needle, int needle_len,
  1789. // const char *haystack, int haystack_len)
  1790. // {
  1791. // for (int idx = 0; idx < (haystack_len - needle_len); idx++)
  1792. // {
  1793. // if (memcmp(needle, haystack + idx, needle_len) == 0)
  1794. // {
  1795. // return(idx);
  1796. // }
  1797. // }
  1798. // return(-1);
  1799. // }