PageRenderTime 119ms CodeModel.GetById 22ms RepoModel.GetById 2ms app.codeStats 1ms

/std/d/lexer.d

https://github.com/sinkuu/Dscanner
D | 2165 lines | 1979 code | 109 blank | 77 comment | 340 complexity | db342afc581162bad932f3eb5d95968f MD5 | raw file
  1. module std.d.lexer;
  2. import std.typecons;
  3. import std.typetuple;
  4. import std.array;
  5. import std.algorithm;
  6. import std.range;
  7. import std.lexer;
  8. private enum operators = [
  9. ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
  10. "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
  11. "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
  12. "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^",
  13. "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
  14. ];
  15. private enum keywords = [
  16. "abstract", "alias", "align", "asm", "assert", "auto", "body", "bool",
  17. "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
  18. "char", "class", "const", "continue", "creal", "dchar", "debug", "default",
  19. "delegate", "delete", "deprecated", "do", "double", "else", "enum",
  20. "export", "extern", "false", "final", "finally", "float", "for", "foreach",
  21. "foreach_reverse", "function", "goto", "idouble", "if", "ifloat",
  22. "immutable", "import", "in", "inout", "int", "interface", "invariant",
  23. "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow",
  24. "null", "out", "override", "package", "pragma", "private", "protected",
  25. "public", "pure", "real", "ref", "return", "scope", "shared", "short",
  26. "static", "struct", "super", "switch", "synchronized", "template", "this",
  27. "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent",
  28. "uint", "ulong", "union", "unittest", "ushort", "version", "virtual", "void",
  29. "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__",
  30. "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters",
  31. "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector",
  32. "__VENDOR__", "__VERSION__"
  33. ];
  34. private enum dynamicTokens = [
  35. "specialTokenSequence", "comment", "identifier", "scriptLine",
  36. "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral",
  37. "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral",
  38. "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral",
  39. "dstringLiteral", "stringLiteral", "wstringLiteral"
  40. ];
  41. private enum pseudoTokenHandlers = [
  42. "\"", "lexStringLiteral",
  43. "`", "lexWysiwygString",
  44. "//", "lexSlashSlashComment",
  45. "/*", "lexSlashStarComment",
  46. "/+", "lexSlashPlusComment",
  47. ".", "lexDot",
  48. "'", "lexCharacterLiteral",
  49. "0", "lexNumber",
  50. "1", "lexDecimal",
  51. "2", "lexDecimal",
  52. "3", "lexDecimal",
  53. "4", "lexDecimal",
  54. "5", "lexDecimal",
  55. "6", "lexDecimal",
  56. "7", "lexDecimal",
  57. "8", "lexDecimal",
  58. "9", "lexDecimal",
  59. "q\"", "lexDelimitedString",
  60. "q{", "lexTokenString",
  61. "r\"", "lexWysiwygString",
  62. "x\"", "lexHexString",
  63. " ", "lexWhitespace",
  64. "\t", "lexWhitespace",
  65. "\r", "lexWhitespace",
  66. "\n", "lexWhitespace",
  67. "\u2028", "lexLongNewline",
  68. "\u2029", "lexLongNewline",
  69. "#!", "lexScriptLine",
  70. "#line", "lexSpecialTokenSequence"
  71. ];
  72. public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
  73. public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
  74. public template tok(string token)
  75. {
  76. alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
  77. }
  78. private enum extraFields = q{
  79. string comment;
  80. int opCmp(size_t i) const pure nothrow @safe {
  81. if (index < i) return -1;
  82. if (index > i) return 1;
  83. return 0;
  84. }
  85. };
  86. public alias Token = std.lexer.TokenStructure!(IdType, extraFields);
  87. /**
  88. * Configure string lexing behavior
  89. */
  90. public enum StringBehavior : ubyte
  91. {
  92. /// Do not include quote characters, process escape sequences
  93. compiler = 0b0000_0000,
  94. /// Opening quotes, closing quotes, and string suffixes are included in the
  95. /// string token
  96. includeQuoteChars = 0b0000_0001,
  97. /// String escape sequences are not replaced
  98. notEscaped = 0b0000_0010,
  99. /// Not modified at all. Useful for formatters or highlighters
  100. source = includeQuoteChars | notEscaped
  101. }
  102. /**
  103. * Configure whitespace handling behavior
  104. */
  105. public enum WhitespaceBehavior : ubyte
  106. {
  107. /// Whitespace is skipped
  108. skip,
  109. /// Whitespace is treated as a token
  110. include
  111. }
  112. /**
  113. * Configure special token handling behavior
  114. */
  115. public enum SpecialTokenBehavior : ubyte
  116. {
  117. /// Special tokens are skipped
  118. skip,
  119. /// Special tokens are treated as a token
  120. include
  121. }
  122. /**
  123. * Configure comment handling behavior
  124. */
  125. public enum CommentBehavior : ubyte
  126. {
  127. /// Comments are attached to the non-whitespace token that follows them
  128. attach,
  129. /// Comments are tokens, and can be returned by calls to the token range's front()
  130. include
  131. }
  132. public struct LexerConfig
  133. {
  134. string fileName;
  135. StringBehavior stringBehavior;
  136. WhitespaceBehavior whitespaceBehavior;
  137. CommentBehavior commentBehavior;
  138. SpecialTokenBehavior specialTokenBehavior;
  139. }
  140. public bool isBasicType(IdType type) nothrow pure @safe
  141. {
  142. switch (type)
  143. {
  144. case tok!"int":
  145. case tok!"uint":
  146. case tok!"double":
  147. case tok!"idouble":
  148. case tok!"float":
  149. case tok!"ifloat":
  150. case tok!"short":
  151. case tok!"ushort":
  152. case tok!"long":
  153. case tok!"ulong":
  154. case tok!"char":
  155. case tok!"wchar":
  156. case tok!"dchar":
  157. case tok!"bool":
  158. case tok!"void":
  159. case tok!"cent":
  160. case tok!"ucent":
  161. case tok!"real":
  162. case tok!"ireal":
  163. case tok!"byte":
  164. case tok!"ubyte":
  165. case tok!"cdouble":
  166. case tok!"cfloat":
  167. case tok!"creal":
  168. return true;
  169. default:
  170. return false;
  171. }
  172. }
  173. public bool isNumberLiteral(IdType type) nothrow pure @safe
  174. {
  175. switch (type)
  176. {
  177. case tok!"doubleLiteral":
  178. case tok!"floatLiteral":
  179. case tok!"idoubleLiteral":
  180. case tok!"ifloatLiteral":
  181. case tok!"intLiteral":
  182. case tok!"longLiteral":
  183. case tok!"realLiteral":
  184. case tok!"irealLiteral":
  185. case tok!"uintLiteral":
  186. case tok!"ulongLiteral":
  187. return true;
  188. default:
  189. return false;
  190. }
  191. }
  192. public bool isOperator(IdType type) nothrow pure @safe
  193. {
  194. switch (type)
  195. {
  196. case tok!",":
  197. case tok!".":
  198. case tok!"..":
  199. case tok!"...":
  200. case tok!"/":
  201. case tok!"/=":
  202. case tok!"!":
  203. case tok!"!<":
  204. case tok!"!<=":
  205. case tok!"!<>":
  206. case tok!"!<>=":
  207. case tok!"!=":
  208. case tok!"!>":
  209. case tok!"!>=":
  210. case tok!"$":
  211. case tok!"%":
  212. case tok!"%=":
  213. case tok!"&":
  214. case tok!"&&":
  215. case tok!"&=":
  216. case tok!"(":
  217. case tok!")":
  218. case tok!"*":
  219. case tok!"*=":
  220. case tok!"+":
  221. case tok!"++":
  222. case tok!"+=":
  223. case tok!"-":
  224. case tok!"--":
  225. case tok!"-=":
  226. case tok!":":
  227. case tok!";":
  228. case tok!"<":
  229. case tok!"<<":
  230. case tok!"<<=":
  231. case tok!"<=":
  232. case tok!"<>":
  233. case tok!"<>=":
  234. case tok!"=":
  235. case tok!"==":
  236. case tok!"=>":
  237. case tok!">":
  238. case tok!">=":
  239. case tok!">>":
  240. case tok!">>=":
  241. case tok!">>>":
  242. case tok!">>>=":
  243. case tok!"?":
  244. case tok!"@":
  245. case tok!"[":
  246. case tok!"]":
  247. case tok!"^":
  248. case tok!"^=":
  249. case tok!"^^":
  250. case tok!"^^=":
  251. case tok!"{":
  252. case tok!"|":
  253. case tok!"|=":
  254. case tok!"||":
  255. case tok!"}":
  256. case tok!"~":
  257. case tok!"~=":
  258. return true;
  259. default:
  260. return false;
  261. }
  262. }
  263. public bool isKeyword(IdType type) pure nothrow @safe
  264. {
  265. switch (type)
  266. {
  267. case tok!"abstract":
  268. case tok!"alias":
  269. case tok!"align":
  270. case tok!"asm":
  271. case tok!"assert":
  272. case tok!"auto":
  273. case tok!"body":
  274. case tok!"break":
  275. case tok!"case":
  276. case tok!"cast":
  277. case tok!"catch":
  278. case tok!"class":
  279. case tok!"const":
  280. case tok!"continue":
  281. case tok!"debug":
  282. case tok!"default":
  283. case tok!"delegate":
  284. case tok!"delete":
  285. case tok!"deprecated":
  286. case tok!"do":
  287. case tok!"else":
  288. case tok!"enum":
  289. case tok!"export":
  290. case tok!"extern":
  291. case tok!"false":
  292. case tok!"final":
  293. case tok!"finally":
  294. case tok!"for":
  295. case tok!"foreach":
  296. case tok!"foreach_reverse":
  297. case tok!"function":
  298. case tok!"goto":
  299. case tok!"if":
  300. case tok!"immutable":
  301. case tok!"import":
  302. case tok!"in":
  303. case tok!"inout":
  304. case tok!"interface":
  305. case tok!"invariant":
  306. case tok!"is":
  307. case tok!"lazy":
  308. case tok!"macro":
  309. case tok!"mixin":
  310. case tok!"module":
  311. case tok!"new":
  312. case tok!"nothrow":
  313. case tok!"null":
  314. case tok!"out":
  315. case tok!"override":
  316. case tok!"package":
  317. case tok!"pragma":
  318. case tok!"private":
  319. case tok!"protected":
  320. case tok!"public":
  321. case tok!"pure":
  322. case tok!"ref":
  323. case tok!"return":
  324. case tok!"scope":
  325. case tok!"shared":
  326. case tok!"static":
  327. case tok!"struct":
  328. case tok!"super":
  329. case tok!"switch":
  330. case tok!"synchronized":
  331. case tok!"template":
  332. case tok!"this":
  333. case tok!"throw":
  334. case tok!"true":
  335. case tok!"try":
  336. case tok!"typedef":
  337. case tok!"typeid":
  338. case tok!"typeof":
  339. case tok!"union":
  340. case tok!"unittest":
  341. case tok!"version":
  342. case tok!"volatile":
  343. case tok!"while":
  344. case tok!"with":
  345. case tok!"__DATE__":
  346. case tok!"__EOF__":
  347. case tok!"__FILE__":
  348. case tok!"__FUNCTION__":
  349. case tok!"__gshared":
  350. case tok!"__LINE__":
  351. case tok!"__MODULE__":
  352. case tok!"__parameters":
  353. case tok!"__PRETTY_FUNCTION__":
  354. case tok!"__TIME__":
  355. case tok!"__TIMESTAMP__":
  356. case tok!"__traits":
  357. case tok!"__vector":
  358. case tok!"__VENDOR__":
  359. case tok!"__VERSION__":
  360. return true;
  361. default:
  362. return false;
  363. }
  364. }
  365. public bool isStringLiteral(IdType type) pure nothrow @safe
  366. {
  367. switch (type)
  368. {
  369. case tok!"dstringLiteral":
  370. case tok!"stringLiteral":
  371. case tok!"wstringLiteral":
  372. return true;
  373. default:
  374. return false;
  375. }
  376. }
  377. public bool isProtection(IdType type) pure nothrow @safe
  378. {
  379. switch (type)
  380. {
  381. case tok!"export":
  382. case tok!"package":
  383. case tok!"private":
  384. case tok!"public":
  385. case tok!"protected":
  386. return true;
  387. default:
  388. return false;
  389. }
  390. }
  391. public struct DLexer
  392. {
  393. import core.vararg;
  394. mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
  395. keywords, pseudoTokenHandlers);
  396. @disable this();
  397. this(ubyte[] range, const LexerConfig config, StringCache* cache)
  398. {
  399. auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf)
  400. ? range[3 .. $] : range;
  401. this.range = LexerRange(r);
  402. this.config = config;
  403. this.cache = cache;
  404. popFront();
  405. }
  406. private static bool isDocComment(string comment) pure nothrow @safe
  407. {
  408. return comment.length >= 3 && (comment[0 .. 3] == "///"
  409. || comment[0 .. 3] == "/++" || comment[0 .. 3] == "/**");
  410. }
  411. public void popFront() pure
  412. {
  413. _popFront();
  414. string comment;
  415. switch (front.type)
  416. {
  417. case tok!"comment":
  418. if (config.commentBehavior == CommentBehavior.attach)
  419. {
  420. import std.string;
  421. if (isDocComment(front.text))
  422. {
  423. comment = comment is null
  424. ? front.text
  425. : format("%s\n%s", comment, front.text);
  426. }
  427. do _popFront(); while (front == tok!"comment");
  428. if (front == tok!"whitespace") goto case tok!"whitespace";
  429. if (front == tok!"specialTokenSequence") goto case tok!"specialTokenSequence";
  430. }
  431. break;
  432. case tok!"whitespace":
  433. if (config.whitespaceBehavior == WhitespaceBehavior.skip)
  434. {
  435. do _popFront(); while (front == tok!"whitespace");
  436. if (front == tok!"comment") goto case tok!"comment";
  437. if (front == tok!"specialTokenSequence") goto case tok!"specialTokenSequence";
  438. }
  439. break;
  440. case tok!"specialTokenSequence":
  441. if (config.specialTokenBehavior == SpecialTokenBehavior.skip)
  442. {
  443. do _popFront(); while (front == tok!"specialTokenSequence");
  444. if (front == tok!"comment") goto case tok!"comment";
  445. if (front == tok!"whitespace") goto case tok!"whitespace";
  446. }
  447. break;
  448. default:
  449. break;
  450. }
  451. _front.comment = comment;
  452. }
  453. bool isWhitespace() pure /*const*/ nothrow
  454. {
  455. switch (range.front)
  456. {
  457. case ' ':
  458. case '\r':
  459. case '\n':
  460. case '\t':
  461. return true;
  462. case 0xe2:
  463. auto peek = range.peek(2);
  464. return peek.length == 2
  465. && peek[0] == 0x80
  466. && (peek[1] == 0xa8 || peek[1] == 0xa9);
  467. default:
  468. return false;
  469. }
  470. }
  471. void popFrontWhitespaceAware() pure nothrow
  472. {
  473. switch (range.front)
  474. {
  475. case '\r':
  476. range.popFront();
  477. if (!range.empty && range.front == '\n')
  478. {
  479. range.popFront();
  480. range.incrementLine();
  481. }
  482. else
  483. range.incrementLine();
  484. return;
  485. case '\n':
  486. range.popFront();
  487. range.incrementLine();
  488. return;
  489. case 0xe2:
  490. auto lookahead = range.peek(3);
  491. if (lookahead.length == 3 && lookahead[1] == 0x80
  492. && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
  493. {
  494. range.popFront();
  495. range.popFront();
  496. range.popFront();
  497. range.incrementLine();
  498. return;
  499. }
  500. else
  501. {
  502. range.popFront();
  503. return;
  504. }
  505. default:
  506. range.popFront();
  507. return;
  508. }
  509. }
  510. Token lexWhitespace() pure nothrow
  511. {
  512. mixin (tokenStart);
  513. static if (__VERSION__ > 2065) version (D_InlineAsm_X86_64) while (index + 16 <= range.bytes.length)
  514. {
  515. ulong startAddr = (cast(ulong) range.bytes.ptr) + index;
  516. enum space = (cast(ulong) ' ') * 0x0101010101010101L;
  517. enum tab = (cast(ulong) '\t') * 0x0101010101010101L;
  518. enum cr = (cast(ulong) '\r') * 0x0101010101010101L;
  519. enum lf = (cast(ulong) '\n') * 0x0101010101010101L;
  520. ulong charsSkipped;
  521. ulong lineIncrement;
  522. asm
  523. {
  524. mov R10, space;
  525. mov R11, tab;
  526. mov R12, cr;
  527. mov R13, lf;
  528. mov R8, startAddr;
  529. movdqu XMM0, [R8];
  530. mov R9, line;
  531. // space pattern
  532. movq XMM1, R10;
  533. shufpd XMM1, XMM1, 0;
  534. pcmpeqb XMM1, XMM0;
  535. // tab pattern
  536. movq XMM2, R11;
  537. shufpd XMM2, XMM2, 0;
  538. pcmpeqb XMM2, XMM0;
  539. // CR pattern
  540. movq XMM3, R12;
  541. shufpd XMM3, XMM3, 0;
  542. pcmpeqb XMM3, XMM0;
  543. // LF pattern
  544. movq XMM4, R13;
  545. shufpd XMM4, XMM4, 0;
  546. pcmpeqb XMM4, XMM0;
  547. // Bit mask-of newlines to r10
  548. pmovmskb R10, XMM4;
  549. // and the masks together
  550. por XMM1, XMM2;
  551. por XMM1, XMM3;
  552. por XMM1, XMM4;
  553. pmovmskb RAX, XMM1;
  554. not RAX;
  555. bsf RCX, RAX;
  556. mov charsSkipped, RCX;
  557. mov RBX, 1;
  558. inc CL;
  559. shl RBX, CL;
  560. sub RBX, 1;
  561. and R10, RBX;
  562. popcnt R10, R10;
  563. mov lineIncrement, R10;
  564. }
  565. range.incrementLine(lineIncrement);
  566. range.popFrontN(charsSkipped);
  567. if (charsSkipped < 16)
  568. goto end;
  569. index += 16;
  570. }
  571. loop: do
  572. {
  573. switch (range.front)
  574. {
  575. case '\r':
  576. range.popFront();
  577. if (!range.empty && range.front == '\n')
  578. range.popFront();
  579. range.incrementLine();
  580. break;
  581. case '\n':
  582. range.popFront();
  583. range.incrementLine();
  584. break;
  585. case ' ':
  586. case '\t':
  587. range.popFront();
  588. break;
  589. case 0xe2:
  590. auto lookahead = range.peek(3);
  591. if (lookahead.length != 3)
  592. break loop;
  593. if (lookahead[1] != 0x80)
  594. break loop;
  595. if (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)
  596. {
  597. range.popFront();
  598. range.popFront();
  599. range.popFront();
  600. range.incrementLine();
  601. break;
  602. }
  603. break loop;
  604. default:
  605. break loop;
  606. }
  607. } while (!range.empty);
  608. end:
  609. string text = config.whitespaceBehavior == WhitespaceBehavior.skip
  610. ? null : cache.intern(range.slice(mark));
  611. return Token(tok!"whitespace", text, line, column, index);
  612. }
  613. Token lexNumber() pure nothrow
  614. {
  615. mixin (tokenStart);
  616. if (range.front == '0' && range.canPeek(1))
  617. {
  618. auto ahead = range.peek(1)[1];
  619. switch (ahead)
  620. {
  621. case 'x':
  622. case 'X':
  623. range.popFront();
  624. range.popFront();
  625. return lexHex(mark, line, column, index);
  626. case 'b':
  627. case 'B':
  628. range.popFront();
  629. range.popFront();
  630. return lexBinary(mark, line, column, index);
  631. default:
  632. return lexDecimal(mark, line, column, index);
  633. }
  634. }
  635. else
  636. return lexDecimal(mark, line, column, index);
  637. }
  638. Token lexHex() pure nothrow
  639. {
  640. mixin (tokenStart);
  641. return lexHex(mark, line, column, index);
  642. }
  643. Token lexHex(size_t mark, size_t line, size_t column, size_t index) pure nothrow
  644. {
  645. IdType type = tok!"intLiteral";
  646. bool foundDot;
  647. hexLoop: while (!range.empty)
  648. {
  649. switch (range.front)
  650. {
  651. case 'a': .. case 'f':
  652. case 'A': .. case 'F':
  653. case '0': .. case '9':
  654. case '_':
  655. range.popFront();
  656. break;
  657. case 'u':
  658. case 'U':
  659. lexIntSuffix(type);
  660. break hexLoop;
  661. case 'i':
  662. if (foundDot)
  663. lexFloatSuffix(type);
  664. break hexLoop;
  665. case 'L':
  666. if (foundDot)
  667. lexFloatSuffix(type);
  668. else
  669. lexIntSuffix(type);
  670. break hexLoop;
  671. case 'p':
  672. case 'P':
  673. lexExponent(type);
  674. break hexLoop;
  675. case '.':
  676. if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.')
  677. break hexLoop;
  678. else
  679. {
  680. // The following bit of silliness tries to tell the
  681. // difference between "int dot identifier" and
  682. // "double identifier".
  683. if (range.canPeek(1))
  684. {
  685. switch (range.peekAt(1))
  686. {
  687. case '0': .. case '9':
  688. case 'A': .. case 'F':
  689. case 'a': .. case 'f':
  690. goto doubleLiteral;
  691. default:
  692. break hexLoop;
  693. }
  694. }
  695. else
  696. {
  697. doubleLiteral:
  698. range.popFront();
  699. foundDot = true;
  700. type = tok!"doubleLiteral";
  701. }
  702. }
  703. break;
  704. default:
  705. break hexLoop;
  706. }
  707. }
  708. return Token(type, cache.intern(range.slice(mark)), line, column,
  709. index);
  710. }
  711. Token lexBinary() pure nothrow
  712. {
  713. mixin (tokenStart);
  714. return lexBinary(mark, line, column, index);
  715. }
  716. Token lexBinary(size_t mark, size_t line, size_t column, size_t index) pure nothrow
  717. {
  718. IdType type = tok!"intLiteral";
  719. binaryLoop: while (!range.empty)
  720. {
  721. switch (range.front)
  722. {
  723. case '0':
  724. case '1':
  725. case '_':
  726. range.popFront();
  727. break;
  728. case 'u':
  729. case 'U':
  730. case 'L':
  731. lexIntSuffix(type);
  732. break binaryLoop;
  733. default:
  734. break binaryLoop;
  735. }
  736. }
  737. return Token(type, cache.intern(range.slice(mark)), line, column,
  738. index);
  739. }
  740. Token lexDecimal() pure nothrow
  741. {
  742. mixin (tokenStart);
  743. return lexDecimal(mark, line, column, index);
  744. }
  745. Token lexDecimal(size_t mark, size_t line, size_t column, size_t index) pure nothrow
  746. {
  747. bool foundDot = range.front == '.';
  748. IdType type = tok!"intLiteral";
  749. if (foundDot)
  750. {
  751. range.popFront();
  752. type = tok!"doubleLiteral";
  753. }
  754. decimalLoop: while (!range.empty)
  755. {
  756. switch (range.front)
  757. {
  758. case '0': .. case '9':
  759. case '_':
  760. range.popFront();
  761. break;
  762. case 'u':
  763. case 'U':
  764. if (!foundDot)
  765. lexIntSuffix(type);
  766. break decimalLoop;
  767. case 'i':
  768. lexFloatSuffix(type);
  769. break decimalLoop;
  770. case 'L':
  771. if (foundDot)
  772. lexFloatSuffix(type);
  773. else
  774. lexIntSuffix(type);
  775. break decimalLoop;
  776. case 'f':
  777. case 'F':
  778. lexFloatSuffix(type);
  779. break decimalLoop;
  780. case 'e':
  781. case 'E':
  782. lexExponent(type);
  783. break decimalLoop;
  784. case '.':
  785. if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.')
  786. break decimalLoop;
  787. else
  788. {
  789. // The following bit of silliness tries to tell the
  790. // difference between "int dot identifier" and
  791. // "double identifier".
  792. if (range.canPeek(1))
  793. {
  794. auto ch = range.peekAt(1);
  795. if (ch <= 0x2f
  796. || (ch >= '0' && ch <= '9')
  797. || (ch >= ':' && ch <= '@')
  798. || (ch >= '[' && ch <= '^')
  799. || (ch >= '{' && ch <= '~')
  800. || ch == '`' || ch == '_')
  801. {
  802. goto doubleLiteral;
  803. }
  804. else
  805. break decimalLoop;
  806. }
  807. else
  808. {
  809. doubleLiteral:
  810. range.popFront();
  811. foundDot = true;
  812. type = tok!"doubleLiteral";
  813. }
  814. }
  815. break;
  816. default:
  817. break decimalLoop;
  818. }
  819. }
  820. return Token(type, cache.intern(range.slice(mark)), line, column,
  821. index);
  822. }
  823. void lexIntSuffix(ref IdType type) pure nothrow @safe
  824. {
  825. bool secondPass;
  826. if (range.front == 'u' || range.front == 'U')
  827. {
  828. U:
  829. if (type == tok!"intLiteral")
  830. type = tok!"uintLiteral";
  831. else
  832. type = tok!"ulongLiteral";
  833. range.popFront();
  834. if (secondPass)
  835. return;
  836. if (range.front == 'L' || range.front == 'l')
  837. goto L;
  838. return;
  839. }
  840. if (range.front == 'L' || range.front == 'l')
  841. {
  842. L:
  843. if (type == tok!"uintLiteral")
  844. type = tok!"ulongLiteral";
  845. else
  846. type = tok!"longLiteral";
  847. range.popFront();
  848. if (range.front == 'U' || range.front == 'u')
  849. {
  850. secondPass = true;
  851. goto U;
  852. }
  853. return;
  854. }
  855. }
  856. void lexFloatSuffix(ref IdType type) pure nothrow @safe
  857. {
  858. switch (range.front)
  859. {
  860. case 'L':
  861. range.popFront();
  862. type = tok!"doubleLiteral";
  863. break;
  864. case 'f':
  865. case 'F':
  866. range.popFront();
  867. type = tok!"floatLiteral";
  868. break;
  869. default:
  870. break;
  871. }
  872. if (!range.empty && range.front == 'i')
  873. {
  874. warning("Complex number literals are deprecated");
  875. range.popFront();
  876. if (type == tok!"floatLiteral")
  877. type = tok!"ifloatLiteral";
  878. else
  879. type = tok!"idoubleLiteral";
  880. }
  881. }
  882. void lexExponent(ref IdType type) pure nothrow @safe
  883. {
  884. range.popFront();
  885. bool foundSign = false;
  886. bool foundDigit = false;
  887. while (!range.empty)
  888. {
  889. switch (range.front)
  890. {
  891. case '-':
  892. case '+':
  893. if (foundSign)
  894. {
  895. if (!foundDigit)
  896. error("Expected an exponent");
  897. return;
  898. }
  899. foundSign = true;
  900. range.popFront();
  901. break;
  902. case '0': .. case '9':
  903. case '_':
  904. foundDigit = true;
  905. range.popFront();
  906. break;
  907. case 'L':
  908. case 'f':
  909. case 'F':
  910. case 'i':
  911. lexFloatSuffix(type);
  912. return;
  913. default:
  914. if (!foundDigit)
  915. error("Expected an exponent");
  916. return;
  917. }
  918. }
  919. }
  920. Token lexScriptLine() pure
  921. {
  922. mixin (tokenStart);
  923. while (!range.empty && !isNewline)
  924. range.popFront();
  925. return Token(tok!"scriptLine", cache.intern(range.slice(mark)),
  926. line, column, index);
  927. }
  928. Token lexSpecialTokenSequence() pure
  929. {
  930. mixin (tokenStart);
  931. while (!range.empty && !isNewline)
  932. range.popFront();
  933. return Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
  934. line, column, index);
  935. }
  936. Token lexSlashStarComment() pure
  937. {
  938. mixin (tokenStart);
  939. IdType type = tok!"comment";
  940. range.popFrontN(2);
  941. static if (__VERSION__ > 2065) version (D_InlineAsm_X86_64) while (range.index + 16 <= range.bytes.length)
  942. {
  943. ulong startAddress = cast(ulong) range.bytes.ptr + range.index;
  944. enum slash = (cast(ulong) '/') * 0x0101010101010101L;
  945. enum star = (cast(ulong) '*') * 0x0101010101010101L;
  946. enum lf = (cast(ulong) '\n') * 0x0101010101010101L;
  947. ulong charsSkipped;
  948. ulong newlineCount;
  949. bool done;
  950. asm
  951. {
  952. mov RAX, startAddress;
  953. movdqu XMM0, [RAX];
  954. mov R10, lf;
  955. movq XMM2, R10;
  956. shufpd XMM2, XMM2, 0;
  957. pcmpeqb XMM2, XMM0;
  958. pmovmskb R15, XMM2;
  959. mov R10, star;
  960. movq XMM3, R10;
  961. shufpd XMM3, XMM3, 0;
  962. pcmpeqb XMM3, XMM0;
  963. pmovmskb R8, XMM3;
  964. mov R10, slash;
  965. movq XMM4, R10;
  966. shufpd XMM4, XMM4, 0;
  967. pcmpeqb XMM4, XMM0;
  968. pmovmskb R9, XMM4;
  969. loop:
  970. cmp R8, 0;
  971. je notFound;
  972. cmp R9, 0;
  973. je notFound;
  974. bsf RAX, R8; // stIndex
  975. bsf RBX, R9; // slIndex
  976. mov RDX, RAX;
  977. inc RDX;
  978. cmp RDX, RBX;
  979. je found;
  980. cmp RAX, RBX;
  981. jae maskSlash;
  982. maskStar:
  983. mov RCX, RAX;
  984. mov R10, 1;
  985. shl R10, CL;
  986. xor R8, R10;
  987. jmp loop;
  988. maskSlash:
  989. mov RCX, RBX;
  990. mov R10, 1;
  991. shl R10, CL;
  992. xor R9, R10;
  993. jmp loop;
  994. notFound:
  995. mov R14, 16;
  996. mov charsSkipped, R14;
  997. popcnt R14, R15;
  998. mov newlineCount, R14;
  999. jmp asmEnd;
  1000. found:
  1001. inc RBX;
  1002. mov charsSkipped, RBX;
  1003. mov RAX, 1;
  1004. mov done, AL;
  1005. mov RCX, RBX;
  1006. mov RBX, 1;
  1007. shl RBX, CL;
  1008. dec RBX;
  1009. and R15, RBX;
  1010. popcnt R14, R15;
  1011. mov newlineCount, R14;
  1012. asmEnd:
  1013. nop;
  1014. }
  1015. range.popFrontN(charsSkipped);
  1016. range.incrementLine(newlineCount);
  1017. if (done)
  1018. goto end;
  1019. }
  1020. while (!range.empty)
  1021. {
  1022. if (range.front == '*')
  1023. {
  1024. range.popFront();
  1025. if (!range.empty && range.front == '/')
  1026. {
  1027. range.popFront();
  1028. break;
  1029. }
  1030. }
  1031. else
  1032. popFrontWhitespaceAware();
  1033. }
  1034. end:
  1035. return Token(type, cache.intern(range.slice(mark)), line, column,
  1036. index);
  1037. }
  1038. Token lexSlashSlashComment() pure nothrow
  1039. {
  1040. mixin (tokenStart);
  1041. IdType type = tok!"comment";
  1042. range.popFrontN(2);
  1043. static if (__VERSION__ > 2065) version (D_InlineAsm_X86_64) while (range.index + 16 <= range.bytes.length)
  1044. {
  1045. ulong startAddress = cast(ulong) range.bytes.ptr + range.index;
  1046. enum cr = (cast(ulong) '\r') * 0x0101010101010101L;
  1047. enum lf = (cast(ulong) '\n') * 0x0101010101010101L;
  1048. ulong charsSkipped;
  1049. asm
  1050. {
  1051. mov RAX, startAddress;
  1052. movdqu XMM0, [RAX];
  1053. mov R10, cr;
  1054. movq XMM1, R10;
  1055. shufpd XMM1, XMM1, 0;
  1056. pcmpeqb XMM1, XMM0;
  1057. mov R10, lf;
  1058. movq XMM2, R10;
  1059. shufpd XMM2, XMM2, 0;
  1060. pcmpeqb XMM2, XMM0;
  1061. por XMM1, XMM2;
  1062. pmovmskb RBX, XMM1;
  1063. bsf RCX, RBX;
  1064. mov RDX, 16;
  1065. cmp RBX, 0;
  1066. cmove RCX, RDX;
  1067. mov charsSkipped, RCX;
  1068. }
  1069. if (charsSkipped < 16)
  1070. {
  1071. index += charsSkipped;
  1072. column += charsSkipped;
  1073. range.popFrontN(charsSkipped);
  1074. goto end;
  1075. }
  1076. else
  1077. {
  1078. assert (charsSkipped == 16);
  1079. index += 16;
  1080. column += 16;
  1081. range.popFrontN(16);
  1082. }
  1083. }
  1084. while (!range.empty)
  1085. {
  1086. if (range.front == '\r' || range.front == '\n')
  1087. break;
  1088. range.popFront();
  1089. }
  1090. end:
  1091. return Token(type, cache.intern(range.slice(mark)), line, column,
  1092. index);
  1093. }
  1094. Token lexSlashPlusComment() pure nothrow
  1095. {
  1096. mixin (tokenStart);
  1097. IdType type = tok!"comment";
  1098. range.popFront();
  1099. range.popFront();
  1100. int depth = 1;
  1101. while (depth > 0 && !range.empty)
  1102. {
  1103. if (range.front == '+')
  1104. {
  1105. range.popFront();
  1106. if (!range.empty && range.front == '/')
  1107. {
  1108. range.popFront();
  1109. depth--;
  1110. }
  1111. }
  1112. else if (range.front == '/')
  1113. {
  1114. range.popFront();
  1115. if (!range.empty && range.front == '+')
  1116. {
  1117. range.popFront();
  1118. depth++;
  1119. }
  1120. }
  1121. else
  1122. popFrontWhitespaceAware();
  1123. }
  1124. return Token(type, cache.intern(range.slice(mark)), line, column,
  1125. index);
  1126. }
  1127. Token lexStringLiteral() pure nothrow
  1128. {
  1129. mixin (tokenStart);
  1130. range.popFront();
  1131. while (true)
  1132. {
  1133. if (range.empty)
  1134. {
  1135. error("Error: unterminated string literal");
  1136. return Token();
  1137. }
  1138. else if (range.front == '"')
  1139. {
  1140. range.popFront();
  1141. break;
  1142. }
  1143. else if (range.front == '\\')
  1144. {
  1145. lexEscapeSequence();
  1146. }
  1147. else
  1148. popFrontWhitespaceAware();
  1149. }
  1150. IdType type = tok!"stringLiteral";
  1151. lexStringSuffix(type);
  1152. return Token(type, cache.intern(range.slice(mark)), line, column,
  1153. index);
  1154. }
  1155. Token lexWysiwygString() pure nothrow
  1156. {
  1157. mixin (tokenStart);
  1158. IdType type = tok!"stringLiteral";
  1159. bool backtick = range.front == '`';
  1160. if (backtick)
  1161. {
  1162. range.popFront();
  1163. while (true)
  1164. {
  1165. if (range.empty)
  1166. {
  1167. error("Error: unterminated string literal");
  1168. return Token(tok!"");
  1169. }
  1170. else if (range.front == '`')
  1171. {
  1172. range.popFront();
  1173. break;
  1174. }
  1175. else
  1176. popFrontWhitespaceAware();
  1177. }
  1178. }
  1179. else
  1180. {
  1181. range.popFront();
  1182. if (range.empty)
  1183. {
  1184. error("Error: unterminated string literal");
  1185. return Token(tok!"");
  1186. }
  1187. range.popFront();
  1188. while (true)
  1189. {
  1190. if (range.empty)
  1191. {
  1192. error("Error: unterminated string literal");
  1193. return Token(tok!"");
  1194. }
  1195. else if (range.front == '"')
  1196. {
  1197. range.popFront();
  1198. break;
  1199. }
  1200. else
  1201. popFrontWhitespaceAware();
  1202. }
  1203. }
  1204. lexStringSuffix(type);
  1205. return Token(type, cache.intern(range.slice(mark)), line, column,
  1206. index);
  1207. }
  1208. void lexStringSuffix(ref IdType type) pure nothrow
  1209. {
  1210. if (range.empty)
  1211. type = tok!"stringLiteral";
  1212. else
  1213. {
  1214. switch (range.front)
  1215. {
  1216. case 'w': range.popFront(); type = tok!"wstringLiteral"; break;
  1217. case 'd': range.popFront(); type = tok!"dstringLiteral"; break;
  1218. case 'c': range.popFront(); type = tok!"stringLiteral"; break;
  1219. default: type = tok!"stringLiteral"; break;
  1220. }
  1221. }
  1222. }
  1223. Token lexDelimitedString() pure nothrow
  1224. {
  1225. import std.traits;
  1226. mixin (tokenStart);
  1227. range.popFront();
  1228. range.popFront();
  1229. ubyte open;
  1230. ubyte close;
  1231. switch (range.front)
  1232. {
  1233. case '<':
  1234. open = '<';
  1235. close = '>';
  1236. range.popFront();
  1237. return lexNormalDelimitedString(mark, line, column, index, open, close);
  1238. case '{':
  1239. open = '{';
  1240. close = '}';
  1241. range.popFront();
  1242. return lexNormalDelimitedString(mark, line, column, index, open, close);
  1243. case '[':
  1244. open = '[';
  1245. close = ']';
  1246. range.popFront();
  1247. return lexNormalDelimitedString(mark, line, column, index, open, close);
  1248. case '(':
  1249. open = '(';
  1250. close = ')';
  1251. range.popFront();
  1252. return lexNormalDelimitedString(mark, line, column, index, open, close);
  1253. default:
  1254. return lexHeredocString(mark, line, column, index);
  1255. }
  1256. }
  1257. Token lexNormalDelimitedString(size_t mark, size_t line, size_t column,
  1258. size_t index, ubyte open, ubyte close)
  1259. pure nothrow
  1260. {
  1261. int depth = 1;
  1262. while (!range.empty && depth > 0)
  1263. {
  1264. if (range.front == open)
  1265. {
  1266. depth++;
  1267. range.popFront();
  1268. }
  1269. else if (range.front == close)
  1270. {
  1271. depth--;
  1272. range.popFront();
  1273. if (depth <= 0)
  1274. {
  1275. if (range.front == '"')
  1276. range.popFront();
  1277. else
  1278. {
  1279. error("Error: \" expected to end delimited string literal");
  1280. return Token(tok!"");
  1281. }
  1282. }
  1283. }
  1284. else
  1285. popFrontWhitespaceAware();
  1286. }
  1287. IdType type = tok!"stringLiteral";
  1288. lexStringSuffix(type);
  1289. return Token(type, cache.intern(range.slice(mark)), line, column, index);
  1290. }
  1291. Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
  1292. pure nothrow
  1293. {
  1294. import std.regex;
  1295. Token ident = lexIdentifier();
  1296. if (isNewline())
  1297. popFrontWhitespaceAware();
  1298. else
  1299. error("Newline expected");
  1300. while (!range.empty)
  1301. {
  1302. if (isNewline())
  1303. {
  1304. popFrontWhitespaceAware();
  1305. if (!range.canPeek(ident.text.length))
  1306. {
  1307. error(ident.text ~ " expected");
  1308. break;
  1309. }
  1310. if (range.peek(ident.text.length - 1) == ident.text)
  1311. {
  1312. range.popFrontN(ident.text.length);
  1313. break;
  1314. }
  1315. }
  1316. else
  1317. range.popFront();
  1318. }
  1319. if (!range.empty() && range.front == '"')
  1320. range.popFront();
  1321. else
  1322. error(`" expected`);
  1323. IdType type = tok!"stringLiteral";
  1324. lexStringSuffix(type);
  1325. return Token(type, cache.intern(range.slice(mark)), line, column, index);
  1326. }
  1327. Token lexTokenString() pure
  1328. {
  1329. mixin (tokenStart);
  1330. assert (range.front == 'q');
  1331. range.popFront();
  1332. assert (range.front == '{');
  1333. range.popFront();
  1334. auto app = appender!string();
  1335. app.put("q{");
  1336. int depth = 1;
  1337. LexerConfig c = config;
  1338. scope(exit) config = c;
  1339. config.whitespaceBehavior = WhitespaceBehavior.include;
  1340. config.stringBehavior = StringBehavior.source;
  1341. config.commentBehavior = CommentBehavior.include;
  1342. _front = advance();
  1343. while (depth > 0 && !empty)
  1344. {
  1345. auto t = front();
  1346. if (t.text is null)
  1347. app.put(str(t.type));
  1348. else
  1349. app.put(t.text);
  1350. if (t.type == tok!"}")
  1351. {
  1352. depth--;
  1353. if (depth > 0)
  1354. popFront();
  1355. }
  1356. else if (t.type == tok!"{")
  1357. {
  1358. depth++;
  1359. popFront();
  1360. }
  1361. else
  1362. popFront();
  1363. }
  1364. IdType type = tok!"stringLiteral";
  1365. lexStringSuffix(type);
  1366. return Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
  1367. column, index);
  1368. }
  1369. Token lexHexString() pure nothrow
  1370. {
  1371. mixin (tokenStart);
  1372. range.popFront();
  1373. range.popFront();
  1374. loop: while (true)
  1375. {
  1376. if (range.empty)
  1377. {
  1378. error("Error: unterminated hex string literal");
  1379. return Token();
  1380. }
  1381. else if (isWhitespace())
  1382. popFrontWhitespaceAware();
  1383. else switch (range.front)
  1384. {
  1385. case '0': .. case '9':
  1386. case 'A': .. case 'F':
  1387. case 'a': .. case 'f':
  1388. range.popFront();
  1389. break;
  1390. case '"':
  1391. range.popFront();
  1392. break loop;
  1393. default:
  1394. error("Error: invalid character in hex string");
  1395. return Token();
  1396. }
  1397. }
  1398. IdType type = tok!"stringLiteral";
  1399. lexStringSuffix(type);
  1400. return Token(type, cache.intern(range.slice(mark)), line, column,
  1401. index);
  1402. }
  1403. bool lexEscapeSequence() pure nothrow
  1404. {
  1405. range.popFront();
  1406. if (range.empty)
  1407. {
  1408. error("Error: non-terminated character escape sequence.");
  1409. return false;
  1410. }
  1411. switch (range.front)
  1412. {
  1413. case '\'':
  1414. case '"':
  1415. case '?':
  1416. case '\\':
  1417. case '0':
  1418. case 'a':
  1419. case 'b':
  1420. case 'f':
  1421. case 'n':
  1422. case 'r':
  1423. case 't':
  1424. case 'v':
  1425. range.popFront();
  1426. break;
  1427. case 'x':
  1428. range.popFront();
  1429. foreach (i; 0 .. 2)
  1430. {
  1431. if (range.empty)
  1432. {
  1433. error("Error: 2 hex digits expected.");
  1434. return false;
  1435. }
  1436. switch (range.front)
  1437. {
  1438. case '0': .. case '9':
  1439. case 'a': .. case 'f':
  1440. case 'A': .. case 'F':
  1441. range.popFront();
  1442. break;
  1443. default:
  1444. error("Error: 2 hex digits expected.");
  1445. return false;
  1446. }
  1447. }
  1448. break;
  1449. case '1': .. case '7':
  1450. for (size_t i = 0; i < 3 && !range.empty && range.front >= '0' && range.front <= '7'; i++)
  1451. range.popFront();
  1452. break;
  1453. case 'u':
  1454. range.popFront();
  1455. foreach (i; 0 .. 4)
  1456. {
  1457. if (range.empty)
  1458. {
  1459. error("Error: at least 4 hex digits expected.");
  1460. return false;
  1461. }
  1462. switch (range.front)
  1463. {
  1464. case '0': .. case '9':
  1465. case 'a': .. case 'f':
  1466. case 'A': .. case 'F':
  1467. range.popFront();
  1468. break;
  1469. default:
  1470. error("Error: at least 4 hex digits expected.");
  1471. return false;
  1472. }
  1473. }
  1474. break;
  1475. case 'U':
  1476. range.popFront();
  1477. foreach (i; 0 .. 8)
  1478. {
  1479. if (range.empty)
  1480. {
  1481. error("Error: at least 8 hex digits expected.");
  1482. return false;
  1483. }
  1484. switch (range.front)
  1485. {
  1486. case '0': .. case '9':
  1487. case 'a': .. case 'f':
  1488. case 'A': .. case 'F':
  1489. range.popFront();
  1490. break;
  1491. default:
  1492. error("Error: at least 8 hex digits expected.");
  1493. return false;
  1494. }
  1495. }
  1496. break;
  1497. default:
  1498. while (true)
  1499. {
  1500. if (range.empty)
  1501. {
  1502. error("Error: non-terminated character escape sequence.");
  1503. return false;
  1504. }
  1505. if (range.front == ';')
  1506. {
  1507. range.popFront();
  1508. break;
  1509. }
  1510. else
  1511. range.popFront();
  1512. }
  1513. }
  1514. return true;
  1515. }
  1516. Token lexCharacterLiteral() pure nothrow
  1517. {
  1518. mixin (tokenStart);
  1519. range.popFront();
  1520. if (range.front == '\\')
  1521. {
  1522. lexEscapeSequence();
  1523. goto close;
  1524. }
  1525. else if (range.front == '\'')
  1526. {
  1527. range.popFront();
  1528. return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
  1529. line, column, index);
  1530. }
  1531. else if (range.front & 0x80)
  1532. {
  1533. while (range.front & 0x80)
  1534. range.popFront();
  1535. goto close;
  1536. }
  1537. else
  1538. {
  1539. popFrontWhitespaceAware();
  1540. goto close;
  1541. }
  1542. close:
  1543. if (range.front == '\'')
  1544. {
  1545. range.popFront();
  1546. return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
  1547. line, column, index);
  1548. }
  1549. else
  1550. {
  1551. error("Error: Expected ' to end character literal");
  1552. return Token();
  1553. }
  1554. }
  1555. Token lexIdentifier() pure nothrow
  1556. {
  1557. import std.stdio;
  1558. mixin (tokenStart);
  1559. uint hash = 0;
  1560. if (isSeparating(0) || range.empty)
  1561. {
  1562. error("Invalid identifier");
  1563. range.popFront();
  1564. }
  1565. while (!range.empty && !isSeparating(0))
  1566. {
  1567. hash = StringCache.hashStep(range.front, hash);
  1568. range.popFront();
  1569. }
  1570. return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line,
  1571. column, index);
  1572. }
  1573. Token lexDot() pure nothrow
  1574. {
  1575. mixin (tokenStart);
  1576. if (!range.canPeek(1))
  1577. {
  1578. range.popFront();
  1579. return Token(tok!".", null, line, column, index);
  1580. }
  1581. switch (range.peekAt(1))
  1582. {
  1583. case '0': .. case '9':
  1584. return lexNumber();
  1585. case '.':
  1586. range.popFront();
  1587. range.popFront();
  1588. if (!range.empty && range.front == '.')
  1589. {
  1590. range.popFront();
  1591. return Token(tok!"...", null, line, column, index);
  1592. }
  1593. else
  1594. return Token(tok!"..", null, line, column, index);
  1595. default:
  1596. range.popFront();
  1597. return Token(tok!".", null, line, column, index);
  1598. }
  1599. }
  1600. Token lexLongNewline() pure nothrow
  1601. {
  1602. mixin (tokenStart);
  1603. range.popFront();
  1604. range.popFront();
  1605. range.popFront();
  1606. range.incrementLine();
  1607. return Token(tok!"whitespace", cache.intern(range.slice(mark)), line,
  1608. column, index);
  1609. }
  1610. bool isNewline() pure @safe nothrow
  1611. {
  1612. if (range.front == '\n') return true;
  1613. if (range.front == '\r') return true;
  1614. return (range.front & 0x80) && range.canPeek(2)
  1615. && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
  1616. }
  1617. bool isSeparating(size_t offset) pure nothrow @safe
  1618. {
  1619. if (!range.canPeek(offset)) return true;
  1620. auto c = range.peekAt(offset);
  1621. if (c >= 'A' && c <= 'Z') return false;
  1622. if (c >= 'a' && c <= 'z') return false;
  1623. if (c <= 0x2f) return true;
  1624. if (c >= ':' && c <= '@') return true;
  1625. if (c >= '[' && c <= '^') return true;
  1626. if (c >= '{' && c <= '~') return true;
  1627. if (c == '`') return true;
  1628. if (c & 0x80)
  1629. {
  1630. auto r = range;
  1631. range.popFrontN(offset);
  1632. return (r.canPeek(2) && (r.peek(2) == "\u2028"
  1633. || r.peek(2) == "\u2029"));
  1634. }
  1635. return false;
  1636. }
  1637. enum tokenStart = q{
  1638. size_t index = range.index;
  1639. size_t column = range.column;
  1640. size_t line = range.line;
  1641. auto mark = range.mark();
  1642. };
  1643. void error(string message) pure nothrow @safe
  1644. {
  1645. messages ~= Message(range.line, range.column, message, true);
  1646. }
  1647. void warning(string message) pure nothrow @safe
  1648. {
  1649. messages ~= Message(range.line, range.column, message, false);
  1650. assert (messages.length > 0);
  1651. }
  1652. struct Message
  1653. {
  1654. size_t line;
  1655. size_t column;
  1656. string message;
  1657. bool isError;
  1658. }
  1659. Message[] messages;
  1660. StringCache* cache;
  1661. LexerConfig config;
  1662. }
  1663. public auto byToken(ubyte[] range)
  1664. {
  1665. LexerConfig config;
  1666. StringCache* cache = new StringCache(StringCache.defaultBucketCount);
  1667. return DLexer(range, config, cache);
  1668. }
  1669. public auto byToken(ubyte[] range, StringCache* cache)
  1670. {
  1671. LexerConfig config;
  1672. return DLexer(range, config, cache);
  1673. }
  1674. public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
  1675. {
  1676. return DLexer(range, config, cache);
  1677. }
  1678. /**
  1679. * Removes "decoration" such as leading whitespace, leading + and * characters,
  1680. * and places the result into the given output range
  1681. */
  1682. public void unDecorateComment(T)(string comment, auto ref T outputRange)
  1683. if (isOutputRange!(T, string))
  1684. in
  1685. {
  1686. assert (comment.length >= 3);
  1687. }
  1688. body
  1689. {
  1690. switch (comment[0 .. 3])
  1691. {
  1692. case "///":
  1693. size_t i = 3;
  1694. while (comment[i] == ' ' || comment[i] == '\t')
  1695. i++;
  1696. outputRange.put(comment[i .. $]);
  1697. break;
  1698. case "/++":
  1699. case "/**":
  1700. size_t i = 3;
  1701. immutable char c = comment[1];
  1702. // Skip leading * and + characters
  1703. while (comment[i] == c) i++;
  1704. // Skip trailing * and + characters
  1705. size_t j = comment.length - 2;
  1706. while (j > i && comment[j] == c)
  1707. j--;
  1708. while (j > i && (comment[j] == ' ' || comment[j] == '\t'))
  1709. j--;
  1710. if (comment[i] == '\r') i++;
  1711. if (comment[i] == '\n') i++;
  1712. while (comment[i] == ' ' || comment[i] == '\t') i++;
  1713. immutable bool skipBeginningChar = comment[i] == c;
  1714. if (skipBeginningChar)
  1715. i++;
  1716. size_t whitespaceToSkip;
  1717. while (comment[i] == ' ' || comment[i] == '\t')
  1718. {
  1719. whitespaceToSkip++;
  1720. i++;
  1721. }
  1722. size_t l = i;
  1723. while (i < j)
  1724. {
  1725. if (comment[i++] == '\n')
  1726. break;
  1727. }
  1728. outputRange.put(comment[l .. i]);
  1729. while (true)
  1730. {
  1731. if (skipBeginningChar)
  1732. {
  1733. while (i < j && (comment[i] == ' ' || comment[i] == '\t')) i++;
  1734. if (i < j && comment[i] == c) i++;
  1735. }
  1736. for (size_t s = 0; (i < j) && (s <= whitespaceToSkip)
  1737. && (comment[i] == ' ' || comment[i] == '\t');)
  1738. {
  1739. s++;
  1740. i++;
  1741. }
  1742. size_t k = i;
  1743. inner: while (k < j)
  1744. {
  1745. if (comment[k] == '\n')
  1746. {
  1747. k++;
  1748. break inner;
  1749. }
  1750. k++;
  1751. }
  1752. outputRange.put(comment[i .. k]);
  1753. i = k;
  1754. if (i >= j)
  1755. break;
  1756. }
  1757. break;
  1758. default:
  1759. assert (false, "Invalid doc comment");
  1760. }
  1761. }
  1762. struct StringCache
  1763. {
  1764. public:
  1765. @disable this();
  1766. /**
  1767. * Params: bucketCount = the initial number of buckets. Must be a
  1768. * power of two
  1769. */
  1770. this(size_t bucketCount)
  1771. {
  1772. buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount];
  1773. }
  1774. ~this()
  1775. {
  1776. Block* current = rootBlock;
  1777. while (current !is null)
  1778. {
  1779. Block* prev = current;
  1780. current = current.next;
  1781. free(cast(void*) prev.bytes.ptr);
  1782. free(cast(void*) prev);
  1783. }
  1784. foreach (nodePointer; buckets)
  1785. {
  1786. Node* currentNode = nodePointer;
  1787. while (currentNode !is null)
  1788. {
  1789. Node* prev = currentNode;
  1790. currentNode = currentNode.next;
  1791. free(prev);
  1792. }
  1793. }
  1794. rootBlock = null;
  1795. free(buckets.ptr);
  1796. buckets = null;
  1797. }
  1798. /**
  1799. * Caches a string.
  1800. */
  1801. string intern(const(ubyte)[] str) pure nothrow @safe
  1802. {
  1803. if (str is null || str.length == 0)
  1804. return "";
  1805. immutable uint hash = hashBytes(str);
  1806. return intern(str, hash);
  1807. }
  1808. /**
  1809. * ditto
  1810. */
  1811. string intern(string str) pure nothrow @trusted
  1812. {
  1813. return intern(cast(ubyte[]) str);
  1814. }
  1815. /**
  1816. * Caches a string as above, but uses the given hash code instead of
  1817. * calculating one itself. Use this alongside $(LREF hashStep)() can reduce the
  1818. * amount of work necessary when lexing dynamic tokens.
  1819. */
  1820. string intern(const(ubyte)[] str, uint hash) pure nothrow @safe
  1821. in
  1822. {
  1823. assert (str.length > 0);
  1824. }
  1825. body
  1826. {
  1827. return _intern(str, hash);
  1828. // string s = _intern(str, hash);
  1829. // size_t* ptr = s in debugMap;
  1830. // if (ptr is null)
  1831. // debugMap[s] = cast(size_t) s.ptr;
  1832. // else
  1833. // assert (*ptr == cast(size_t) s.ptr);
  1834. // return s;
  1835. }
  1836. /**
  1837. * Incremental hashing.
  1838. * Params:
  1839. * b = the byte to add to the hash
  1840. * h = the hash that has been calculated so far
  1841. * Returns: the new hash code for the string.
  1842. */
  1843. static uint hashStep(ubyte b, uint h) pure nothrow @safe
  1844. {
  1845. return (h ^ sbox[b]) * 3;
  1846. }
  1847. /**
  1848. * The default bucket count for the string cache.
  1849. */
  1850. static enum defaultBucketCount = 4096;
  1851. size_t allocated() pure nothrow @safe @property
  1852. {
  1853. return _allocated;
  1854. }
  1855. private:
  1856. string _intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
  1857. {
  1858. if (bytes is null || bytes.length == 0)
  1859. return "";
  1860. immutable size_t index = hash & (buckets.length - 1);
  1861. Node* s = find(bytes, hash);
  1862. if (s !is null)
  1863. return cast(string) s.str;
  1864. _allocated += bytes.length;
  1865. ubyte[] mem = allocate(bytes.length);
  1866. mem[] = bytes[];
  1867. Node* node = cast(Node*) malloc(Node.sizeof);
  1868. node.str = mem;
  1869. node.hash = hash;
  1870. node.next = buckets[index];
  1871. buckets[index] = node;
  1872. return cast(string) mem;
  1873. }
  1874. Node* find(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
  1875. {
  1876. import std.algorithm;
  1877. immutable size_t index = hash & (buckets.length - 1);
  1878. Node* node = buckets[index];
  1879. while (node !is null)
  1880. {
  1881. if (node.hash == hash && bytes.equal(cast(ubyte[]) node.str))
  1882. return node;
  1883. node = node.next;
  1884. }
  1885. return node;
  1886. }
  1887. static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted
  1888. in
  1889. {
  1890. assert (data !is null);
  1891. assert (data.length > 0);
  1892. }
  1893. body
  1894. {
  1895. uint hash = 0;
  1896. foreach (ubyte b; data)
  1897. {
  1898. hash ^= sbox[b];
  1899. hash *= 3;
  1900. }
  1901. return hash;
  1902. }
  1903. ubyte[] allocate(size_t numBytes) pure nothrow @trusted
  1904. in
  1905. {
  1906. assert (numBytes != 0);
  1907. }
  1908. out (result)
  1909. {
  1910. assert (result.length == numBytes);
  1911. }
  1912. body
  1913. {
  1914. if (numBytes > (blockSize / 4))
  1915. return (cast(ubyte*) malloc(numBytes))[0 .. numBytes];
  1916. Block* r = rootBlock;
  1917. size_t i = 0;
  1918. while (i <= 3 && r !is null)
  1919. {
  1920. immutable size_t available = r.bytes.length;
  1921. immutable size_t oldUsed = r.used;
  1922. immutable size_t newUsed = oldUsed + numBytes;
  1923. if (newUsed <= available)
  1924. {
  1925. r.used = newUsed;
  1926. return r.bytes[oldUsed .. newUsed];
  1927. }
  1928. i++;
  1929. r = r.next;
  1930. }
  1931. Block* b = cast(Block*) malloc(Block.sizeof);
  1932. b.bytes = (cast(ubyte*) malloc(blockSize))[0 .. blockSize];
  1933. b.used = numBytes;
  1934. b.next = rootBlock;
  1935. rootBlock = b;
  1936. return b.bytes[0 .. numBytes];
  1937. }
  1938. static struct Node
  1939. {
  1940. ubyte[] str;
  1941. uint hash;
  1942. Node* next;
  1943. }
  1944. static struct Block
  1945. {
  1946. ubyte[] bytes;
  1947. size_t used;
  1948. Block* next;
  1949. }
  1950. static enum blockSize = 1024 * 16;
  1951. static immutable uint[] sbox = [
  1952. 0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
  1953. 0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
  1954. 0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
  1955. 0x514F4303, 0x7BE12B83, 0x7192F195, 0x82DC7300,
  1956. 0x084380B4, 0x480B55D3, 0x5F430471, 0x13F75991,
  1957. 0x3F9CF22C, 0x2FE0907A, 0xFD8E1E69, 0x7B1D5DE8,
  1958. 0xD575A85C, 0xAD01C50A, 0x7EE00737, 0x3CE981E8,
  1959. 0x0E447EFA, 0x23089DD6, 0xB59F149F, 0x13600EC7,
  1960. 0xE802C8E6, 0x670921E4, 0x7207EFF0, 0xE74761B0,
  1961. 0x69035234, 0xBFA40F19, 0xF63651A0, 0x29E64C26,
  1962. 0x1F98CCA7, 0xD957007E, 0xE71DDC75, 0x3E729595,
  1963. 0x7580B7CC, 0xD7FAF60B, 0x92484323, 0xA44113EB,
  1964. 0xE4CBDE08, 0x346827C9, 0x3CF32AFA, 0x0B29BCF1,
  1965. 0x6E29F7DF, 0xB01E71CB, 0x3BFBC0D1, 0x62EDC5B8,
  1966. 0xB7DE789A, 0xA4748EC9, 0xE17A4C4F, 0x67E5BD03,
  1967. 0xF3B33D1A, 0x97D8D3E9, 0x09121BC0, 0x347B2D2C,
  1968. 0x79A1913C, 0x504172DE, 0x7F1F8483, 0x13AC3CF6,
  1969. 0x7A2094DB, 0xC778FA12, 0xADF7469F, 0x21786B7B,
  1970. 0x71A445D0, 0xA8896C1B, 0x656F62FB, 0x83A059B3,
  1971. 0x972DFE6E, 0x4122000C, 0x97D9DA19, 0x17D5947B,
  1972. 0xB1AFFD0C, 0x6EF83B97, 0xAF7F780B, 0x4613138A,
  1973. 0x7C3E73A6, 0xCF15E03D, 0x41576322, 0x672DF292,
  1974. 0xB658588D, 0x33EBEFA9, 0x938CBF06, 0x06B67381,
  1975. 0x07F192C6, 0x2BDA5855, 0x348EE0E8, 0x19DBB6E3,
  1976. 0x3222184B, 0xB69D5DBA, 0x7E760B88, 0xAF4D8154,
  1977. 0x007A51AD, 0x35112500, 0xC9CD2D7D, 0x4F4FB761,
  1978. 0x694772E3, 0x694C8351, 0x4A7E3AF5, 0x67D65CE1,
  1979. 0x9287DE92, 0x2518DB3C, 0x8CB4EC06, 0xD154D38F,
  1980. 0xE19A26BB, 0x295EE439, 0xC50A1104, 0x2153C6A7,
  1981. 0x82366656, 0x0713BC2F, 0x6462215A, 0x21D9BFCE,
  1982. 0xBA8EACE6, 0xAE2DF4C1, 0x2A8D5E80, 0x3F7E52D1,
  1983. 0x29359399, 0xFEA1D19C, 0x18879313, 0x455AFA81,
  1984. 0xFADFE838, 0x62609838, 0xD1028839, 0x0736E92F,
  1985. 0x3BCA22A3, 0x1485B08A, 0x2DA7900B, 0x852C156D,
  1986. 0xE8F24803, 0x00078472, 0x13F0D332, 0x2ACFD0CF,
  1987. 0x5F747F5C, 0x87BB1E2F, 0xA7EFCB63, 0x23F432F0,
  1988. 0xE6CE7C5C, 0x1F954EF6, 0xB609C91B, 0x3B4571BF,
  1989. 0xEED17DC0, 0xE556CDA0, 0xA7846A8D, 0xFF105F94,
  1990. 0x52B7CCDE, 0x0E33E801, 0x664455EA, 0xF2C70414,
  1991. 0x73E7B486, 0x8F830661, 0x8B59E826, 0xBB8AEDCA,
  1992. 0xF3D70AB9, 0xD739F2B9, 0x4A04C34A, 0x88D0F089,
  1993. 0xE02191A2, 0xD89D9C78, 0x192C2749, 0xFC43A78F,
  1994. 0x0AAC88CB, 0x9438D42D, 0x9E280F7A, 0x36063802,
  1995. 0x38E8D018, 0x1C42A9CB, 0x92AAFF6C, 0xA24820C5,
  1996. 0x007F077F, 0xCE5BC543, 0x69668D58, 0x10D6FF74,
  1997. 0xBE00F621, 0x21300BBE, 0x2E9E8F46, 0x5ACEA629,
  1998. 0xFA1F86C7, 0x52F206B8, 0x3EDF1A75, 0x6DA8D843,
  1999. 0xCF719928, 0x73E3891F, 0xB4B95DD6, 0xB2A42D27,
  2000. 0xEDA20BBF, 0x1A58DBDF, 0xA449AD03, 0x6DDEF22B,
  2001. 0x900531E6, 0x3D3BFF35, 0x5B24ABA2, 0x472B3E4C,
  2002. 0x387F2D75, 0x4D8DBA36, 0x71CB5641, 0xE3473F3F,
  2003. 0xF6CD4B7F, 0xBF7D1428, 0x344B64D0, 0xC5CDFCB6,
  2004. 0xFE2E0182, 0x2C37A673, 0xDE4EB7A3, 0x63FDC933,
  2005. 0x01DC4063, 0x611F3571, 0xD167BFAF, 0x4496596F,
  2006. 0x3DEE0689, 0xD8704910, 0x7052A114, 0x068C9EC5,
  2007. 0x75D0E766, 0x4D54CC20, 0xB44ECDE2, 0x4ABC653E,
  2008. 0x2C550A21, 0x1A52C0DB, 0xCFED03D0, 0x119BAFE2,
  2009. 0x876A6133, 0xBC232088, 0x435BA1B2, 0xAE99BBFA,
  2010. 0xBB4F08E4, 0xA62B5F49, 0x1DA4B695, 0x336B84DE,
  2011. 0xDC813D31, 0x00C134FB, 0x397A98E6, 0x151F0E64,
  2012. 0xD9EB3E69, 0xD3C7DF60, 0xD2F2C336, 0x2DDD067B,
  2013. 0xBD122835, 0xB0B3BD3A, 0xB0D54E46, 0x8641F1E4,
  2014. 0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41,
  2015. 0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
  2016. ];
  2017. // deprecated size_t[string] debugMap;
  2018. size_t _allocated;
  2019. Node*[] buckets;
  2020. Block* rootBlock;
  2021. }
  2022. private extern(C) void* calloc(size_t, size_t) nothrow pure;
  2023. private extern(C) void* malloc(size_t) nothrow pure;
  2024. private extern(C) void free(void*) nothrow pure;
  2025. unittest
  2026. {
  2027. import std.stdio;
  2028. auto source = cast(ubyte[]) q{ import std.stdio;}c;
  2029. auto tokens = byToken(source);
  2030. assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
  2031. tok!"identifier", tok!";"]));
  2032. }
  2033. /// Test \x char sequence
  2034. unittest
  2035. {
  2036. auto toks = (string s) => byToken(cast(ubyte[])s);
  2037. // valid
  2038. enum hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F'];
  2039. auto source = "";
  2040. foreach (h1; hex)
  2041. foreach (h2; hex)
  2042. source ~= "'\\x" ~ h1 ~ h2 ~ "'";
  2043. assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty);
  2044. // invalid
  2045. assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
  2046. assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
  2047. assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
  2048. assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
  2049. assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
  2050. }