PageRenderTime 57ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/SmarterSql/SmarterSql/Parsing/Tokenizer.cs

#
C# | 920 lines | 901 code | 15 blank | 4 comment | 9 complexity | efea56541a6f35a3e183d045a334de4b MD5 | raw file
  1. // ---------------------------------
  2. // SmarterSql (c) Johan Sassner 2008
  3. // ---------------------------------
  4. using System;
  5. using System.Diagnostics;
  6. using System.Text;
  7. using Sassner.SmarterSql.Objects;
  8. using Sassner.SmarterSql.ParsingObjects;
  9. namespace Sassner.SmarterSql.Parsing {
  10. public class Tokenizer {
  11. #region Member variables
  12. private const int CR = 13;
  13. private const int EOF = -1;
  14. private const int LF = 10;
  15. private const int TABSIZE = 4;
  16. private readonly char[] data;
  17. private readonly int length;
  18. private Location current;
  19. private int end;
  20. private Location endLoc;
  21. private int index;
  22. private int parenLevel;
  23. private int pendingNewlines;
  24. private int start;
  25. private Location startLoc;
  26. #endregion
  27. public Tokenizer(char[] data) {
  28. pendingNewlines = 1;
  29. this.data = data;
  30. length = data.Length;
  31. current.Line = 1;
  32. startLoc.Line = 1;
  33. endLoc.Line = 1;
  34. }
  35. #region Public properties
  36. public Location CurrentLocation {
  37. get { return current; }
  38. set { current = value; }
  39. }
  40. public Location EndLocation {
  41. get { return endLoc; }
  42. set { endLoc = value; }
  43. }
  44. public int GroupingLevel {
  45. get { return parenLevel; }
  46. set { parenLevel = value; }
  47. }
  48. private bool IsBeginningOfFile {
  49. get { return (start == 0); }
  50. }
  51. public bool IsEndOfFile {
  52. get { return (PeekChar() == EOF); }
  53. }
  54. public Location StartLocation {
  55. get { return startLoc; }
  56. set { startLoc = value; }
  57. }
  58. #endregion
  59. public Token Next() {
  60. Label_0032:
  61. SetStart();
  62. int ch = NextChar();
  63. switch (ch) {
  64. case ' ':
  65. case '\t':
  66. case 12:
  67. if (IsBeginningOfFile) {
  68. SkipInitialWhitespace();
  69. }
  70. goto Label_0032;
  71. case 'N':
  72. if (NextChar('\'')) {
  73. return ReadString('\'', true);
  74. }
  75. if (NextChar('"')) {
  76. return ReadString('"', true);
  77. }
  78. break;
  79. // String
  80. case '\'':
  81. case '"':
  82. return ReadString((char)ch, false);
  83. case '/':
  84. if (NextChar('*')) {
  85. return ReadEOComment(false);
  86. }
  87. break;
  88. case '-':
  89. if (NextChar('-')) {
  90. return ReadEolComment();
  91. }
  92. break;
  93. case '.':
  94. if (!IsDigit(PeekChar())) {
  95. SetEnd();
  96. return Tokens.DotToken;
  97. }
  98. return ReadFloatPostDot();
  99. case '0':
  100. case '1':
  101. case '2':
  102. case '3':
  103. case '4':
  104. case '5':
  105. case '6':
  106. case '7':
  107. case '8':
  108. case '9':
  109. return ReadNumber((char)ch);
  110. case LF:
  111. return ReadNewline();
  112. case EOF:
  113. SetEnd();
  114. return ReadEof();
  115. case '\\':
  116. if ((PeekChar() != LF) && (PeekChar() != CR)) {
  117. return BadChar(ch);
  118. }
  119. NextChar();
  120. return Next();
  121. case '[':
  122. int ch2 = PeekChar();
  123. if (IsNameStart(ch2)) {
  124. return ReadQuotedName();
  125. }
  126. return BadChar(ch);
  127. case 0xef:
  128. // Unicode
  129. if (((start == 0) && NextChar(0xbb)) && NextChar(0xbf)) {
  130. goto Label_0032;
  131. }
  132. break;
  133. }
  134. Token token = NextOperator(ch);
  135. if (token != null) {
  136. return token;
  137. }
  138. if (IsNameStart(ch)) {
  139. return ReadName();
  140. }
  141. return BadChar(ch);
  142. }
  143. protected void Backup() {
  144. index--;
  145. current.Column--;
  146. int num2 = PeekChar();
  147. if (num2 != LF) {
  148. if (num2 != CR) {
  149. return;
  150. }
  151. } else if (data[index - 1] == '\r') {
  152. index--;
  153. }
  154. current.Line--;
  155. current.Column = 0;
  156. while (current.Column < index) {
  157. int num = data[(index - current.Column) - 1];
  158. if (num == LF) {
  159. break;
  160. }
  161. if (num == CR) {
  162. return;
  163. }
  164. current.Column++;
  165. }
  166. }
  167. private ErrorToken BadChar(int ch) {
  168. SetEnd();
  169. return new ErrorToken("bad character '" + ((char)ch) + "'");
  170. }
  171. internal string GetImage() {
  172. return new string(data, start, end - start);
  173. }
  174. public string GetRawLineForError(int lineNo) {
  175. int num = 1;
  176. int startIndex = EOF;
  177. if (lineNo == 1) {
  178. startIndex = 0;
  179. } else {
  180. for (int i = 0; i < data.Length; i++) {
  181. if (data[i] == '\r') {
  182. if (((i + 1) < data.Length) && (data[i] == '\n')) {
  183. i++;
  184. }
  185. num++;
  186. if (num != lineNo) {
  187. continue;
  188. }
  189. startIndex = i + 1;
  190. break;
  191. }
  192. if (data[i] == '\n') {
  193. num++;
  194. if (num == lineNo) {
  195. startIndex = i + 1;
  196. break;
  197. }
  198. }
  199. }
  200. if (startIndex == EOF) {
  201. return string.Empty;
  202. }
  203. }
  204. int index2 = startIndex;
  205. while (index2 < data.Length) {
  206. if ((data[index2] == '\r') || (data[index2] == '\n')) {
  207. break;
  208. }
  209. index2++;
  210. }
  211. return new string(data, startIndex, index2 - startIndex);
  212. }
  213. private bool InGrouping() {
  214. return (parenLevel != 0);
  215. }
  216. private static bool IsDigit(int ch) {
  217. return char.IsDigit((char)ch);
  218. }
  219. private static bool IsNamePart(int ch) {
  220. if (!char.IsLetterOrDigit((char)ch)) {
  221. return (ch == '_' || ch == '@' || ch == '#' || ch == '$' || ch == '!');
  222. }
  223. return true;
  224. }
  225. private static bool IsNameStart(int ch) {
  226. if (!char.IsLetter((char)ch)) {
  227. return (ch == '_' || ch == '@' || ch == '#');
  228. }
  229. return true;
  230. }
  231. protected bool NextChar(int ch) {
  232. if (PeekChar() == ch) {
  233. NextChar();
  234. return true;
  235. }
  236. return false;
  237. }
  238. protected int NextChar() {
  239. if (index < length) {
  240. int num = data[index];
  241. index++;
  242. current.Column++;
  243. if (num == LF) {
  244. current.Line++;
  245. current.Column = 0;
  246. return num;
  247. }
  248. if (num != CR) {
  249. return num;
  250. }
  251. if (PeekChar() == LF) {
  252. NextChar();
  253. } else {
  254. current.Line++;
  255. current.Column = 0;
  256. }
  257. return LF;
  258. }
  259. index++;
  260. current.Column++;
  261. return EOF;
  262. }
  263. private Token NextOperator(int ch) {
  264. switch (ch) {
  265. case '!':
  266. if (NextChar('=')) {
  267. SetEnd();
  268. return Tokens.symNotEqualToToken;
  269. }
  270. if (NextChar('<')) {
  271. SetEnd();
  272. return Tokens.symNotLessThanToken;
  273. }
  274. if (NextChar('>')) {
  275. SetEnd();
  276. return Tokens.symNotGreaterThanToken;
  277. }
  278. return BadChar(ch);
  279. case '%':
  280. if (!NextChar('=')) {
  281. SetEnd();
  282. return Tokens.symModToken;
  283. }
  284. // SetEnd();
  285. // return Tokens.symModEqualToken;
  286. break;
  287. case '&':
  288. if (!NextChar('=')) {
  289. SetEnd();
  290. return Tokens.symBitwiseAndToken;
  291. }
  292. // SetEnd();
  293. // return Tokens.symBitwiseAndEqualToken;
  294. break;
  295. case '(':
  296. parenLevel++;
  297. SetEnd();
  298. return Tokens.symLeftParenthesisToken;
  299. case ')':
  300. parenLevel--;
  301. SetEnd();
  302. return Tokens.symRightParenthesisToken;
  303. case '*':
  304. SetEnd();
  305. return Tokens.symMultiplyToken;
  306. case '+':
  307. if (!NextChar('=')) {
  308. SetEnd();
  309. return Tokens.symAddToken;
  310. }
  311. // SetEnd();
  312. // return Tokens.symAddEqualToken;
  313. break;
  314. case ',':
  315. SetEnd();
  316. return Tokens.symCommaToken;
  317. case '-':
  318. if (!NextChar('=')) {
  319. SetEnd();
  320. return Tokens.symSubtractToken;
  321. }
  322. // SetEnd();
  323. // return Tokens.symSubtractEqualToken;
  324. break;
  325. case '/':
  326. if (!NextChar('/')) {
  327. if (NextChar('=')) {
  328. // SetEnd();
  329. // return Tokens.symDivEqualToken;
  330. break;
  331. }
  332. SetEnd();
  333. return Tokens.symDivideToken;
  334. }
  335. // SetEnd();
  336. // return Tokens.symFloorDivideToken;
  337. break;
  338. case ':':
  339. SetEnd();
  340. return Tokens.symColonToken;
  341. case ';':
  342. SetEnd();
  343. return Tokens.symSemicolonToken;
  344. case '<':
  345. if (!NextChar('<')) {
  346. if (NextChar('=')) {
  347. SetEnd();
  348. return Tokens.symLessThanOrEqualToken;
  349. }
  350. if (NextChar('>')) {
  351. SetEnd();
  352. return Tokens.symNotEqualToken;
  353. }
  354. SetEnd();
  355. return Tokens.symLessThanToken;
  356. }
  357. // SetEnd();
  358. // return Tokens.symLeftShiftToken;
  359. break;
  360. case '=':
  361. SetEnd();
  362. return Tokens.symAssignToken;
  363. case '>':
  364. if (!NextChar('>')) {
  365. if (NextChar('=')) {
  366. SetEnd();
  367. return Tokens.symGreaterThanOrEqualToken;
  368. }
  369. SetEnd();
  370. return Tokens.symGreaterThanToken;
  371. }
  372. // SetEnd();
  373. // return Tokens.symRightShiftToken;
  374. break;
  375. case '@':
  376. bool isSystem = false;
  377. if (NextChar('@')) {
  378. isSystem = true;
  379. }
  380. for (int i = NextChar(); IsNamePart(i); i = NextChar()) {
  381. }
  382. Backup();
  383. SetEnd();
  384. if (isSystem) {
  385. return new SystemVariableToken(GetImage());
  386. }
  387. return new VariableToken(GetImage());
  388. case '^':
  389. SetEnd();
  390. return Tokens.symPowerToken;
  391. // case '`':
  392. // SetEnd();
  393. // return Tokens.symBackQuoteToken;
  394. case '|':
  395. if (!NextChar('=')) {
  396. SetEnd();
  397. return Tokens.symBitwiseOrToken;
  398. }
  399. // SetEnd();
  400. // return Tokens.symBitwiseOrEqualToken;
  401. break;
  402. case '~':
  403. SetEnd();
  404. return Tokens.symTwiddleToken;
  405. }
  406. return null;
  407. }
  408. private object ParseFloat(string s) {
  409. try {
  410. return LiteralParser.ParseFloat(s);
  411. } catch (Exception exception) {
  412. ReportSyntaxError("ParseFloat(): " + exception.Message);
  413. return 0;
  414. }
  415. }
  416. private object ParseInteger(string s, int radix) {
  417. try {
  418. return LiteralParser.ParseInteger(s, radix);
  419. } catch (ArgumentException exception) {
  420. ReportSyntaxError("ParseInteger(): " + exception.Message);
  421. }
  422. return 0;
  423. }
  424. protected int PeekChar() {
  425. if ((0 <= index) && (index < length)) {
  426. return data[index];
  427. }
  428. return EOF;
  429. }
  430. private Token ReadEof() {
  431. if (pendingNewlines-- > 0) {
  432. return ReadNewline();
  433. }
  434. return Tokens.EndOfFileToken;
  435. }
  436. public Token ContinueMultiLineComment() {
  437. SetStart();
  438. return ReadEOComment(true);
  439. }
  440. private Token ReadEOComment(bool isMultiLine) {
  441. StringBuilder builder = new StringBuilder();
  442. while (true) {
  443. int num = NextChar();
  444. switch (num) {
  445. case '*':
  446. if (NextChar('/')) {
  447. SetEnd();
  448. return new CommentToken(builder.ToString(), (isMultiLine ? TokenKind.MultiLineComment : TokenKind.SingleLineComment), true);
  449. }
  450. break;
  451. case EOF:
  452. case CR:
  453. case LF:
  454. // Backup one to ignore the EOF/CR/LF char
  455. Backup();
  456. SetEnd();
  457. // Skip over the EOF/CR/LF char
  458. NextChar();
  459. return new CommentToken(builder.ToString(), TokenKind.MultiLineComment, false);
  460. }
  461. builder.Append((char)num);
  462. }
  463. }
  464. private Token ReadEolComment() {
  465. StringBuilder builder = new StringBuilder();
  466. while (true) {
  467. int num = NextChar();
  468. switch (num) {
  469. case EOF:
  470. Backup();
  471. SetEnd(1);
  472. return new CommentToken(builder.ToString(), TokenKind.SingleLineComment, true);
  473. case LF:
  474. Backup();
  475. SetEnd();
  476. return new CommentToken(builder.ToString(), TokenKind.SingleLineComment, true);
  477. }
  478. builder.Append((char)num);
  479. }
  480. }
  481. private Token ReadFloatPostDot() {
  482. Label_0000:
  483. switch (NextChar()) {
  484. case 'e':
  485. case 'E':
  486. return ReadFloatPostE();
  487. case '0':
  488. case '1':
  489. case '2':
  490. case '3':
  491. case '4':
  492. case '5':
  493. case '6':
  494. case '7':
  495. case '8':
  496. case '9':
  497. goto Label_0000;
  498. }
  499. Backup();
  500. SetEnd();
  501. return new ValueNumberToken(ParseFloat(GetImage()));
  502. }
  503. private Token ReadFloatPostE() {
  504. int num = NextChar();
  505. switch (num) {
  506. case '+':
  507. case '-':
  508. num = NextChar();
  509. break;
  510. }
  511. Label_0018:
  512. switch (num) {
  513. case '0':
  514. case '1':
  515. case '2':
  516. case '3':
  517. case '4':
  518. case '5':
  519. case '6':
  520. case '7':
  521. case '8':
  522. case '9':
  523. num = NextChar();
  524. goto Label_0018;
  525. }
  526. Backup();
  527. SetEnd();
  528. return new ValueNumberToken(ParseFloat(GetImage()));
  529. }
  530. private Token ReadHexNumber() {
  531. Label_0000:
  532. switch (NextChar()) {
  533. case '0':
  534. case '1':
  535. case '2':
  536. case '3':
  537. case '4':
  538. case '5':
  539. case '6':
  540. case '7':
  541. case '8':
  542. case '9':
  543. case 'a':
  544. case 'b':
  545. case 'c':
  546. case 'd':
  547. case 'e':
  548. case 'f':
  549. case 'A':
  550. case 'B':
  551. case 'C':
  552. case 'D':
  553. case 'E':
  554. case 'F':
  555. goto Label_0000;
  556. case 'l':
  557. case 'L':
  558. SetEnd();
  559. // TODO: Implement
  560. // string image = GetImage();
  561. // return new ValueStringToken(LiteralParser.ParseBigInteger(image.Substring(2, image.Length - 3), 16));
  562. return new ValueNumberToken(0);
  563. }
  564. Backup();
  565. SetEnd();
  566. string image = GetImage();
  567. image = image.Substring(2, image.Length - 2);
  568. return new ValueNumberToken(ParseInteger(image, 16));
  569. }
  570. private Token ReadQuotedName() {
  571. int ch = NextChar();
  572. while (ch != ']' && (ch == ' ' || IsNamePart(ch))) {
  573. ch = NextChar();
  574. }
  575. SetEnd();
  576. if (ch != ']') {
  577. return BadChar(ch);
  578. }
  579. return ConstructNameToken(true);
  580. }
  581. private Token ReadName() {
  582. for (int i = NextChar(); IsNamePart(i); i = NextChar()) {
  583. }
  584. Backup();
  585. NextChar(':');
  586. SetEnd();
  587. return ConstructNameToken(false);
  588. }
  589. private Token ConstructNameToken(bool isQuoted) {
  590. string strValue = GetImage();
  591. string unquotedValue = (isQuoted ? strValue.Substring(1, strValue.Length - 2) : strValue);
  592. SymbolId key = SymbolTable.StringToId(unquotedValue);
  593. if (key == SymbolTable.None) {
  594. return Tokens.NoneToken;
  595. }
  596. Token token;
  597. if (Tokens.Keywords.TryGetValue(key, out token)) {
  598. token.ScannedText = strValue;
  599. if (token is SymbolToken && isQuoted) {
  600. DataType dataType;
  601. if (!Instance.StaticData.DataTypes.TryGetValue(token, out dataType)) {
  602. return new NameToken(strValue, isQuoted);
  603. }
  604. }
  605. token.Image = strValue;
  606. return token;
  607. }
  608. if (strValue.Length > 0 && strValue[0] == '#') {
  609. return new TemporaryToken(strValue, isQuoted);
  610. }
  611. return new NameToken(strValue, isQuoted);
  612. }
  613. private Token ReadNewline() {
  614. int spaces = 0;
  615. Label_0018:
  616. switch (NextChar()) {
  617. case ' ':
  618. case 12:
  619. spaces++;
  620. goto Label_0018;
  621. case '-':
  622. if (PeekChar() == '-') {
  623. SetStart(1);
  624. NextChar();
  625. return ReadEolComment();
  626. }
  627. goto Label_0018;
  628. case '\t':
  629. spaces += TABSIZE - (spaces % TABSIZE);
  630. goto Label_0018;
  631. case LF:
  632. spaces = 0;
  633. goto Label_0018;
  634. case EOF:
  635. return Tokens.NewLineToken;
  636. }
  637. if (InGrouping()) {
  638. Backup();
  639. return Next();
  640. }
  641. Backup();
  642. SetNewLine(startLoc);
  643. return Tokens.NewLineToken;
  644. }
  645. private Token ReadNumber(char startChar) {
  646. int b = LF;
  647. if (startChar == '0') {
  648. if (NextChar('x') || NextChar('X')) {
  649. return ReadHexNumber();
  650. }
  651. b = 8;
  652. }
  653. Label_0025:
  654. switch (NextChar()) {
  655. case '.':
  656. return ReadFloatPostDot();
  657. case '0':
  658. case '1':
  659. case '2':
  660. case '3':
  661. case '4':
  662. case '5':
  663. case '6':
  664. case '7':
  665. case '8':
  666. case '9':
  667. goto Label_0025;
  668. case 'e':
  669. case 'E':
  670. return ReadFloatPostE();
  671. case 'l':
  672. case 'L':
  673. SetEnd();
  674. // TODO: Implement
  675. // return new ValueStringToken(LiteralParser.ParseBigInteger(GetImage(), b));
  676. return new ValueNumberToken(0);
  677. }
  678. Backup();
  679. SetEnd();
  680. return new ValueNumberToken(ParseInteger(GetImage(), b));
  681. }
  682. public Token ContinueReadString(char quote) {
  683. SetStart();
  684. return ReadString(quote, false);
  685. }
  686. public Token ReadString(char quote, bool isUnicode) {
  687. StringBuilder builder = new StringBuilder();
  688. while (true) {
  689. int num2 = NextChar();
  690. if (num2 == quote) {
  691. if (NextChar(quote)) {
  692. builder.Append(quote);
  693. } else {
  694. SetEnd();
  695. return new ValueStringToken(builder.ToString(), true, isUnicode);
  696. }
  697. } else if (num2 == EOF || num2 == LF) {
  698. SetEnd(1);
  699. return new ValueStringToken(builder.ToString(), false, isUnicode);
  700. } else {
  701. builder.Append((char)num2);
  702. }
  703. }
  704. }
  705. public Token ContinueString(char quote, bool isTriple, int startAdd) {
  706. bool complete = true;
  707. int num = 0;
  708. Label_0004:
  709. int num2 = NextChar();
  710. if (num2 == EOF) {
  711. complete = !isTriple;
  712. } else if (num2 == quote) {
  713. if (isTriple) {
  714. if (!NextChar(quote) || !NextChar(quote)) {
  715. goto Label_0004;
  716. }
  717. num += 3;
  718. } else {
  719. num++;
  720. }
  721. } else {
  722. if (num2 == '\\') {
  723. int num3 = PeekChar();
  724. switch (num3) {
  725. case CR:
  726. case '\\':
  727. case LF:
  728. NextChar();
  729. if (PeekChar() != EOF) {
  730. goto Label_0004;
  731. }
  732. UnexpectedEndOfString(isTriple, true);
  733. return new ErrorToken("<eof> while reading string");
  734. case EOF:
  735. complete = false;
  736. goto Label_0135;
  737. }
  738. if (num3 == quote) {
  739. NextChar();
  740. }
  741. goto Label_0004;
  742. }
  743. if (((num2 != LF) && (num2 != CR)) || isTriple) {
  744. goto Label_0004;
  745. }
  746. complete = false;
  747. }
  748. Label_0135:
  749. SetEnd();
  750. int _end = end;
  751. if (_end >= length) {
  752. _end = length;
  753. }
  754. string text = new string(data, start + startAdd, (_end - start) - (startAdd + num));
  755. if (isTriple) {
  756. text = text.Replace("\r\n", "\n");
  757. }
  758. return new ValueStringToken(text, complete);
  759. }
  760. private void ReportSyntaxError(string message) {
  761. ReportSyntaxError(message, 16);
  762. }
  763. private void ReportSyntaxError(string message, int errorCode) {
  764. string rawLineForError = GetRawLineForError(startLoc.Line);
  765. Debug.WriteLine("'" + message + "', " + rawLineForError + ", sline=" + startLoc.Line + ", scol=" + startLoc.Column + ", eline=" + endLoc.Line + ", ecol=" + endLoc.Column + ", errcode=" + errorCode + ", " + Severity.Error);
  766. }
  767. private void SetEnd() {
  768. SetEnd(0);
  769. }
  770. private void SetStart() {
  771. SetStart(0);
  772. }
  773. private void SetStart(int revert) {
  774. start = index - revert;
  775. startLoc.Column = current.Column - revert;
  776. startLoc.Line = current.Line;
  777. }
  778. private void SetEnd(int revert) {
  779. end = index - revert;
  780. endLoc.Column = current.Column - revert;
  781. endLoc.Line = current.Line;
  782. }
  783. private void SetNewLine(Location loc) {
  784. startLoc = loc;
  785. endLoc = loc;
  786. endLoc.Column++;
  787. }
  788. private void SkipInitialWhitespace() {
  789. Label_0000:
  790. int num2 = NextChar();
  791. if (num2 <= 12) {
  792. switch (num2) {
  793. case '\t':
  794. case 12:
  795. goto Label_0000;
  796. case LF:
  797. case EOF:
  798. goto Label_0037;
  799. case 11:
  800. goto Label_003E;
  801. }
  802. goto Label_003E;
  803. }
  804. if (num2 == ' ') {
  805. goto Label_0000;
  806. }
  807. // if (num2 != '#') {
  808. // goto Label_003E;
  809. // }
  810. Label_0037:
  811. Backup();
  812. return;
  813. Label_003E:
  814. ReportSyntaxError("SkipInitialWhitespace(): invalid syntax");
  815. Backup();
  816. }
  817. private void UnexpectedEndOfString(bool isTriple, bool isIncomplete) {
  818. string message = isTriple ? "EOF while scanning triple-quoted string" : "EOL while scanning single-quoted string";
  819. int errorCode = isIncomplete ? 18 : 16;
  820. ReportSyntaxError("UnexpectedEndOfString(): " + message, errorCode);
  821. }
  822. }
  823. }