PageRenderTime 64ms CodeModel.GetById 35ms RepoModel.GetById 0ms app.codeStats 0ms

/Scanner.cs

https://bitbucket.org/cvillamor/compiler
C# | 488 lines | 421 code | 50 blank | 17 comment | 83 complexity | 0f49322992a25512a4e38ed2eda315bb MD5 | raw file
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Collections;
  4. using System.Linq;
  5. using System.Text;
  6. using System.IO;
  7. namespace Compiler_
  8. {
  9. class Scanner
  10. {
  11. private string m_fileLoc;
  12. private char[] m_input;
  13. private int m_charPtr;
  14. private int m_lineNum;
  15. private int m_colNum;
  16. private Token m_prev_Token;
  17. Stack<Dictionary<string, Record>> SYMBOL_TABLE_STACK;
  18. public Scanner(string file, char[] input, Stack<Dictionary<string, Record>> Symb)
  19. {
  20. m_fileLoc = file;
  21. m_charPtr = 0;
  22. m_colNum = 1;
  23. m_lineNum = 1;
  24. m_input = input;
  25. SYMBOL_TABLE_STACK = Symb;
  26. }
  27. //Bootstrap + Entry point
  28. static public void Scan(Scanner scan)
  29. {
  30. scan.GetToken();
  31. }
  32. static public Token ScanAndRet(Scanner scan)
  33. {
  34. scan.GetToken();
  35. return scan.TOKEN;
  36. }
  37. private void GetToken()
  38. {
  39. Token tk = new Token();
  40. RESTART:
  41. try
  42. {
  43. //Keyword or Identifier
  44. if (IsLetter())
  45. {
  46. ProcessKeyword(ref tk);
  47. }
  48. //Integer or Real or Double etc..
  49. else if (IsNumeric())
  50. {
  51. ProcessNumerics(ref tk);
  52. }
  53. //Space
  54. else if (IsWhiteSpaceOrSemiColon())
  55. {
  56. ProcessWhitespace(ref tk);
  57. }
  58. //Operators
  59. else if (IsCharOperator(m_input[m_charPtr]))
  60. {
  61. ProcessOperators(ref tk);
  62. }
  63. //String
  64. else if (IsString(m_input[m_charPtr]))
  65. {
  66. ProcessString(ref tk);
  67. }
  68. //Restart -- add error checking later
  69. else
  70. goto RESTART;
  71. }
  72. catch (IndexOutOfRangeException e)
  73. {
  74. tk.TOKEN_TYPE = TOKEN_TYPES.TK_EOF;
  75. }
  76. catch (TruncationCharErrorException e)
  77. {
  78. tk.TOKEN_VALUE = tk.TOKEN_CURRENT;
  79. tk.TOKEN_TYPE = GetKeywordOrIdentType(tk);
  80. tk.TOKEN_LINE = m_lineNum;
  81. }
  82. catch (BadIdentifierFormatException e)
  83. {
  84. tk.TOKEN_VALUE = TOKEN_TYPES.TK_ERROR;
  85. }
  86. catch (RestartException e)
  87. {
  88. goto RESTART;
  89. }
  90. TOKEN = tk;
  91. }
  92. void ProcessKeyword(ref Token tk)
  93. {
  94. tk.TOKEN_COL = m_colNum;
  95. while (char.IsLetterOrDigit(m_input[m_charPtr]))
  96. {
  97. tk.TOKEN_CURRENT = tk.TOKEN_CURRENT + m_input[m_charPtr++];
  98. m_colNum++;
  99. if (tk.TOKEN_CURRENT.Length > 31)
  100. throw new TruncationCharErrorException();
  101. }
  102. tk.TOKEN_VALUE = tk.TOKEN_CURRENT;
  103. tk.TOKEN_TYPE = GetKeywordOrIdentType(tk);
  104. tk.TOKEN_LINE = m_lineNum;
  105. }
  106. void ProcessNumerics(ref Token tk)
  107. {
  108. tk.TOKEN_COL = m_colNum;
  109. //Integer
  110. while (char.IsDigit(m_input[m_charPtr]))
  111. {
  112. tk.TOKEN_CURRENT = tk.TOKEN_CURRENT + m_input[m_charPtr++];
  113. m_colNum++;
  114. }
  115. tk.TOKEN_TYPE = TOKEN_TYPES.TK_INTLIT;
  116. //Range -- ignore, let operator handle it
  117. if (m_input[m_charPtr] == '.' && m_input[m_charPtr + 1] == '.')
  118. { }
  119. //Real
  120. else if (m_input[m_charPtr] == '.')
  121. {
  122. tk.TOKEN_CURRENT += m_input[m_charPtr++];
  123. while (char.IsDigit(m_input[m_charPtr]))
  124. {
  125. tk.TOKEN_CURRENT = tk.TOKEN_CURRENT + m_input[m_charPtr++];
  126. m_colNum++;
  127. }
  128. tk.TOKEN_TYPE = TOKEN_TYPES.TK_REALLIT;
  129. }
  130. //Bad Format
  131. if (!IsWhiteSpaceOrSemiColon() &&
  132. !IsCharOperator(m_input[m_charPtr]))
  133. throw new BadIdentifierFormatException();
  134. tk.TOKEN_VALUE = tk.TOKEN_CURRENT;
  135. tk.TOKEN_LINE = m_lineNum;
  136. }
  137. private void ProcessWhitespace(ref Token tk)
  138. {
  139. //check newline
  140. if (m_input[m_charPtr] == 0xA)
  141. {
  142. m_lineNum++;
  143. m_colNum = 1;
  144. }
  145. else
  146. {
  147. m_colNum++;
  148. }
  149. m_charPtr++;
  150. throw new RestartException();
  151. }
  152. private void ProcessOperators(ref Token tk)
  153. {
  154. tk.TOKEN_COL = m_colNum;
  155. tk.TOKEN_VALUE = GetCharOperator(m_input[m_charPtr]);
  156. tk.TOKEN_TYPE = GetOperatorType(tk);
  157. tk.TOKEN_LINE = m_lineNum;
  158. //Ignore comments -- of type {
  159. if (tk.TOKEN_TYPE == TOKEN_TYPES.TK_LBRACKETOP)
  160. {
  161. while (m_input[m_charPtr] != '}')
  162. {
  163. m_charPtr++;
  164. m_colNum++;
  165. if (m_input[m_charPtr] == 0xA)
  166. m_lineNum++;
  167. }
  168. m_charPtr++;
  169. m_colNum++;
  170. throw new RestartException();
  171. }
  172. //Ignore comments -- of type (*
  173. else if (tk.TOKEN_TYPE == TOKEN_TYPES.TK_LCOLASTOP)
  174. {
  175. string temp = "";
  176. while (temp != "*)")
  177. {
  178. m_charPtr++;
  179. m_colNum++;
  180. temp = "";
  181. temp += m_input[m_charPtr];
  182. temp += m_input[m_charPtr + 1];
  183. if (m_input[m_charPtr] == 0xA)
  184. m_lineNum++;
  185. }
  186. m_charPtr += 2;
  187. m_colNum += 2;
  188. throw new RestartException();
  189. }
  190. }
  191. private void ProcessString(ref Token tk)
  192. {
  193. tk.TOKEN_COL = m_colNum;
  194. m_colNum++;
  195. m_charPtr++;
  196. do
  197. {
  198. tk.TOKEN_CURRENT += m_input[m_charPtr];
  199. m_colNum++;
  200. m_charPtr++;
  201. } while (m_input[m_charPtr] != '\'' || (m_input[m_charPtr] == '\'' && m_input[m_charPtr + 1] == '\''));
  202. tk.TOKEN_VALUE = tk.TOKEN_CURRENT;
  203. tk.TOKEN_TYPE = TOKEN_TYPES.TK_STRING;
  204. tk.TOKEN_LINE = m_lineNum;
  205. m_colNum++;
  206. m_charPtr++;
  207. }
  208. private TOKEN_TYPES GetKeywordOrIdentType(Token tk)
  209. {
  210. string key = (string)tk.TOKEN_VALUE;
  211. TOKEN_TYPES type = new TOKEN_TYPES();
  212. switch (key)
  213. {
  214. case "and" : type = TOKEN_TYPES.TK_AND; break;
  215. case "absolute": type = TOKEN_TYPES.TK_ABSOLUTE; break;
  216. case "array": type = TOKEN_TYPES.TK_ARRAY; break;
  217. case "asm": type = TOKEN_TYPES.TK_ASM; break;
  218. case "begin": type = TOKEN_TYPES.TK_BEGIN; break;
  219. case "boolean": type = TOKEN_TYPES.TK_BOOLEAN; break;
  220. case "case": type = TOKEN_TYPES.TK_CASE; break;
  221. case "const": type = TOKEN_TYPES.TK_CONST; break;
  222. case "constructor": type = TOKEN_TYPES.TK_CONSTRUCTOR; break;
  223. case "destructor": type = TOKEN_TYPES.TK_DESTRUCTOR; break;
  224. case "div": type = TOKEN_TYPES.TK_DIV; break;
  225. case "do": type = TOKEN_TYPES.TK_DO; break;
  226. case "downto": type = TOKEN_TYPES.TK_DOWNTO; break;
  227. case "else": type = TOKEN_TYPES.TK_ELSE; break;
  228. case "end": type = TOKEN_TYPES.TK_END; break;
  229. case "false": type = TOKEN_TYPES.TK_FALSE; break;
  230. case "file": type = TOKEN_TYPES.TK_FILE; break;
  231. case "for": type = TOKEN_TYPES.TK_FOR; break;
  232. case "function": type = TOKEN_TYPES.TK_FUNCTION; break;
  233. case "goto": type = TOKEN_TYPES.TK_GOTO; break;
  234. case "if": type = TOKEN_TYPES.TK_IF; break;
  235. case "implementation": type = TOKEN_TYPES.TK_IMPLEMENTATION; break;
  236. case "in": type = TOKEN_TYPES.TK_IN; break;
  237. case "inherited": type = TOKEN_TYPES.TK_INHERITED; break;
  238. case "inline": type = TOKEN_TYPES.TK_INLINE; break;
  239. case "interface": type = TOKEN_TYPES.TK_INTERFACE; break;
  240. case "integer": type = TOKEN_TYPES.TK_INTEGER; break;
  241. case "label": type = TOKEN_TYPES.TK_LABEL; break;
  242. case "mod": type = TOKEN_TYPES.TK_MOD; break;
  243. case "nil": type = TOKEN_TYPES.TK_NIL; break;
  244. case "not": type = TOKEN_TYPES.TK_NOT; break;
  245. case "object": type = TOKEN_TYPES.TK_OBJECT; break;
  246. case "of": type = TOKEN_TYPES.TK_OF; break;
  247. case "on": type = TOKEN_TYPES.TK_ON; break;
  248. case "operator": type = TOKEN_TYPES.TK_OPERATOR; break;
  249. case "or": type = TOKEN_TYPES.TK_OR; break;
  250. case "packed": type = TOKEN_TYPES.TK_PACKED; break;
  251. case "procedure": type = TOKEN_TYPES.TK_PROCEDURE; break;
  252. case "program": type = TOKEN_TYPES.TK_PROGRAM; break;
  253. case "record": type = TOKEN_TYPES.TK_RECORD; break;
  254. case "reintroduce": type = TOKEN_TYPES.TK_REINTRODUCE; break;
  255. case "repeat": type = TOKEN_TYPES.TK_REPEAT; break;
  256. case "self": type = TOKEN_TYPES.TK_SELF; break;
  257. case "set": type = TOKEN_TYPES.TK_SET; break;
  258. case "shl": type = TOKEN_TYPES.TK_SHL; break;
  259. case "shr": type = TOKEN_TYPES.TK_SHR; break;
  260. case "string": type = TOKEN_TYPES.TK_STRING; break;
  261. case "then": type = TOKEN_TYPES.TK_THEN; break;
  262. case "true": type = TOKEN_TYPES.TK_TRUE; break;
  263. case "to": type = TOKEN_TYPES.TK_TO; break;
  264. case "type": type = TOKEN_TYPES.TK_TYPE; break;
  265. case "unit": type = TOKEN_TYPES.TK_UNIT; break;
  266. case "until": type = TOKEN_TYPES.TK_UNTIL; break;
  267. case "uses": type = TOKEN_TYPES.TK_USES; break;
  268. case "var": type = TOKEN_TYPES.TK_VAR; break;
  269. case "while": type = TOKEN_TYPES.TK_WHILE; break;
  270. case "with": type = TOKEN_TYPES.TK_WITH; break;
  271. case "xor": type = TOKEN_TYPES.TK_XOR; break;
  272. default: type = GetTokenIfNotKeyword(key); break;
  273. }
  274. return type;
  275. }
  276. //Get the TOKEN_TYPE of the Variable or Id if a record of it exists in the stack
  277. private TOKEN_TYPES GetTokenIfNotKeyword(string key)
  278. {
  279. Record rec = new Record();
  280. rec.TYPE_LT = TOKEN_TYPES.TK_ID;
  281. Stack<Dictionary<string, Record>> tempStack = new Stack<Dictionary<string, Record>>();
  282. bool found = false;
  283. int STACK_COUNT = SYMBOL_TABLE_STACK.Count;
  284. //return if the SYMBOL TABLE IS EMPTY
  285. if (SYMBOL_TABLE_STACK.Count == 0)
  286. return rec.TYPE_LT;
  287. while(SYMBOL_TABLE_STACK.Count!=0)
  288. {
  289. Dictionary<string, Record> symbolTable = SYMBOL_TABLE_STACK.Pop();
  290. tempStack.Push(symbolTable);
  291. foreach (KeyValuePair<string, Record> keyV in symbolTable)
  292. if (keyV.Key == key)
  293. {
  294. rec = keyV.Value;
  295. found = true;
  296. }
  297. }
  298. //Pop SymbolTables back into the SYMBOL_TABLE_STACK
  299. for(int i=0; i<STACK_COUNT; i++)
  300. SYMBOL_TABLE_STACK.Push(tempStack.Pop());
  301. return rec.TYPE_LT;
  302. }
  303. private TOKEN_TYPES GetOperatorType(Token tk)
  304. {
  305. string key = (string)tk.TOKEN_VALUE;
  306. TOKEN_TYPES type = new TOKEN_TYPES();
  307. switch (key)
  308. {
  309. case "+": type = TOKEN_TYPES.TK_PLUSOP; break;
  310. case "-": type = TOKEN_TYPES.TK_MINUSOP; break;
  311. case "*": type = TOKEN_TYPES.TK_MULTOP; break;
  312. case "/": type = TOKEN_TYPES.TK_DIVOP; break;
  313. case "=": type = TOKEN_TYPES.TK_EQUALOP; break;
  314. case "<": type = TOKEN_TYPES.TK_LTOP; break;
  315. case ">": type = TOKEN_TYPES.TK_GTOP; break;
  316. case ".": type = TOKEN_TYPES.TK_DOTOP; break;
  317. case ",": type = TOKEN_TYPES.TK_COMMAOP; break;
  318. case "(": type = TOKEN_TYPES.TK_LPARENOP; break;
  319. case ")": type = TOKEN_TYPES.TK_RPARENOP; break;
  320. case ":": type = TOKEN_TYPES.TK_COLONOP; break;
  321. case ";": type = TOKEN_TYPES.TK_SEMICOLON; break;
  322. case "^": type = TOKEN_TYPES.TK_POWEROP; break;
  323. case "@": type = TOKEN_TYPES.TK_ATOP; break;
  324. case "{": type = TOKEN_TYPES.TK_LBRACKETOP; break;
  325. case "}": type = TOKEN_TYPES.TK_RBRACKETOP; break;
  326. case "$": type = TOKEN_TYPES.TK_DOLLAROP; break;
  327. case "#": type = TOKEN_TYPES.TK_POUNDOP; break;
  328. case "&": type = TOKEN_TYPES.TK_AMPERSANDOP; break;
  329. case "%": type = TOKEN_TYPES.TK_PERCENTOP; break;
  330. case "[": type = TOKEN_TYPES.TK_LARRAYBRACK; break;
  331. case "]": type = TOKEN_TYPES.TK_RARRAYBRACK; break;
  332. case "<<": type = TOKEN_TYPES.TK_SHIFTLOP; break;
  333. case ">>": type = TOKEN_TYPES.TK_SHIFTROP; break;
  334. case "**": type = TOKEN_TYPES.TK_DOUBASTOP; break;
  335. case "<>": type = TOKEN_TYPES.TK_NOTEQOP; break;
  336. case "><": type = TOKEN_TYPES.TK_NOTEQ2OP; break;
  337. case "<=": type = TOKEN_TYPES.TK_LTEQOP; break;
  338. case ">=": type = TOKEN_TYPES.TK_GTEQOP; break;
  339. case ":=": type = TOKEN_TYPES.TK_COLEQOP; break;
  340. case "+=": type = TOKEN_TYPES.TK_PLUSEQOP; break;
  341. case "-=": type = TOKEN_TYPES.TK_MINUSEQOP; break;
  342. case "*=": type = TOKEN_TYPES.TK_MULTEQOP; break;
  343. case "/=": type = TOKEN_TYPES.TK_DIVEQOP; break;
  344. case "(*": type = TOKEN_TYPES.TK_LCOLASTOP; break;
  345. case "..": type = TOKEN_TYPES.TK_ARRAYRNGOP; break;
  346. }
  347. return type;
  348. }
  349. public Token TOKEN
  350. {
  351. get { return Global.CURRENT_TOKEN; }
  352. set
  353. {
  354. Global.PREV_TOKEN = Global.CURRENT_TOKEN;
  355. Global.CURRENT_TOKEN = value;
  356. }
  357. }
  358. public Token PREVTOKEN
  359. {
  360. get { return Global.PREV_TOKEN; }
  361. }
  362. public bool IsLetter()
  363. {
  364. return (char.IsLetter(m_input[m_charPtr]));
  365. }
  366. public bool IsNumeric()
  367. {
  368. return (char.IsDigit(m_input[m_charPtr]));
  369. }
  370. public bool IsString(char x)
  371. {
  372. return(x=='\'');
  373. }
  374. public bool IsWhiteSpaceOrSemiColon()
  375. {
  376. return (m_input[m_charPtr] <= 0x20);
  377. }
  378. public bool IsCharOperator(char x)
  379. {
  380. bool op = false;
  381. switch (x)
  382. {
  383. case '+': case '-': case '*': case '/': case '=': case '<': case '>': case '[': case ']': case '.': case ',': case '(':
  384. case ')': case ':': case '^': case '@': case '{': case '}': case '$': case '#': case '&': case '%': case ';':
  385. op = true;
  386. break;
  387. }
  388. return op;
  389. }
  390. public string GetCharOperator(char x)
  391. {
  392. string op = x.ToString();
  393. m_charPtr++;
  394. m_colNum++;
  395. if (m_charPtr == m_input.Length) return op;
  396. switch (x)
  397. {
  398. case '<' :
  399. if (m_input[m_charPtr] == '>' || m_input[m_charPtr] == '<' || m_input[m_charPtr] == '=')
  400. { op += m_input[m_charPtr++]; m_colNum++; }
  401. break;
  402. case '>' :
  403. if (m_input[m_charPtr] == '>' || m_input[m_charPtr] == '<' || m_input[m_charPtr] == '=')
  404. { op += m_input[m_charPtr++]; m_colNum++; }
  405. break;
  406. case '*' :
  407. if (m_input[m_charPtr] == '*')
  408. { op += m_input[m_charPtr++]; m_colNum++; }
  409. break;
  410. case ':' :
  411. if (m_input[m_charPtr] == '=')
  412. { op += m_input[m_charPtr++]; m_colNum++; }
  413. break;
  414. case '+' :
  415. if (m_input[m_charPtr] == '=')
  416. { op += m_input[m_charPtr++]; m_colNum++; }
  417. break;
  418. case '-' :
  419. if (m_input[m_charPtr] == '=')
  420. { op += m_input[m_charPtr++]; m_colNum++; }
  421. break;
  422. case '/' :
  423. if (m_input[m_charPtr] == '=')
  424. { op += m_input[m_charPtr++]; m_colNum++; }
  425. break;
  426. case '(' :
  427. if (m_input[m_charPtr] == '*' || m_input[m_charPtr] == '.')
  428. { op += m_input[m_charPtr++]; m_colNum++; }
  429. break;
  430. case '.' :
  431. if (m_input[m_charPtr] == ')' || m_input[m_charPtr] == '.')
  432. { op += m_input[m_charPtr++]; m_colNum++; }
  433. break;
  434. }
  435. return op;
  436. }
  437. }
  438. }