/source/Lexer.ooc

http://github.com/fperrad/ooc-lua · Unknown · 535 lines · 506 code · 29 blank · 0 comment · 0 complexity · eff9477d9b522ca34adc0132661d5a0c MD5 · raw file

  1. import io/Reader
  2. import structs/ArrayList
  3. import structs/HashMap
  4. LUA_MINBUFFER := const 32
  5. Token: cover {
  6. token: Int
  7. str: String
  8. num: Double
  9. }
  10. Lexer: class {
  11. current: Int /* current character (charint) */
  12. linenumber: Int /* input line counter */
  13. lastline: Int /* line of last token `consumed' */
  14. t: Token /* current token */
  15. lookahead: Token /* look ahead token */
  16. z: Reader /* input stream */
  17. buff: ArrayList<Char> /* buffer for tokens */
  18. source: String /* current source name */
  19. envn: String /* environment variable name */
  20. init: func {}
  21. setInput: func(=z, =source) {
  22. lookahead token = TK_EOS /* no look-ahead token */
  23. linenumber = 1
  24. lastline = 1
  25. envn = "_ENV"
  26. buff = ArrayList<Char> new(LUA_MINBUFFER) /* initialize buffer */
  27. _next() /* read first char */
  28. }
  29. next: func {
  30. lastline = linenumber
  31. if (lookahead token != TK_EOS) { /* is there a look-ahead token? */
  32. t = lookahead /* use this one */
  33. lookahead token = TK_EOS /* and discharge it */
  34. }
  35. else
  36. t token = _lex(t&) /* read next token */
  37. }
  38. lookahead: func -> Int {
  39. version(debug) {
  40. assert(lookahead token == TK_EOS)
  41. }
  42. lookahead token = _lex(lookahead&)
  43. return lookahead token
  44. }
  45. shebang: func {
  46. if (current == '#') {
  47. while (current != '\n')
  48. _next()
  49. _incLineNumber()
  50. }
  51. }
  52. syntaxError: func(msg: String) {
  53. _error(msg, t token)
  54. }
  55. _error: func(message: String, token: Int) {
  56. msg := "%s:%d: %s" format(source, linenumber, message)
  57. if (token)
  58. msg += " near " + _txtToken(token)
  59. Exception new(This, msg) throw()
  60. }
  61. _txtToken: func(token: Int) -> String {
  62. if (token == TK_NAME || token == TK_STRING || token == TK_NUMBER) {
  63. s := String new(buff data as CString, buff getSize())
  64. return "'%s'" format(s)
  65. }
  66. else
  67. return token2str(token)
  68. }
  69. token2str: func(token: Int) -> String {
  70. if (token < FIRST_RESERVED) {
  71. if (token as Char printable?())
  72. return "'%c'" format(token)
  73. else
  74. return "char(%d)" format(token)
  75. }
  76. else
  77. match (token) {
  78. case TK_AND => return "and"
  79. case TK_BREAK => return "break"
  80. case TK_DO => return "do"
  81. case TK_ELSE => return "else"
  82. case TK_ELSEIF => return "elseif"
  83. case TK_END => return "end"
  84. case TK_FALSE => return "false"
  85. case TK_FOR => return "for"
  86. case TK_FUNCTION=> return "function"
  87. case TK_IF => return "if"
  88. case TK_IN => return "in"
  89. case TK_LOCAL => return "local"
  90. case TK_NIL => return "nil"
  91. case TK_NOT => return "not"
  92. case TK_OR => return "or"
  93. case TK_REPEAT => return "repeat"
  94. case TK_RETURN => return "return"
  95. case TK_THEN => return "then"
  96. case TK_TRUE => return "true"
  97. case TK_UNTIL => return "until"
  98. case TK_WHILE => return "while"
  99. // other terminal symbols
  100. case TK_CONCAT => return ".."
  101. case TK_DOTS => return "..."
  102. case TK_EQ => return "=="
  103. case TK_GE => return ">="
  104. case TK_LE => return "<="
  105. case TK_NE => return "~="
  106. }
  107. return "???"
  108. }
  109. _lex: func(tok: Token@) -> Int {
  110. buff clear()
  111. while (true) {
  112. match current {
  113. case '\n' =>
  114. _incLineNumber()
  115. case '\r' =>
  116. _incLineNumber()
  117. case ' ' =>
  118. _next()
  119. case '\f' =>
  120. _next()
  121. case '\t' =>
  122. _next()
  123. case '\v' =>
  124. _next()
  125. case '-' => /* '-' or '--' (comment) */
  126. _next()
  127. if (current != '-')
  128. return '-' as Int
  129. /* else is a comment */
  130. _next()
  131. if (current == '[') { /* long comment? */
  132. sep := _skipSep()
  133. buff clear() /* `skip_sep' may dirty the buffer */
  134. if (sep >= 0) {
  135. _readLongString(null, sep) /* skip long comment */
  136. buff clear() /* previous call may dirty the buff. */
  137. continue
  138. }
  139. }
  140. /* else short comment */
  141. while (!(current == '\n' || current == '\r') && current != -1)
  142. _next() /* skip until end of line (or end of file) */
  143. case '[' => /* long string or simply '[' */
  144. sep := _skipSep()
  145. if (sep >= 0) {
  146. _readLongString(tok&, sep)
  147. return TK_STRING
  148. }
  149. else if (sep == -1)
  150. return '[' as Int
  151. else
  152. _error("invalid long string delimiter", TK_STRING)
  153. case '=' =>
  154. _next()
  155. if (current != '=')
  156. return '=' as Int
  157. else {
  158. _next()
  159. return TK_EQ
  160. }
  161. case '<' =>
  162. _next()
  163. if (current != '=')
  164. return '<' as Int
  165. else {
  166. _next()
  167. return TK_LE
  168. }
  169. case '>' =>
  170. _next()
  171. if (current != '=')
  172. return '>' as Int
  173. else {
  174. _next()
  175. return TK_GE
  176. }
  177. case '~' =>
  178. _next()
  179. if (current != '=')
  180. return '~' as Int
  181. else {
  182. _next()
  183. return TK_NE
  184. }
  185. case '"' => /* short literal strings */
  186. _readString(current, tok&)
  187. return TK_STRING
  188. case '\'' =>
  189. _readString(current, tok&)
  190. return TK_STRING
  191. case '.' => /* '.', '..', '...', or number */
  192. buff add(current as Char)
  193. _next()
  194. if (_checkNext(".")) {
  195. if (_checkNext("."))
  196. return TK_DOTS; /* '...' */
  197. else
  198. return TK_CONCAT; /* '..' */
  199. }
  200. else if (! current as Char digit?())
  201. return '.' as Int
  202. else {
  203. _readNumeral(tok&)
  204. return TK_NUMBER
  205. }
  206. case -1 =>
  207. return TK_EOS
  208. case =>
  209. if (current as Char digit?()) {
  210. _readNumeral(tok&)
  211. return TK_NUMBER
  212. }
  213. else if (current as Char alpha?() || /* identifier or reserved word? */
  214. current == '_') {
  215. buff add(current as Char)
  216. _next()
  217. while (current as Char alphaNumeric?() ||
  218. current == '_') {
  219. buff add(current as Char)
  220. _next()
  221. }
  222. s := String new(buff data as CString, buff getSize())
  223. if (keywords contains?(s))
  224. return keywords get(s)
  225. else {
  226. tok str = s
  227. return TK_NAME
  228. }
  229. }
  230. else { /* single-char tokens (+ - / ...) */
  231. c := current
  232. _next()
  233. return c
  234. }
  235. }
  236. }
  237. return 0 // avoid error
  238. }
  239. _next: func -> Int {
  240. current = z hasNext?() ? z read() as Int : -1
  241. if (current == 0 && ! z hasNext?())
  242. current = -1
  243. return current
  244. }
  245. _checkNext: func(set: String) -> Bool {
  246. // if (current == '\0' || ! set contains?(current as Char))
  247. if (! set contains?(current as Char))
  248. return false
  249. buff add(current as Char)
  250. _next()
  251. return true
  252. }
  253. _incLineNumber: func {
  254. old := current
  255. _next()
  256. if ((current == '\n' || current == '\r') && current != old)
  257. _next()
  258. linenumber += 1
  259. }
  260. _readString: func(delim: Int, tok: Token@) {
  261. buff add(current as Char) /* keep delimiter (for error messages) */
  262. _next()
  263. while (current != delim) {
  264. match current {
  265. case -1 =>
  266. _error("unfinished string", TK_EOS)
  267. case '\n' =>
  268. _error("unfinished string", TK_STRING)
  269. case '\r' =>
  270. _error("unfinished string", TK_STRING)
  271. case '\\' => /* escape sequences */
  272. nextDone := false
  273. c: Char /* final character to be saved */
  274. _next() /* do not save the `\' */
  275. match current {
  276. case 'a' =>
  277. c = '\a'
  278. case 'b' =>
  279. c = '\b'
  280. case 'f' =>
  281. c = '\f'
  282. case 'n' =>
  283. c = '\n'
  284. case 'r' =>
  285. c = '\r'
  286. case 't' =>
  287. c = '\t'
  288. case 'v' =>
  289. c = '\v'
  290. case 'x' =>
  291. c = _readHexaEsc()
  292. case '\n' =>
  293. buff add('\n')
  294. _incLineNumber()
  295. continue
  296. case '\r' =>
  297. buff add('\n')
  298. _incLineNumber()
  299. continue
  300. case -1 =>
  301. continue /* will raise an error next loop */
  302. case '*' => /* skip following span of spaces */
  303. _next() /* skip the '*' */
  304. while (current as Char whitespace?()) {
  305. if (current == '\n' || current == '\r')
  306. _incLineNumber()
  307. else
  308. _next()
  309. }
  310. continue /* do not save 'c' */
  311. case =>
  312. if (! current as Char digit?())
  313. c = current as Char /* handles \\, \", \', and \? */
  314. else { /* digital escape \ddd */
  315. c = _readDecEsc(nextDone&)
  316. }
  317. }
  318. buff add(c)
  319. if (! nextDone)
  320. _next()
  321. case =>
  322. buff add(current as Char)
  323. _next()
  324. }
  325. }
  326. buff add(current as Char) /* skip delimiter */
  327. _next()
  328. tmp := buff slice(1, buff getSize() -2)
  329. n := tmp getSize()
  330. tok str = (n != 0) ? String new(tmp data as CString, n) : ""
  331. }
  332. _readHexaEsc: func() -> Char {
  333. c1 := _next() as Char
  334. c2 := _next() as Char
  335. if (! c1 hexDigit?() || ! c2 hexDigit?()) {
  336. buff clear()
  337. buff add('\\')
  338. buff add('x')
  339. if (c1 == -1)
  340. buff add(c1)
  341. if (c2 == -1)
  342. buff add(c2)
  343. _error("hexadecimal digit expected", TK_STRING)
  344. }
  345. return (_hexaValue(c1) << 4) + _hexaValue(c2) as Char
  346. }
  347. _hexaValue: func(c: Char) -> Char {
  348. if (c digit?())
  349. return c - '0'
  350. else if (c upper?())
  351. return c - 'A' + 10
  352. else
  353. return c - 'a' + 10
  354. }
  355. _readDecEsc: func(nextDone: Bool@) -> Char {
  356. c1 := current as Char
  357. c := (c1 - '0') as Int
  358. c2 := _next() as Char
  359. if (c2 digit?()) {
  360. c = 10 * c + (c2 - '0') as Int
  361. c3 := _next() as Char
  362. if (c3 digit?()) {
  363. c = 10 * c + (c3 - '0') as Int
  364. if (c > 255) {
  365. buff clear()
  366. buff add('\\')
  367. buff add(c1)
  368. buff add(c2)
  369. buff add(c3)
  370. _error("decimal escape too large", TK_STRING)
  371. }
  372. return c as Char
  373. }
  374. }
  375. /* else, has read one character that was not a digit */
  376. nextDone = true
  377. return c as Char
  378. }
  379. _readNumeral: func(tok: Token@) {
  380. buff add(current as Char)
  381. _next()
  382. while (current as Char alphaNumeric?() || current == '.') {
  383. buff add(current as Char)
  384. _next()
  385. if (_checkNext("EePp")) /* exponent part? */
  386. _checkNext("+-") /* optional exponent sign */
  387. }
  388. buff add('\0')
  389. s1, s2: CString
  390. s1 = buff data as CString
  391. d := strtod(s1, s2&)
  392. if (s1 != s2 && s2[0] == '\0')
  393. tok num = d
  394. else {
  395. n := strtoul(s1, s2&, 0)
  396. if (s1 != s2 && s2[0] == '\0')
  397. tok num = n
  398. else
  399. _error("malformed number", TK_NUMBER)
  400. }
  401. }
  402. _skipSep: func -> Int {
  403. count := 0
  404. s := current
  405. buff add(current as Char)
  406. _next()
  407. while (current == '=') {
  408. buff add(current as Char)
  409. _next()
  410. count += 1
  411. }
  412. return (current == s) ? count : - 1
  413. }
  414. _readLongString: func(tok: Token@, sep: Int) {
  415. buff add(current as Char) /* skip 2nd `[' */
  416. _next()
  417. if (current == '\n' || current == '\r') /* string starts with a newline? */
  418. _incLineNumber() /* skip it */
  419. while (true) {
  420. match current {
  421. case -1 =>
  422. _error(tok& ? "unfinished long string" :
  423. "unfinished long comment", TK_EOS)
  424. case ']' =>
  425. if (_skipSep() == sep) {
  426. buff add(current as Char) /* skip 2nd `]' */
  427. _next()
  428. break
  429. }
  430. case '\n' =>
  431. buff add('\n')
  432. _incLineNumber()
  433. if (! tok&)
  434. buff clear() /* avoid wasting space */
  435. case '\r' =>
  436. buff add('\n')
  437. _incLineNumber()
  438. if (! tok&)
  439. buff clear() /* avoid wasting space */
  440. case =>
  441. if (tok&)
  442. buff add(current as Char)
  443. _next()
  444. }
  445. }
  446. if (tok&) {
  447. tmp := buff slice(2 + sep, buff getSize() - sep - 3)
  448. tok str = String new(tmp data as CString, tmp getSize())
  449. }
  450. }
  451. }
  452. FIRST_RESERVED := const 257
  453. TK_AND := const 257
  454. TK_BREAK := const 258
  455. TK_DO := const 259
  456. TK_ELSE := const 260
  457. TK_ELSEIF := const 261
  458. TK_END := const 262
  459. TK_FALSE := const 263
  460. TK_FOR := const 264
  461. TK_FUNCTION := const 265
  462. TK_IF := const 266
  463. TK_IN := const 267
  464. TK_LOCAL := const 268
  465. TK_NIL := const 269
  466. TK_NOT := const 270
  467. TK_OR := const 271
  468. TK_REPEAT := const 272
  469. TK_RETURN := const 273
  470. TK_THEN := const 274
  471. TK_TRUE := const 275
  472. TK_UNTIL := const 276
  473. TK_WHILE := const 277
  474. // other terminal symbols
  475. TK_CONCAT := const 278
  476. TK_DOTS := const 279
  477. TK_EQ := const 280
  478. TK_GE := const 281
  479. TK_LE := const 282
  480. TK_NE := const 283
  481. TK_EOS := const 284
  482. TK_NUMBER := const 285
  483. TK_NAME := const 286
  484. TK_STRING := const 287
  485. keywords := HashMap<String, Int> new()
  486. keywords put("and", TK_AND)
  487. keywords put("break", TK_BREAK)
  488. keywords put("do", TK_DO)
  489. keywords put("else", TK_ELSE)
  490. keywords put("elseif", TK_ELSEIF)
  491. keywords put("end", TK_END)
  492. keywords put("false", TK_FALSE)
  493. keywords put("for", TK_FOR)
  494. keywords put("function", TK_FUNCTION)
  495. keywords put("if", TK_IF)
  496. keywords put("in", TK_IN)
  497. keywords put("local", TK_LOCAL)
  498. keywords put("nil", TK_NIL)
  499. keywords put("not", TK_NOT)
  500. keywords put("or", TK_OR)
  501. keywords put("repeat", TK_REPEAT)
  502. keywords put("return", TK_RETURN)
  503. keywords put("then", TK_THEN)
  504. keywords put("true", TK_TRUE)
  505. keywords put("until", TK_UNTIL)
  506. keywords put("while", TK_WHILE)