PageRenderTime 43ms CodeModel.GetById 11ms app.highlight 8ms RepoModel.GetById 18ms app.codeStats 0ms

/source/Lexer.ooc

http://github.com/fperrad/ooc-lua
Unknown | 535 lines | 506 code | 29 blank | 0 comment | 0 complexity | eff9477d9b522ca34adc0132661d5a0c MD5 | raw file
  1
  2import io/Reader
  3import structs/ArrayList
  4import structs/HashMap
  5
  6LUA_MINBUFFER := const 32
  7
  8Token: cover {
  9    token: Int
 10    str: String
 11    num: Double
 12}
 13
 14Lexer: class {
 15    current: Int  /* current character (charint) */
 16    linenumber: Int  /* input line counter */
 17    lastline: Int  /* line of last token `consumed' */
 18    t: Token  /* current token */
 19    lookahead: Token  /* look ahead token */
 20    z: Reader  /* input stream */
 21    buff: ArrayList<Char>  /* buffer for tokens */
 22    source: String  /* current source name */
 23    envn: String  /* environment variable name */
 24
 25    init: func {}
 26
 27    setInput: func(=z, =source) {
 28        lookahead token = TK_EOS  /* no look-ahead token */
 29        linenumber = 1
 30        lastline = 1
 31        envn = "_ENV"
 32        buff = ArrayList<Char> new(LUA_MINBUFFER)  /* initialize buffer */
 33        _next()  /* read first char */
 34    }
 35
 36    next: func {
 37        lastline = linenumber
 38        if (lookahead token != TK_EOS) {  /* is there a look-ahead token? */
 39            t = lookahead  /* use this one */
 40            lookahead token = TK_EOS  /* and discharge it */
 41        }
 42        else
 43            t token = _lex(t&)  /* read next token */
 44    }
 45
 46    lookahead: func -> Int {
 47        version(debug) {
 48            assert(lookahead token == TK_EOS)
 49        }
 50        lookahead token = _lex(lookahead&)
 51        return lookahead token
 52    }
 53
 54    shebang: func {
 55        if (current == '#') {
 56            while (current != '\n')
 57                _next()
 58            _incLineNumber()
 59        }
 60    }
 61
 62    syntaxError: func(msg: String) {
 63        _error(msg, t token)
 64    }
 65
 66    _error: func(message: String, token: Int) {
 67        msg := "%s:%d: %s" format(source, linenumber, message)
 68        if (token)
 69            msg += " near " + _txtToken(token)
 70        Exception new(This, msg) throw()
 71    }
 72
 73    _txtToken: func(token: Int) -> String {
 74        if (token == TK_NAME || token == TK_STRING || token == TK_NUMBER) {
 75            s := String new(buff data as CString, buff getSize())
 76            return "'%s'" format(s)
 77        }
 78        else
 79            return token2str(token)
 80    }
 81
 82    token2str: func(token: Int) -> String {
 83        if (token < FIRST_RESERVED) {
 84            if (token as Char printable?())
 85                return "'%c'" format(token)
 86            else
 87                return "char(%d)" format(token)
 88        }
 89        else
 90            match (token) {
 91                case TK_AND     => return "and"
 92                case TK_BREAK   => return "break"
 93                case TK_DO      => return "do"
 94                case TK_ELSE    => return "else"
 95                case TK_ELSEIF  => return "elseif"
 96                case TK_END     => return "end"
 97                case TK_FALSE   => return "false"
 98                case TK_FOR     => return "for"
 99                case TK_FUNCTION=> return "function"
100                case TK_IF      => return "if"
101                case TK_IN      => return "in"
102                case TK_LOCAL   => return "local"
103                case TK_NIL     => return "nil"
104                case TK_NOT     => return "not"
105                case TK_OR      => return "or"
106                case TK_REPEAT  => return "repeat"
107                case TK_RETURN  => return "return"
108                case TK_THEN    => return "then"
109                case TK_TRUE    => return "true"
110                case TK_UNTIL   => return "until"
111                case TK_WHILE   => return "while"
112                // other terminal symbols
113                case TK_CONCAT  => return ".."
114                case TK_DOTS    => return "..."
115                case TK_EQ      => return "=="
116                case TK_GE      => return ">="
117                case TK_LE      => return "<="
118                case TK_NE      => return "~="
119            }
120        return "???"
121    }
122
123    _lex: func(tok: Token@) -> Int {
124        buff clear()
125        while (true) {
126            match current {
127                case '\n' =>
128                    _incLineNumber()
129                case '\r' =>
130                    _incLineNumber()
131                case ' ' =>
132                    _next()
133                case '\f' =>
134                    _next()
135                case '\t' =>
136                    _next()
137                case '\v' =>
138                    _next()
139                case '-' => /* '-' or '--' (comment) */
140                    _next()
141                    if (current != '-')
142                        return '-' as Int
143                    /* else is a comment */
144                    _next()
145                    if (current == '[') {  /* long comment? */
146                        sep := _skipSep()
147                        buff clear()  /* `skip_sep' may dirty the buffer */
148                        if (sep >= 0) {
149                            _readLongString(null, sep)  /* skip long comment */
150                            buff clear()  /* previous call may dirty the buff. */
151                            continue
152                        }
153                    }
154                    /* else short comment */
155                    while (!(current == '\n' || current == '\r') && current != -1)
156                        _next() /* skip until end of line (or end of file) */
157                case '[' =>  /* long string or simply '[' */
158                    sep := _skipSep()
159                    if (sep >= 0) {
160                        _readLongString(tok&, sep)
161                       return TK_STRING
162                    }
163                    else if (sep == -1)
164                        return '[' as Int
165                    else
166                        _error("invalid long string delimiter", TK_STRING)
167                case '=' =>
168                    _next()
169                    if (current != '=')
170                        return '=' as Int
171                    else {
172                        _next()
173                        return TK_EQ
174                    }
175                case '<' =>
176                    _next()
177                    if (current != '=')
178                        return '<' as Int
179                    else {
180                        _next()
181                        return TK_LE
182                    }
183                case '>' =>
184                    _next()
185                    if (current != '=')
186                        return '>' as Int
187                    else {
188                        _next()
189                        return TK_GE
190                    }
191                case '~' =>
192                    _next()
193                    if (current != '=')
194                        return '~' as Int
195                    else {
196                        _next()
197                        return TK_NE
198                    }
199                case '"' =>  /* short literal strings */
200                    _readString(current, tok&)
201                    return TK_STRING
202                case '\'' =>
203                    _readString(current, tok&)
204                    return TK_STRING
205                case '.' =>  /* '.', '..', '...', or number */
206                    buff add(current as Char)
207                    _next()
208                    if (_checkNext(".")) {
209                        if (_checkNext("."))
210                            return TK_DOTS;   /* '...' */
211                        else
212                            return TK_CONCAT;   /* '..' */
213                    }
214                    else if (! current as Char digit?())
215                        return '.' as Int
216                    else {
217                        _readNumeral(tok&)
218                        return TK_NUMBER
219                    }
220                case -1 =>
221                    return TK_EOS
222                case =>
223                    if (current as Char digit?()) {
224                        _readNumeral(tok&)
225                        return TK_NUMBER
226                    }
227                    else if (current as Char alpha?() ||  /* identifier or reserved word? */
228                             current == '_') {
229                        buff add(current as Char)
230                        _next()
231                        while (current as Char alphaNumeric?() ||
232                               current == '_') {
233                            buff add(current as Char)
234                            _next()
235                        }
236                        s := String new(buff data as CString, buff getSize())
237                        if (keywords contains?(s))
238                            return keywords get(s)
239                        else {
240                            tok str = s
241                            return TK_NAME
242                        }
243                    }
244                    else {  /* single-char tokens (+ - / ...) */
245                        c := current
246                        _next()
247                        return c
248                    }
249            }
250        }
251        return 0 // avoid error
252    }
253
254    _next: func -> Int {
255        current = z hasNext?() ? z read() as Int : -1
256        if (current == 0 && ! z hasNext?())
257            current = -1
258        return current
259    }
260
261    _checkNext: func(set: String) -> Bool {
262//        if (current == '\0' || ! set contains?(current as Char))
263        if (! set contains?(current as Char))
264            return false
265        buff add(current as Char)
266        _next()
267        return true
268    }
269
270    _incLineNumber: func {
271        old := current
272        _next()
273        if ((current == '\n' || current == '\r') && current != old)
274            _next()
275        linenumber += 1
276    }
277
278    _readString: func(delim: Int, tok: Token@) {
279        buff add(current as Char)  /* keep delimiter (for error messages) */
280        _next()
281        while (current != delim) {
282            match current {
283                case -1 =>
284                    _error("unfinished string", TK_EOS)
285                case '\n' =>
286                    _error("unfinished string", TK_STRING)
287                case '\r' =>
288                    _error("unfinished string", TK_STRING)
289                case '\\' =>  /* escape sequences */
290                    nextDone := false
291                    c: Char  /* final character to be saved */
292                    _next()  /* do not save the `\' */
293                    match current {
294                        case 'a' =>
295                            c = '\a'
296                        case 'b' =>
297                            c = '\b'
298                        case 'f' =>
299                            c = '\f'
300                        case 'n' =>
301                            c = '\n'
302                        case 'r' =>
303                            c = '\r'
304                        case 't' =>
305                            c = '\t'
306                        case 'v' =>
307                            c = '\v'
308                        case 'x' =>
309                            c = _readHexaEsc()
310                        case '\n' =>
311                            buff add('\n')
312                            _incLineNumber()
313                            continue
314                        case '\r' =>
315                            buff add('\n')
316                            _incLineNumber()
317                            continue
318                        case -1 =>
319                            continue  /* will raise an error next loop */
320                        case '*' =>  /* skip following span of spaces */
321                            _next()  /* skip the '*' */
322                            while (current as Char whitespace?()) {
323                                if (current == '\n' || current == '\r')
324                                    _incLineNumber()
325                                else
326                                    _next()
327                            }
328                            continue  /* do not save 'c' */
329                        case =>
330                            if (! current as Char digit?())
331                                c = current as Char /* handles \\, \", \', and \? */
332                            else { /* digital escape \ddd */
333                                c = _readDecEsc(nextDone&)
334                            }
335                    }
336                    buff add(c)
337                    if (! nextDone)
338                        _next()
339                case =>
340                    buff add(current as Char)
341                    _next()
342            }
343        }
344        buff add(current as Char)  /* skip delimiter */
345        _next()
346        tmp := buff slice(1, buff getSize() -2)
347        n := tmp getSize()
348        tok str = (n != 0) ? String new(tmp data as CString, n) : ""
349    }
350
351    _readHexaEsc: func() -> Char {
352        c1 := _next() as Char
353        c2 := _next() as Char
354        if (! c1 hexDigit?() || ! c2 hexDigit?()) {
355            buff clear()
356            buff add('\\')
357            buff add('x')
358            if (c1 == -1)
359                buff add(c1)
360            if (c2 == -1)
361                buff add(c2)
362            _error("hexadecimal digit expected", TK_STRING)
363        }
364        return (_hexaValue(c1) << 4) + _hexaValue(c2) as Char
365    }
366
367    _hexaValue: func(c: Char) -> Char {
368        if (c digit?())
369            return c - '0'
370        else if (c upper?())
371            return c - 'A' + 10
372        else
373            return c - 'a' + 10
374    }
375
376    _readDecEsc: func(nextDone: Bool@) -> Char {
377        c1 := current as Char
378        c := (c1 - '0') as Int
379        c2 := _next() as Char
380        if (c2 digit?()) {
381            c = 10 * c + (c2 - '0') as Int
382            c3 := _next() as Char
383            if (c3 digit?()) {
384                c = 10 * c + (c3 - '0') as Int
385                if (c > 255) {
386                    buff clear()
387                    buff add('\\')
388                    buff add(c1)
389                    buff add(c2)
390                    buff add(c3)
391                    _error("decimal escape too large", TK_STRING)
392                 }
393                 return c as Char
394            }
395        }
396        /* else, has read one character that was not a digit */
397        nextDone = true
398        return c as Char
399    }
400
401    _readNumeral: func(tok: Token@) {
402        buff add(current as Char)
403        _next()
404        while (current as Char alphaNumeric?() || current == '.') {
405            buff add(current as Char)
406            _next()
407            if (_checkNext("EePp"))  /* exponent part? */
408                _checkNext("+-")  /* optional exponent sign */
409
410        }
411        buff add('\0')
412        s1, s2: CString
413        s1 = buff data as CString
414        d := strtod(s1, s2&)
415        if (s1 != s2 && s2[0] == '\0')
416            tok num = d
417        else {
418            n := strtoul(s1, s2&, 0)
419            if (s1 != s2 && s2[0] == '\0')
420                tok num = n
421            else
422                _error("malformed number", TK_NUMBER)
423        }
424    }
425
426    _skipSep: func -> Int {
427        count := 0
428        s := current
429        buff add(current as Char)
430        _next()
431        while (current == '=') {
432            buff add(current as Char)
433            _next()
434            count += 1
435        }
436        return (current == s) ? count : - 1
437    }
438
439    _readLongString: func(tok: Token@, sep: Int) {
440        buff add(current as Char) /* skip 2nd `[' */
441        _next()
442        if (current == '\n' || current == '\r')  /* string starts with a newline? */
443            _incLineNumber()  /* skip it */
444        while (true) {
445            match current {
446                case -1 =>
447                    _error(tok& ? "unfinished long string" :
448                                  "unfinished long comment", TK_EOS)
449                case ']' =>
450                    if (_skipSep() == sep) {
451                        buff add(current as Char) /* skip 2nd `]' */
452                        _next()
453                        break
454                    }
455                case '\n' =>
456                    buff add('\n')
457                    _incLineNumber()
458                    if (! tok&)
459                        buff clear()  /* avoid wasting space */
460                case '\r' =>
461                    buff add('\n')
462                    _incLineNumber()
463                    if (! tok&)
464                        buff clear()  /* avoid wasting space */
465                case =>
466                    if (tok&)
467                        buff add(current as Char)
468                    _next()
469            }
470        }
471        if (tok&) {
472            tmp := buff slice(2 + sep, buff getSize() - sep - 3)
473            tok str = String new(tmp data as CString, tmp getSize())
474        }
475    }
476}
477
478FIRST_RESERVED  := const 257
479
480TK_AND      := const 257
481TK_BREAK    := const 258
482TK_DO       := const 259
483TK_ELSE     := const 260
484TK_ELSEIF   := const 261
485TK_END      := const 262
486TK_FALSE    := const 263
487TK_FOR      := const 264
488TK_FUNCTION := const 265
489TK_IF       := const 266
490TK_IN       := const 267
491TK_LOCAL    := const 268
492TK_NIL      := const 269
493TK_NOT      := const 270
494TK_OR       := const 271
495TK_REPEAT   := const 272
496TK_RETURN   := const 273
497TK_THEN     := const 274
498TK_TRUE     := const 275
499TK_UNTIL    := const 276
500TK_WHILE    := const 277
501// other terminal symbols
502TK_CONCAT   := const 278
503TK_DOTS     := const 279
504TK_EQ       := const 280
505TK_GE       := const 281
506TK_LE       := const 282
507TK_NE       := const 283
508TK_EOS      := const 284
509TK_NUMBER   := const 285
510TK_NAME     := const 286
511TK_STRING   := const 287
512
513keywords := HashMap<String, Int> new()
514keywords put("and", TK_AND)
515keywords put("break", TK_BREAK)
516keywords put("do", TK_DO)
517keywords put("else", TK_ELSE)
518keywords put("elseif", TK_ELSEIF)
519keywords put("end", TK_END)
520keywords put("false", TK_FALSE)
521keywords put("for", TK_FOR)
522keywords put("function", TK_FUNCTION)
523keywords put("if", TK_IF)
524keywords put("in", TK_IN)
525keywords put("local", TK_LOCAL)
526keywords put("nil", TK_NIL)
527keywords put("not", TK_NOT)
528keywords put("or", TK_OR)
529keywords put("repeat", TK_REPEAT)
530keywords put("return", TK_RETURN)
531keywords put("then", TK_THEN)
532keywords put("true", TK_TRUE)
533keywords put("until", TK_UNTIL)
534keywords put("while", TK_WHILE)
535