PageRenderTime 59ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/third_party/gofrontend/libgo/go/go/scanner/scanner.go

http://github.com/axw/llgo
Go | 761 lines | 629 code | 55 blank | 77 comment | 182 complexity | 06aefca2fd87c7e4aaaae997bb52d19b MD5 | raw file
Possible License(s): BSD-3-Clause, MIT
  1. // Copyright 2009 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package scanner implements a scanner for Go source text.
  5. // It takes a []byte as source which can then be tokenized
  6. // through repeated calls to the Scan method.
  7. //
  8. package scanner
  9. import (
  10. "bytes"
  11. "fmt"
  12. "go/token"
  13. "path/filepath"
  14. "strconv"
  15. "unicode"
  16. "unicode/utf8"
  17. )
  18. // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
  19. // encountered and a handler was installed, the handler is called with a
  20. // position and an error message. The position points to the beginning of
  21. // the offending token.
  22. //
  23. type ErrorHandler func(pos token.Position, msg string)
  24. // A Scanner holds the scanner's internal state while processing
  25. // a given text. It can be allocated as part of another data
  26. // structure but must be initialized via Init before use.
  27. //
  28. type Scanner struct {
  29. // immutable state
  30. file *token.File // source file handle
  31. dir string // directory portion of file.Name()
  32. src []byte // source
  33. err ErrorHandler // error reporting; or nil
  34. mode Mode // scanning mode
  35. // scanning state
  36. ch rune // current character
  37. offset int // character offset
  38. rdOffset int // reading offset (position after current character)
  39. lineOffset int // current line offset
  40. insertSemi bool // insert a semicolon before next newline
  41. // public state - ok to modify
  42. ErrorCount int // number of errors encountered
  43. }
  44. const bom = 0xFEFF // byte order mark, only permitted as very first character
  45. // Read the next Unicode char into s.ch.
  46. // s.ch < 0 means end-of-file.
  47. //
  48. func (s *Scanner) next() {
  49. if s.rdOffset < len(s.src) {
  50. s.offset = s.rdOffset
  51. if s.ch == '\n' {
  52. s.lineOffset = s.offset
  53. s.file.AddLine(s.offset)
  54. }
  55. r, w := rune(s.src[s.rdOffset]), 1
  56. switch {
  57. case r == 0:
  58. s.error(s.offset, "illegal character NUL")
  59. case r >= 0x80:
  60. // not ASCII
  61. r, w = utf8.DecodeRune(s.src[s.rdOffset:])
  62. if r == utf8.RuneError && w == 1 {
  63. s.error(s.offset, "illegal UTF-8 encoding")
  64. } else if r == bom && s.offset > 0 {
  65. s.error(s.offset, "illegal byte order mark")
  66. }
  67. }
  68. s.rdOffset += w
  69. s.ch = r
  70. } else {
  71. s.offset = len(s.src)
  72. if s.ch == '\n' {
  73. s.lineOffset = s.offset
  74. s.file.AddLine(s.offset)
  75. }
  76. s.ch = -1 // eof
  77. }
  78. }
  79. // A mode value is a set of flags (or 0).
  80. // They control scanner behavior.
  81. //
  82. type Mode uint
  83. const (
  84. ScanComments Mode = 1 << iota // return comments as COMMENT tokens
  85. dontInsertSemis // do not automatically insert semicolons - for testing only
  86. )
  87. // Init prepares the scanner s to tokenize the text src by setting the
  88. // scanner at the beginning of src. The scanner uses the file set file
  89. // for position information and it adds line information for each line.
  90. // It is ok to re-use the same file when re-scanning the same file as
  91. // line information which is already present is ignored. Init causes a
  92. // panic if the file size does not match the src size.
  93. //
  94. // Calls to Scan will invoke the error handler err if they encounter a
  95. // syntax error and err is not nil. Also, for each error encountered,
  96. // the Scanner field ErrorCount is incremented by one. The mode parameter
  97. // determines how comments are handled.
  98. //
  99. // Note that Init may call err if there is an error in the first character
  100. // of the file.
  101. //
  102. func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
  103. // Explicitly initialize all fields since a scanner may be reused.
  104. if file.Size() != len(src) {
  105. panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
  106. }
  107. s.file = file
  108. s.dir, _ = filepath.Split(file.Name())
  109. s.src = src
  110. s.err = err
  111. s.mode = mode
  112. s.ch = ' '
  113. s.offset = 0
  114. s.rdOffset = 0
  115. s.lineOffset = 0
  116. s.insertSemi = false
  117. s.ErrorCount = 0
  118. s.next()
  119. if s.ch == bom {
  120. s.next() // ignore BOM at file beginning
  121. }
  122. }
  123. func (s *Scanner) error(offs int, msg string) {
  124. if s.err != nil {
  125. s.err(s.file.Position(s.file.Pos(offs)), msg)
  126. }
  127. s.ErrorCount++
  128. }
  129. var prefix = []byte("//line ")
  130. func (s *Scanner) interpretLineComment(text []byte) {
  131. if bytes.HasPrefix(text, prefix) {
  132. // get filename and line number, if any
  133. if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
  134. if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
  135. // valid //line filename:line comment
  136. filename := string(bytes.TrimSpace(text[len(prefix):i]))
  137. if filename != "" {
  138. filename = filepath.Clean(filename)
  139. if !filepath.IsAbs(filename) {
  140. // make filename relative to current directory
  141. filename = filepath.Join(s.dir, filename)
  142. }
  143. }
  144. // update scanner position
  145. s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
  146. }
  147. }
  148. }
  149. }
  150. func (s *Scanner) scanComment() string {
  151. // initial '/' already consumed; s.ch == '/' || s.ch == '*'
  152. offs := s.offset - 1 // position of initial '/'
  153. hasCR := false
  154. if s.ch == '/' {
  155. //-style comment
  156. s.next()
  157. for s.ch != '\n' && s.ch >= 0 {
  158. if s.ch == '\r' {
  159. hasCR = true
  160. }
  161. s.next()
  162. }
  163. if offs == s.lineOffset {
  164. // comment starts at the beginning of the current line
  165. s.interpretLineComment(s.src[offs:s.offset])
  166. }
  167. goto exit
  168. }
  169. /*-style comment */
  170. s.next()
  171. for s.ch >= 0 {
  172. ch := s.ch
  173. if ch == '\r' {
  174. hasCR = true
  175. }
  176. s.next()
  177. if ch == '*' && s.ch == '/' {
  178. s.next()
  179. goto exit
  180. }
  181. }
  182. s.error(offs, "comment not terminated")
  183. exit:
  184. lit := s.src[offs:s.offset]
  185. if hasCR {
  186. lit = stripCR(lit)
  187. }
  188. return string(lit)
  189. }
  190. func (s *Scanner) findLineEnd() bool {
  191. // initial '/' already consumed
  192. defer func(offs int) {
  193. // reset scanner state to where it was upon calling findLineEnd
  194. s.ch = '/'
  195. s.offset = offs
  196. s.rdOffset = offs + 1
  197. s.next() // consume initial '/' again
  198. }(s.offset - 1)
  199. // read ahead until a newline, EOF, or non-comment token is found
  200. for s.ch == '/' || s.ch == '*' {
  201. if s.ch == '/' {
  202. //-style comment always contains a newline
  203. return true
  204. }
  205. /*-style comment: look for newline */
  206. s.next()
  207. for s.ch >= 0 {
  208. ch := s.ch
  209. if ch == '\n' {
  210. return true
  211. }
  212. s.next()
  213. if ch == '*' && s.ch == '/' {
  214. s.next()
  215. break
  216. }
  217. }
  218. s.skipWhitespace() // s.insertSemi is set
  219. if s.ch < 0 || s.ch == '\n' {
  220. return true
  221. }
  222. if s.ch != '/' {
  223. // non-comment token
  224. return false
  225. }
  226. s.next() // consume '/'
  227. }
  228. return false
  229. }
  230. func isLetter(ch rune) bool {
  231. return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
  232. }
  233. func isDigit(ch rune) bool {
  234. return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
  235. }
  236. func (s *Scanner) scanIdentifier() string {
  237. offs := s.offset
  238. for isLetter(s.ch) || isDigit(s.ch) {
  239. s.next()
  240. }
  241. return string(s.src[offs:s.offset])
  242. }
  243. func digitVal(ch rune) int {
  244. switch {
  245. case '0' <= ch && ch <= '9':
  246. return int(ch - '0')
  247. case 'a' <= ch && ch <= 'f':
  248. return int(ch - 'a' + 10)
  249. case 'A' <= ch && ch <= 'F':
  250. return int(ch - 'A' + 10)
  251. }
  252. return 16 // larger than any legal digit val
  253. }
  254. func (s *Scanner) scanMantissa(base int) {
  255. for digitVal(s.ch) < base {
  256. s.next()
  257. }
  258. }
  259. func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
  260. // digitVal(s.ch) < 10
  261. offs := s.offset
  262. tok := token.INT
  263. if seenDecimalPoint {
  264. offs--
  265. tok = token.FLOAT
  266. s.scanMantissa(10)
  267. goto exponent
  268. }
  269. if s.ch == '0' {
  270. // int or float
  271. offs := s.offset
  272. s.next()
  273. if s.ch == 'x' || s.ch == 'X' {
  274. // hexadecimal int
  275. s.next()
  276. s.scanMantissa(16)
  277. if s.offset-offs <= 2 {
  278. // only scanned "0x" or "0X"
  279. s.error(offs, "illegal hexadecimal number")
  280. }
  281. } else {
  282. // octal int or float
  283. seenDecimalDigit := false
  284. s.scanMantissa(8)
  285. if s.ch == '8' || s.ch == '9' {
  286. // illegal octal int or float
  287. seenDecimalDigit = true
  288. s.scanMantissa(10)
  289. }
  290. if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
  291. goto fraction
  292. }
  293. // octal int
  294. if seenDecimalDigit {
  295. s.error(offs, "illegal octal number")
  296. }
  297. }
  298. goto exit
  299. }
  300. // decimal int or float
  301. s.scanMantissa(10)
  302. fraction:
  303. if s.ch == '.' {
  304. tok = token.FLOAT
  305. s.next()
  306. s.scanMantissa(10)
  307. }
  308. exponent:
  309. if s.ch == 'e' || s.ch == 'E' {
  310. tok = token.FLOAT
  311. s.next()
  312. if s.ch == '-' || s.ch == '+' {
  313. s.next()
  314. }
  315. s.scanMantissa(10)
  316. }
  317. if s.ch == 'i' {
  318. tok = token.IMAG
  319. s.next()
  320. }
  321. exit:
  322. return tok, string(s.src[offs:s.offset])
  323. }
  324. // scanEscape parses an escape sequence where rune is the accepted
  325. // escaped quote. In case of a syntax error, it stops at the offending
  326. // character (without consuming it) and returns false. Otherwise
  327. // it returns true.
  328. func (s *Scanner) scanEscape(quote rune) bool {
  329. offs := s.offset
  330. var n int
  331. var base, max uint32
  332. switch s.ch {
  333. case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
  334. s.next()
  335. return true
  336. case '0', '1', '2', '3', '4', '5', '6', '7':
  337. n, base, max = 3, 8, 255
  338. case 'x':
  339. s.next()
  340. n, base, max = 2, 16, 255
  341. case 'u':
  342. s.next()
  343. n, base, max = 4, 16, unicode.MaxRune
  344. case 'U':
  345. s.next()
  346. n, base, max = 8, 16, unicode.MaxRune
  347. default:
  348. msg := "unknown escape sequence"
  349. if s.ch < 0 {
  350. msg = "escape sequence not terminated"
  351. }
  352. s.error(offs, msg)
  353. return false
  354. }
  355. var x uint32
  356. for n > 0 {
  357. d := uint32(digitVal(s.ch))
  358. if d >= base {
  359. msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
  360. if s.ch < 0 {
  361. msg = "escape sequence not terminated"
  362. }
  363. s.error(s.offset, msg)
  364. return false
  365. }
  366. x = x*base + d
  367. s.next()
  368. n--
  369. }
  370. if x > max || 0xD800 <= x && x < 0xE000 {
  371. s.error(offs, "escape sequence is invalid Unicode code point")
  372. return false
  373. }
  374. return true
  375. }
  376. func (s *Scanner) scanRune() string {
  377. // '\'' opening already consumed
  378. offs := s.offset - 1
  379. valid := true
  380. n := 0
  381. for {
  382. ch := s.ch
  383. if ch == '\n' || ch < 0 {
  384. // only report error if we don't have one already
  385. if valid {
  386. s.error(offs, "rune literal not terminated")
  387. valid = false
  388. }
  389. break
  390. }
  391. s.next()
  392. if ch == '\'' {
  393. break
  394. }
  395. n++
  396. if ch == '\\' {
  397. if !s.scanEscape('\'') {
  398. valid = false
  399. }
  400. // continue to read to closing quote
  401. }
  402. }
  403. if valid && n != 1 {
  404. s.error(offs, "illegal rune literal")
  405. }
  406. return string(s.src[offs:s.offset])
  407. }
  408. func (s *Scanner) scanString() string {
  409. // '"' opening already consumed
  410. offs := s.offset - 1
  411. for {
  412. ch := s.ch
  413. if ch == '\n' || ch < 0 {
  414. s.error(offs, "string literal not terminated")
  415. break
  416. }
  417. s.next()
  418. if ch == '"' {
  419. break
  420. }
  421. if ch == '\\' {
  422. s.scanEscape('"')
  423. }
  424. }
  425. return string(s.src[offs:s.offset])
  426. }
  427. func stripCR(b []byte) []byte {
  428. c := make([]byte, len(b))
  429. i := 0
  430. for _, ch := range b {
  431. if ch != '\r' {
  432. c[i] = ch
  433. i++
  434. }
  435. }
  436. return c[:i]
  437. }
  438. func (s *Scanner) scanRawString() string {
  439. // '`' opening already consumed
  440. offs := s.offset - 1
  441. hasCR := false
  442. for {
  443. ch := s.ch
  444. if ch < 0 {
  445. s.error(offs, "raw string literal not terminated")
  446. break
  447. }
  448. s.next()
  449. if ch == '`' {
  450. break
  451. }
  452. if ch == '\r' {
  453. hasCR = true
  454. }
  455. }
  456. lit := s.src[offs:s.offset]
  457. if hasCR {
  458. lit = stripCR(lit)
  459. }
  460. return string(lit)
  461. }
  462. func (s *Scanner) skipWhitespace() {
  463. for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
  464. s.next()
  465. }
  466. }
  467. // Helper functions for scanning multi-byte tokens such as >> += >>= .
  468. // Different routines recognize different length tok_i based on matches
  469. // of ch_i. If a token ends in '=', the result is tok1 or tok3
  470. // respectively. Otherwise, the result is tok0 if there was no other
  471. // matching character, or tok2 if the matching character was ch2.
  472. func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
  473. if s.ch == '=' {
  474. s.next()
  475. return tok1
  476. }
  477. return tok0
  478. }
  479. func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
  480. if s.ch == '=' {
  481. s.next()
  482. return tok1
  483. }
  484. if s.ch == ch2 {
  485. s.next()
  486. return tok2
  487. }
  488. return tok0
  489. }
  490. func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
  491. if s.ch == '=' {
  492. s.next()
  493. return tok1
  494. }
  495. if s.ch == ch2 {
  496. s.next()
  497. if s.ch == '=' {
  498. s.next()
  499. return tok3
  500. }
  501. return tok2
  502. }
  503. return tok0
  504. }
  505. // Scan scans the next token and returns the token position, the token,
  506. // and its literal string if applicable. The source end is indicated by
  507. // token.EOF.
  508. //
  509. // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
  510. // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
  511. // has the corresponding value.
  512. //
  513. // If the returned token is a keyword, the literal string is the keyword.
  514. //
  515. // If the returned token is token.SEMICOLON, the corresponding
  516. // literal string is ";" if the semicolon was present in the source,
  517. // and "\n" if the semicolon was inserted because of a newline or
  518. // at EOF.
  519. //
  520. // If the returned token is token.ILLEGAL, the literal string is the
  521. // offending character.
  522. //
  523. // In all other cases, Scan returns an empty literal string.
  524. //
  525. // For more tolerant parsing, Scan will return a valid token if
  526. // possible even if a syntax error was encountered. Thus, even
  527. // if the resulting token sequence contains no illegal tokens,
  528. // a client may not assume that no error occurred. Instead it
  529. // must check the scanner's ErrorCount or the number of calls
  530. // of the error handler, if there was one installed.
  531. //
  532. // Scan adds line information to the file added to the file
  533. // set with Init. Token positions are relative to that file
  534. // and thus relative to the file set.
  535. //
  536. func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
  537. scanAgain:
  538. s.skipWhitespace()
  539. // current token start
  540. pos = s.file.Pos(s.offset)
  541. // determine token value
  542. insertSemi := false
  543. switch ch := s.ch; {
  544. case isLetter(ch):
  545. lit = s.scanIdentifier()
  546. if len(lit) > 1 {
  547. // keywords are longer than one letter - avoid lookup otherwise
  548. tok = token.Lookup(lit)
  549. switch tok {
  550. case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
  551. insertSemi = true
  552. }
  553. } else {
  554. insertSemi = true
  555. tok = token.IDENT
  556. }
  557. case '0' <= ch && ch <= '9':
  558. insertSemi = true
  559. tok, lit = s.scanNumber(false)
  560. default:
  561. s.next() // always make progress
  562. switch ch {
  563. case -1:
  564. if s.insertSemi {
  565. s.insertSemi = false // EOF consumed
  566. return pos, token.SEMICOLON, "\n"
  567. }
  568. tok = token.EOF
  569. case '\n':
  570. // we only reach here if s.insertSemi was
  571. // set in the first place and exited early
  572. // from s.skipWhitespace()
  573. s.insertSemi = false // newline consumed
  574. return pos, token.SEMICOLON, "\n"
  575. case '"':
  576. insertSemi = true
  577. tok = token.STRING
  578. lit = s.scanString()
  579. case '\'':
  580. insertSemi = true
  581. tok = token.CHAR
  582. lit = s.scanRune()
  583. case '`':
  584. insertSemi = true
  585. tok = token.STRING
  586. lit = s.scanRawString()
  587. case ':':
  588. tok = s.switch2(token.COLON, token.DEFINE)
  589. case '.':
  590. if '0' <= s.ch && s.ch <= '9' {
  591. insertSemi = true
  592. tok, lit = s.scanNumber(true)
  593. } else if s.ch == '.' {
  594. s.next()
  595. if s.ch == '.' {
  596. s.next()
  597. tok = token.ELLIPSIS
  598. }
  599. } else {
  600. tok = token.PERIOD
  601. }
  602. case ',':
  603. tok = token.COMMA
  604. case ';':
  605. tok = token.SEMICOLON
  606. lit = ";"
  607. case '(':
  608. tok = token.LPAREN
  609. case ')':
  610. insertSemi = true
  611. tok = token.RPAREN
  612. case '[':
  613. tok = token.LBRACK
  614. case ']':
  615. insertSemi = true
  616. tok = token.RBRACK
  617. case '{':
  618. tok = token.LBRACE
  619. case '}':
  620. insertSemi = true
  621. tok = token.RBRACE
  622. case '+':
  623. tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
  624. if tok == token.INC {
  625. insertSemi = true
  626. }
  627. case '-':
  628. tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
  629. if tok == token.DEC {
  630. insertSemi = true
  631. }
  632. case '*':
  633. tok = s.switch2(token.MUL, token.MUL_ASSIGN)
  634. case '/':
  635. if s.ch == '/' || s.ch == '*' {
  636. // comment
  637. if s.insertSemi && s.findLineEnd() {
  638. // reset position to the beginning of the comment
  639. s.ch = '/'
  640. s.offset = s.file.Offset(pos)
  641. s.rdOffset = s.offset + 1
  642. s.insertSemi = false // newline consumed
  643. return pos, token.SEMICOLON, "\n"
  644. }
  645. comment := s.scanComment()
  646. if s.mode&ScanComments == 0 {
  647. // skip comment
  648. s.insertSemi = false // newline consumed
  649. goto scanAgain
  650. }
  651. tok = token.COMMENT
  652. lit = comment
  653. } else {
  654. tok = s.switch2(token.QUO, token.QUO_ASSIGN)
  655. }
  656. case '%':
  657. tok = s.switch2(token.REM, token.REM_ASSIGN)
  658. case '^':
  659. tok = s.switch2(token.XOR, token.XOR_ASSIGN)
  660. case '<':
  661. if s.ch == '-' {
  662. s.next()
  663. tok = token.ARROW
  664. } else {
  665. tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
  666. }
  667. case '>':
  668. tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
  669. case '=':
  670. tok = s.switch2(token.ASSIGN, token.EQL)
  671. case '!':
  672. tok = s.switch2(token.NOT, token.NEQ)
  673. case '&':
  674. if s.ch == '^' {
  675. s.next()
  676. tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
  677. } else {
  678. tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
  679. }
  680. case '|':
  681. tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
  682. default:
  683. // next reports unexpected BOMs - don't repeat
  684. if ch != bom {
  685. s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
  686. }
  687. insertSemi = s.insertSemi // preserve insertSemi info
  688. tok = token.ILLEGAL
  689. lit = string(ch)
  690. }
  691. }
  692. if s.mode&dontInsertSemis == 0 {
  693. s.insertSemi = insertSemi
  694. }
  695. return
  696. }