PageRenderTime 111ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/deps/v8/src/scanner.cc

http://github.com/joyent/node
C++ | 1111 lines | 896 code | 90 blank | 125 comment | 189 complexity | 56c246bf5fd5eae9ec942a36e316682d MD5 | raw file
Possible License(s): 0BSD, BSD-3-Clause, MPL-2.0-no-copyleft-exception, GPL-2.0, ISC, Apache-2.0, MIT, AGPL-3.0
  1. // Copyright 2011 the V8 project authors. All rights reserved.
  2. // Redistribution and use in source and binary forms, with or without
  3. // modification, are permitted provided that the following conditions are
  4. // met:
  5. //
  6. // * Redistributions of source code must retain the above copyright
  7. // notice, this list of conditions and the following disclaimer.
  8. // * Redistributions in binary form must reproduce the above
  9. // copyright notice, this list of conditions and the following
  10. // disclaimer in the documentation and/or other materials provided
  11. // with the distribution.
  12. // * Neither the name of Google Inc. nor the names of its
  13. // contributors may be used to endorse or promote products derived
  14. // from this software without specific prior written permission.
  15. //
  16. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  17. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  20. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. // Features shared by parsing and pre-parsing scanners.
  28. #include "scanner.h"
  29. #include "../include/v8stdint.h"
  30. #include "char-predicates-inl.h"
  31. namespace v8 {
  32. namespace internal {
  33. // ----------------------------------------------------------------------------
  34. // Scanner
  35. Scanner::Scanner(UnicodeCache* unicode_cache)
  36. : unicode_cache_(unicode_cache),
  37. octal_pos_(Location::invalid()),
  38. harmony_scoping_(false),
  39. harmony_modules_(false),
  40. harmony_numeric_literals_(false) { }
  41. void Scanner::Initialize(Utf16CharacterStream* source) {
  42. source_ = source;
  43. // Need to capture identifiers in order to recognize "get" and "set"
  44. // in object literals.
  45. Init();
  46. // Skip initial whitespace allowing HTML comment ends just like
  47. // after a newline and scan first token.
  48. has_line_terminator_before_next_ = true;
  49. SkipWhiteSpace();
  50. Scan();
  51. }
  52. uc32 Scanner::ScanHexNumber(int expected_length) {
  53. ASSERT(expected_length <= 4); // prevent overflow
  54. uc32 digits[4] = { 0, 0, 0, 0 };
  55. uc32 x = 0;
  56. for (int i = 0; i < expected_length; i++) {
  57. digits[i] = c0_;
  58. int d = HexValue(c0_);
  59. if (d < 0) {
  60. // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
  61. // should be illegal, but other JS VMs just return the
  62. // non-escaped version of the original character.
  63. // Push back digits that we have advanced past.
  64. for (int j = i-1; j >= 0; j--) {
  65. PushBack(digits[j]);
  66. }
  67. return -1;
  68. }
  69. x = x * 16 + d;
  70. Advance();
  71. }
  72. return x;
  73. }
  74. // Ensure that tokens can be stored in a byte.
  75. STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
  76. // Table of one-character tokens, by character (0x00..0x7f only).
  77. static const byte one_char_tokens[] = {
  78. Token::ILLEGAL,
  79. Token::ILLEGAL,
  80. Token::ILLEGAL,
  81. Token::ILLEGAL,
  82. Token::ILLEGAL,
  83. Token::ILLEGAL,
  84. Token::ILLEGAL,
  85. Token::ILLEGAL,
  86. Token::ILLEGAL,
  87. Token::ILLEGAL,
  88. Token::ILLEGAL,
  89. Token::ILLEGAL,
  90. Token::ILLEGAL,
  91. Token::ILLEGAL,
  92. Token::ILLEGAL,
  93. Token::ILLEGAL,
  94. Token::ILLEGAL,
  95. Token::ILLEGAL,
  96. Token::ILLEGAL,
  97. Token::ILLEGAL,
  98. Token::ILLEGAL,
  99. Token::ILLEGAL,
  100. Token::ILLEGAL,
  101. Token::ILLEGAL,
  102. Token::ILLEGAL,
  103. Token::ILLEGAL,
  104. Token::ILLEGAL,
  105. Token::ILLEGAL,
  106. Token::ILLEGAL,
  107. Token::ILLEGAL,
  108. Token::ILLEGAL,
  109. Token::ILLEGAL,
  110. Token::ILLEGAL,
  111. Token::ILLEGAL,
  112. Token::ILLEGAL,
  113. Token::ILLEGAL,
  114. Token::ILLEGAL,
  115. Token::ILLEGAL,
  116. Token::ILLEGAL,
  117. Token::ILLEGAL,
  118. Token::LPAREN, // 0x28
  119. Token::RPAREN, // 0x29
  120. Token::ILLEGAL,
  121. Token::ILLEGAL,
  122. Token::COMMA, // 0x2c
  123. Token::ILLEGAL,
  124. Token::ILLEGAL,
  125. Token::ILLEGAL,
  126. Token::ILLEGAL,
  127. Token::ILLEGAL,
  128. Token::ILLEGAL,
  129. Token::ILLEGAL,
  130. Token::ILLEGAL,
  131. Token::ILLEGAL,
  132. Token::ILLEGAL,
  133. Token::ILLEGAL,
  134. Token::ILLEGAL,
  135. Token::ILLEGAL,
  136. Token::COLON, // 0x3a
  137. Token::SEMICOLON, // 0x3b
  138. Token::ILLEGAL,
  139. Token::ILLEGAL,
  140. Token::ILLEGAL,
  141. Token::CONDITIONAL, // 0x3f
  142. Token::ILLEGAL,
  143. Token::ILLEGAL,
  144. Token::ILLEGAL,
  145. Token::ILLEGAL,
  146. Token::ILLEGAL,
  147. Token::ILLEGAL,
  148. Token::ILLEGAL,
  149. Token::ILLEGAL,
  150. Token::ILLEGAL,
  151. Token::ILLEGAL,
  152. Token::ILLEGAL,
  153. Token::ILLEGAL,
  154. Token::ILLEGAL,
  155. Token::ILLEGAL,
  156. Token::ILLEGAL,
  157. Token::ILLEGAL,
  158. Token::ILLEGAL,
  159. Token::ILLEGAL,
  160. Token::ILLEGAL,
  161. Token::ILLEGAL,
  162. Token::ILLEGAL,
  163. Token::ILLEGAL,
  164. Token::ILLEGAL,
  165. Token::ILLEGAL,
  166. Token::ILLEGAL,
  167. Token::ILLEGAL,
  168. Token::ILLEGAL,
  169. Token::LBRACK, // 0x5b
  170. Token::ILLEGAL,
  171. Token::RBRACK, // 0x5d
  172. Token::ILLEGAL,
  173. Token::ILLEGAL,
  174. Token::ILLEGAL,
  175. Token::ILLEGAL,
  176. Token::ILLEGAL,
  177. Token::ILLEGAL,
  178. Token::ILLEGAL,
  179. Token::ILLEGAL,
  180. Token::ILLEGAL,
  181. Token::ILLEGAL,
  182. Token::ILLEGAL,
  183. Token::ILLEGAL,
  184. Token::ILLEGAL,
  185. Token::ILLEGAL,
  186. Token::ILLEGAL,
  187. Token::ILLEGAL,
  188. Token::ILLEGAL,
  189. Token::ILLEGAL,
  190. Token::ILLEGAL,
  191. Token::ILLEGAL,
  192. Token::ILLEGAL,
  193. Token::ILLEGAL,
  194. Token::ILLEGAL,
  195. Token::ILLEGAL,
  196. Token::ILLEGAL,
  197. Token::ILLEGAL,
  198. Token::ILLEGAL,
  199. Token::ILLEGAL,
  200. Token::ILLEGAL,
  201. Token::LBRACE, // 0x7b
  202. Token::ILLEGAL,
  203. Token::RBRACE, // 0x7d
  204. Token::BIT_NOT, // 0x7e
  205. Token::ILLEGAL
  206. };
  207. Token::Value Scanner::Next() {
  208. current_ = next_;
  209. has_line_terminator_before_next_ = false;
  210. has_multiline_comment_before_next_ = false;
  211. if (static_cast<unsigned>(c0_) <= 0x7f) {
  212. Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
  213. if (token != Token::ILLEGAL) {
  214. int pos = source_pos();
  215. next_.token = token;
  216. next_.location.beg_pos = pos;
  217. next_.location.end_pos = pos + 1;
  218. Advance();
  219. return current_.token;
  220. }
  221. }
  222. Scan();
  223. return current_.token;
  224. }
  225. static inline bool IsByteOrderMark(uc32 c) {
  226. // The Unicode value U+FFFE is guaranteed never to be assigned as a
  227. // Unicode character; this implies that in a Unicode context the
  228. // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
  229. // character expressed in little-endian byte order (since it could
  230. // not be a U+FFFE character expressed in big-endian byte
  231. // order). Nevertheless, we check for it to be compatible with
  232. // Spidermonkey.
  233. return c == 0xFEFF || c == 0xFFFE;
  234. }
  235. bool Scanner::SkipWhiteSpace() {
  236. int start_position = source_pos();
  237. while (true) {
  238. // We treat byte-order marks (BOMs) as whitespace for better
  239. // compatibility with Spidermonkey and other JavaScript engines.
  240. while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
  241. // IsWhiteSpace() includes line terminators!
  242. if (unicode_cache_->IsLineTerminator(c0_)) {
  243. // Ignore line terminators, but remember them. This is necessary
  244. // for automatic semicolon insertion.
  245. has_line_terminator_before_next_ = true;
  246. }
  247. Advance();
  248. }
  249. // If there is an HTML comment end '-->' at the beginning of a
  250. // line (with only whitespace in front of it), we treat the rest
  251. // of the line as a comment. This is in line with the way
  252. // SpiderMonkey handles it.
  253. if (c0_ == '-' && has_line_terminator_before_next_) {
  254. Advance();
  255. if (c0_ == '-') {
  256. Advance();
  257. if (c0_ == '>') {
  258. // Treat the rest of the line as a comment.
  259. SkipSingleLineComment();
  260. // Continue skipping white space after the comment.
  261. continue;
  262. }
  263. PushBack('-'); // undo Advance()
  264. }
  265. PushBack('-'); // undo Advance()
  266. }
  267. // Return whether or not we skipped any characters.
  268. return source_pos() != start_position;
  269. }
  270. }
  271. Token::Value Scanner::SkipSingleLineComment() {
  272. Advance();
  273. // The line terminator at the end of the line is not considered
  274. // to be part of the single-line comment; it is recognized
  275. // separately by the lexical grammar and becomes part of the
  276. // stream of input elements for the syntactic grammar (see
  277. // ECMA-262, section 7.4).
  278. while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
  279. Advance();
  280. }
  281. return Token::WHITESPACE;
  282. }
  283. Token::Value Scanner::SkipMultiLineComment() {
  284. ASSERT(c0_ == '*');
  285. Advance();
  286. while (c0_ >= 0) {
  287. uc32 ch = c0_;
  288. Advance();
  289. if (unicode_cache_->IsLineTerminator(ch)) {
  290. // Following ECMA-262, section 7.4, a comment containing
  291. // a newline will make the comment count as a line-terminator.
  292. has_multiline_comment_before_next_ = true;
  293. }
  294. // If we have reached the end of the multi-line comment, we
  295. // consume the '/' and insert a whitespace. This way all
  296. // multi-line comments are treated as whitespace.
  297. if (ch == '*' && c0_ == '/') {
  298. c0_ = ' ';
  299. return Token::WHITESPACE;
  300. }
  301. }
  302. // Unterminated multi-line comment.
  303. return Token::ILLEGAL;
  304. }
  305. Token::Value Scanner::ScanHtmlComment() {
  306. // Check for <!-- comments.
  307. ASSERT(c0_ == '!');
  308. Advance();
  309. if (c0_ == '-') {
  310. Advance();
  311. if (c0_ == '-') return SkipSingleLineComment();
  312. PushBack('-'); // undo Advance()
  313. }
  314. PushBack('!'); // undo Advance()
  315. ASSERT(c0_ == '!');
  316. return Token::LT;
  317. }
  318. void Scanner::Scan() {
  319. next_.literal_chars = NULL;
  320. Token::Value token;
  321. do {
  322. // Remember the position of the next token
  323. next_.location.beg_pos = source_pos();
  324. switch (c0_) {
  325. case ' ':
  326. case '\t':
  327. Advance();
  328. token = Token::WHITESPACE;
  329. break;
  330. case '\n':
  331. Advance();
  332. has_line_terminator_before_next_ = true;
  333. token = Token::WHITESPACE;
  334. break;
  335. case '"': case '\'':
  336. token = ScanString();
  337. break;
  338. case '<':
  339. // < <= << <<= <!--
  340. Advance();
  341. if (c0_ == '=') {
  342. token = Select(Token::LTE);
  343. } else if (c0_ == '<') {
  344. token = Select('=', Token::ASSIGN_SHL, Token::SHL);
  345. } else if (c0_ == '!') {
  346. token = ScanHtmlComment();
  347. } else {
  348. token = Token::LT;
  349. }
  350. break;
  351. case '>':
  352. // > >= >> >>= >>> >>>=
  353. Advance();
  354. if (c0_ == '=') {
  355. token = Select(Token::GTE);
  356. } else if (c0_ == '>') {
  357. // >> >>= >>> >>>=
  358. Advance();
  359. if (c0_ == '=') {
  360. token = Select(Token::ASSIGN_SAR);
  361. } else if (c0_ == '>') {
  362. token = Select('=', Token::ASSIGN_SHR, Token::SHR);
  363. } else {
  364. token = Token::SAR;
  365. }
  366. } else {
  367. token = Token::GT;
  368. }
  369. break;
  370. case '=':
  371. // = == ===
  372. Advance();
  373. if (c0_ == '=') {
  374. token = Select('=', Token::EQ_STRICT, Token::EQ);
  375. } else {
  376. token = Token::ASSIGN;
  377. }
  378. break;
  379. case '!':
  380. // ! != !==
  381. Advance();
  382. if (c0_ == '=') {
  383. token = Select('=', Token::NE_STRICT, Token::NE);
  384. } else {
  385. token = Token::NOT;
  386. }
  387. break;
  388. case '+':
  389. // + ++ +=
  390. Advance();
  391. if (c0_ == '+') {
  392. token = Select(Token::INC);
  393. } else if (c0_ == '=') {
  394. token = Select(Token::ASSIGN_ADD);
  395. } else {
  396. token = Token::ADD;
  397. }
  398. break;
  399. case '-':
  400. // - -- --> -=
  401. Advance();
  402. if (c0_ == '-') {
  403. Advance();
  404. if (c0_ == '>' && has_line_terminator_before_next_) {
  405. // For compatibility with SpiderMonkey, we skip lines that
  406. // start with an HTML comment end '-->'.
  407. token = SkipSingleLineComment();
  408. } else {
  409. token = Token::DEC;
  410. }
  411. } else if (c0_ == '=') {
  412. token = Select(Token::ASSIGN_SUB);
  413. } else {
  414. token = Token::SUB;
  415. }
  416. break;
  417. case '*':
  418. // * *=
  419. token = Select('=', Token::ASSIGN_MUL, Token::MUL);
  420. break;
  421. case '%':
  422. // % %=
  423. token = Select('=', Token::ASSIGN_MOD, Token::MOD);
  424. break;
  425. case '/':
  426. // / // /* /=
  427. Advance();
  428. if (c0_ == '/') {
  429. token = SkipSingleLineComment();
  430. } else if (c0_ == '*') {
  431. token = SkipMultiLineComment();
  432. } else if (c0_ == '=') {
  433. token = Select(Token::ASSIGN_DIV);
  434. } else {
  435. token = Token::DIV;
  436. }
  437. break;
  438. case '&':
  439. // & && &=
  440. Advance();
  441. if (c0_ == '&') {
  442. token = Select(Token::AND);
  443. } else if (c0_ == '=') {
  444. token = Select(Token::ASSIGN_BIT_AND);
  445. } else {
  446. token = Token::BIT_AND;
  447. }
  448. break;
  449. case '|':
  450. // | || |=
  451. Advance();
  452. if (c0_ == '|') {
  453. token = Select(Token::OR);
  454. } else if (c0_ == '=') {
  455. token = Select(Token::ASSIGN_BIT_OR);
  456. } else {
  457. token = Token::BIT_OR;
  458. }
  459. break;
  460. case '^':
  461. // ^ ^=
  462. token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
  463. break;
  464. case '.':
  465. // . Number
  466. Advance();
  467. if (IsDecimalDigit(c0_)) {
  468. token = ScanNumber(true);
  469. } else {
  470. token = Token::PERIOD;
  471. }
  472. break;
  473. case ':':
  474. token = Select(Token::COLON);
  475. break;
  476. case ';':
  477. token = Select(Token::SEMICOLON);
  478. break;
  479. case ',':
  480. token = Select(Token::COMMA);
  481. break;
  482. case '(':
  483. token = Select(Token::LPAREN);
  484. break;
  485. case ')':
  486. token = Select(Token::RPAREN);
  487. break;
  488. case '[':
  489. token = Select(Token::LBRACK);
  490. break;
  491. case ']':
  492. token = Select(Token::RBRACK);
  493. break;
  494. case '{':
  495. token = Select(Token::LBRACE);
  496. break;
  497. case '}':
  498. token = Select(Token::RBRACE);
  499. break;
  500. case '?':
  501. token = Select(Token::CONDITIONAL);
  502. break;
  503. case '~':
  504. token = Select(Token::BIT_NOT);
  505. break;
  506. default:
  507. if (unicode_cache_->IsIdentifierStart(c0_)) {
  508. token = ScanIdentifierOrKeyword();
  509. } else if (IsDecimalDigit(c0_)) {
  510. token = ScanNumber(false);
  511. } else if (SkipWhiteSpace()) {
  512. token = Token::WHITESPACE;
  513. } else if (c0_ < 0) {
  514. token = Token::EOS;
  515. } else {
  516. token = Select(Token::ILLEGAL);
  517. }
  518. break;
  519. }
  520. // Continue scanning for tokens as long as we're just skipping
  521. // whitespace.
  522. } while (token == Token::WHITESPACE);
  523. next_.location.end_pos = source_pos();
  524. next_.token = token;
  525. }
  526. void Scanner::SeekForward(int pos) {
  527. // After this call, we will have the token at the given position as
  528. // the "next" token. The "current" token will be invalid.
  529. if (pos == next_.location.beg_pos) return;
  530. int current_pos = source_pos();
  531. ASSERT_EQ(next_.location.end_pos, current_pos);
  532. // Positions inside the lookahead token aren't supported.
  533. ASSERT(pos >= current_pos);
  534. if (pos != current_pos) {
  535. source_->SeekForward(pos - source_->pos());
  536. Advance();
  537. // This function is only called to seek to the location
  538. // of the end of a function (at the "}" token). It doesn't matter
  539. // whether there was a line terminator in the part we skip.
  540. has_line_terminator_before_next_ = false;
  541. has_multiline_comment_before_next_ = false;
  542. }
  543. Scan();
  544. }
  545. bool Scanner::ScanEscape() {
  546. uc32 c = c0_;
  547. Advance();
  548. // Skip escaped newlines.
  549. if (unicode_cache_->IsLineTerminator(c)) {
  550. // Allow CR+LF newlines in multiline string literals.
  551. if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
  552. // Allow LF+CR newlines in multiline string literals.
  553. if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
  554. return true;
  555. }
  556. switch (c) {
  557. case '\'': // fall through
  558. case '"' : // fall through
  559. case '\\': break;
  560. case 'b' : c = '\b'; break;
  561. case 'f' : c = '\f'; break;
  562. case 'n' : c = '\n'; break;
  563. case 'r' : c = '\r'; break;
  564. case 't' : c = '\t'; break;
  565. case 'u' : {
  566. c = ScanHexNumber(4);
  567. if (c < 0) return false;
  568. break;
  569. }
  570. case 'v' : c = '\v'; break;
  571. case 'x' : {
  572. c = ScanHexNumber(2);
  573. if (c < 0) return false;
  574. break;
  575. }
  576. case '0' : // fall through
  577. case '1' : // fall through
  578. case '2' : // fall through
  579. case '3' : // fall through
  580. case '4' : // fall through
  581. case '5' : // fall through
  582. case '6' : // fall through
  583. case '7' : c = ScanOctalEscape(c, 2); break;
  584. }
  585. // According to ECMA-262, section 7.8.4, characters not covered by the
  586. // above cases should be illegal, but they are commonly handled as
  587. // non-escaped characters by JS VMs.
  588. AddLiteralChar(c);
  589. return true;
  590. }
  591. // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
  592. // ECMA-262. Other JS VMs support them.
  593. uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
  594. uc32 x = c - '0';
  595. int i = 0;
  596. for (; i < length; i++) {
  597. int d = c0_ - '0';
  598. if (d < 0 || d > 7) break;
  599. int nx = x * 8 + d;
  600. if (nx >= 256) break;
  601. x = nx;
  602. Advance();
  603. }
  604. // Anything except '\0' is an octal escape sequence, illegal in strict mode.
  605. // Remember the position of octal escape sequences so that an error
  606. // can be reported later (in strict mode).
  607. // We don't report the error immediately, because the octal escape can
  608. // occur before the "use strict" directive.
  609. if (c != '0' || i > 0) {
  610. octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
  611. }
  612. return x;
  613. }
  614. Token::Value Scanner::ScanString() {
  615. uc32 quote = c0_;
  616. Advance(); // consume quote
  617. LiteralScope literal(this);
  618. while (c0_ != quote && c0_ >= 0
  619. && !unicode_cache_->IsLineTerminator(c0_)) {
  620. uc32 c = c0_;
  621. Advance();
  622. if (c == '\\') {
  623. if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL;
  624. } else {
  625. AddLiteralChar(c);
  626. }
  627. }
  628. if (c0_ != quote) return Token::ILLEGAL;
  629. literal.Complete();
  630. Advance(); // consume quote
  631. return Token::STRING;
  632. }
  633. void Scanner::ScanDecimalDigits() {
  634. while (IsDecimalDigit(c0_))
  635. AddLiteralCharAdvance();
  636. }
  637. Token::Value Scanner::ScanNumber(bool seen_period) {
  638. ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
  639. enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
  640. LiteralScope literal(this);
  641. if (seen_period) {
  642. // we have already seen a decimal point of the float
  643. AddLiteralChar('.');
  644. ScanDecimalDigits(); // we know we have at least one digit
  645. } else {
  646. // if the first character is '0' we must check for octals and hex
  647. if (c0_ == '0') {
  648. int start_pos = source_pos(); // For reporting octal positions.
  649. AddLiteralCharAdvance();
  650. // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
  651. // an octal number.
  652. if (c0_ == 'x' || c0_ == 'X') {
  653. // hex number
  654. kind = HEX;
  655. AddLiteralCharAdvance();
  656. if (!IsHexDigit(c0_)) {
  657. // we must have at least one hex digit after 'x'/'X'
  658. return Token::ILLEGAL;
  659. }
  660. while (IsHexDigit(c0_)) {
  661. AddLiteralCharAdvance();
  662. }
  663. } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
  664. kind = OCTAL;
  665. AddLiteralCharAdvance();
  666. if (!IsOctalDigit(c0_)) {
  667. // we must have at least one octal digit after 'o'/'O'
  668. return Token::ILLEGAL;
  669. }
  670. while (IsOctalDigit(c0_)) {
  671. AddLiteralCharAdvance();
  672. }
  673. } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
  674. kind = BINARY;
  675. AddLiteralCharAdvance();
  676. if (!IsBinaryDigit(c0_)) {
  677. // we must have at least one binary digit after 'b'/'B'
  678. return Token::ILLEGAL;
  679. }
  680. while (IsBinaryDigit(c0_)) {
  681. AddLiteralCharAdvance();
  682. }
  683. } else if ('0' <= c0_ && c0_ <= '7') {
  684. // (possible) octal number
  685. kind = IMPLICIT_OCTAL;
  686. while (true) {
  687. if (c0_ == '8' || c0_ == '9') {
  688. kind = DECIMAL;
  689. break;
  690. }
  691. if (c0_ < '0' || '7' < c0_) {
  692. // Octal literal finished.
  693. octal_pos_ = Location(start_pos, source_pos());
  694. break;
  695. }
  696. AddLiteralCharAdvance();
  697. }
  698. }
  699. }
  700. // Parse decimal digits and allow trailing fractional part.
  701. if (kind == DECIMAL) {
  702. ScanDecimalDigits(); // optional
  703. if (c0_ == '.') {
  704. AddLiteralCharAdvance();
  705. ScanDecimalDigits(); // optional
  706. }
  707. }
  708. }
  709. // scan exponent, if any
  710. if (c0_ == 'e' || c0_ == 'E') {
  711. ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
  712. if (kind != DECIMAL) return Token::ILLEGAL;
  713. // scan exponent
  714. AddLiteralCharAdvance();
  715. if (c0_ == '+' || c0_ == '-')
  716. AddLiteralCharAdvance();
  717. if (!IsDecimalDigit(c0_)) {
  718. // we must have at least one decimal digit after 'e'/'E'
  719. return Token::ILLEGAL;
  720. }
  721. ScanDecimalDigits();
  722. }
  723. // The source character immediately following a numeric literal must
  724. // not be an identifier start or a decimal digit; see ECMA-262
  725. // section 7.8.3, page 17 (note that we read only one decimal digit
  726. // if the value is 0).
  727. if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
  728. return Token::ILLEGAL;
  729. literal.Complete();
  730. return Token::NUMBER;
  731. }
  732. uc32 Scanner::ScanIdentifierUnicodeEscape() {
  733. Advance();
  734. if (c0_ != 'u') return -1;
  735. Advance();
  736. uc32 result = ScanHexNumber(4);
  737. if (result < 0) PushBack('u');
  738. return result;
  739. }
  740. // ----------------------------------------------------------------------------
  741. // Keyword Matcher
  742. #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
  743. KEYWORD_GROUP('b') \
  744. KEYWORD("break", Token::BREAK) \
  745. KEYWORD_GROUP('c') \
  746. KEYWORD("case", Token::CASE) \
  747. KEYWORD("catch", Token::CATCH) \
  748. KEYWORD("class", Token::FUTURE_RESERVED_WORD) \
  749. KEYWORD("const", Token::CONST) \
  750. KEYWORD("continue", Token::CONTINUE) \
  751. KEYWORD_GROUP('d') \
  752. KEYWORD("debugger", Token::DEBUGGER) \
  753. KEYWORD("default", Token::DEFAULT) \
  754. KEYWORD("delete", Token::DELETE) \
  755. KEYWORD("do", Token::DO) \
  756. KEYWORD_GROUP('e') \
  757. KEYWORD("else", Token::ELSE) \
  758. KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \
  759. KEYWORD("export", harmony_modules \
  760. ? Token::EXPORT : Token::FUTURE_RESERVED_WORD) \
  761. KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \
  762. KEYWORD_GROUP('f') \
  763. KEYWORD("false", Token::FALSE_LITERAL) \
  764. KEYWORD("finally", Token::FINALLY) \
  765. KEYWORD("for", Token::FOR) \
  766. KEYWORD("function", Token::FUNCTION) \
  767. KEYWORD_GROUP('i') \
  768. KEYWORD("if", Token::IF) \
  769. KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
  770. KEYWORD("import", harmony_modules \
  771. ? Token::IMPORT : Token::FUTURE_RESERVED_WORD) \
  772. KEYWORD("in", Token::IN) \
  773. KEYWORD("instanceof", Token::INSTANCEOF) \
  774. KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \
  775. KEYWORD_GROUP('l') \
  776. KEYWORD("let", harmony_scoping \
  777. ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
  778. KEYWORD_GROUP('n') \
  779. KEYWORD("new", Token::NEW) \
  780. KEYWORD("null", Token::NULL_LITERAL) \
  781. KEYWORD_GROUP('p') \
  782. KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \
  783. KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \
  784. KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \
  785. KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \
  786. KEYWORD_GROUP('r') \
  787. KEYWORD("return", Token::RETURN) \
  788. KEYWORD_GROUP('s') \
  789. KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \
  790. KEYWORD("super", Token::FUTURE_RESERVED_WORD) \
  791. KEYWORD("switch", Token::SWITCH) \
  792. KEYWORD_GROUP('t') \
  793. KEYWORD("this", Token::THIS) \
  794. KEYWORD("throw", Token::THROW) \
  795. KEYWORD("true", Token::TRUE_LITERAL) \
  796. KEYWORD("try", Token::TRY) \
  797. KEYWORD("typeof", Token::TYPEOF) \
  798. KEYWORD_GROUP('v') \
  799. KEYWORD("var", Token::VAR) \
  800. KEYWORD("void", Token::VOID) \
  801. KEYWORD_GROUP('w') \
  802. KEYWORD("while", Token::WHILE) \
  803. KEYWORD("with", Token::WITH) \
  804. KEYWORD_GROUP('y') \
  805. KEYWORD("yield", Token::YIELD)
  806. static Token::Value KeywordOrIdentifierToken(const char* input,
  807. int input_length,
  808. bool harmony_scoping,
  809. bool harmony_modules) {
  810. ASSERT(input_length >= 1);
  811. const int kMinLength = 2;
  812. const int kMaxLength = 10;
  813. if (input_length < kMinLength || input_length > kMaxLength) {
  814. return Token::IDENTIFIER;
  815. }
  816. switch (input[0]) {
  817. default:
  818. #define KEYWORD_GROUP_CASE(ch) \
  819. break; \
  820. case ch:
  821. #define KEYWORD(keyword, token) \
  822. { \
  823. /* 'keyword' is a char array, so sizeof(keyword) is */ \
  824. /* strlen(keyword) plus 1 for the NUL char. */ \
  825. const int keyword_length = sizeof(keyword) - 1; \
  826. STATIC_ASSERT(keyword_length >= kMinLength); \
  827. STATIC_ASSERT(keyword_length <= kMaxLength); \
  828. if (input_length == keyword_length && \
  829. input[1] == keyword[1] && \
  830. (keyword_length <= 2 || input[2] == keyword[2]) && \
  831. (keyword_length <= 3 || input[3] == keyword[3]) && \
  832. (keyword_length <= 4 || input[4] == keyword[4]) && \
  833. (keyword_length <= 5 || input[5] == keyword[5]) && \
  834. (keyword_length <= 6 || input[6] == keyword[6]) && \
  835. (keyword_length <= 7 || input[7] == keyword[7]) && \
  836. (keyword_length <= 8 || input[8] == keyword[8]) && \
  837. (keyword_length <= 9 || input[9] == keyword[9])) { \
  838. return token; \
  839. } \
  840. }
  841. KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
  842. }
  843. return Token::IDENTIFIER;
  844. }
  845. Token::Value Scanner::ScanIdentifierOrKeyword() {
  846. ASSERT(unicode_cache_->IsIdentifierStart(c0_));
  847. LiteralScope literal(this);
  848. // Scan identifier start character.
  849. if (c0_ == '\\') {
  850. uc32 c = ScanIdentifierUnicodeEscape();
  851. // Only allow legal identifier start characters.
  852. if (c < 0 ||
  853. c == '\\' || // No recursive escapes.
  854. !unicode_cache_->IsIdentifierStart(c)) {
  855. return Token::ILLEGAL;
  856. }
  857. AddLiteralChar(c);
  858. return ScanIdentifierSuffix(&literal);
  859. }
  860. uc32 first_char = c0_;
  861. Advance();
  862. AddLiteralChar(first_char);
  863. // Scan the rest of the identifier characters.
  864. while (unicode_cache_->IsIdentifierPart(c0_)) {
  865. if (c0_ != '\\') {
  866. uc32 next_char = c0_;
  867. Advance();
  868. AddLiteralChar(next_char);
  869. continue;
  870. }
  871. // Fallthrough if no longer able to complete keyword.
  872. return ScanIdentifierSuffix(&literal);
  873. }
  874. literal.Complete();
  875. if (next_.literal_chars->is_ascii()) {
  876. Vector<const char> chars = next_.literal_chars->ascii_literal();
  877. return KeywordOrIdentifierToken(chars.start(),
  878. chars.length(),
  879. harmony_scoping_,
  880. harmony_modules_);
  881. }
  882. return Token::IDENTIFIER;
  883. }
  884. Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
  885. // Scan the rest of the identifier characters.
  886. while (unicode_cache_->IsIdentifierPart(c0_)) {
  887. if (c0_ == '\\') {
  888. uc32 c = ScanIdentifierUnicodeEscape();
  889. // Only allow legal identifier part characters.
  890. if (c < 0 ||
  891. c == '\\' ||
  892. !unicode_cache_->IsIdentifierPart(c)) {
  893. return Token::ILLEGAL;
  894. }
  895. AddLiteralChar(c);
  896. } else {
  897. AddLiteralChar(c0_);
  898. Advance();
  899. }
  900. }
  901. literal->Complete();
  902. return Token::IDENTIFIER;
  903. }
  904. bool Scanner::ScanRegExpPattern(bool seen_equal) {
  905. // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
  906. bool in_character_class = false;
  907. // Previous token is either '/' or '/=', in the second case, the
  908. // pattern starts at =.
  909. next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
  910. next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
  911. // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
  912. // the scanner should pass uninterpreted bodies to the RegExp
  913. // constructor.
  914. LiteralScope literal(this);
  915. if (seen_equal) {
  916. AddLiteralChar('=');
  917. }
  918. while (c0_ != '/' || in_character_class) {
  919. if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
  920. if (c0_ == '\\') { // Escape sequence.
  921. AddLiteralCharAdvance();
  922. if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
  923. AddLiteralCharAdvance();
  924. // If the escape allows more characters, i.e., \x??, \u????, or \c?,
  925. // only "safe" characters are allowed (letters, digits, underscore),
  926. // otherwise the escape isn't valid and the invalid character has
  927. // its normal meaning. I.e., we can just continue scanning without
  928. // worrying whether the following characters are part of the escape
  929. // or not, since any '/', '\\' or '[' is guaranteed to not be part
  930. // of the escape sequence.
  931. // TODO(896): At some point, parse RegExps more throughly to capture
  932. // octal esacpes in strict mode.
  933. } else { // Unescaped character.
  934. if (c0_ == '[') in_character_class = true;
  935. if (c0_ == ']') in_character_class = false;
  936. AddLiteralCharAdvance();
  937. }
  938. }
  939. Advance(); // consume '/'
  940. literal.Complete();
  941. return true;
  942. }
  943. bool Scanner::ScanLiteralUnicodeEscape() {
  944. ASSERT(c0_ == '\\');
  945. uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
  946. Advance();
  947. int i = 1;
  948. if (c0_ == 'u') {
  949. i++;
  950. while (i < 6) {
  951. Advance();
  952. if (!IsHexDigit(c0_)) break;
  953. chars_read[i] = c0_;
  954. i++;
  955. }
  956. }
  957. if (i < 6) {
  958. // Incomplete escape. Undo all advances and return false.
  959. while (i > 0) {
  960. i--;
  961. PushBack(chars_read[i]);
  962. }
  963. return false;
  964. }
  965. // Complete escape. Add all chars to current literal buffer.
  966. for (int i = 0; i < 6; i++) {
  967. AddLiteralChar(chars_read[i]);
  968. }
  969. return true;
  970. }
  971. bool Scanner::ScanRegExpFlags() {
  972. // Scan regular expression flags.
  973. LiteralScope literal(this);
  974. while (unicode_cache_->IsIdentifierPart(c0_)) {
  975. if (c0_ != '\\') {
  976. AddLiteralCharAdvance();
  977. } else {
  978. if (!ScanLiteralUnicodeEscape()) {
  979. break;
  980. }
  981. Advance();
  982. }
  983. }
  984. literal.Complete();
  985. next_.location.end_pos = source_pos() - 1;
  986. return true;
  987. }
  988. } } // namespace v8::internal