PageRenderTime 66ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/core/src/main/java/org/jruby/lexer/yacc/RubyLexer.java

https://github.com/davidraj/jruby
Java | 2739 lines | 2219 code | 327 blank | 193 comment | 570 complexity | 004ee903289b702c9e9e075bc1b2f7ce MD5 | raw file
Possible License(s): GPL-3.0
  1. /*
  2. ***** BEGIN LICENSE BLOCK *****
  3. * Version: EPL 1.0/GPL 2.0/LGPL 2.1
  4. *
  5. * The contents of this file are subject to the Eclipse Public
  6. * License Version 1.0 (the "License"); you may not use this file
  7. * except in compliance with the License. You may obtain a copy of
  8. * the License at http://www.eclipse.org/legal/epl-v10.html
  9. *
  10. * Software distributed under the License is distributed on an "AS
  11. * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
  12. * implied. See the License for the specific language governing
  13. * rights and limitations under the License.
  14. *
  15. * Copyright (C) 2002 Benoit Cerrina <b.cerrina@wanadoo.fr>
  16. * Copyright (C) 2002-2004 Anders Bengtsson <ndrsbngtssn@yahoo.se>
  17. * Copyright (C) 2002-2004 Jan Arne Petersen <jpetersen@uni-bonn.de>
  18. * Copyright (C) 2004-2006 Thomas E Enebo <enebo@acm.org>
  19. * Copyright (C) 2004 Stefan Matthias Aust <sma@3plus4.de>
  20. * Copyright (C) 2004-2005 David Corbin <dcorbin@users.sourceforge.net>
  21. * Copyright (C) 2005 Zach Dennis <zdennis@mktec.com>
  22. * Copyright (C) 2006 Thomas Corbat <tcorbat@hsr.ch>
  23. *
  24. * Alternatively, the contents of this file may be used under the terms of
  25. * either of the GNU General Public License Version 2 or later (the "GPL"),
  26. * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27. * in which case the provisions of the GPL or the LGPL are applicable instead
  28. * of those above. If you wish to allow use of your version of this file only
  29. * under the terms of either the GPL or the LGPL, and not to allow others to
  30. * use your version of this file under the terms of the EPL, indicate your
  31. * decision by deleting the provisions above and replace them with the notice
  32. * and other provisions required by the GPL or the LGPL. If you do not delete
  33. * the provisions above, a recipient may use your version of this file under
  34. * the terms of any one of the EPL, the GPL or the LGPL.
  35. ***** END LICENSE BLOCK *****/
  36. package org.jruby.lexer.yacc;
  37. import java.io.IOException;
  38. import java.math.BigInteger;
  39. import java.util.HashMap;
  40. import org.jcodings.Encoding;
  41. import org.jcodings.specific.ASCIIEncoding;
  42. import org.jcodings.specific.USASCIIEncoding;
  43. import org.jcodings.specific.UTF8Encoding;
  44. import org.joni.Matcher;
  45. import org.joni.Option;
  46. import org.joni.Regex;
  47. import org.jruby.RubyRegexp;
  48. import org.jruby.ast.BackRefNode;
  49. import org.jruby.ast.BignumNode;
  50. import org.jruby.ast.ComplexNode;
  51. import org.jruby.ast.FixnumNode;
  52. import org.jruby.ast.FloatNode;
  53. import org.jruby.ast.Node;
  54. import org.jruby.ast.NthRefNode;
  55. import org.jruby.ast.RationalNode;
  56. import org.jruby.ast.StrNode;
  57. import org.jruby.common.IRubyWarnings;
  58. import org.jruby.common.IRubyWarnings.ID;
  59. import org.jruby.lexer.yacc.SyntaxException.PID;
  60. import org.jruby.parser.ParserSupport;
  61. import org.jruby.parser.Tokens;
  62. import org.jruby.util.ByteList;
  63. import org.jruby.util.SafeDoubleParser;
  64. import org.jruby.util.StringSupport;
  65. /** This is a port of the MRI lexer to Java it is compatible to Ruby 1.8.1.
  66. */
  67. public class RubyLexer {
  68. public static final Encoding UTF8_ENCODING = UTF8Encoding.INSTANCE;
  69. public static final Encoding USASCII_ENCODING = USASCIIEncoding.INSTANCE;
  70. public static final Encoding ASCII8BIT_ENCODING = ASCIIEncoding.INSTANCE;
  71. private static final ByteList END_MARKER = new ByteList(new byte[] {'_', 'E', 'N', 'D', '_', '_'});
  72. private static final ByteList BEGIN_DOC_MARKER = new ByteList(new byte[] {'b', 'e', 'g', 'i', 'n'});
  73. private static final ByteList END_DOC_MARKER = new ByteList(new byte[] {'e', 'n', 'd'});
  74. private static final HashMap<String, Keyword> map;
  75. private static final int SUFFIX_R = 1<<0;
  76. private static final int SUFFIX_I = 1<<1;
  77. private static final int SUFFIX_ALL = 3;
  78. static {
  79. map = new HashMap<String, Keyword>();
  80. map.put("end", Keyword.END);
  81. map.put("else", Keyword.ELSE);
  82. map.put("case", Keyword.CASE);
  83. map.put("ensure", Keyword.ENSURE);
  84. map.put("module", Keyword.MODULE);
  85. map.put("elsif", Keyword.ELSIF);
  86. map.put("def", Keyword.DEF);
  87. map.put("rescue", Keyword.RESCUE);
  88. map.put("not", Keyword.NOT);
  89. map.put("then", Keyword.THEN);
  90. map.put("yield", Keyword.YIELD);
  91. map.put("for", Keyword.FOR);
  92. map.put("self", Keyword.SELF);
  93. map.put("false", Keyword.FALSE);
  94. map.put("retry", Keyword.RETRY);
  95. map.put("return", Keyword.RETURN);
  96. map.put("true", Keyword.TRUE);
  97. map.put("if", Keyword.IF);
  98. map.put("defined?", Keyword.DEFINED_P);
  99. map.put("super", Keyword.SUPER);
  100. map.put("undef", Keyword.UNDEF);
  101. map.put("break", Keyword.BREAK);
  102. map.put("in", Keyword.IN);
  103. map.put("do", Keyword.DO);
  104. map.put("nil", Keyword.NIL);
  105. map.put("until", Keyword.UNTIL);
  106. map.put("unless", Keyword.UNLESS);
  107. map.put("or", Keyword.OR);
  108. map.put("next", Keyword.NEXT);
  109. map.put("when", Keyword.WHEN);
  110. map.put("redo", Keyword.REDO);
  111. map.put("and", Keyword.AND);
  112. map.put("begin", Keyword.BEGIN);
  113. map.put("__LINE__", Keyword.__LINE__);
  114. map.put("class", Keyword.CLASS);
  115. map.put("__FILE__", Keyword.__FILE__);
  116. map.put("END", Keyword.LEND);
  117. map.put("BEGIN", Keyword.LBEGIN);
  118. map.put("while", Keyword.WHILE);
  119. map.put("alias", Keyword.ALIAS);
  120. map.put("__ENCODING__", Keyword.__ENCODING__);
  121. }
  122. private Encoding encoding;
  123. public Encoding getEncoding() {
  124. return encoding;
  125. }
  126. private int getFloatToken(String number) {
  127. // FIXME: Rational support is needed here.
  128. double d;
  129. try {
  130. d = SafeDoubleParser.parseDouble(number);
  131. } catch (NumberFormatException e) {
  132. warnings.warn(ID.FLOAT_OUT_OF_RANGE, getPosition(), "Float " + number + " out of range.");
  133. d = number.startsWith("-") ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
  134. }
  135. yaccValue = new FloatNode(getPosition(), d);
  136. return Tokens.tFLOAT;
  137. }
  138. private BignumNode newBignumNode(String value, int radix) {
  139. return new BignumNode(getPosition(), new BigInteger(value, radix));
  140. }
  141. private FixnumNode newFixnumNode(String value, int radix) throws NumberFormatException {
  142. return new FixnumNode(getPosition(), Long.parseLong(value, radix));
  143. }
  144. private RationalNode newRationalNode(String value, int radix) throws NumberFormatException {
  145. return new RationalNode(getPosition(), Long.parseLong(value, radix));
  146. }
  147. private ComplexNode newComplexNode(Node number) {
  148. return new ComplexNode(getPosition(), number);
  149. }
  150. private void ambiguousOperator(String op, String syn) {
  151. warnings.warn(ID.AMBIGUOUS_ARGUMENT, "`" + op + "' after local variable is interpreted as binary operator\nevent though it seems like \"" + syn + "\"");
  152. }
  153. private void warn_balanced(int c, boolean spaceSeen, String op, String syn) {
  154. if (false && last_state != LexState.EXPR_CLASS && last_state != LexState.EXPR_DOT &&
  155. last_state != LexState.EXPR_FNAME && last_state != LexState.EXPR_ENDFN &&
  156. last_state != LexState.EXPR_ENDARG && spaceSeen && !Character.isWhitespace(c)) {
  157. ambiguousOperator(op, syn);
  158. }
  159. }
  160. // FIXME: Also sucks that matchMarker will strip off valuable bytes and not work for this (could be a one-liner)
  161. private void detectUTF8BOM() throws IOException {
  162. int b1 = src.read();
  163. if (b1 == 0xef) {
  164. int b2 = src.read();
  165. if (b2 == 0xbb) {
  166. int b3 = src.read();
  167. if (b3 == 0xbf) {
  168. setEncoding(UTF8_ENCODING);
  169. } else {
  170. src.unread(b3);
  171. src.unread(b2);
  172. src.unread(b1);
  173. }
  174. } else {
  175. src.unread(b2);
  176. src.unread(b1);
  177. }
  178. } else {
  179. src.unread(b1);
  180. }
  181. }
  182. private int numberLiteralSuffix(int mask) throws IOException {
  183. int c = src.read();
  184. if (c == 'i') return (mask & SUFFIX_I) != 0 ? mask & SUFFIX_I : 0;
  185. if (c == 'r') {
  186. int result = 0;
  187. if ((mask & SUFFIX_R) != 0) result |= (mask & SUFFIX_R);
  188. if (src.peek('i') && (mask & SUFFIX_I) != 0) {
  189. c = src.read();
  190. result |= (mask & SUFFIX_I);
  191. }
  192. return result;
  193. }
  194. src.unread(c);
  195. return 0;
  196. }
  197. public enum Keyword {
  198. END ("end", Tokens.kEND, Tokens.kEND, LexState.EXPR_END),
  199. ELSE ("else", Tokens.kELSE, Tokens.kELSE, LexState.EXPR_BEG),
  200. CASE ("case", Tokens.kCASE, Tokens.kCASE, LexState.EXPR_BEG),
  201. ENSURE ("ensure", Tokens.kENSURE, Tokens.kENSURE, LexState.EXPR_BEG),
  202. MODULE ("module", Tokens.kMODULE, Tokens.kMODULE, LexState.EXPR_BEG),
  203. ELSIF ("elsif", Tokens.kELSIF, Tokens.kELSIF, LexState.EXPR_BEG),
  204. DEF ("def", Tokens.kDEF, Tokens.kDEF, LexState.EXPR_FNAME),
  205. RESCUE ("rescue", Tokens.kRESCUE, Tokens.kRESCUE_MOD, LexState.EXPR_MID),
  206. NOT ("not", Tokens.kNOT, Tokens.kNOT, LexState.EXPR_BEG),
  207. THEN ("then", Tokens.kTHEN, Tokens.kTHEN, LexState.EXPR_BEG),
  208. YIELD ("yield", Tokens.kYIELD, Tokens.kYIELD, LexState.EXPR_ARG),
  209. FOR ("for", Tokens.kFOR, Tokens.kFOR, LexState.EXPR_BEG),
  210. SELF ("self", Tokens.kSELF, Tokens.kSELF, LexState.EXPR_END),
  211. FALSE ("false", Tokens.kFALSE, Tokens.kFALSE, LexState.EXPR_END),
  212. RETRY ("retry", Tokens.kRETRY, Tokens.kRETRY, LexState.EXPR_END),
  213. RETURN ("return", Tokens.kRETURN, Tokens.kRETURN, LexState.EXPR_MID),
  214. TRUE ("true", Tokens.kTRUE, Tokens.kTRUE, LexState.EXPR_END),
  215. IF ("if", Tokens.kIF, Tokens.kIF_MOD, LexState.EXPR_BEG),
  216. DEFINED_P ("defined?", Tokens.kDEFINED, Tokens.kDEFINED, LexState.EXPR_ARG),
  217. SUPER ("super", Tokens.kSUPER, Tokens.kSUPER, LexState.EXPR_ARG),
  218. UNDEF ("undef", Tokens.kUNDEF, Tokens.kUNDEF, LexState.EXPR_FNAME),
  219. BREAK ("break", Tokens.kBREAK, Tokens.kBREAK, LexState.EXPR_MID),
  220. IN ("in", Tokens.kIN, Tokens.kIN, LexState.EXPR_BEG),
  221. DO ("do", Tokens.kDO, Tokens.kDO, LexState.EXPR_BEG),
  222. NIL ("nil", Tokens.kNIL, Tokens.kNIL, LexState.EXPR_END),
  223. UNTIL ("until", Tokens.kUNTIL, Tokens.kUNTIL_MOD, LexState.EXPR_BEG),
  224. UNLESS ("unless", Tokens.kUNLESS, Tokens.kUNLESS_MOD, LexState.EXPR_BEG),
  225. OR ("or", Tokens.kOR, Tokens.kOR, LexState.EXPR_BEG),
  226. NEXT ("next", Tokens.kNEXT, Tokens.kNEXT, LexState.EXPR_MID),
  227. WHEN ("when", Tokens.kWHEN, Tokens.kWHEN, LexState.EXPR_BEG),
  228. REDO ("redo", Tokens.kREDO, Tokens.kREDO, LexState.EXPR_END),
  229. AND ("and", Tokens.kAND, Tokens.kAND, LexState.EXPR_BEG),
  230. BEGIN ("begin", Tokens.kBEGIN, Tokens.kBEGIN, LexState.EXPR_BEG),
  231. __LINE__ ("__LINE__", Tokens.k__LINE__, Tokens.k__LINE__, LexState.EXPR_END),
  232. CLASS ("class", Tokens.kCLASS, Tokens.kCLASS, LexState.EXPR_CLASS),
  233. __FILE__("__FILE__", Tokens.k__FILE__, Tokens.k__FILE__, LexState.EXPR_END),
  234. LEND ("END", Tokens.klEND, Tokens.klEND, LexState.EXPR_END),
  235. LBEGIN ("BEGIN", Tokens.klBEGIN, Tokens.klBEGIN, LexState.EXPR_END),
  236. WHILE ("while", Tokens.kWHILE, Tokens.kWHILE_MOD, LexState.EXPR_BEG),
  237. ALIAS ("alias", Tokens.kALIAS, Tokens.kALIAS, LexState.EXPR_FNAME),
  238. __ENCODING__("__ENCODING__", Tokens.k__ENCODING__, Tokens.k__ENCODING__, LexState.EXPR_END);
  239. public final String name;
  240. public final int id0;
  241. public final int id1;
  242. public final LexState state;
  243. Keyword(String name, int id0, int id1, LexState state) {
  244. this.name = name;
  245. this.id0 = id0;
  246. this.id1 = id1;
  247. this.state = state;
  248. }
  249. }
  250. public enum LexState {
  251. EXPR_BEG, EXPR_END, EXPR_ARG, EXPR_CMDARG, EXPR_ENDARG, EXPR_MID,
  252. EXPR_FNAME, EXPR_DOT, EXPR_CLASS, EXPR_VALUE, EXPR_ENDFN
  253. }
  254. public static Keyword getKeyword(String str) {
  255. return (Keyword) map.get(str);
  256. }
  257. // Last token read via yylex().
  258. private int token;
  259. // Value of last token which had a value associated with it.
  260. Object yaccValue;
  261. // Stream of data that yylex() examines.
  262. private LexerSource src;
  263. // Used for tiny smidgen of grammar in lexer (see setParserSupport())
  264. private ParserSupport parserSupport = null;
  265. // What handles warnings
  266. private IRubyWarnings warnings;
  267. // Additional context surrounding tokens that both the lexer and
  268. // grammar use.
  269. private LexState lex_state;
  270. private LexState last_state;
  271. public ISourcePosition tokline;
  272. public void newtok() {
  273. tokline = getPosition();
  274. }
  275. // Tempory buffer to build up a potential token. Consumer takes responsibility to reset
  276. // this before use.
  277. private StringBuilder tokenBuffer = new StringBuilder(60);
  278. private StackState conditionState = new StackState();
  279. private StackState cmdArgumentState = new StackState();
  280. private StrTerm lex_strterm;
  281. public boolean commandStart;
  282. // Give a name to a value. Enebo: This should be used more.
  283. static final int EOF = -1;
  284. // ruby constants for strings (should this be moved somewhere else?)
  285. static final int STR_FUNC_ESCAPE=0x01;
  286. static final int STR_FUNC_EXPAND=0x02;
  287. static final int STR_FUNC_REGEXP=0x04;
  288. static final int STR_FUNC_QWORDS=0x08;
  289. static final int STR_FUNC_SYMBOL=0x10;
  290. // When the heredoc identifier specifies <<-EOF that indents before ident. are ok (the '-').
  291. static final int STR_FUNC_INDENT=0x20;
  292. private static final int str_squote = 0;
  293. private static final int str_dquote = STR_FUNC_EXPAND;
  294. private static final int str_xquote = STR_FUNC_EXPAND;
  295. private static final int str_regexp = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND;
  296. private static final int str_ssym = STR_FUNC_SYMBOL;
  297. private static final int str_dsym = STR_FUNC_SYMBOL | STR_FUNC_EXPAND;
  298. // Count of nested parentheses
  299. private int parenNest = 0;
  300. private int leftParenBegin = 0;
  301. public int incrementParenNest() {
  302. parenNest++;
  303. return parenNest;
  304. }
  305. public int getLeftParenBegin() {
  306. return leftParenBegin;
  307. }
  308. public void setLeftParenBegin(int value) {
  309. leftParenBegin = value;
  310. }
  311. public RubyLexer() {
  312. reset();
  313. }
  314. public final void reset() {
  315. token = 0;
  316. tokline = null;
  317. yaccValue = null;
  318. src = null;
  319. setState(null);
  320. resetStacks();
  321. lex_strterm = null;
  322. commandStart = true;
  323. }
  324. public int nextToken() throws IOException {
  325. token = yylex();
  326. final ISourcePosition p = getPosition();
  327. return token == EOF ? 0 : token;
  328. }
  329. /**
  330. * Last token read from the lexer at the end of a call to yylex()
  331. *
  332. * @return last token read
  333. */
  334. public int token() {
  335. return token;
  336. }
  337. public StringBuilder getTokenBuffer() {
  338. return tokenBuffer;
  339. }
  340. /**
  341. * Value of last token (if it is a token which has a value).
  342. *
  343. * @return value of last value-laden token
  344. */
  345. public Object value() {
  346. return yaccValue;
  347. }
  348. /**
  349. * Get position information for Token/Node that follows node represented by startPosition
  350. * and current lexer location.
  351. *
  352. * @param startPosition previous node/token
  353. * @return a new position
  354. */
  355. public ISourcePosition getPosition(ISourcePosition startPosition) {
  356. return src.getPosition(startPosition);
  357. }
  358. public ISourcePosition getPosition() {
  359. return src.getPosition();
  360. }
  361. public String getCurrentLine() {
  362. return src.getCurrentLine();
  363. }
  364. /**
  365. * Parse must pass its support object for some check at bottom of
  366. * yylex(). Ruby does it this way as well (i.e. a little parsing
  367. * logic in the lexer).
  368. *
  369. * @param parserSupport
  370. */
  371. public void setParserSupport(ParserSupport parserSupport) {
  372. this.parserSupport = parserSupport;
  373. }
  374. private void setEncoding(ByteList name) {
  375. Encoding newEncoding = parserSupport.getConfiguration().getEncodingService().loadEncoding(name);
  376. if (newEncoding == null) {
  377. throw new SyntaxException(PID.UNKNOWN_ENCODING, getPosition(),
  378. null, "unknown encoding name: " + name.toString());
  379. }
  380. if (!newEncoding.isAsciiCompatible()) {
  381. throw new SyntaxException(PID.NOT_ASCII_COMPATIBLE, getPosition(),
  382. null, name.toString() + " is not ASCII compatible");
  383. }
  384. setEncoding(newEncoding);
  385. }
  386. public void setEncoding(Encoding encoding) {
  387. this.encoding = encoding;
  388. }
  389. /**
  390. * Allow the parser to set the source for its lexer.
  391. *
  392. * @param source where the lexer gets raw data
  393. */
  394. public void setSource(LexerSource source) {
  395. this.src = source;
  396. }
  397. public StrTerm getStrTerm() {
  398. return lex_strterm;
  399. }
  400. public void setStrTerm(StrTerm strterm) {
  401. this.lex_strterm = strterm;
  402. }
  403. public void resetStacks() {
  404. conditionState.reset();
  405. cmdArgumentState.reset();
  406. }
  407. public void setWarnings(IRubyWarnings warnings) {
  408. this.warnings = warnings;
  409. }
  410. private void printState() {
  411. if (lex_state == null) {
  412. System.out.println("NULL");
  413. } else {
  414. System.out.println(lex_state);
  415. }
  416. }
  417. public void setState(LexState state) {
  418. this.lex_state = state;
  419. // printState();
  420. }
  421. public StackState getCmdArgumentState() {
  422. return cmdArgumentState;
  423. }
  424. public StackState getConditionState() {
  425. return conditionState;
  426. }
  427. public void setValue(Object yaccValue) {
  428. this.yaccValue = yaccValue;
  429. }
  430. private boolean isNext_identchar() throws IOException {
  431. int c = src.read();
  432. src.unread(c);
  433. return c != EOF && (Character.isLetterOrDigit(c) || c == '_');
  434. }
  435. private boolean isBEG() {
  436. return lex_state == LexState.EXPR_BEG || lex_state == LexState.EXPR_MID ||
  437. lex_state == LexState.EXPR_CLASS || (lex_state == LexState.EXPR_VALUE);
  438. }
  439. private boolean isEND() {
  440. return lex_state == LexState.EXPR_END || lex_state == LexState.EXPR_ENDARG ||
  441. (lex_state == LexState.EXPR_ENDFN);
  442. }
  443. private boolean isARG() {
  444. return lex_state == LexState.EXPR_ARG || lex_state == LexState.EXPR_CMDARG;
  445. }
  446. private boolean isLabelPossible(boolean commandState) {
  447. return ((lex_state == LexState.EXPR_BEG || lex_state == LexState.EXPR_ENDFN) && !commandState) || isARG();
  448. }
  449. private boolean isSpaceArg(int c, boolean spaceSeen) {
  450. return isARG() && spaceSeen && !Character.isWhitespace(c);
  451. }
  452. private void determineExpressionState() {
  453. switch (lex_state) {
  454. case EXPR_FNAME: case EXPR_DOT:
  455. setState(LexState.EXPR_ARG);
  456. break;
  457. default:
  458. setState(LexState.EXPR_BEG);
  459. break;
  460. }
  461. }
  462. private Object getInteger(String value, int radix, int suffix) {
  463. Node literalValue = null;
  464. if ((suffix & SUFFIX_R) != 0) {
  465. literalValue = newRationalNode(value, radix);
  466. } else {
  467. try {
  468. literalValue = newFixnumNode(value, radix);
  469. } catch (NumberFormatException e) {
  470. literalValue = newBignumNode(value, radix);
  471. }
  472. }
  473. return (suffix & SUFFIX_I) != 0 ? newComplexNode(literalValue) : literalValue;
  474. }
  475. /**
  476. * @param c the character to test
  477. * @return true if character is a hex value (0-9a-f)
  478. */
  479. static boolean isHexChar(int c) {
  480. return Character.isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
  481. }
  482. /**
  483. * @param c the character to test
  484. * @return true if character is an octal value (0-7)
  485. */
  486. static boolean isOctChar(int c) {
  487. return '0' <= c && c <= '7';
  488. }
  489. /**
  490. * This is a valid character for an identifier?
  491. *
  492. * @param c is character to be compared
  493. * @return whether c is an identifier or not
  494. *
  495. * mri: is_identchar
  496. */
  497. public boolean isIdentifierChar(int c) {
  498. return Character.isLetterOrDigit(c) || c == '_' || isMultiByteChar(c);
  499. }
  500. public boolean isASCII(int c) {
  501. return !isMultiByteChar(c);
  502. }
  503. /**
  504. * Is this a multibyte character from a multibyte encoding?
  505. *
  506. * @param c
  507. * @return whether c is an multibyte char or not
  508. */
  509. protected boolean isMultiByteChar(int c) {
  510. return encoding.codeToMbcLength(c) != 1;
  511. }
  512. // STR_NEW3/parser_str_new
  513. public StrNode createStrNode(ISourcePosition position, ByteList buffer, int flags) {
  514. Encoding bufferEncoding = buffer.getEncoding();
  515. int codeRange = StringSupport.codeRangeScan(bufferEncoding, buffer);
  516. if ((flags & RubyLexer.STR_FUNC_REGEXP) == 0 && bufferEncoding.isAsciiCompatible()) {
  517. // If we have characters outside 7-bit range and we are still ascii then change to ascii-8bit
  518. if (codeRange == StringSupport.CR_7BIT) {
  519. // Do nothing like MRI
  520. } else if (getEncoding() == RubyLexer.USASCII_ENCODING &&
  521. bufferEncoding != RubyLexer.UTF8_ENCODING) {
  522. codeRange = ParserSupport.associateEncoding(buffer, RubyLexer.ASCII8BIT_ENCODING, codeRange);
  523. }
  524. }
  525. return new StrNode(position, buffer, codeRange);
  526. }
  527. /**
  528. * What type/kind of quote are we dealing with?
  529. *
  530. * @param c first character the the quote construct
  531. * @return a token that specifies the quote type
  532. */
  533. private int parseQuote(int c) throws IOException {
  534. int begin, end;
  535. boolean shortHand;
  536. // Short-hand (e.g. %{,%.,%!,... versus %Q{).
  537. if (!Character.isLetterOrDigit(c)) {
  538. begin = c;
  539. c = 'Q';
  540. shortHand = true;
  541. // Long-hand (e.g. %Q{}).
  542. } else {
  543. shortHand = false;
  544. begin = src.read();
  545. if (Character.isLetterOrDigit(begin) /* no mb || ismbchar(term)*/) {
  546. throw new SyntaxException(PID.STRING_UNKNOWN_TYPE, getPosition(), getCurrentLine(), "unknown type of %string");
  547. }
  548. }
  549. if (c == EOF || begin == EOF) {
  550. throw new SyntaxException(PID.STRING_HITS_EOF, getPosition(), getCurrentLine(), "unterminated quoted string meets end of file");
  551. }
  552. // Figure end-char. '\0' is special to indicate begin=end and that no nesting?
  553. switch(begin) {
  554. case '(': end = ')'; break;
  555. case '[': end = ']'; break;
  556. case '{': end = '}'; break;
  557. case '<': end = '>'; break;
  558. default:
  559. end = begin;
  560. begin = '\0';
  561. }
  562. switch (c) {
  563. case 'Q':
  564. lex_strterm = new StringTerm(str_dquote, begin ,end);
  565. yaccValue = "%"+ (shortHand ? (""+end) : ("" + c + begin));
  566. return Tokens.tSTRING_BEG;
  567. case 'q':
  568. lex_strterm = new StringTerm(str_squote, begin, end);
  569. yaccValue = "%"+c+begin;
  570. return Tokens.tSTRING_BEG;
  571. case 'W':
  572. lex_strterm = new StringTerm(str_dquote | STR_FUNC_QWORDS, begin, end);
  573. do {c = src.read();} while (Character.isWhitespace(c));
  574. src.unread(c);
  575. yaccValue = "%"+c+begin;
  576. return Tokens.tWORDS_BEG;
  577. case 'w':
  578. lex_strterm = new StringTerm(/* str_squote | */ STR_FUNC_QWORDS, begin, end);
  579. do {c = src.read();} while (Character.isWhitespace(c));
  580. src.unread(c);
  581. yaccValue = "%"+c+begin;
  582. return Tokens.tQWORDS_BEG;
  583. case 'x':
  584. lex_strterm = new StringTerm(str_xquote, begin, end);
  585. yaccValue = "%"+c+begin;
  586. return Tokens.tXSTRING_BEG;
  587. case 'r':
  588. lex_strterm = new StringTerm(str_regexp, begin, end);
  589. yaccValue = "%"+c+begin;
  590. return Tokens.tREGEXP_BEG;
  591. case 's':
  592. lex_strterm = new StringTerm(str_ssym, begin, end);
  593. setState(LexState.EXPR_FNAME);
  594. yaccValue = "%"+c+begin;
  595. return Tokens.tSYMBEG;
  596. case 'I':
  597. lex_strterm = new StringTerm(str_dquote | STR_FUNC_QWORDS, begin, end);
  598. do {c = src.read();} while (Character.isWhitespace(c));
  599. src.unread(c);
  600. yaccValue = "%" + c + begin;
  601. return Tokens.tSYMBOLS_BEG;
  602. case 'i':
  603. lex_strterm = new StringTerm(/* str_squote | */STR_FUNC_QWORDS, begin, end);
  604. do {c = src.read();} while (Character.isWhitespace(c));
  605. src.unread(c);
  606. yaccValue = "%" + c + begin;
  607. return Tokens.tQSYMBOLS_BEG;
  608. default:
  609. throw new SyntaxException(PID.STRING_UNKNOWN_TYPE,
  610. getPosition(), getCurrentLine(), "unknown type of %string");
  611. }
  612. }
  613. private int hereDocumentIdentifier() throws IOException {
  614. int c = src.read();
  615. int term;
  616. int func = 0;
  617. if (c == '-') {
  618. c = src.read();
  619. func = STR_FUNC_INDENT;
  620. }
  621. ByteList markerValue;
  622. if (c == '\'' || c == '"' || c == '`') {
  623. if (c == '\'') {
  624. func |= str_squote;
  625. } else if (c == '"') {
  626. func |= str_dquote;
  627. } else {
  628. func |= str_xquote;
  629. }
  630. newtok();
  631. markerValue = new ByteList();
  632. term = c;
  633. while ((c = src.read()) != EOF && c != term) {
  634. markerValue.append(c);
  635. }
  636. if (c == EOF) {
  637. throw new SyntaxException(PID.STRING_MARKER_MISSING, getPosition(),
  638. getCurrentLine(), "unterminated here document identifier");
  639. }
  640. } else {
  641. if (!isIdentifierChar(c)) {
  642. src.unread(c);
  643. if ((func & STR_FUNC_INDENT) != 0) {
  644. src.unread('-');
  645. }
  646. return 0;
  647. }
  648. newtok();
  649. markerValue = new ByteList();
  650. term = '"';
  651. func |= str_dquote;
  652. do {
  653. markerValue.append(c);
  654. } while ((c = src.read()) != EOF && isIdentifierChar(c));
  655. src.unread(c);
  656. }
  657. ByteList lastLine = src.readLineBytes();
  658. lastLine.append('\n');
  659. lex_strterm = new HeredocTerm(markerValue, func, lastLine);
  660. if (term == '`') {
  661. yaccValue = "`";
  662. return Tokens.tXSTRING_BEG;
  663. }
  664. yaccValue = "\"";
  665. // Hacky: Advance position to eat newline here....
  666. getPosition();
  667. return Tokens.tSTRING_BEG;
  668. }
  669. private void arg_ambiguous() {
  670. if (warnings.isVerbose()) warnings.warning(ID.AMBIGUOUS_ARGUMENT, getPosition(), "Ambiguous first argument; make sure.");
  671. }
  672. /* MRI: magic_comment_marker */
  673. /* This impl is a little sucky. We basically double scan the same bytelist twice. Once here
  674. * and once in parseMagicComment.
  675. */
  676. private int magicCommentMarker(ByteList str, int begin) {
  677. int i = begin;
  678. int len = str.length();
  679. while (i < len) {
  680. switch (str.charAt(i)) {
  681. case '-':
  682. if (i >= 2 && str.charAt(i - 1) == '*' && str.charAt(i - 2) == '-') return i + 1;
  683. i += 2;
  684. break;
  685. case '*':
  686. if (i + 1 >= len) return -1;
  687. if (str.charAt(i + 1) != '-') {
  688. i += 4;
  689. } else if (str.charAt(i - 1) != '-') {
  690. i += 2;
  691. } else {
  692. return i + 2;
  693. }
  694. break;
  695. default:
  696. i += 3;
  697. break;
  698. }
  699. }
  700. return -1;
  701. }
  702. private boolean magicCommentSpecialChar(char c) {
  703. switch (c) {
  704. case '\'': case '"': case ':': case ';': return true;
  705. }
  706. return false;
  707. }
  708. private static final String magicString = "([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*";
  709. private static final Regex magicRegexp = new Regex(magicString.getBytes(), 0, magicString.length(), 0, Encoding.load("ASCII"));
  710. // MRI: parser_magic_comment
  711. protected boolean parseMagicComment(ByteList magicLine) throws IOException {
  712. int length = magicLine.length();
  713. if (length <= 7) return false;
  714. int beg = magicCommentMarker(magicLine, 0);
  715. if (beg < 0) return false;
  716. int end = magicCommentMarker(magicLine, beg);
  717. if (end < 0) return false;
  718. // We only use a regex if -*- ... -*- is found. Not too hot a path?
  719. int realSize = magicLine.getRealSize();
  720. int begin = magicLine.getBegin();
  721. Matcher matcher = magicRegexp.matcher(magicLine.getUnsafeBytes(), begin, begin + realSize);
  722. int result = RubyRegexp.matcherSearch(parserSupport.getConfiguration().getRuntime(), matcher, begin, begin + realSize, Option.NONE);
  723. if (result < 0) return false;
  724. // Regexp is guarateed to have three matches
  725. int begs[] = matcher.getRegion().beg;
  726. int ends[] = matcher.getRegion().end;
  727. String name = magicLine.subSequence(begs[1], ends[1]).toString();
  728. if (!name.equalsIgnoreCase("encoding")) return false;
  729. setEncoding(new ByteList(magicLine.getUnsafeBytes(), begs[2], ends[2] - begs[2]));
  730. return true;
  731. }
  732. // TODO: Make hand-rolled version of this
  733. private static final String encodingString = "[cC][oO][dD][iI][nN][gG]\\s*[=:]\\s*([a-zA-Z0-9\\-_]+)";
  734. private static final Regex encodingRegexp = new Regex(encodingString.getBytes(), 0,
  735. encodingString.length(), 0, Encoding.load("ASCII"));
  736. protected void handleFileEncodingComment(ByteList encodingLine) throws IOException {
  737. int realSize = encodingLine.getRealSize();
  738. int begin = encodingLine.getBegin();
  739. Matcher matcher = encodingRegexp.matcher(encodingLine.getUnsafeBytes(), begin, begin + realSize);
  740. int result = RubyRegexp.matcherSearch(parserSupport.getConfiguration().getRuntime(), matcher, begin, begin + realSize, Option.IGNORECASE);
  741. if (result < 0) return;
  742. int begs[] = matcher.getRegion().beg;
  743. int ends[] = matcher.getRegion().end;
  744. setEncoding(new ByteList(encodingLine.getUnsafeBytes(), begs[1], ends[1] - begs[1]));
  745. }
  746. /**
  747. * Read a comment up to end of line.
  748. *
  749. * @return something or eof value
  750. */
  751. protected int readComment() throws IOException {
  752. // 1.9 - first line comment handling
  753. ByteList commentLine;
  754. if (src.getLine() == 0 && token == 0) {
  755. // Skip first line if it is a shebang line?
  756. // (not the same as MRI:parser_prepare/comment_at_top)
  757. if (src.peek('!')) {
  758. int c = src.skipUntil('\n');
  759. // TODO: Eat whitespace
  760. if (!src.peek('#')) return c; // Next line better also be a comment
  761. }
  762. commentLine = src.readUntil('\n');
  763. if (commentLine != null) {
  764. boolean handledMagicComment = parseMagicComment(commentLine);
  765. if (!handledMagicComment) {
  766. handleFileEncodingComment(commentLine);
  767. }
  768. }
  769. return 0;
  770. }
  771. return src.skipUntil('\n');
  772. }
  773. /*
  774. * Not normally used, but is left in here since it can be useful in debugging
  775. * grammar and lexing problems.
  776. *
  777. */
  778. private void printToken(int token) {
  779. //System.out.print("LOC: " + support.getPosition() + " ~ ");
  780. switch (token) {
  781. case Tokens.yyErrorCode: System.err.print("yyErrorCode,"); break;
  782. case Tokens.kCLASS: System.err.print("kClass,"); break;
  783. case Tokens.kMODULE: System.err.print("kModule,"); break;
  784. case Tokens.kDEF: System.err.print("kDEF,"); break;
  785. case Tokens.kUNDEF: System.err.print("kUNDEF,"); break;
  786. case Tokens.kBEGIN: System.err.print("kBEGIN,"); break;
  787. case Tokens.kRESCUE: System.err.print("kRESCUE,"); break;
  788. case Tokens.kENSURE: System.err.print("kENSURE,"); break;
  789. case Tokens.kEND: System.err.print("kEND,"); break;
  790. case Tokens.kIF: System.err.print("kIF,"); break;
  791. case Tokens.kUNLESS: System.err.print("kUNLESS,"); break;
  792. case Tokens.kTHEN: System.err.print("kTHEN,"); break;
  793. case Tokens.kELSIF: System.err.print("kELSIF,"); break;
  794. case Tokens.kELSE: System.err.print("kELSE,"); break;
  795. case Tokens.kCASE: System.err.print("kCASE,"); break;
  796. case Tokens.kWHEN: System.err.print("kWHEN,"); break;
  797. case Tokens.kWHILE: System.err.print("kWHILE,"); break;
  798. case Tokens.kUNTIL: System.err.print("kUNTIL,"); break;
  799. case Tokens.kFOR: System.err.print("kFOR,"); break;
  800. case Tokens.kBREAK: System.err.print("kBREAK,"); break;
  801. case Tokens.kNEXT: System.err.print("kNEXT,"); break;
  802. case Tokens.kREDO: System.err.print("kREDO,"); break;
  803. case Tokens.kRETRY: System.err.print("kRETRY,"); break;
  804. case Tokens.kIN: System.err.print("kIN,"); break;
  805. case Tokens.kDO: System.err.print("kDO,"); break;
  806. case Tokens.kDO_COND: System.err.print("kDO_COND,"); break;
  807. case Tokens.kDO_BLOCK: System.err.print("kDO_BLOCK,"); break;
  808. case Tokens.kRETURN: System.err.print("kRETURN,"); break;
  809. case Tokens.kYIELD: System.err.print("kYIELD,"); break;
  810. case Tokens.kSUPER: System.err.print("kSUPER,"); break;
  811. case Tokens.kSELF: System.err.print("kSELF,"); break;
  812. case Tokens.kNIL: System.err.print("kNIL,"); break;
  813. case Tokens.kTRUE: System.err.print("kTRUE,"); break;
  814. case Tokens.kFALSE: System.err.print("kFALSE,"); break;
  815. case Tokens.kAND: System.err.print("kAND,"); break;
  816. case Tokens.kOR: System.err.print("kOR,"); break;
  817. case Tokens.kNOT: System.err.print("kNOT,"); break;
  818. case Tokens.kIF_MOD: System.err.print("kIF_MOD,"); break;
  819. case Tokens.kUNLESS_MOD: System.err.print("kUNLESS_MOD,"); break;
  820. case Tokens.kWHILE_MOD: System.err.print("kWHILE_MOD,"); break;
  821. case Tokens.kUNTIL_MOD: System.err.print("kUNTIL_MOD,"); break;
  822. case Tokens.kRESCUE_MOD: System.err.print("kRESCUE_MOD,"); break;
  823. case Tokens.kALIAS: System.err.print("kALIAS,"); break;
  824. case Tokens.kDEFINED: System.err.print("kDEFINED,"); break;
  825. case Tokens.klBEGIN: System.err.print("klBEGIN,"); break;
  826. case Tokens.klEND: System.err.print("klEND,"); break;
  827. case Tokens.k__LINE__: System.err.print("k__LINE__,"); break;
  828. case Tokens.k__FILE__: System.err.print("k__FILE__,"); break;
  829. case Tokens.k__ENCODING__: System.err.print("k__ENCODING__,"); break;
  830. case Tokens.kDO_LAMBDA: System.err.print("kDO_LAMBDA,"); break;
  831. case Tokens.tIDENTIFIER: System.err.print("tIDENTIFIER["+ value() + "],"); break;
  832. case Tokens.tFID: System.err.print("tFID[" + value() + "],"); break;
  833. case Tokens.tGVAR: System.err.print("tGVAR[" + value() + "],"); break;
  834. case Tokens.tIVAR: System.err.print("tIVAR[" + value() +"],"); break;
  835. case Tokens.tCONSTANT: System.err.print("tCONSTANT["+ value() +"],"); break;
  836. case Tokens.tCVAR: System.err.print("tCVAR,"); break;
  837. case Tokens.tINTEGER: System.err.print("tINTEGER,"); break;
  838. case Tokens.tFLOAT: System.err.print("tFLOAT,"); break;
  839. case Tokens.tSTRING_CONTENT: System.err.print("tSTRING_CONTENT[" + ((StrNode) value()).getValue() + "],"); break;
  840. case Tokens.tSTRING_BEG: System.err.print("tSTRING_BEG,"); break;
  841. case Tokens.tSTRING_END: System.err.print("tSTRING_END,"); break;
  842. case Tokens.tSTRING_DBEG: System.err.print("tSTRING_DBEG,"); break;
  843. case Tokens.tSTRING_DVAR: System.err.print("tSTRING_DVAR,"); break;
  844. case Tokens.tXSTRING_BEG: System.err.print("tXSTRING_BEG,"); break;
  845. case Tokens.tREGEXP_BEG: System.err.print("tREGEXP_BEG,"); break;
  846. case Tokens.tREGEXP_END: System.err.print("tREGEXP_END,"); break;
  847. case Tokens.tWORDS_BEG: System.err.print("tWORDS_BEG,"); break;
  848. case Tokens.tQWORDS_BEG: System.err.print("tQWORDS_BEG,"); break;
  849. case Tokens.tBACK_REF: System.err.print("tBACK_REF,"); break;
  850. case Tokens.tBACK_REF2: System.err.print("tBACK_REF2,"); break;
  851. case Tokens.tNTH_REF: System.err.print("tNTH_REF,"); break;
  852. case Tokens.tUPLUS: System.err.print("tUPLUS"); break;
  853. case Tokens.tUMINUS: System.err.print("tUMINUS,"); break;
  854. case Tokens.tPOW: System.err.print("tPOW,"); break;
  855. case Tokens.tCMP: System.err.print("tCMP,"); break;
  856. case Tokens.tEQ: System.err.print("tEQ,"); break;
  857. case Tokens.tEQQ: System.err.print("tEQQ,"); break;
  858. case Tokens.tNEQ: System.err.print("tNEQ,"); break;
  859. case Tokens.tGEQ: System.err.print("tGEQ,"); break;
  860. case Tokens.tLEQ: System.err.print("tLEQ,"); break;
  861. case Tokens.tANDOP: System.err.print("tANDOP,"); break;
  862. case Tokens.tOROP: System.err.print("tOROP,"); break;
  863. case Tokens.tMATCH: System.err.print("tMATCH,"); break;
  864. case Tokens.tNMATCH: System.err.print("tNMATCH,"); break;
  865. case Tokens.tDOT: System.err.print("tDOT,"); break;
  866. case Tokens.tDOT2: System.err.print("tDOT2,"); break;
  867. case Tokens.tDOT3: System.err.print("tDOT3,"); break;
  868. case Tokens.tAREF: System.err.print("tAREF,"); break;
  869. case Tokens.tASET: System.err.print("tASET,"); break;
  870. case Tokens.tLSHFT: System.err.print("tLSHFT,"); break;
  871. case Tokens.tRSHFT: System.err.print("tRSHFT,"); break;
  872. case Tokens.tCOLON2: System.err.print("tCOLON2,"); break;
  873. case Tokens.tCOLON3: System.err.print("tCOLON3,"); break;
  874. case Tokens.tOP_ASGN: System.err.print("tOP_ASGN,"); break;
  875. case Tokens.tASSOC: System.err.print("tASSOC,"); break;
  876. case Tokens.tLPAREN: System.err.print("tLPAREN,"); break;
  877. case Tokens.tLPAREN2: System.err.print("tLPAREN2,"); break;
  878. case Tokens.tLPAREN_ARG: System.err.print("tLPAREN_ARG,"); break;
  879. case Tokens.tLBRACK: System.err.print("tLBRACK,"); break;
  880. case Tokens.tRBRACK: System.err.print("tRBRACK,"); break;
  881. case Tokens.tLBRACE: System.err.print("tLBRACE,"); break;
  882. case Tokens.tLBRACE_ARG: System.err.print("tLBRACE_ARG,"); break;
  883. case Tokens.tSTAR: System.err.print("tSTAR,"); break;
  884. case Tokens.tSTAR2: System.err.print("tSTAR2,"); break;
  885. case Tokens.tAMPER: System.err.print("tAMPER,"); break;
  886. case Tokens.tAMPER2: System.err.print("tAMPER2,"); break;
  887. case Tokens.tSYMBEG: System.err.print("tSYMBEG,"); break;
  888. case Tokens.tTILDE: System.err.print("tTILDE,"); break;
  889. case Tokens.tPERCENT: System.err.print("tPERCENT,"); break;
  890. case Tokens.tDIVIDE: System.err.print("tDIVIDE,"); break;
  891. case Tokens.tPLUS: System.err.print("tPLUS,"); break;
  892. case Tokens.tMINUS: System.err.print("tMINUS,"); break;
  893. case Tokens.tLT: System.err.print("tLT,"); break;
  894. case Tokens.tGT: System.err.print("tGT,"); break;
  895. case Tokens.tCARET: System.err.print("tCARET,"); break;
  896. case Tokens.tBANG: System.err.print("tBANG,"); break;
  897. case Tokens.tLCURLY: System.err.print("tTLCURLY,"); break;
  898. case Tokens.tRCURLY: System.err.print("tRCURLY,"); break;
  899. case Tokens.tPIPE: System.err.print("tTPIPE,"); break;
  900. case Tokens.tLAMBDA: System.err.print("tLAMBDA,"); break;
  901. case Tokens.tLAMBEG: System.err.print("tLAMBEG,"); break;
  902. case Tokens.tRPAREN: System.err.print("tRPAREN,"); break;
  903. case Tokens.tLABEL: System.err.print("tLABEL("+ value() +":),"); break;
  904. case '\n': System.err.println("NL"); break;
  905. case EOF: System.out.println("EOF"); break;
  906. case Tokens.tDSTAR: System.err.print("tDSTAR"); break;
  907. default: System.err.print("'" + (char)token + "',"); break;
  908. }
  909. }
  910. // DEBUGGING HELP
  911. private int yylex2() throws IOException {
  912. int currentToken = yylex2();
  913. printToken(currentToken);
  914. return currentToken;
  915. }
  916. /**
  917. * Returns the next token. Also sets yyVal is needed.
  918. *
  919. *@return Description of the Returned Value
  920. */
  921. private int yylex() throws IOException {
  922. int c;
  923. boolean spaceSeen = false;
  924. boolean commandState;
  925. // FIXME: Sucks we do this n times versus one since it is only important at beginning of parse but we need to change
  926. // setup of parser differently.
  927. if (token == 0 && src.getLine() == 0) detectUTF8BOM();
  928. if (lex_strterm != null) {
  929. int tok = lex_strterm.parseString(this, src);
  930. if (tok == Tokens.tSTRING_END || tok == Tokens.tREGEXP_END) {
  931. lex_strterm = null;
  932. setState(LexState.EXPR_END);
  933. }
  934. return tok;
  935. }
  936. commandState = commandStart;
  937. commandStart = false;
  938. loop: for(;;) {
  939. src.startOfToken();
  940. last_state = lex_state;
  941. c = src.read();
  942. switch(c) {
  943. case '\000': /* NUL */
  944. case '\004': /* ^D */
  945. case '\032': /* ^Z */
  946. case EOF: /* end of script. */
  947. return EOF;
  948. /* white spaces */
  949. case ' ': case '\t': case '\f': case '\r':
  950. case '\13': /* '\v' */
  951. getPosition();
  952. spaceSeen = true;
  953. continue;
  954. case '#': /* it's a comment */
  955. if (readComment() == EOF) return EOF;
  956. /* fall through */
  957. case '\n':
  958. switch (lex_state) {
  959. case EXPR_BEG: case EXPR_FNAME: case EXPR_DOT:
  960. case EXPR_CLASS: case EXPR_VALUE:
  961. continue loop;
  962. }
  963. boolean done = false;
  964. while(!done) {
  965. c = src.read();
  966. switch (c) {
  967. case ' ': case '\t': case '\f': case '\r': case '\13': /* '\v' */
  968. spaceSeen = true;
  969. continue;
  970. case '.': {
  971. if ((c = src.read()) != '.') {
  972. src.unread(c);
  973. src.unread('.');
  974. continue loop;
  975. }
  976. }
  977. default:
  978. case -1: // EOF (ENEBO: After default?
  979. done = true;
  980. }
  981. }
  982. if (c == -1) return EOF;
  983. src.unread(c);
  984. getPosition();
  985. switch (lex_state) {
  986. case EXPR_BEG: case EXPR_FNAME: case EXPR_DOT: case EXPR_CLASS:
  987. continue loop;
  988. }
  989. commandStart = true;
  990. setState(LexState.EXPR_BEG);
  991. return '\n';
  992. case '*':
  993. return star(spaceSeen);
  994. case '!':
  995. return bang();
  996. case '=':
  997. // documentation nodes
  998. if (src.wasBeginOfLine()) {
  999. if (src.matchMarker(BEGIN_DOC_MARKER, false, false)) {
  1000. c = src.read();
  1001. if (Character.isWhitespace(c)) {
  1002. // In case last next was the newline.
  1003. src.unread(c);
  1004. for (;;) {
  1005. c = src.read();
  1006. // If a line is followed by a blank line put
  1007. // it back.
  1008. while (c == '\n') {
  1009. c = src.read();
  1010. }
  1011. if (c == EOF) {
  1012. throw new SyntaxException(PID.STRING_HITS_EOF, getPosition(),
  1013. getCurrentLine(), "embedded document meets end of file");
  1014. }
  1015. if (c != '=') continue;
  1016. if (src.wasBeginOfLine() && src.matchMarker(END_DOC_MARKER, false, false)) {
  1017. ByteList list = src.readLineBytes();
  1018. src.unread('\n');
  1019. break;
  1020. }
  1021. }
  1022. continue;
  1023. }
  1024. src.unread(c);
  1025. }
  1026. }
  1027. determineExpressionState();
  1028. c = src.read();
  1029. if (c == '=') {
  1030. c = src.read();
  1031. if (c == '=') {
  1032. yaccValue = "===";
  1033. return Tokens.tEQQ;
  1034. }
  1035. src.unread(c);
  1036. yaccValue = "==";
  1037. return Tokens.tEQ;
  1038. }
  1039. if (c == '~') {
  1040. yaccValue = "=~";
  1041. return Tokens.tMATCH;
  1042. } else if (c == '>') {
  1043. yaccValue = "=>";
  1044. return Tokens.tASSOC;
  1045. }
  1046. src.unread(c);
  1047. yaccValue = "=";
  1048. return '=';
  1049. case '<':
  1050. return lessThan(spaceSeen);
  1051. case '>':
  1052. return greaterThan();
  1053. case '"':
  1054. return doubleQuote();
  1055. case '`':
  1056. return backtick(commandState);
  1057. case '\'':
  1058. return singleQuote();
  1059. case '?':
  1060. return questionMark();
  1061. case '&':
  1062. return ampersand(spaceSeen);
  1063. case '|':
  1064. return pipe();
  1065. case '+':
  1066. return plus(spaceSeen);
  1067. case '-':
  1068. return minus(spaceSeen);
  1069. case '.':
  1070. return dot();
  1071. case '0' : case '1' : case '2' : case '3' : case '4' :
  1072. case '5' : case '6' : case '7' : case '8' : case '9' :
  1073. return parseNumber(c);
  1074. case ')':
  1075. return rightParen();
  1076. case ']':
  1077. return rightBracket();
  1078. case '}':
  1079. return rightCurly();
  1080. case ':':
  1081. return colon(spaceSeen);
  1082. case '/':
  1083. return slash(spaceSeen);
  1084. case '^':
  1085. return caret();
  1086. case ';':
  1087. commandStart = true;
  1088. setState(LexState.EXPR_BEG);
  1089. yaccValue = ";";
  1090. return ';';
  1091. case ',':
  1092. return comma(c);
  1093. case '~':
  1094. return tilde();
  1095. case '(':
  1096. return leftParen(spaceSeen);
  1097. case '[':
  1098. return leftBracket(spaceSeen);
  1099. case '{':
  1100. return leftCurly();
  1101. case '\\':
  1102. c = src.read();
  1103. if (c == '\n') {
  1104. spaceSeen = true;
  1105. continue;
  1106. }
  1107. src.unread(c);
  1108. yaccValue = "\\";
  1109. return '\\';
  1110. case '%':
  1111. return percent(spaceSeen);
  1112. case '$':
  1113. return dollar();
  1114. case '@':
  1115. return at();
  1116. case '_':
  1117. if (src.wasBeginOfLine() && src.matchMarker(END_MARKER, false, true)) {
  1118. parserSupport.getResult().setEndOffset(src.getOffset());
  1119. return EOF;
  1120. }
  1121. return identifier(c, commandState);
  1122. default:
  1123. return identifier(c, commandState);
  1124. }
  1125. }
  1126. }
  1127. private int identifierToken(int result, String value) {
  1128. if (result == Tokens.tIDENTIFIER && last_state != LexState.EXPR_DOT &&
  1129. parserSupport.getCurrentScope().isDefined(value) >= 0) {
  1130. setState(LexState.EXPR_END);
  1131. }
  1132. yaccValue = value;
  1133. return result;
  1134. }
  1135. private int getIdentifier(int first) throws IOException {
  1136. if (isMultiByteChar(first)) first = src.readCodepoint(first, encoding);
  1137. if (!isIdentifierChar(first)) return first;
  1138. tokenBuffer.append((char) first);
  1139. int c;
  1140. for (c = src.read(); c != EOF; c = src.read()) {
  1141. if (isMultiByteChar(c)) c = src.readCodepoint(c, encoding);
  1142. if (!isIdentifierChar(c)) break;
  1143. tokenBuffer.append((char) c);
  1144. }
  1145. src.unread(c);
  1146. return first;
  1147. }
  1148. private int ampersand(boolean spaceSeen) throws IOException {
  1149. int c = src.read();
  1150. switch (c) {
  1151. case '&':
  1152. setState(LexState.EXPR_BEG);
  1153. if ((c = src.read()) == '=') {
  1154. yaccValue = "&&";
  1155. setState(LexState.EXPR_BEG);
  1156. return Tokens.tOP_ASGN;
  1157. }
  1158. src.unread(c);
  1159. yaccValue = "&&";
  1160. return Tokens.tANDOP;
  1161. case '=':
  1162. yaccValue = "&";
  1163. setState(LexState.EXPR_BEG);
  1164. return Tokens.tOP_ASGN;
  1165. }
  1166. src.unread(c);
  1167. //tmpPosition is required because getPosition()'s side effects.
  1168. //if the warning is generated, the getPosition() on line 954 (this line + 18) will create
  1169. //a wrong position if the "inclusive" flag is not set.
  1170. ISourcePosition tmpPosition = getPosition();
  1171. if (isSpaceArg(c, spaceSeen)) {
  1172. if (warnings.isVerbose()) warnings.warning(ID.ARGUMENT_AS_PREFIX, tmpPosition, "`&' interpreted as argument prefix");
  1173. c = Tokens.tAMPER;
  1174. } else if (isBEG()) {
  1175. c = Tokens.tAMPER;
  1176. } else {
  1177. warn_balanced(c, spaceSeen, "&", "argument prefix");
  1178. c = Tokens.tAMPER2;
  1179. }
  1180. determineExpressionState();
  1181. yaccValue = "&";
  1182. return c;
  1183. }
  1184. private int at() throws IOException {
  1185. newtok();
  1186. int c = src.read();
  1187. int result;
  1188. tokenBuffer.setLength(0);
  1189. tokenBuffer.append('@');
  1190. if (c == '@') {
  1191. tokenBuffer.append('@');
  1192. c = src.read();
  1193. result = Tokens.tCVAR;
  1194. } else {
  1195. result = Tokens.tIVAR;
  1196. }
  1197. if (Character.isDigit(c)) {
  1198. if (tokenBuffer.length() == 1) {
  1199. throw new SyntaxException(PID.IVAR_BAD_NAME, getPosition(), getCurrentLine(),
  1200. "`@" + c + "' is not allowed as an instance variable name");
  1201. }
  1202. throw new SyntaxException(PID.CVAR_BAD_NAME, getPosition(), getCurrentLine(),
  1203. "`@@" + c + "' is not allowed as a class variable name");
  1204. }
  1205. if (!isIdentifierChar(c)) {
  1206. src.unread(c);
  1207. yaccValue = "@";
  1208. return '@';
  1209. }
  1210. getIdentifier(c);
  1211. last_state = lex_state;
  1212. setState(LexState.EXPR_END);
  1213. return identifierToken(result, tokenBuffer.toString().intern());
  1214. }
  1215. private int backtick(boolean commandState) throws IOException {
  1216. yaccValue = "`";
  1217. switch (lex_state) {
  1218. case EXPR_FNAME:
  1219. setState(LexState.EXPR_ENDFN);
  1220. return Tokens.tBACK_REF2;
  1221. case EXPR_DOT:
  1222. setState(commandState ? LexState.EXPR_CMDARG : LexState.EXPR_ARG);
  1223. return Tokens.tBACK_REF2;
  1224. default:
  1225. lex_strterm = new StringTerm(str_xquote, '\0', '`');
  1226. return Tokens.tXSTRING_BEG;
  1227. }
  1228. }
  1229. private int bang() throws IOException {
  1230. int c = src.read();
  1231. if (lex_state == LexState.EXPR_FNAME || lex_state == LexState.EXPR_DOT) {
  1232. setState(LexState.EXPR_ARG);
  1233. if (c == '@') {
  1234. yaccValue = "!";
  1235. return Tokens.tBANG;
  1236. }
  1237. } else {
  1238. setState(LexState.EXPR_BEG);
  1239. }
  1240. switch (c) {
  1241. case '=':
  1242. yaccValue = "!=";
  1243. return Tokens.tNEQ;
  1244. case '~':
  1245. yaccValue = "!~";
  1246. return Tokens.tNMATCH;
  1247. default: // Just a plain bang
  1248. src.unread(c);
  1249. yaccValue = "!";
  1250. return Tokens.tBANG;
  1251. }
  1252. }
  1253. private int caret() throws IOException {
  1254. int c = src.read();
  1255. if (c == '=') {
  1256. setState(LexState.EXPR_BEG);
  1257. yaccValue = "^";
  1258. return Tokens.tOP_ASGN;
  1259. }
  1260. determineExpressionState();
  1261. src.unread(c);
  1262. yaccValue = "^";
  1263. return Tokens.tCARET;
  1264. }
  1265. private int colon(boolean spaceSeen) throws IOException {
  1266. int c = src.read();
  1267. if (c == ':') {
  1268. if (isBEG() || lex_state == LexState.EXPR_CLASS || (isARG() && spaceSeen)) {
  1269. setState(LexState.EXPR_BEG);
  1270. yaccValue = "::";
  1271. return Tokens.tCOLON3;
  1272. }
  1273. setState(LexState.EXPR_DOT);
  1274. yaccValue = ":";
  1275. return Tokens.tCOLON2;
  1276. }
  1277. if (isEND() || Character.isWhitespace(c)) {
  1278. src.unread(c);
  1279. setState(LexState.EXPR_BEG);
  1280. yaccValue = ":";
  1281. warn_balanced(c, spaceSeen, ":", "symbol literal");
  1282. return ':';
  1283. }
  1284. switch (c) {
  1285. case '\'':
  1286. lex_strterm = new StringTerm(str_ssym, '\0', c);
  1287. break;
  1288. case '"':
  1289. lex_strterm = new StringTerm(str_dsym, '\0', c);
  1290. break;
  1291. default:
  1292. src.unread(c);
  1293. break;
  1294. }
  1295. setState(LexState.EXPR_FNAME);
  1296. yaccValue = ":";
  1297. return Tokens.tSYMBEG;
  1298. }
  1299. private int comma(int c) throws IOException {
  1300. setState(LexState.EXPR_BEG);
  1301. yaccValue = ",";
  1302. return c;
  1303. }
  1304. private int doKeyword(LexState state) {
  1305. commandStart = true;
  1306. if (leftParenBegin > 0 && leftParenBegin == parenNest) {
  1307. leftParenBegin = 0;
  1308. parenNest--;
  1309. return Tokens.kDO_LAMBDA;
  1310. }
  1311. if (conditionState.isInState()) return Tokens.kDO_COND;
  1312. if (state != LexState.EXPR_CMDARG && cmdArgumentState.isInState()) {
  1313. return Tokens.kDO_BLOCK;
  1314. }
  1315. if (state == LexState.EXPR_ENDARG || state == LexState.EXPR_BEG) {
  1316. return Tokens.kDO_BLOCK;
  1317. }
  1318. return Tokens.kDO;
  1319. }
  1320. private int dollar() throws IOException {
  1321. newtok();
  1322. last_state = lex_state;
  1323. setState(LexState.EXPR_END);
  1324. int c = src.read();
  1325. switch (c) {
  1326. case '_': /* $_: last read line string */
  1327. c = src.read();
  1328. if (isIdentifierChar(c)) {
  1329. tokenBuffer.setLength(0);
  1330. tokenBuffer.append("$_");
  1331. getIdentifier(c);
  1332. last_state = lex_state;
  1333. setState(LexState.EXPR_END);
  1334. return identifierToken(Tokens.tGVAR, tokenBuffer.toString().intern());
  1335. }
  1336. src.unread(c);
  1337. c = '_';
  1338. // fall through
  1339. case '~': /* $~: match-data */
  1340. case '*': /* $*: argv */
  1341. case '$': /* $$: pid */
  1342. case '?': /* $?: last status */
  1343. case '!': /* $!: error string */
  1344. case '@': /* $@: error position */
  1345. case '/': /* $/: input record separator */
  1346. case '\\': /* $\: output record separator */
  1347. case ';': /* $;: field separator */
  1348. case ',': /* $,: output field separator */
  1349. case '.': /* $.: last read line number */
  1350. case '=': /* $=: ignorecase */
  1351. case ':': /* $:: load path */
  1352. case '<': /* $<: reading filename */
  1353. case '>': /* $>: default output handle */
  1354. case '\"': /* $": already loaded files */
  1355. yaccValue = "$" + (char) c;
  1356. return Tokens.tGVAR;
  1357. case '-':
  1358. tokenBuffer.setLength(0);
  1359. tokenBuffer.append('$');
  1360. tokenBuffer.append((char) c);
  1361. c = src.read();
  1362. if (isIdentifierChar(c)) {
  1363. tokenBuffer.append((char) c);
  1364. } else {
  1365. src.unread(c);
  1366. }
  1367. yaccValue = tokenBuffer.toString();
  1368. /* xxx shouldn't check if valid option variable */
  1369. return Tokens.tGVAR;
  1370. case '&': /* $&: last match */
  1371. case '`': /* $`: string before last match */
  1372. case '\'': /* $': string after last match */
  1373. case '+': /* $+: string matches last paren. */
  1374. // Explicit reference to these vars as symbols...
  1375. if (last_state == LexState.EXPR_FNAME) {
  1376. yaccValue = "$" + (char) c;
  1377. return Tokens.tGVAR;
  1378. }
  1379. yaccValue = new BackRefNode(getPosition(), c);
  1380. return Tokens.tBACK_REF;
  1381. case '1': case '2': case '3': case '4': case '5': case '6':
  1382. case '7': case '8': case '9':
  1383. tokenBuffer.setLength(0);
  1384. tokenBuffer.append('$');
  1385. do {
  1386. tokenBuffer.append((char) c);
  1387. c = src.read();
  1388. } while (Character.isDigit(c));
  1389. src.unread(c);
  1390. if (last_state == LexState.EXPR_FNAME) {
  1391. yaccValue = tokenBuffer.toString();
  1392. return Tokens.tGVAR;
  1393. }
  1394. yaccValue = new NthRefNode(getPosition(), Integer.parseInt(tokenBuffer.substring(1)));
  1395. return Tokens.tNTH_REF;
  1396. case '0':
  1397. setState(LexState.EXPR_END);
  1398. return identifierToken(Tokens.tGVAR, ("$" + (char) c).intern());
  1399. default:
  1400. if (!isIdentifierChar(c)) {
  1401. src.unread(c);
  1402. yaccValue = "$";
  1403. return '$';
  1404. }
  1405. // $blah
  1406. tokenBuffer.setLength(0);
  1407. tokenBuffer.append('$');
  1408. getIdentifier(c);
  1409. last_state = lex_state;
  1410. setState(LexState.EXPR_END);
  1411. return identifierToken(Tokens.tGVAR, tokenBuffer.toString().intern());
  1412. }
  1413. }
  1414. private int dot() throws IOException {
  1415. int c;
  1416. setState(LexState.EXPR_BEG);
  1417. if ((c = src.read()) == '.') {
  1418. if ((c = src.read()) == '.') {
  1419. yaccValue = "...";
  1420. return Tokens.tDOT3;
  1421. }
  1422. src.unread(c);
  1423. yaccValue = "..";
  1424. return Tokens.tDOT2;
  1425. }
  1426. src.unread(c);
  1427. if (Character.isDigit(c)) {
  1428. throw new SyntaxException(PID.FLOAT_MISSING_ZERO, getPosition(), getCurrentLine(),
  1429. "no .<digit> floating literal anymore; put 0 before dot");
  1430. }
  1431. setState(LexState.EXPR_DOT);
  1432. yaccValue = ".";
  1433. return Tokens.tDOT;
  1434. }
  1435. private int doubleQuote() throws IOException {
  1436. lex_strterm = new StringTerm(str_dquote, '\0', '"');
  1437. yaccValue = "\"";
  1438. return Tokens.tSTRING_BEG;
  1439. }
  1440. private int greaterThan() throws IOException {
  1441. determineExpressionState();
  1442. int c = src.read();
  1443. switch (c) {
  1444. case '=':
  1445. yaccValue = ">=";
  1446. return Tokens.tGEQ;
  1447. case '>':
  1448. if ((c = src.read()) == '=') {
  1449. setState(LexState.EXPR_BEG);
  1450. yaccValue = ">>";
  1451. return Tokens.tOP_ASGN;
  1452. }
  1453. src.unread(c);
  1454. yaccValue = ">>";
  1455. return Tokens.tRSHFT;
  1456. default:
  1457. src.unread(c);
  1458. yaccValue = ">";
  1459. return Tokens.tGT;
  1460. }
  1461. }
  1462. private int identifier(int c, boolean commandState) throws IOException {
  1463. if (!isIdentifierChar(c)) {
  1464. String badChar = "\\" + Integer.toOctalString(c & 0xff);
  1465. throw new SyntaxException(PID.CHARACTER_BAD, getPosition(), getCurrentLine(),
  1466. "Invalid char `" + badChar + "' ('" + (char) c + "') in expression", badChar);
  1467. }
  1468. newtok();
  1469. tokenBuffer.setLength(0);
  1470. int first = getIdentifier(c);
  1471. c = src.read();
  1472. boolean lastBangOrPredicate = false;
  1473. // methods 'foo!' and 'foo?' are possible but if followed by '=' it is relop
  1474. if (c == '!' || c == '?') {
  1475. if (!src.peek('=')) {
  1476. lastBangOrPredicate = true;
  1477. tokenBuffer.append((char) c);
  1478. } else {
  1479. src.unread(c);
  1480. }
  1481. } else {
  1482. src.unread(c);
  1483. }
  1484. int result = 0;
  1485. last_state = lex_state;
  1486. if (lastBangOrPredicate) {
  1487. result = Tokens.tFID;
  1488. } else {
  1489. if (lex_state == LexState.EXPR_FNAME) {
  1490. if ((c = src.read()) == '=') {
  1491. int c2 = src.read();
  1492. if (c2 != '~' && c2 != '>' &&
  1493. (c2 != '=' || src.peek('>'))) {
  1494. result = Tokens.tIDENTIFIER;
  1495. tokenBuffer.append((char) c);
  1496. src.unread(c2);
  1497. } else {
  1498. src.unread(c2);
  1499. src.unread(c);
  1500. }
  1501. } else {
  1502. src.unread(c);
  1503. }
  1504. }
  1505. if (result == 0 && Character.isUpperCase(first)) {
  1506. result = Tokens.tCONSTANT;
  1507. } else {
  1508. result = Tokens.tIDENTIFIER;
  1509. }
  1510. }
  1511. String tempVal = tokenBuffer.toString().intern();
  1512. if (isLabelPossible(commandState)) {
  1513. int c2 = src.read();
  1514. if (c2 == ':' && !src.peek(':')) {
  1515. setState(LexState.EXPR_BEG);
  1516. yaccValue = tempVal;
  1517. return Tokens.tLABEL;
  1518. }
  1519. src.unread(c2);
  1520. }
  1521. if (lex_state != LexState.EXPR_DOT) {
  1522. Keyword keyword = getKeyword(tempVal); // Is it is a keyword?
  1523. if (keyword != null) {
  1524. LexState state = lex_state; // Save state at time keyword is encountered
  1525. if (keyword == Keyword.NOT) {
  1526. setState(LexState.EXPR_ARG);
  1527. } else {
  1528. setState(keyword.state);
  1529. }
  1530. if (state == LexState.EXPR_FNAME) {
  1531. yaccValue = keyword.name;
  1532. } else {
  1533. yaccValue = getPosition();
  1534. if (keyword.id0 == Tokens.kDO) return doKeyword(state);
  1535. }
  1536. if (state == LexState.EXPR_BEG || state == LexState.EXPR_VALUE) return keyword.id0;
  1537. if (keyword.id0 != keyword.id1) setState(LexState.EXPR_BEG);
  1538. return keyword.id1;
  1539. }
  1540. }
  1541. if (isBEG() || lex_state == LexState.EXPR_DOT || isARG()) {
  1542. setState(commandState ? LexState.EXPR_CMDARG : LexState.EXPR_ARG);
  1543. } else if (lex_state == LexState.EXPR_FNAME) {
  1544. setState(LexState.EXPR_ENDFN);
  1545. } else {
  1546. setState(LexState.EXPR_END);
  1547. }
  1548. return identifierToken(result, tempVal);
  1549. }
  1550. private int leftBracket(boolean spaceSeen) throws IOException {
  1551. parenNest++;
  1552. int c = '[';
  1553. if (lex_state == LexState.EXPR_FNAME || lex_state == LexState.EXPR_DOT) {
  1554. setState(LexState.EXPR_ARG);
  1555. if ((c = src.read()) == ']') {
  1556. if (src.peek('=')) {
  1557. src.read();
  1558. yaccValue = "[]=";
  1559. return Tokens.tASET;
  1560. }
  1561. yaccValue = "[]";
  1562. return Tokens.tAREF;
  1563. }
  1564. src.unread(c);
  1565. yaccValue = "[";
  1566. return '[';
  1567. } else if (isBEG() || (isARG() && spaceSeen)) {
  1568. c = Tokens.tLBRACK;
  1569. }
  1570. setState(LexState.EXPR_BEG);
  1571. conditionState.stop();
  1572. cmdArgumentState.stop();
  1573. yaccValue = "[";
  1574. return c;
  1575. }
  1576. private int leftCurly() {
  1577. if (leftParenBegin > 0 && leftParenBegin == parenNest) {
  1578. setState(LexState.EXPR_BEG);
  1579. leftParenBegin = 0;
  1580. parenNest--;
  1581. conditionState.stop();
  1582. cmdArgumentState.stop();
  1583. yaccValue = "{";
  1584. return Tokens.tLAMBEG;
  1585. }
  1586. char c;
  1587. if (isARG() || lex_state == LexState.EXPR_END || lex_state == LexState.EXPR_ENDFN) { // block (primary)
  1588. c = Tokens.tLCURLY;
  1589. } else if (lex_state == LexState.EXPR_ENDARG) { // block (expr)
  1590. c = Tokens.tLBRACE_ARG;
  1591. } else { // hash
  1592. c = Tokens.tLBRACE;
  1593. }
  1594. conditionState.stop();
  1595. cmdArgumentState.stop();
  1596. setState(LexState.EXPR_BEG);
  1597. yaccValue = "{";
  1598. if (c != Tokens.tLBRACE) commandStart = true;
  1599. return c;
  1600. }
  1601. private int leftParen(boolean spaceSeen) throws IOException {
  1602. int result = Tokens.tLPAREN2;
  1603. if (isBEG()) {
  1604. result = Tokens.tLPAREN;
  1605. } else if (spaceSeen) {
  1606. // ENEBO: 1.9 is IS_ARG, but we need to break apart for 1.8 support.
  1607. if (lex_state == LexState.EXPR_CMDARG) {
  1608. result = Tokens.tLPAREN_ARG;
  1609. } else if (lex_state == LexState.EXPR_ARG) {
  1610. result = Tokens.tLPAREN_ARG;
  1611. }
  1612. if (token == Tokens.tLAMBDA) {
  1613. result = Tokens.tLPAREN2;
  1614. }
  1615. }
  1616. parenNest++;
  1617. conditionState.stop();
  1618. cmdArgumentState.stop();
  1619. setState(LexState.EXPR_BEG);
  1620. yaccValue = "(";
  1621. return result;
  1622. }
  1623. private int lessThan(boolean spaceSeen) throws IOException {
  1624. last_state = lex_state;
  1625. int c = src.read();
  1626. if (c == '<' && lex_state != LexState.EXPR_DOT && lex_state != LexState.EXPR_CLASS &&
  1627. !isEND() && (!isARG() || spaceSeen)) {
  1628. int tok = hereDocumentIdentifier();
  1629. if (tok != 0) return tok;
  1630. }
  1631. determineExpressionState();
  1632. switch (c) {
  1633. case '=':
  1634. if ((c = src.read()) == '>') {
  1635. yaccValue = "<=>";
  1636. return Tokens.tCMP;
  1637. }
  1638. src.unread(c);
  1639. yaccValue = "<=";
  1640. return Tokens.tLEQ;
  1641. case '<':
  1642. if ((c = src.read()) == '=') {
  1643. setState(LexState.EXPR_BEG);
  1644. yaccValue = "<<";
  1645. return Tokens.tOP_ASGN;
  1646. }
  1647. src.unread(c);
  1648. yaccValue = "<<";
  1649. warn_balanced(c, spaceSeen, "<<", "here document");
  1650. return Tokens.tLSHFT;
  1651. default:
  1652. yaccValue = "<";
  1653. src.unread(c);
  1654. return Tokens.tLT;
  1655. }
  1656. }
  1657. private int minus(boolean spaceSeen) throws IOException {
  1658. int c = src.read();
  1659. if (lex_state == LexState.EXPR_FNAME || lex_state == LexState.EXPR_DOT) {
  1660. setState(LexState.EXPR_ARG);
  1661. if (c == '@') {
  1662. yaccValue = "-@";
  1663. return Tokens.tUMINUS;
  1664. }
  1665. src.unread(c);
  1666. yaccValue = "-";
  1667. return Tokens.tMINUS;
  1668. }
  1669. if (c == '=') {
  1670. setState(LexState.EXPR_BEG);
  1671. yaccValue = "-";
  1672. return Tokens.tOP_ASGN;
  1673. }
  1674. if (c == '>') {
  1675. setState(LexState.EXPR_ARG);
  1676. yaccValue = "->";
  1677. return Tokens.tLAMBDA;
  1678. }
  1679. if (isBEG() || isSpaceArg(c, spaceSeen)) {
  1680. if (isARG()) arg_ambiguous();
  1681. setState(LexState.EXPR_BEG);
  1682. src.unread(c);
  1683. yaccValue = "-";
  1684. if (Character.isDigit(c)) {
  1685. return Tokens.tUMINUS_NUM;
  1686. }
  1687. return Tokens.tUMINUS;
  1688. }
  1689. setState(LexState.EXPR_BEG);
  1690. src.unread(c);
  1691. yaccValue = "-";
  1692. warn_balanced(c, spaceSeen, "-", "unary operator");
  1693. return Tokens.tMINUS;
  1694. }
  1695. private int percent(boolean spaceSeen) throws IOException {
  1696. if (isBEG()) return parseQuote(src.read());
  1697. int c = src.read();
  1698. if (c == '=') {
  1699. setState(LexState.EXPR_BEG);
  1700. yaccValue = "%";
  1701. return Tokens.tOP_ASGN;
  1702. }
  1703. if (isSpaceArg(c, spaceSeen)) return parseQuote(c);
  1704. determineExpressionState();
  1705. src.unread(c);
  1706. yaccValue = "%";
  1707. warn_balanced(c, spaceSeen, "%%", "string literal");
  1708. return Tokens.tPERCENT;
  1709. }
  1710. private int pipe() throws IOException {
  1711. int c = src.read();
  1712. switch (c) {
  1713. case '|':
  1714. setState(LexState.EXPR_BEG);
  1715. if ((c = src.read()) == '=') {
  1716. setState(LexState.EXPR_BEG);
  1717. yaccValue = "||";
  1718. return Tokens.tOP_ASGN;
  1719. }
  1720. src.unread(c);
  1721. yaccValue = "||";
  1722. return Tokens.tOROP;
  1723. case '=':
  1724. setState(LexState.EXPR_BEG);
  1725. yaccValue = "|";
  1726. return Tokens.tOP_ASGN;
  1727. default:
  1728. determineExpressionState();
  1729. src.unread(c);
  1730. yaccValue = "|";
  1731. return Tokens.tPIPE;
  1732. }
  1733. }
  1734. private int plus(boolean spaceSeen) throws IOException {
  1735. int c = src.read();
  1736. if (lex_state == LexState.EXPR_FNAME || lex_state == LexState.EXPR_DOT) {
  1737. setState(LexState.EXPR_ARG);
  1738. if (c == '@') {
  1739. yaccValue = "+@";
  1740. return Tokens.tUPLUS;
  1741. }
  1742. src.unread(c);
  1743. yaccValue = "+";
  1744. return Tokens.tPLUS;
  1745. }
  1746. if (c == '=') {
  1747. setState(LexState.EXPR_BEG);
  1748. yaccValue = "+";
  1749. return Tokens.tOP_ASGN;
  1750. }
  1751. if (isBEG() || isSpaceArg(c, spaceSeen)) { //FIXME: arg_ambiguous missing
  1752. if (isARG()) arg_ambiguous();
  1753. setState(LexState.EXPR_BEG);
  1754. src.unread(c);
  1755. if (Character.isDigit(c)) {
  1756. c = '+';
  1757. return parseNumber(c);
  1758. }
  1759. yaccValue = "+";
  1760. return Tokens.tUPLUS;
  1761. }
  1762. setState(LexState.EXPR_BEG);
  1763. src.unread(c);
  1764. yaccValue = "+";
  1765. warn_balanced(c, spaceSeen, "+", "unary operator");
  1766. return Tokens.tPLUS;
  1767. }
  1768. private int questionMark() throws IOException {
  1769. int c;
  1770. if (isEND()) {
  1771. setState(LexState.EXPR_VALUE);
  1772. yaccValue = "?";
  1773. return '?';
  1774. }
  1775. c = src.read();
  1776. if (c == EOF) throw new SyntaxException(PID.INCOMPLETE_CHAR_SYNTAX, getPosition(),
  1777. getCurrentLine(), "incomplete character syntax");
  1778. if (Character.isWhitespace(c)){
  1779. if (!isARG()) {
  1780. int c2 = 0;
  1781. switch (c) {
  1782. case ' ':
  1783. c2 = 's';
  1784. break;
  1785. case '\n':
  1786. c2 = 'n';
  1787. break;
  1788. case '\t':
  1789. c2 = 't';
  1790. break;
  1791. /* What is \v in C?
  1792. case '\v':
  1793. c2 = 'v';
  1794. break;
  1795. */
  1796. case '\r':
  1797. c2 = 'r';
  1798. break;
  1799. case '\f':
  1800. c2 = 'f';
  1801. break;
  1802. }
  1803. if (c2 != 0) {
  1804. warnings.warn(ID.INVALID_CHAR_SEQUENCE, getPosition(), "invalid character syntax; use ?\\" + c2);
  1805. }
  1806. }
  1807. src.unread(c);
  1808. setState(LexState.EXPR_VALUE);
  1809. yaccValue = "?";
  1810. return '?';
  1811. /*} else if (ismbchar(c)) { // ruby - we don't support them either?
  1812. rb_warn("multibyte character literal not supported yet; use ?\\" + c);
  1813. support.unread(c);
  1814. lexState = LexState.EXPR_BEG;
  1815. return '?';*/
  1816. } else if (isIdentifierChar(c) && !src.peek('\n') && isNext_identchar()) {
  1817. newtok();
  1818. src.unread(c);
  1819. setState(LexState.EXPR_VALUE);
  1820. yaccValue = "?";
  1821. return '?';
  1822. } else if (c == '\\') {
  1823. newtok();
  1824. if (src.peek('u')) {
  1825. src.read(); // Eat 'u'
  1826. ByteList oneCharBL = new ByteList(2);
  1827. c = readUTFEscape(oneCharBL, false, false);
  1828. if (c >= 0x80) {
  1829. tokenAddMBC(c, oneCharBL);
  1830. } else {
  1831. oneCharBL.append(c);
  1832. }
  1833. setState(LexState.EXPR_END);
  1834. yaccValue = new StrNode(getPosition(), oneCharBL);
  1835. return Tokens.tINTEGER; // FIXME: This should be something else like a tCHAR in 1.9/2.0
  1836. } else {
  1837. c = readEscape();
  1838. }
  1839. } else {
  1840. newtok();
  1841. }
  1842. setState(LexState.EXPR_END);
  1843. ByteList oneCharBL = new ByteList(1);
  1844. oneCharBL.append(c);
  1845. yaccValue = new StrNode(getPosition(), oneCharBL);
  1846. return Tokens.tINTEGER;
  1847. }
  1848. private int rightBracket() {
  1849. parenNest--;
  1850. conditionState.restart();
  1851. cmdArgumentState.restart();
  1852. setState(LexState.EXPR_ENDARG);
  1853. yaccValue = "]";
  1854. return Tokens.tRBRACK;
  1855. }
  1856. private int rightCurly() {
  1857. conditionState.restart();
  1858. cmdArgumentState.restart();
  1859. setState(LexState.EXPR_ENDARG);
  1860. yaccValue = "}";
  1861. return Tokens.tRCURLY;
  1862. }
  1863. private int rightParen() {
  1864. parenNest--;
  1865. conditionState.restart();
  1866. cmdArgumentState.restart();
  1867. setState(LexState.EXPR_ENDFN);
  1868. yaccValue = ")";
  1869. return Tokens.tRPAREN;
  1870. }
  1871. private int singleQuote() throws IOException {
  1872. lex_strterm = new StringTerm(str_squote, '\0', '\'');
  1873. yaccValue = "'";
  1874. return Tokens.tSTRING_BEG;
  1875. }
  1876. private int slash(boolean spaceSeen) throws IOException {
  1877. if (isBEG()) {
  1878. lex_strterm = new StringTerm(str_regexp, '\0', '/');
  1879. yaccValue = "/";
  1880. return Tokens.tREGEXP_BEG;
  1881. }
  1882. int c = src.read();
  1883. if (c == '=') {
  1884. yaccValue = "/";
  1885. setState(LexState.EXPR_BEG);
  1886. return Tokens.tOP_ASGN;
  1887. }
  1888. src.unread(c);
  1889. if (isSpaceArg(c, spaceSeen)) {
  1890. arg_ambiguous();
  1891. lex_strterm = new StringTerm(str_regexp, '\0', '/');
  1892. yaccValue = "/";
  1893. return Tokens.tREGEXP_BEG;
  1894. }
  1895. determineExpressionState();
  1896. yaccValue = "/";
  1897. warn_balanced(c, spaceSeen, "/", "regexp literal");
  1898. return Tokens.tDIVIDE;
  1899. }
  1900. private int star(boolean spaceSeen) throws IOException {
  1901. int c = src.read();
  1902. switch (c) {
  1903. case '*':
  1904. if ((c = src.read()) == '=') {
  1905. setState(LexState.EXPR_BEG);
  1906. yaccValue = "**";
  1907. return Tokens.tOP_ASGN;
  1908. }
  1909. src.unread(c); // not a '=' put it back
  1910. yaccValue = "**";
  1911. if (isSpaceArg(c, spaceSeen)) {
  1912. if (warnings.isVerbose()) warnings.warning(ID.ARGUMENT_AS_PREFIX, getPosition(), "`**' interpreted as argument prefix");
  1913. c = Tokens.tDSTAR;
  1914. } else if (isBEG()) {
  1915. c = Tokens.tDSTAR;
  1916. } else {
  1917. warn_balanced(c, spaceSeen, "*", "argument prefix");
  1918. c = Tokens.tPOW;
  1919. }
  1920. break;
  1921. case '=':
  1922. setState(LexState.EXPR_BEG);
  1923. yaccValue = "*";
  1924. return Tokens.tOP_ASGN;
  1925. default:
  1926. src.unread(c);
  1927. if (isSpaceArg(c, spaceSeen)) {
  1928. if (warnings.isVerbose()) warnings.warning(ID.ARGUMENT_AS_PREFIX, getPosition(), "`*' interpreted as argument prefix");
  1929. c = Tokens.tSTAR;
  1930. } else if (isBEG()) {
  1931. c = Tokens.tSTAR;
  1932. } else {
  1933. warn_balanced(c, spaceSeen, "*", "argument prefix");
  1934. c = Tokens.tSTAR2;
  1935. }
  1936. yaccValue = "*";
  1937. }
  1938. determineExpressionState();
  1939. return c;
  1940. }
  1941. private int tilde() throws IOException {
  1942. int c;
  1943. if (lex_state == LexState.EXPR_FNAME || lex_state == LexState.EXPR_DOT) {
  1944. if ((c = src.read()) != '@') src.unread(c);
  1945. setState(LexState.EXPR_ARG);
  1946. } else {
  1947. setState(LexState.EXPR_BEG);
  1948. }
  1949. yaccValue = "~";
  1950. return Tokens.tTILDE;
  1951. }
  1952. /**
  1953. * Parse a number from the input stream.
  1954. *
  1955. *@param c The first character of the number.
  1956. *@return A int constant wich represents a token.
  1957. */
  1958. private int parseNumber(int c) throws IOException {
  1959. setState(LexState.EXPR_END);
  1960. newtok();
  1961. tokenBuffer.setLength(0);
  1962. if (c == '-') {
  1963. tokenBuffer.append((char) c);
  1964. c = src.read();
  1965. } else if (c == '+') {
  1966. // We don't append '+' since Java number parser gets confused
  1967. c = src.read();
  1968. }
  1969. int nondigit = 0;
  1970. if (c == '0') {
  1971. int startLen = tokenBuffer.length();
  1972. switch (c = src.read()) {
  1973. case 'x' :
  1974. case 'X' : // hexadecimal
  1975. c = src.read();
  1976. if (isHexChar(c)) {
  1977. for (;; c = src.read()) {
  1978. if (c == '_') {
  1979. if (nondigit != '\0') break;
  1980. nondigit = c;
  1981. } else if (isHexChar(c)) {
  1982. nondigit = '\0';
  1983. tokenBuffer.append((char) c);
  1984. } else {
  1985. break;
  1986. }
  1987. }
  1988. }
  1989. src.unread(c);
  1990. if (tokenBuffer.length() == startLen) {
  1991. throw new SyntaxException(PID.BAD_HEX_NUMBER, getPosition(),
  1992. getCurrentLine(), "Hexadecimal number without hex-digits.");
  1993. } else if (nondigit != '\0') {
  1994. throw new SyntaxException(PID.TRAILING_UNDERSCORE_IN_NUMBER,
  1995. getPosition(), getCurrentLine(), "Trailing '_' in number.");
  1996. }
  1997. yaccValue = getInteger(tokenBuffer.toString(), 16, numberLiteralSuffix(SUFFIX_ALL));
  1998. return Tokens.tINTEGER;
  1999. case 'b' :
  2000. case 'B' : // binary
  2001. c = src.read();
  2002. if (c == '0' || c == '1') {
  2003. for (;; c = src.read()) {
  2004. if (c == '_') {
  2005. if (nondigit != '\0') break;
  2006. nondigit = c;
  2007. } else if (c == '0' || c == '1') {
  2008. nondigit = '\0';
  2009. tokenBuffer.append((char) c);
  2010. } else {
  2011. break;
  2012. }
  2013. }
  2014. }
  2015. src.unread(c);
  2016. if (tokenBuffer.length() == startLen) {
  2017. throw new SyntaxException(PID.EMPTY_BINARY_NUMBER, getPosition(),
  2018. getCurrentLine(), "Binary number without digits.");
  2019. } else if (nondigit != '\0') {
  2020. throw new SyntaxException(PID.TRAILING_UNDERSCORE_IN_NUMBER,
  2021. getPosition(), getCurrentLine(), "Trailing '_' in number.");
  2022. }
  2023. yaccValue = getInteger(tokenBuffer.toString(), 2, numberLiteralSuffix(SUFFIX_ALL));
  2024. return Tokens.tINTEGER;
  2025. case 'd' :
  2026. case 'D' : // decimal
  2027. c = src.read();
  2028. if (Character.isDigit(c)) {
  2029. for (;; c = src.read()) {
  2030. if (c == '_') {
  2031. if (nondigit != '\0') break;
  2032. nondigit = c;
  2033. } else if (Character.isDigit(c)) {
  2034. nondigit = '\0';
  2035. tokenBuffer.append((char) c);
  2036. } else {
  2037. break;
  2038. }
  2039. }
  2040. }
  2041. src.unread(c);
  2042. if (tokenBuffer.length() == startLen) {
  2043. throw new SyntaxException(PID.EMPTY_BINARY_NUMBER, getPosition(),
  2044. getCurrentLine(), "Binary number without digits.");
  2045. } else if (nondigit != '\0') {
  2046. throw new SyntaxException(PID.TRAILING_UNDERSCORE_IN_NUMBER, getPosition(),
  2047. getCurrentLine(), "Trailing '_' in number.");
  2048. }
  2049. yaccValue = getInteger(tokenBuffer.toString(), 10, numberLiteralSuffix(SUFFIX_ALL));
  2050. return Tokens.tINTEGER;
  2051. case 'o':
  2052. case 'O':
  2053. c = src.read();
  2054. case '0': case '1': case '2': case '3': case '4': //Octal
  2055. case '5': case '6': case '7': case '_':
  2056. for (;; c = src.read()) {
  2057. if (c == '_') {
  2058. if (nondigit != '\0') break;
  2059. nondigit = c;
  2060. } else if (c >= '0' && c <= '7') {
  2061. nondigit = '\0';
  2062. tokenBuffer.append((char) c);
  2063. } else {
  2064. break;
  2065. }
  2066. }
  2067. if (tokenBuffer.length() > startLen) {
  2068. src.unread(c);
  2069. if (nondigit != '\0') {
  2070. throw new SyntaxException(PID.TRAILING_UNDERSCORE_IN_NUMBER,
  2071. getPosition(), getCurrentLine(), "Trailing '_' in number.");
  2072. }
  2073. yaccValue = getInteger(tokenBuffer.toString(), 8, numberLiteralSuffix(SUFFIX_ALL));
  2074. return Tokens.tINTEGER;
  2075. }
  2076. case '8' :
  2077. case '9' :
  2078. throw new SyntaxException(PID.BAD_OCTAL_DIGIT, getPosition(),
  2079. getCurrentLine(), "Illegal octal digit.");
  2080. case '.' :
  2081. case 'e' :
  2082. case 'E' :
  2083. tokenBuffer.append('0');
  2084. break;
  2085. default :
  2086. src.unread(c);
  2087. yaccValue = new FixnumNode(getPosition(), 0);
  2088. return Tokens.tINTEGER;
  2089. }
  2090. }
  2091. boolean seen_point = false;
  2092. boolean seen_e = false;
  2093. for (;; c = src.read()) {
  2094. switch (c) {
  2095. case '0' :
  2096. case '1' :
  2097. case '2' :
  2098. case '3' :
  2099. case '4' :
  2100. case '5' :
  2101. case '6' :
  2102. case '7' :
  2103. case '8' :
  2104. case '9' :
  2105. nondigit = '\0';
  2106. tokenBuffer.append((char) c);
  2107. break;
  2108. case '.' :
  2109. if (nondigit != '\0') {
  2110. src.unread(c);
  2111. throw new SyntaxException(PID.TRAILING_UNDERSCORE_IN_NUMBER, getPosition(),
  2112. getCurrentLine(), "Trailing '_' in number.");
  2113. } else if (seen_point || seen_e) {
  2114. src.unread(c);
  2115. return getNumberToken(tokenBuffer.toString(), seen_e, seen_point, nondigit);
  2116. } else {
  2117. int c2;
  2118. if (!Character.isDigit(c2 = src.read())) {
  2119. src.unread(c2);
  2120. src.unread('.');
  2121. if (c == '_') {
  2122. // Enebo: c can never be antrhign but '.'
  2123. // Why did I put this here?
  2124. } else {
  2125. yaccValue = getInteger(tokenBuffer.toString(), 10, numberLiteralSuffix(SUFFIX_ALL));
  2126. return Tokens.tINTEGER;
  2127. }
  2128. } else {
  2129. tokenBuffer.append('.');
  2130. tokenBuffer.append((char) c2);
  2131. seen_point = true;
  2132. nondigit = '\0';
  2133. }
  2134. }
  2135. break;
  2136. case 'e' :
  2137. case 'E' :
  2138. if (nondigit != '\0') {
  2139. throw new SyntaxException(PID.TRAILING_UNDERSCORE_IN_NUMBER, getPosition(),
  2140. getCurrentLine(), "Trailing '_' in number.");
  2141. } else if (seen_e) {
  2142. src.unread(c);
  2143. return getNumberToken(tokenBuffer.toString(), seen_e, seen_point, nondigit);
  2144. } else {
  2145. tokenBuffer.append((char) c);
  2146. seen_e = true;
  2147. nondigit = c;
  2148. c = src.read();
  2149. if (c == '-' || c == '+') {
  2150. tokenBuffer.append((char) c);
  2151. nondigit = c;
  2152. } else {
  2153. src.unread(c);
  2154. }
  2155. }
  2156. break;
  2157. case '_' : // '_' in number just ignored
  2158. if (nondigit != '\0') {
  2159. throw new SyntaxException(PID.TRAILING_UNDERSCORE_IN_NUMBER, getPosition(),
  2160. getCurrentLine(), "Trailing '_' in number.");
  2161. }
  2162. nondigit = c;
  2163. break;
  2164. default :
  2165. src.unread(c);
  2166. return getNumberToken(tokenBuffer.toString(), seen_e, seen_point, nondigit);
  2167. }
  2168. }
  2169. }
  2170. private int getNumberToken(String number, boolean seen_e, boolean seen_point, int nondigit) throws IOException {
  2171. boolean isFloat = seen_e || seen_point;
  2172. if (nondigit != '\0') {
  2173. throw new SyntaxException(PID.TRAILING_UNDERSCORE_IN_NUMBER, getPosition(),
  2174. getCurrentLine(), "Trailing '_' in number.");
  2175. } else if (isFloat) {
  2176. return getFloatToken(number);
  2177. }
  2178. yaccValue = getInteger(number, 10, numberLiteralSuffix(SUFFIX_ALL));
  2179. return Tokens.tINTEGER;
  2180. }
  2181. // Note: parser_tokadd_utf8 variant just for regexp literal parsing. This variant is to be
  2182. // called when string_literal and regexp_literal.
  2183. public void readUTFEscapeRegexpLiteral(ByteList buffer) throws IOException {
  2184. buffer.append('\\');
  2185. buffer.append('u');
  2186. if (src.peek('{')) { // handle \\u{...}
  2187. do {
  2188. buffer.append(src.read());
  2189. if (scanHexLiteral(buffer, 6, false, "invalid Unicode escape") > 0x10ffff) {
  2190. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2191. getCurrentLine(), "invalid Unicode codepoint (too large)");
  2192. }
  2193. } while (src.peek(' ') || src.peek('\t'));
  2194. int c = src.read();
  2195. if (c != '}') {
  2196. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2197. getCurrentLine(), "unterminated Unicode escape");
  2198. }
  2199. buffer.append((char) c);
  2200. } else { // handle \\uxxxx
  2201. scanHexLiteral(buffer, 4, true, "Invalid Unicode escape");
  2202. }
  2203. }
  2204. private byte[] mbcBuf = new byte[6];
  2205. //FIXME: This seems like it could be more efficient to ensure size in bytelist and then pass
  2206. // in bytelists byte backing store. This method would look ugly since realSize would need
  2207. // to be tweaked and I don't know how many bytes this codepoint has up front so I would need
  2208. // to grow by 6 (which may be wasteful). Another idea is to make Encoding accept an interface
  2209. // for populating bytes and then make ByteList implement that interface. I like this last idea
  2210. // since it would not leak bytelist impl details all over the place.
  2211. public int tokenAddMBC(int codepoint, ByteList buffer) {
  2212. int length = buffer.getEncoding().codeToMbc(codepoint, mbcBuf, 0);
  2213. if (length <= 0) return EOF;
  2214. buffer.append(mbcBuf, 0, length);
  2215. return length;
  2216. }
  2217. public void tokenAddMBCFromSrc(int c, ByteList buffer) throws IOException {
  2218. // read bytes for length of character
  2219. int length = buffer.getEncoding().length((byte)c);
  2220. buffer.append((byte)c);
  2221. for (int off = 0; off < length - 1; off++) {
  2222. buffer.append((byte)src.read());
  2223. }
  2224. }
  2225. // MRI: parser_tokadd_utf8 sans regexp literal parsing
  2226. public int readUTFEscape(ByteList buffer, boolean stringLiteral, boolean symbolLiteral) throws IOException {
  2227. int codepoint;
  2228. int c;
  2229. if (src.peek('{')) { // handle \\u{...}
  2230. do {
  2231. src.read(); // Eat curly or whitespace
  2232. codepoint = scanHex(6, false, "invalid Unicode escape");
  2233. if (codepoint > 0x10ffff) {
  2234. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2235. getCurrentLine(), "invalid Unicode codepoint (too large)");
  2236. }
  2237. if (buffer != null) readUTF8EscapeIntoBuffer(codepoint, buffer, stringLiteral);
  2238. } while (src.peek(' ') || src.peek('\t'));
  2239. c = src.read();
  2240. if (c != '}') {
  2241. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2242. getCurrentLine(), "unterminated Unicode escape");
  2243. }
  2244. } else { // handle \\uxxxx
  2245. codepoint = scanHex(4, true, "Invalid Unicode escape");
  2246. if (buffer != null) readUTF8EscapeIntoBuffer(codepoint, buffer, stringLiteral);
  2247. }
  2248. return codepoint;
  2249. }
  2250. private void readUTF8EscapeIntoBuffer(int codepoint, ByteList buffer, boolean stringLiteral) {
  2251. if (codepoint >= 0x80) {
  2252. buffer.setEncoding(UTF8_ENCODING);
  2253. if (stringLiteral) tokenAddMBC(codepoint, buffer);
  2254. } else if (stringLiteral) {
  2255. buffer.append((char) codepoint);
  2256. }
  2257. }
  2258. public int readEscape() throws IOException {
  2259. int c = src.read();
  2260. switch (c) {
  2261. case '\\' : // backslash
  2262. return c;
  2263. case 'n' : // newline
  2264. return '\n';
  2265. case 't' : // horizontal tab
  2266. return '\t';
  2267. case 'r' : // carriage return
  2268. return '\r';
  2269. case 'f' : // form feed
  2270. return '\f';
  2271. case 'v' : // vertical tab
  2272. return '\u000B';
  2273. case 'a' : // alarm(bell)
  2274. return '\u0007';
  2275. case 'e' : // escape
  2276. return '\u001B';
  2277. case '0' : case '1' : case '2' : case '3' : // octal constant
  2278. case '4' : case '5' : case '6' : case '7' :
  2279. src.unread(c);
  2280. return scanOct(3);
  2281. case 'x' : // hex constant
  2282. return scanHex(2, false, "Invalid escape character syntax");
  2283. case 'b' : // backspace
  2284. return '\010';
  2285. case 's' : // space
  2286. return ' ';
  2287. case 'M' :
  2288. if ((c = src.read()) != '-') {
  2289. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2290. getCurrentLine(), "Invalid escape character syntax");
  2291. } else if ((c = src.read()) == '\\') {
  2292. return (char) (readEscape() | 0x80);
  2293. } else if (c == EOF) {
  2294. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2295. getCurrentLine(), "Invalid escape character syntax");
  2296. }
  2297. return (char) ((c & 0xff) | 0x80);
  2298. case 'C' :
  2299. if (src.read() != '-') {
  2300. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2301. getCurrentLine(), "Invalid escape character syntax");
  2302. }
  2303. case 'c' :
  2304. if ((c = src.read()) == '\\') {
  2305. c = readEscape();
  2306. } else if (c == '?') {
  2307. return '\177';
  2308. } else if (c == EOF) {
  2309. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2310. getCurrentLine(), "Invalid escape character syntax");
  2311. }
  2312. return (char) (c & 0x9f);
  2313. case EOF :
  2314. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2315. getCurrentLine(), "Invalid escape character syntax");
  2316. default :
  2317. return c;
  2318. }
  2319. }
  2320. /**
  2321. * Read up to count hexadecimal digits and store those digits in a token buffer. If strict is
  2322. * provided then count number of hex digits must be present. If no digits can be read a syntax
  2323. * exception will be thrown. This will also return the codepoint as a value so codepoint
  2324. * ranges can be checked.
  2325. */
  2326. private char scanHexLiteral(ByteList buffer, int count, boolean strict, String errorMessage)
  2327. throws IOException {
  2328. int i = 0;
  2329. char hexValue = '\0';
  2330. for (; i < count; i++) {
  2331. int h1 = src.read();
  2332. if (!isHexChar(h1)) {
  2333. src.unread(h1);
  2334. break;
  2335. }
  2336. buffer.append(h1);
  2337. hexValue <<= 4;
  2338. hexValue |= Integer.parseInt("" + (char) h1, 16) & 15;
  2339. }
  2340. // No hex value after the 'x'.
  2341. if (i == 0 || strict && count != i) {
  2342. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2343. getCurrentLine(), errorMessage);
  2344. }
  2345. return hexValue;
  2346. }
  2347. /**
  2348. * Read up to count hexadecimal digits. If strict is provided then count number of hex
  2349. * digits must be present. If no digits can be read a syntax exception will be thrown.
  2350. */
  2351. private int scanHex(int count, boolean strict, String errorMessage) throws IOException {
  2352. int i = 0;
  2353. int hexValue = '\0';
  2354. for (; i < count; i++) {
  2355. int h1 = src.read();
  2356. if (!isHexChar(h1)) {
  2357. src.unread(h1);
  2358. break;
  2359. }
  2360. hexValue <<= 4;
  2361. hexValue |= Integer.parseInt("" + (char) h1, 16) & 15;
  2362. }
  2363. // No hex value after the 'x'.
  2364. if (i == 0 || (strict && count != i)) {
  2365. throw new SyntaxException(PID.INVALID_ESCAPE_SYNTAX, getPosition(),
  2366. getCurrentLine(), errorMessage);
  2367. }
  2368. return hexValue;
  2369. }
  2370. private char scanOct(int count) throws IOException {
  2371. char value = '\0';
  2372. for (int i = 0; i < count; i++) {
  2373. int c = src.read();
  2374. if (!isOctChar(c)) {
  2375. src.unread(c);
  2376. break;
  2377. }
  2378. value <<= 3;
  2379. value |= Integer.parseInt("" + (char) c, 8);
  2380. }
  2381. return value;
  2382. }
  2383. }