PageRenderTime 44ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/pypy/interpreter/pyparser/pyparse.py

https://bitbucket.org/dac_io/pypy
Python | 187 lines | 180 code | 4 blank | 3 comment | 1 complexity | 6fe951a2f56044b1ef14f45ccac0f629 MD5 | raw file
  1. from pypy.interpreter import gateway
  2. from pypy.interpreter.error import OperationError
  3. from pypy.interpreter.pyparser import future, parser, pytokenizer, pygram, error
  4. from pypy.interpreter.astcompiler import consts
  5. _recode_to_utf8 = gateway.applevel(r'''
  6. def _recode_to_utf8(text, encoding):
  7. return unicode(text, encoding).encode("utf-8")
  8. ''').interphook('_recode_to_utf8')
  9. def recode_to_utf8(space, text, encoding):
  10. return space.str_w(_recode_to_utf8(space, space.wrap(text),
  11. space.wrap(encoding)))
  12. def _normalize_encoding(encoding):
  13. """returns normalized name for <encoding>
  14. see dist/src/Parser/tokenizer.c 'get_normal_name()'
  15. for implementation details / reference
  16. NOTE: for now, parser.suite() raises a MemoryError when
  17. a bad encoding is used. (SF bug #979739)
  18. """
  19. if encoding is None:
  20. return None
  21. # lower() + '_' / '-' conversion
  22. encoding = encoding.replace('_', '-').lower()
  23. if encoding == 'utf-8' or encoding.startswith('utf-8-'):
  24. return 'utf-8'
  25. for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:
  26. if (encoding == variant or
  27. encoding.startswith(variant + '-')):
  28. return 'iso-8859-1'
  29. return encoding
  30. def _check_for_encoding(s):
  31. eol = s.find('\n')
  32. if eol < 0:
  33. return _check_line_for_encoding(s)
  34. enc = _check_line_for_encoding(s[:eol])
  35. if enc:
  36. return enc
  37. eol2 = s.find('\n', eol + 1)
  38. if eol2 < 0:
  39. return _check_line_for_encoding(s[eol + 1:])
  40. return _check_line_for_encoding(s[eol + 1:eol2])
  41. def _check_line_for_encoding(line):
  42. """returns the declared encoding or None"""
  43. i = 0
  44. for i in range(len(line)):
  45. if line[i] == '#':
  46. break
  47. if line[i] not in ' \t\014':
  48. return None
  49. return pytokenizer.match_encoding_declaration(line[i:])
  50. class CompileInfo(object):
  51. """Stores information about the source being compiled.
  52. * filename: The filename of the source.
  53. * mode: The parse mode to use. ('exec', 'eval', or 'single')
  54. * flags: Parser and compiler flags.
  55. * encoding: The source encoding.
  56. * last_future_import: The line number and offset of the last __future__
  57. import.
  58. * hidden_applevel: Will this code unit and sub units be hidden at the
  59. applevel?
  60. """
  61. def __init__(self, filename, mode="exec", flags=0, future_pos=(0, 0),
  62. hidden_applevel=False):
  63. self.filename = filename
  64. self.mode = mode
  65. self.encoding = None
  66. self.flags = flags
  67. self.last_future_import = future_pos
  68. self.hidden_applevel = hidden_applevel
  69. _targets = {
  70. 'eval' : pygram.syms.eval_input,
  71. 'single' : pygram.syms.single_input,
  72. 'exec' : pygram.syms.file_input,
  73. }
  74. class PythonParser(parser.Parser):
  75. def __init__(self, space, future_flags=future.futureFlags_2_7,
  76. grammar=pygram.python_grammar):
  77. parser.Parser.__init__(self, grammar)
  78. self.space = space
  79. self.future_flags = future_flags
  80. def parse_source(self, textsrc, compile_info):
  81. """Main entry point for parsing Python source.
  82. Everything from decoding the source to tokenizing to building the parse
  83. tree is handled here.
  84. """
  85. # Detect source encoding.
  86. enc = None
  87. if textsrc.startswith("\xEF\xBB\xBF"):
  88. textsrc = textsrc[3:]
  89. enc = 'utf-8'
  90. # If an encoding is explicitly given check that it is utf-8.
  91. decl_enc = _check_for_encoding(textsrc)
  92. if decl_enc and decl_enc != "utf-8":
  93. raise error.SyntaxError("UTF-8 BOM with non-utf8 coding cookie",
  94. filename=compile_info.filename)
  95. elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
  96. enc = 'utf-8'
  97. if _check_for_encoding(textsrc) is not None:
  98. raise error.SyntaxError("coding declaration in unicode string",
  99. filename=compile_info.filename)
  100. else:
  101. enc = _normalize_encoding(_check_for_encoding(textsrc))
  102. if enc is not None and enc not in ('utf-8', 'iso-8859-1'):
  103. try:
  104. textsrc = recode_to_utf8(self.space, textsrc, enc)
  105. except OperationError, e:
  106. # if the codec is not found, LookupError is raised. we
  107. # check using 'is_w' not to mask potential IndexError or
  108. # KeyError
  109. space = self.space
  110. if e.match(space, space.w_LookupError):
  111. raise error.SyntaxError("Unknown encoding: %s" % enc,
  112. filename=compile_info.filename)
  113. # Transform unicode errors into SyntaxError
  114. if e.match(space, space.w_UnicodeDecodeError):
  115. e.normalize_exception(space)
  116. w_message = space.str(e.get_w_value(space))
  117. raise error.SyntaxError(space.str_w(w_message))
  118. raise
  119. f_flags, future_info = future.get_futures(self.future_flags, textsrc)
  120. compile_info.last_future_import = future_info
  121. compile_info.flags |= f_flags
  122. flags = compile_info.flags
  123. if flags & consts.CO_FUTURE_PRINT_FUNCTION:
  124. self.grammar = pygram.python_grammar_no_print
  125. else:
  126. self.grammar = pygram.python_grammar
  127. # The tokenizer is very picky about how it wants its input.
  128. source_lines = textsrc.splitlines(True)
  129. if source_lines and not source_lines[-1].endswith("\n"):
  130. source_lines[-1] += '\n'
  131. if textsrc and textsrc[-1] == "\n":
  132. flags &= ~consts.PyCF_DONT_IMPLY_DEDENT
  133. self.prepare(_targets[compile_info.mode])
  134. tp = 0
  135. try:
  136. try:
  137. tokens = pytokenizer.generate_tokens(source_lines, flags)
  138. for tp, value, lineno, column, line in tokens:
  139. if self.add_token(tp, value, lineno, column, line):
  140. break
  141. except error.TokenError, e:
  142. e.filename = compile_info.filename
  143. raise
  144. except parser.ParseError, e:
  145. # Catch parse errors, pretty them up and reraise them as a
  146. # SyntaxError.
  147. new_err = error.IndentationError
  148. if tp == pygram.tokens.INDENT:
  149. msg = "unexpected indent"
  150. elif e.expected == pygram.tokens.INDENT:
  151. msg = "expected an indented block"
  152. else:
  153. new_err = error.SyntaxError
  154. msg = "invalid syntax"
  155. raise new_err(msg, e.lineno, e.column, e.line,
  156. compile_info.filename)
  157. else:
  158. tree = self.root
  159. finally:
  160. # Avoid hanging onto the tree.
  161. self.root = None
  162. if enc is not None:
  163. compile_info.encoding = enc
  164. return tree