PageRenderTime 40ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/pypy/interpreter/pyparser/pytokenizer.py

https://bitbucket.org/kkris/pypy
Python | 259 lines | 254 code | 2 blank | 3 comment | 3 complexity | 855dba296135cf4ae1472d7ee1498315 MD5 | raw file
  1. from pypy.interpreter.pyparser import automata
  2. from pypy.interpreter.pyparser.pygram import tokens
  3. from pypy.interpreter.pyparser.pytoken import python_opmap
  4. from pypy.interpreter.pyparser.error import TokenError, TokenIndentationError
  5. from pypy.interpreter.pyparser.pytokenize import tabsize, whiteSpaceDFA, \
  6. triple_quoted, endDFAs, single_quoted, pseudoDFA
  7. from pypy.interpreter.astcompiler import consts
  8. NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
  9. NUMCHARS = '0123456789'
  10. ALNUMCHARS = NAMECHARS + NUMCHARS
  11. EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.'
  12. WHITESPACES = ' \t\n\r\v\f'
  13. def match_encoding_declaration(comment):
  14. """returns the declared encoding or None
  15. This function is a replacement for :
  16. >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")
  17. >>> py_encoding.search(comment)
  18. """
  19. index = comment.find('coding')
  20. if index < 0:
  21. return None
  22. next_char = comment[index + 6]
  23. if next_char not in ':=':
  24. return None
  25. end_of_decl = comment[index + 7:]
  26. index = 0
  27. for char in end_of_decl:
  28. if char not in WHITESPACES:
  29. break
  30. index += 1
  31. else:
  32. return None
  33. encoding = ''
  34. for char in end_of_decl[index:]:
  35. if char in EXTENDED_ALNUMCHARS:
  36. encoding += char
  37. else:
  38. break
  39. if encoding != '':
  40. return encoding
  41. return None
  42. def generate_tokens(lines, flags):
  43. """
  44. This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
  45. the original function is not RPYTHON (uses yield)
  46. It was also slightly modified to generate Token instances instead
  47. of the original 5-tuples -- it's now a 4-tuple of
  48. * the Token instance
  49. * the whole line as a string
  50. * the line number (the real one, counting continuation lines)
  51. * the position on the line of the end of the token.
  52. Original docstring ::
  53. The generate_tokens() generator requires one argment, readline, which
  54. must be a callable object which provides the same interface as the
  55. readline() method of built-in file objects. Each call to the function
  56. should return one line of input as a string.
  57. The generator produces 5-tuples with these members: the token type; the
  58. token string; a 2-tuple (srow, scol) of ints specifying the row and
  59. column where the token begins in the source; a 2-tuple (erow, ecol) of
  60. ints specifying the row and column where the token ends in the source;
  61. and the line on which the token was found. The line passed is the
  62. logical line; continuation lines are included.
  63. """
  64. token_list = []
  65. lnum = parenlev = continued = 0
  66. namechars = NAMECHARS
  67. numchars = NUMCHARS
  68. contstr, needcont = '', 0
  69. contline = None
  70. indents = [0]
  71. last_comment = ''
  72. parenlevstart = (0, 0, "")
  73. # make the annotator happy
  74. endDFA = automata.DFA([], [])
  75. # make the annotator happy
  76. line = ''
  77. pos = 0
  78. lines.append("")
  79. strstart = (0, 0, "")
  80. for line in lines:
  81. lnum = lnum + 1
  82. pos, max = 0, len(line)
  83. if contstr:
  84. if not line:
  85. raise TokenError(
  86. "EOF while scanning triple-quoted string literal",
  87. strstart[2], strstart[0], strstart[1]+1,
  88. token_list, lnum-1)
  89. endmatch = endDFA.recognize(line)
  90. if endmatch >= 0:
  91. pos = end = endmatch
  92. tok = (tokens.STRING, contstr + line[:end], strstart[0],
  93. strstart[1], line)
  94. token_list.append(tok)
  95. last_comment = ''
  96. contstr, needcont = '', 0
  97. contline = None
  98. elif (needcont and not line.endswith('\\\n') and
  99. not line.endswith('\\\r\n')):
  100. tok = (tokens.ERRORTOKEN, contstr + line, strstart[0],
  101. strstart[1], line)
  102. token_list.append(tok)
  103. last_comment = ''
  104. contstr = ''
  105. contline = None
  106. continue
  107. else:
  108. contstr = contstr + line
  109. contline = contline + line
  110. continue
  111. elif parenlev == 0 and not continued: # new statement
  112. if not line: break
  113. column = 0
  114. while pos < max: # measure leading whitespace
  115. if line[pos] == ' ': column = column + 1
  116. elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
  117. elif line[pos] == '\f': column = 0
  118. else: break
  119. pos = pos + 1
  120. if pos == max: break
  121. if line[pos] in '#\r\n':
  122. # skip comments or blank lines
  123. continue
  124. if column > indents[-1]: # count indents or dedents
  125. indents.append(column)
  126. token_list.append((tokens.INDENT, line[:pos], lnum, 0, line))
  127. last_comment = ''
  128. while column < indents[-1]:
  129. indents = indents[:-1]
  130. token_list.append((tokens.DEDENT, '', lnum, pos, line))
  131. last_comment = ''
  132. if column != indents[-1]:
  133. err = "unindent does not match any outer indentation level"
  134. raise TokenIndentationError(err, line, lnum, 0, token_list)
  135. else: # continued statement
  136. if not line:
  137. if parenlev > 0:
  138. lnum1, start1, line1 = parenlevstart
  139. raise TokenError("parenthesis is never closed", line1,
  140. lnum1, start1 + 1, token_list, lnum)
  141. raise TokenError("EOF in multi-line statement", line,
  142. lnum, 0, token_list)
  143. continued = 0
  144. while pos < max:
  145. pseudomatch = pseudoDFA.recognize(line, pos)
  146. if pseudomatch >= 0: # scan for tokens
  147. # JDR: Modified
  148. start = whiteSpaceDFA.recognize(line, pos)
  149. if start < 0:
  150. start = pos
  151. end = pseudomatch
  152. if start == end:
  153. raise TokenError("Unknown character", line,
  154. lnum, start + 1, token_list)
  155. pos = end
  156. token, initial = line[start:end], line[start]
  157. if initial in numchars or \
  158. (initial == '.' and token != '.'): # ordinary number
  159. token_list.append((tokens.NUMBER, token, lnum, start, line))
  160. last_comment = ''
  161. elif initial in '\r\n':
  162. if parenlev <= 0:
  163. tok = (tokens.NEWLINE, last_comment, lnum, start, line)
  164. token_list.append(tok)
  165. last_comment = ''
  166. elif initial == '#':
  167. # skip comment
  168. last_comment = token
  169. elif token in triple_quoted:
  170. endDFA = endDFAs[token]
  171. endmatch = endDFA.recognize(line, pos)
  172. if endmatch >= 0: # all on one line
  173. pos = endmatch
  174. token = line[start:pos]
  175. tok = (tokens.STRING, token, lnum, start, line)
  176. token_list.append(tok)
  177. last_comment = ''
  178. else:
  179. strstart = (lnum, start, line)
  180. contstr = line[start:]
  181. contline = line
  182. break
  183. elif initial in single_quoted or \
  184. token[:2] in single_quoted or \
  185. token[:3] in single_quoted:
  186. if token[-1] == '\n': # continued string
  187. strstart = (lnum, start, line)
  188. endDFA = (endDFAs[initial] or endDFAs[token[1]] or
  189. endDFAs[token[2]])
  190. contstr, needcont = line[start:], 1
  191. contline = line
  192. break
  193. else: # ordinary string
  194. tok = (tokens.STRING, token, lnum, start, line)
  195. token_list.append(tok)
  196. last_comment = ''
  197. elif initial in namechars: # ordinary name
  198. token_list.append((tokens.NAME, token, lnum, start, line))
  199. last_comment = ''
  200. elif initial == '\\': # continued stmt
  201. continued = 1
  202. else:
  203. if initial in '([{':
  204. if parenlev == 0:
  205. parenlevstart = (lnum, start, line)
  206. parenlev = parenlev + 1
  207. elif initial in ')]}':
  208. parenlev = parenlev - 1
  209. if parenlev < 0:
  210. raise TokenError("unmatched '%s'" % initial, line,
  211. lnum, start + 1, token_list)
  212. if token in python_opmap:
  213. punct = python_opmap[token]
  214. else:
  215. punct = tokens.OP
  216. token_list.append((punct, token, lnum, start, line))
  217. last_comment = ''
  218. else:
  219. start = whiteSpaceDFA.recognize(line, pos)
  220. if start < 0:
  221. start = pos
  222. if start<max and line[start] in single_quoted:
  223. raise TokenError("EOL while scanning string literal",
  224. line, lnum, start+1, token_list)
  225. tok = (tokens.ERRORTOKEN, line[pos], lnum, pos, line)
  226. token_list.append(tok)
  227. last_comment = ''
  228. pos = pos + 1
  229. lnum -= 1
  230. if not (flags & consts.PyCF_DONT_IMPLY_DEDENT):
  231. if token_list and token_list[-1][0] != tokens.NEWLINE:
  232. tok = (tokens.NEWLINE, '', lnum, 0, '\n')
  233. token_list.append(tok)
  234. for indent in indents[1:]: # pop remaining indent levels
  235. token_list.append((tokens.DEDENT, '', lnum, pos, line))
  236. tok = (tokens.NEWLINE, '', lnum, 0, '\n')
  237. token_list.append(tok)
  238. token_list.append((tokens.ENDMARKER, '', lnum, pos, line))
  239. return token_list