PageRenderTime 47ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/lib-python/2.7/idlelib/PyParse.py

https://bitbucket.org/pwaller/pypy
Python | 594 lines | 541 code | 20 blank | 33 comment | 37 complexity | 4655381659547d909590c0c4e956f11b MD5 | raw file
  1. import re
  2. import sys
  3. # Reason last stmt is continued (or C_NONE if it's not).
  4. (C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
  5. C_STRING_NEXT_LINES, C_BRACKET) = range(5)
  6. if 0: # for throwaway debugging output
  7. def dump(*stuff):
  8. sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")
  9. # Find what looks like the start of a popular stmt.
  10. _synchre = re.compile(r"""
  11. ^
  12. [ \t]*
  13. (?: while
  14. | else
  15. | def
  16. | return
  17. | assert
  18. | break
  19. | class
  20. | continue
  21. | elif
  22. | try
  23. | except
  24. | raise
  25. | import
  26. | yield
  27. )
  28. \b
  29. """, re.VERBOSE | re.MULTILINE).search
  30. # Match blank line or non-indenting comment line.
  31. _junkre = re.compile(r"""
  32. [ \t]*
  33. (?: \# \S .* )?
  34. \n
  35. """, re.VERBOSE).match
  36. # Match any flavor of string; the terminating quote is optional
  37. # so that we're robust in the face of incomplete program text.
  38. _match_stringre = re.compile(r"""
  39. \""" [^"\\]* (?:
  40. (?: \\. | "(?!"") )
  41. [^"\\]*
  42. )*
  43. (?: \""" )?
  44. | " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
  45. | ''' [^'\\]* (?:
  46. (?: \\. | '(?!'') )
  47. [^'\\]*
  48. )*
  49. (?: ''' )?
  50. | ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
  51. """, re.VERBOSE | re.DOTALL).match
  52. # Match a line that starts with something interesting;
  53. # used to find the first item of a bracket structure.
  54. _itemre = re.compile(r"""
  55. [ \t]*
  56. [^\s#\\] # if we match, m.end()-1 is the interesting char
  57. """, re.VERBOSE).match
  58. # Match start of stmts that should be followed by a dedent.
  59. _closere = re.compile(r"""
  60. \s*
  61. (?: return
  62. | break
  63. | continue
  64. | raise
  65. | pass
  66. )
  67. \b
  68. """, re.VERBOSE).match
  69. # Chew up non-special chars as quickly as possible. If match is
  70. # successful, m.end() less 1 is the index of the last boring char
  71. # matched. If match is unsuccessful, the string starts with an
  72. # interesting char.
  73. _chew_ordinaryre = re.compile(r"""
  74. [^[\](){}#'"\\]+
  75. """, re.VERBOSE).match
  76. # Build translation table to map uninteresting chars to "x", open
  77. # brackets to "(", and close brackets to ")".
  78. _tran = ['x'] * 256
  79. for ch in "({[":
  80. _tran[ord(ch)] = '('
  81. for ch in ")}]":
  82. _tran[ord(ch)] = ')'
  83. for ch in "\"'\\\n#":
  84. _tran[ord(ch)] = ch
  85. _tran = ''.join(_tran)
  86. del ch
  87. try:
  88. UnicodeType = type(unicode(""))
  89. except NameError:
  90. UnicodeType = None
  91. class Parser:
  92. def __init__(self, indentwidth, tabwidth):
  93. self.indentwidth = indentwidth
  94. self.tabwidth = tabwidth
  95. def set_str(self, str):
  96. assert len(str) == 0 or str[-1] == '\n'
  97. if type(str) is UnicodeType:
  98. # The parse functions have no idea what to do with Unicode, so
  99. # replace all Unicode characters with "x". This is "safe"
  100. # so long as the only characters germane to parsing the structure
  101. # of Python are 7-bit ASCII. It's *necessary* because Unicode
  102. # strings don't have a .translate() method that supports
  103. # deletechars.
  104. uniphooey = str
  105. str = []
  106. push = str.append
  107. for raw in map(ord, uniphooey):
  108. push(raw < 127 and chr(raw) or "x")
  109. str = "".join(str)
  110. self.str = str
  111. self.study_level = 0
  112. # Return index of a good place to begin parsing, as close to the
  113. # end of the string as possible. This will be the start of some
  114. # popular stmt like "if" or "def". Return None if none found:
  115. # the caller should pass more prior context then, if possible, or
  116. # if not (the entire program text up until the point of interest
  117. # has already been tried) pass 0 to set_lo.
  118. #
  119. # This will be reliable iff given a reliable is_char_in_string
  120. # function, meaning that when it says "no", it's absolutely
  121. # guaranteed that the char is not in a string.
  122. def find_good_parse_start(self, is_char_in_string=None,
  123. _synchre=_synchre):
  124. str, pos = self.str, None
  125. if not is_char_in_string:
  126. # no clue -- make the caller pass everything
  127. return None
  128. # Peek back from the end for a good place to start,
  129. # but don't try too often; pos will be left None, or
  130. # bumped to a legitimate synch point.
  131. limit = len(str)
  132. for tries in range(5):
  133. i = str.rfind(":\n", 0, limit)
  134. if i < 0:
  135. break
  136. i = str.rfind('\n', 0, i) + 1 # start of colon line
  137. m = _synchre(str, i, limit)
  138. if m and not is_char_in_string(m.start()):
  139. pos = m.start()
  140. break
  141. limit = i
  142. if pos is None:
  143. # Nothing looks like a block-opener, or stuff does
  144. # but is_char_in_string keeps returning true; most likely
  145. # we're in or near a giant string, the colorizer hasn't
  146. # caught up enough to be helpful, or there simply *aren't*
  147. # any interesting stmts. In any of these cases we're
  148. # going to have to parse the whole thing to be sure, so
  149. # give it one last try from the start, but stop wasting
  150. # time here regardless of the outcome.
  151. m = _synchre(str)
  152. if m and not is_char_in_string(m.start()):
  153. pos = m.start()
  154. return pos
  155. # Peeking back worked; look forward until _synchre no longer
  156. # matches.
  157. i = pos + 1
  158. while 1:
  159. m = _synchre(str, i)
  160. if m:
  161. s, i = m.span()
  162. if not is_char_in_string(s):
  163. pos = s
  164. else:
  165. break
  166. return pos
  167. # Throw away the start of the string. Intended to be called with
  168. # find_good_parse_start's result.
  169. def set_lo(self, lo):
  170. assert lo == 0 or self.str[lo-1] == '\n'
  171. if lo > 0:
  172. self.str = self.str[lo:]
  173. # As quickly as humanly possible <wink>, find the line numbers (0-
  174. # based) of the non-continuation lines.
  175. # Creates self.{goodlines, continuation}.
  176. def _study1(self):
  177. if self.study_level >= 1:
  178. return
  179. self.study_level = 1
  180. # Map all uninteresting characters to "x", all open brackets
  181. # to "(", all close brackets to ")", then collapse runs of
  182. # uninteresting characters. This can cut the number of chars
  183. # by a factor of 10-40, and so greatly speed the following loop.
  184. str = self.str
  185. str = str.translate(_tran)
  186. str = str.replace('xxxxxxxx', 'x')
  187. str = str.replace('xxxx', 'x')
  188. str = str.replace('xx', 'x')
  189. str = str.replace('xx', 'x')
  190. str = str.replace('\nx', '\n')
  191. # note that replacing x\n with \n would be incorrect, because
  192. # x may be preceded by a backslash
  193. # March over the squashed version of the program, accumulating
  194. # the line numbers of non-continued stmts, and determining
  195. # whether & why the last stmt is a continuation.
  196. continuation = C_NONE
  197. level = lno = 0 # level is nesting level; lno is line number
  198. self.goodlines = goodlines = [0]
  199. push_good = goodlines.append
  200. i, n = 0, len(str)
  201. while i < n:
  202. ch = str[i]
  203. i = i+1
  204. # cases are checked in decreasing order of frequency
  205. if ch == 'x':
  206. continue
  207. if ch == '\n':
  208. lno = lno + 1
  209. if level == 0:
  210. push_good(lno)
  211. # else we're in an unclosed bracket structure
  212. continue
  213. if ch == '(':
  214. level = level + 1
  215. continue
  216. if ch == ')':
  217. if level:
  218. level = level - 1
  219. # else the program is invalid, but we can't complain
  220. continue
  221. if ch == '"' or ch == "'":
  222. # consume the string
  223. quote = ch
  224. if str[i-1:i+2] == quote * 3:
  225. quote = quote * 3
  226. firstlno = lno
  227. w = len(quote) - 1
  228. i = i+w
  229. while i < n:
  230. ch = str[i]
  231. i = i+1
  232. if ch == 'x':
  233. continue
  234. if str[i-1:i+w] == quote:
  235. i = i+w
  236. break
  237. if ch == '\n':
  238. lno = lno + 1
  239. if w == 0:
  240. # unterminated single-quoted string
  241. if level == 0:
  242. push_good(lno)
  243. break
  244. continue
  245. if ch == '\\':
  246. assert i < n
  247. if str[i] == '\n':
  248. lno = lno + 1
  249. i = i+1
  250. continue
  251. # else comment char or paren inside string
  252. else:
  253. # didn't break out of the loop, so we're still
  254. # inside a string
  255. if (lno - 1) == firstlno:
  256. # before the previous \n in str, we were in the first
  257. # line of the string
  258. continuation = C_STRING_FIRST_LINE
  259. else:
  260. continuation = C_STRING_NEXT_LINES
  261. continue # with outer loop
  262. if ch == '#':
  263. # consume the comment
  264. i = str.find('\n', i)
  265. assert i >= 0
  266. continue
  267. assert ch == '\\'
  268. assert i < n
  269. if str[i] == '\n':
  270. lno = lno + 1
  271. if i+1 == n:
  272. continuation = C_BACKSLASH
  273. i = i+1
  274. # The last stmt may be continued for all 3 reasons.
  275. # String continuation takes precedence over bracket
  276. # continuation, which beats backslash continuation.
  277. if (continuation != C_STRING_FIRST_LINE
  278. and continuation != C_STRING_NEXT_LINES and level > 0):
  279. continuation = C_BRACKET
  280. self.continuation = continuation
  281. # Push the final line number as a sentinel value, regardless of
  282. # whether it's continued.
  283. assert (continuation == C_NONE) == (goodlines[-1] == lno)
  284. if goodlines[-1] != lno:
  285. push_good(lno)
  286. def get_continuation_type(self):
  287. self._study1()
  288. return self.continuation
  289. # study1 was sufficient to determine the continuation status,
  290. # but doing more requires looking at every character. study2
  291. # does this for the last interesting statement in the block.
  292. # Creates:
  293. # self.stmt_start, stmt_end
  294. # slice indices of last interesting stmt
  295. # self.stmt_bracketing
  296. # the bracketing structure of the last interesting stmt;
  297. # for example, for the statement "say(boo) or die", stmt_bracketing
  298. # will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are
  299. # treated as brackets, for the matter.
  300. # self.lastch
  301. # last non-whitespace character before optional trailing
  302. # comment
  303. # self.lastopenbracketpos
  304. # if continuation is C_BRACKET, index of last open bracket
  305. def _study2(self):
  306. if self.study_level >= 2:
  307. return
  308. self._study1()
  309. self.study_level = 2
  310. # Set p and q to slice indices of last interesting stmt.
  311. str, goodlines = self.str, self.goodlines
  312. i = len(goodlines) - 1
  313. p = len(str) # index of newest line
  314. while i:
  315. assert p
  316. # p is the index of the stmt at line number goodlines[i].
  317. # Move p back to the stmt at line number goodlines[i-1].
  318. q = p
  319. for nothing in range(goodlines[i-1], goodlines[i]):
  320. # tricky: sets p to 0 if no preceding newline
  321. p = str.rfind('\n', 0, p-1) + 1
  322. # The stmt str[p:q] isn't a continuation, but may be blank
  323. # or a non-indenting comment line.
  324. if _junkre(str, p):
  325. i = i-1
  326. else:
  327. break
  328. if i == 0:
  329. # nothing but junk!
  330. assert p == 0
  331. q = p
  332. self.stmt_start, self.stmt_end = p, q
  333. # Analyze this stmt, to find the last open bracket (if any)
  334. # and last interesting character (if any).
  335. lastch = ""
  336. stack = [] # stack of open bracket indices
  337. push_stack = stack.append
  338. bracketing = [(p, 0)]
  339. while p < q:
  340. # suck up all except ()[]{}'"#\\
  341. m = _chew_ordinaryre(str, p, q)
  342. if m:
  343. # we skipped at least one boring char
  344. newp = m.end()
  345. # back up over totally boring whitespace
  346. i = newp - 1 # index of last boring char
  347. while i >= p and str[i] in " \t\n":
  348. i = i-1
  349. if i >= p:
  350. lastch = str[i]
  351. p = newp
  352. if p >= q:
  353. break
  354. ch = str[p]
  355. if ch in "([{":
  356. push_stack(p)
  357. bracketing.append((p, len(stack)))
  358. lastch = ch
  359. p = p+1
  360. continue
  361. if ch in ")]}":
  362. if stack:
  363. del stack[-1]
  364. lastch = ch
  365. p = p+1
  366. bracketing.append((p, len(stack)))
  367. continue
  368. if ch == '"' or ch == "'":
  369. # consume string
  370. # Note that study1 did this with a Python loop, but
  371. # we use a regexp here; the reason is speed in both
  372. # cases; the string may be huge, but study1 pre-squashed
  373. # strings to a couple of characters per line. study1
  374. # also needed to keep track of newlines, and we don't
  375. # have to.
  376. bracketing.append((p, len(stack)+1))
  377. lastch = ch
  378. p = _match_stringre(str, p, q).end()
  379. bracketing.append((p, len(stack)))
  380. continue
  381. if ch == '#':
  382. # consume comment and trailing newline
  383. bracketing.append((p, len(stack)+1))
  384. p = str.find('\n', p, q) + 1
  385. assert p > 0
  386. bracketing.append((p, len(stack)))
  387. continue
  388. assert ch == '\\'
  389. p = p+1 # beyond backslash
  390. assert p < q
  391. if str[p] != '\n':
  392. # the program is invalid, but can't complain
  393. lastch = ch + str[p]
  394. p = p+1 # beyond escaped char
  395. # end while p < q:
  396. self.lastch = lastch
  397. if stack:
  398. self.lastopenbracketpos = stack[-1]
  399. self.stmt_bracketing = tuple(bracketing)
  400. # Assuming continuation is C_BRACKET, return the number
  401. # of spaces the next line should be indented.
  402. def compute_bracket_indent(self):
  403. self._study2()
  404. assert self.continuation == C_BRACKET
  405. j = self.lastopenbracketpos
  406. str = self.str
  407. n = len(str)
  408. origi = i = str.rfind('\n', 0, j) + 1
  409. j = j+1 # one beyond open bracket
  410. # find first list item; set i to start of its line
  411. while j < n:
  412. m = _itemre(str, j)
  413. if m:
  414. j = m.end() - 1 # index of first interesting char
  415. extra = 0
  416. break
  417. else:
  418. # this line is junk; advance to next line
  419. i = j = str.find('\n', j) + 1
  420. else:
  421. # nothing interesting follows the bracket;
  422. # reproduce the bracket line's indentation + a level
  423. j = i = origi
  424. while str[j] in " \t":
  425. j = j+1
  426. extra = self.indentwidth
  427. return len(str[i:j].expandtabs(self.tabwidth)) + extra
  428. # Return number of physical lines in last stmt (whether or not
  429. # it's an interesting stmt! this is intended to be called when
  430. # continuation is C_BACKSLASH).
  431. def get_num_lines_in_stmt(self):
  432. self._study1()
  433. goodlines = self.goodlines
  434. return goodlines[-1] - goodlines[-2]
  435. # Assuming continuation is C_BACKSLASH, return the number of spaces
  436. # the next line should be indented. Also assuming the new line is
  437. # the first one following the initial line of the stmt.
  438. def compute_backslash_indent(self):
  439. self._study2()
  440. assert self.continuation == C_BACKSLASH
  441. str = self.str
  442. i = self.stmt_start
  443. while str[i] in " \t":
  444. i = i+1
  445. startpos = i
  446. # See whether the initial line starts an assignment stmt; i.e.,
  447. # look for an = operator
  448. endpos = str.find('\n', startpos) + 1
  449. found = level = 0
  450. while i < endpos:
  451. ch = str[i]
  452. if ch in "([{":
  453. level = level + 1
  454. i = i+1
  455. elif ch in ")]}":
  456. if level:
  457. level = level - 1
  458. i = i+1
  459. elif ch == '"' or ch == "'":
  460. i = _match_stringre(str, i, endpos).end()
  461. elif ch == '#':
  462. break
  463. elif level == 0 and ch == '=' and \
  464. (i == 0 or str[i-1] not in "=<>!") and \
  465. str[i+1] != '=':
  466. found = 1
  467. break
  468. else:
  469. i = i+1
  470. if found:
  471. # found a legit =, but it may be the last interesting
  472. # thing on the line
  473. i = i+1 # move beyond the =
  474. found = re.match(r"\s*\\", str[i:endpos]) is None
  475. if not found:
  476. # oh well ... settle for moving beyond the first chunk
  477. # of non-whitespace chars
  478. i = startpos
  479. while str[i] not in " \t\n":
  480. i = i+1
  481. return len(str[self.stmt_start:i].expandtabs(\
  482. self.tabwidth)) + 1
  483. # Return the leading whitespace on the initial line of the last
  484. # interesting stmt.
  485. def get_base_indent_string(self):
  486. self._study2()
  487. i, n = self.stmt_start, self.stmt_end
  488. j = i
  489. str = self.str
  490. while j < n and str[j] in " \t":
  491. j = j + 1
  492. return str[i:j]
  493. # Did the last interesting stmt open a block?
  494. def is_block_opener(self):
  495. self._study2()
  496. return self.lastch == ':'
  497. # Did the last interesting stmt close a block?
  498. def is_block_closer(self):
  499. self._study2()
  500. return _closere(self.str, self.stmt_start) is not None
  501. # index of last open bracket ({[, or None if none
  502. lastopenbracketpos = None
  503. def get_last_open_bracket_pos(self):
  504. self._study2()
  505. return self.lastopenbracketpos
  506. # the structure of the bracketing of the last interesting statement,
  507. # in the format defined in _study2, or None if the text didn't contain
  508. # anything
  509. stmt_bracketing = None
  510. def get_last_stmt_bracketing(self):
  511. self._study2()
  512. return self.stmt_bracketing