PageRenderTime 56ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/sphinx/pycode/__init__.py

https://bitbucket.org/birkenfeld/sphinx/
Python | 350 lines | 258 code | 27 blank | 65 comment | 48 complexity | 3092a3ec9a03889175a4ec751b353952 MD5 | raw file
Possible License(s): BSD-2-Clause
  1. # -*- coding: utf-8 -*-
  2. """
  3. sphinx.pycode
  4. ~~~~~~~~~~~~~
  5. Utilities parsing and analyzing Python code.
  6. :copyright: Copyright 2007-2014 by the Sphinx team, see AUTHORS.
  7. :license: BSD, see LICENSE for details.
  8. """
  9. from __future__ import print_function
  10. import sys
  11. from os import path
  12. from six import iteritems, text_type, BytesIO, StringIO
  13. from sphinx import package_dir
  14. from sphinx.errors import PycodeError
  15. from sphinx.pycode import nodes
  16. from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
  17. from sphinx.util import get_module_source, detect_encoding
  18. from sphinx.util.pycompat import TextIOWrapper
  19. from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc
  20. # load the Python grammar
  21. _grammarfile = path.join(package_dir, 'pycode',
  22. 'Grammar-py%d.txt' % sys.version_info[0])
  23. pygrammar = driver.load_grammar(_grammarfile)
  24. pydriver = driver.Driver(pygrammar, convert=nodes.convert)
  25. # an object with attributes corresponding to token and symbol names
  26. class sym: pass
  27. for k, v in iteritems(pygrammar.symbol2number):
  28. setattr(sym, k, v)
  29. for k, v in iteritems(token.tok_name):
  30. setattr(sym, v, k)
  31. # a dict mapping terminal and nonterminal numbers to their names
  32. number2name = pygrammar.number2symbol.copy()
  33. number2name.update(token.tok_name)
  34. _eq = nodes.Leaf(token.EQUAL, '=')
  35. class AttrDocVisitor(nodes.NodeVisitor):
  36. """
  37. Visitor that collects docstrings for attribute assignments on toplevel and
  38. in classes (class attributes and attributes set in __init__).
  39. The docstrings can either be in special '#:' comments before the assignment
  40. or in a docstring after it.
  41. """
  42. def init(self, scope, encoding):
  43. self.scope = scope
  44. self.in_init = 0
  45. self.encoding = encoding
  46. self.namespace = []
  47. self.collected = {}
  48. self.tagnumber = 0
  49. self.tagorder = {}
  50. def add_tag(self, name):
  51. name = '.'.join(self.namespace + [name])
  52. self.tagorder[name] = self.tagnumber
  53. self.tagnumber += 1
  54. def visit_classdef(self, node):
  55. """Visit a class."""
  56. self.add_tag(node[1].value)
  57. self.namespace.append(node[1].value)
  58. self.generic_visit(node)
  59. self.namespace.pop()
  60. def visit_funcdef(self, node):
  61. """Visit a function (or method)."""
  62. # usually, don't descend into functions -- nothing interesting there
  63. self.add_tag(node[1].value)
  64. if node[1].value == '__init__':
  65. # however, collect attributes set in __init__ methods
  66. self.in_init += 1
  67. self.generic_visit(node)
  68. self.in_init -= 1
  69. def visit_expr_stmt(self, node):
  70. """Visit an assignment which may have a special comment before (or
  71. after) it.
  72. """
  73. if _eq not in node.children:
  74. # not an assignment (we don't care for augmented assignments)
  75. return
  76. # look *after* the node; there may be a comment prefixing the NEWLINE
  77. # of the simple_stmt
  78. parent = node.parent
  79. idx = parent.children.index(node) + 1
  80. while idx < len(parent):
  81. if parent[idx].type == sym.SEMI:
  82. idx += 1
  83. continue # skip over semicolon
  84. if parent[idx].type == sym.NEWLINE:
  85. prefix = parent[idx].get_prefix()
  86. if not isinstance(prefix, text_type):
  87. prefix = prefix.decode(self.encoding)
  88. docstring = prepare_commentdoc(prefix)
  89. if docstring:
  90. self.add_docstring(node, docstring)
  91. return # don't allow docstrings both before and after
  92. break
  93. # now look *before* the node
  94. pnode = node[0]
  95. prefix = pnode.get_prefix()
  96. # if the assignment is the first statement on a new indentation
  97. # level, its preceding whitespace and comments are not assigned
  98. # to that token, but the first INDENT or DEDENT token
  99. while not prefix:
  100. pnode = pnode.get_prev_leaf()
  101. if not pnode or pnode.type not in (token.INDENT, token.DEDENT):
  102. break
  103. prefix = pnode.get_prefix()
  104. if not isinstance(prefix, text_type):
  105. prefix = prefix.decode(self.encoding)
  106. docstring = prepare_commentdoc(prefix)
  107. self.add_docstring(node, docstring)
  108. def visit_simple_stmt(self, node):
  109. """Visit a docstring statement which may have an assignment before."""
  110. if node[0].type != token.STRING:
  111. # not a docstring; but still need to visit children
  112. return self.generic_visit(node)
  113. prev = node.get_prev_sibling()
  114. if not prev:
  115. return
  116. if prev.type == sym.simple_stmt and \
  117. prev[0].type == sym.expr_stmt and _eq in prev[0].children:
  118. # need to "eval" the string because it's returned in its
  119. # original form
  120. docstring = literals.evalString(node[0].value, self.encoding)
  121. docstring = prepare_docstring(docstring)
  122. self.add_docstring(prev[0], docstring)
  123. def add_docstring(self, node, docstring):
  124. # add an item for each assignment target
  125. for i in range(0, len(node) - 1, 2):
  126. target = node[i]
  127. if self.in_init and self.number2name[target.type] == 'power':
  128. # maybe an attribute assignment -- check necessary conditions
  129. if (# node must have two children
  130. len(target) != 2 or
  131. # first child must be "self"
  132. target[0].type != token.NAME or target[0].value != 'self' or
  133. # second child must be a "trailer" with two children
  134. self.number2name[target[1].type] != 'trailer' or
  135. len(target[1]) != 2 or
  136. # first child must be a dot, second child a name
  137. target[1][0].type != token.DOT or
  138. target[1][1].type != token.NAME):
  139. continue
  140. name = target[1][1].value
  141. elif target.type != token.NAME:
  142. # don't care about other complex targets
  143. continue
  144. else:
  145. name = target.value
  146. self.add_tag(name)
  147. if docstring:
  148. namespace = '.'.join(self.namespace)
  149. if namespace.startswith(self.scope):
  150. self.collected[namespace, name] = docstring
  151. class ModuleAnalyzer(object):
  152. # cache for analyzer objects -- caches both by module and file name
  153. cache = {}
  154. @classmethod
  155. def for_string(cls, string, modname, srcname='<string>'):
  156. if isinstance(string, bytes):
  157. return cls(BytesIO(string), modname, srcname)
  158. return cls(StringIO(string), modname, srcname, decoded=True)
  159. @classmethod
  160. def for_file(cls, filename, modname):
  161. if ('file', filename) in cls.cache:
  162. return cls.cache['file', filename]
  163. try:
  164. fileobj = open(filename, 'rb')
  165. except Exception as err:
  166. raise PycodeError('error opening %r' % filename, err)
  167. obj = cls(fileobj, modname, filename)
  168. cls.cache['file', filename] = obj
  169. return obj
  170. @classmethod
  171. def for_module(cls, modname):
  172. if ('module', modname) in cls.cache:
  173. entry = cls.cache['module', modname]
  174. if isinstance(entry, PycodeError):
  175. raise entry
  176. return entry
  177. try:
  178. type, source = get_module_source(modname)
  179. if type == 'string':
  180. obj = cls.for_string(source, modname)
  181. else:
  182. obj = cls.for_file(source, modname)
  183. except PycodeError as err:
  184. cls.cache['module', modname] = err
  185. raise
  186. cls.cache['module', modname] = obj
  187. return obj
  188. def __init__(self, source, modname, srcname, decoded=False):
  189. # name of the module
  190. self.modname = modname
  191. # name of the source file
  192. self.srcname = srcname
  193. # file-like object yielding source lines
  194. self.source = source
  195. # cache the source code as well
  196. pos = self.source.tell()
  197. if not decoded:
  198. self.encoding = detect_encoding(self.source.readline)
  199. self.source.seek(pos)
  200. self.code = self.source.read().decode(self.encoding)
  201. self.source.seek(pos)
  202. self.source = TextIOWrapper(self.source, self.encoding)
  203. else:
  204. self.encoding = None
  205. self.code = self.source.read()
  206. self.source.seek(pos)
  207. # will be filled by tokenize()
  208. self.tokens = None
  209. # will be filled by parse()
  210. self.parsetree = None
  211. # will be filled by find_attr_docs()
  212. self.attr_docs = None
  213. self.tagorder = None
  214. # will be filled by find_tags()
  215. self.tags = None
  216. def tokenize(self):
  217. """Generate tokens from the source."""
  218. if self.tokens is not None:
  219. return
  220. try:
  221. self.tokens = list(tokenize.generate_tokens(self.source.readline))
  222. except tokenize.TokenError as err:
  223. raise PycodeError('tokenizing failed', err)
  224. self.source.close()
  225. def parse(self):
  226. """Parse the generated source tokens."""
  227. if self.parsetree is not None:
  228. return
  229. self.tokenize()
  230. try:
  231. self.parsetree = pydriver.parse_tokens(self.tokens)
  232. except parse.ParseError as err:
  233. raise PycodeError('parsing failed', err)
  234. def find_attr_docs(self, scope=''):
  235. """Find class and module-level attributes and their documentation."""
  236. if self.attr_docs is not None:
  237. return self.attr_docs
  238. self.parse()
  239. attr_visitor = AttrDocVisitor(number2name, scope, self.encoding)
  240. attr_visitor.visit(self.parsetree)
  241. self.attr_docs = attr_visitor.collected
  242. self.tagorder = attr_visitor.tagorder
  243. # now that we found everything we could in the tree, throw it away
  244. # (it takes quite a bit of memory for large modules)
  245. self.parsetree = None
  246. return attr_visitor.collected
  247. def find_tags(self):
  248. """Find class, function and method definitions and their location."""
  249. if self.tags is not None:
  250. return self.tags
  251. self.tokenize()
  252. result = {}
  253. namespace = []
  254. stack = []
  255. indent = 0
  256. defline = False
  257. expect_indent = False
  258. def tokeniter(ignore = (token.COMMENT, token.NL)):
  259. for tokentup in self.tokens:
  260. if tokentup[0] not in ignore:
  261. yield tokentup
  262. tokeniter = tokeniter()
  263. for type, tok, spos, epos, line in tokeniter:
  264. if expect_indent:
  265. if type != token.INDENT:
  266. # no suite -- one-line definition
  267. assert stack
  268. dtype, fullname, startline, _ = stack.pop()
  269. endline = epos[0]
  270. namespace.pop()
  271. result[fullname] = (dtype, startline, endline)
  272. expect_indent = False
  273. if tok in ('def', 'class'):
  274. name = next(tokeniter)[1]
  275. namespace.append(name)
  276. fullname = '.'.join(namespace)
  277. stack.append((tok, fullname, spos[0], indent))
  278. defline = True
  279. elif type == token.INDENT:
  280. expect_indent = False
  281. indent += 1
  282. elif type == token.DEDENT:
  283. indent -= 1
  284. # if the stacklevel is the same as it was before the last
  285. # def/class block, this dedent closes that block
  286. if stack and indent == stack[-1][3]:
  287. dtype, fullname, startline, _ = stack.pop()
  288. endline = spos[0]
  289. namespace.pop()
  290. result[fullname] = (dtype, startline, endline)
  291. elif type == token.NEWLINE:
  292. # if this line contained a definition, expect an INDENT
  293. # to start the suite; if there is no such INDENT
  294. # it's a one-line definition
  295. if defline:
  296. defline = False
  297. expect_indent = True
  298. self.tags = result
  299. return result
  300. if __name__ == '__main__':
  301. import time, pprint
  302. x0 = time.time()
  303. #ma = ModuleAnalyzer.for_file(__file__.rstrip('c'), 'sphinx.builders.html')
  304. ma = ModuleAnalyzer.for_file('sphinx/environment.py',
  305. 'sphinx.environment')
  306. ma.tokenize()
  307. x1 = time.time()
  308. ma.parse()
  309. x2 = time.time()
  310. #for (ns, name), doc in iteritems(ma.find_attr_docs()):
  311. # print '>>', ns, name
  312. # print '\n'.join(doc)
  313. pprint.pprint(ma.find_tags())
  314. x3 = time.time()
  315. #print nodes.nice_repr(ma.parsetree, number2name)
  316. print("tokenizing %.4f, parsing %.4f, finding %.4f" % (x1-x0, x2-x1, x3-x2))