PageRenderTime 54ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/closure_linter/javascripttokenizer.py

http://closure-linter.googlecode.com/
Python | 363 lines | 327 code | 11 blank | 25 comment | 0 complexity | aefa02d8b351b10417b3cce5cd44919e MD5 | raw file
Possible License(s): Apache-2.0
  1. #!/usr/bin/env python
  2. #
  3. # Copyright 2007 The Closure Linter Authors. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS-IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """Regular expression based JavaScript parsing classes."""
  17. __author__ = ('robbyw@google.com (Robert Walker)',
  18. 'ajp@google.com (Andy Perelson)')
  19. import copy
  20. import re
  21. from closure_linter import javascripttokens
  22. from closure_linter.common import matcher
  23. from closure_linter.common import tokenizer
  24. # Shorthand
  25. Type = javascripttokens.JavaScriptTokenType
  26. Matcher = matcher.Matcher
  27. class JavaScriptModes(object):
  28. """Enumeration of the different matcher modes used for JavaScript."""
  29. TEXT_MODE = 'text'
  30. SINGLE_QUOTE_STRING_MODE = 'single_quote_string'
  31. DOUBLE_QUOTE_STRING_MODE = 'double_quote_string'
  32. BLOCK_COMMENT_MODE = 'block_comment'
  33. DOC_COMMENT_MODE = 'doc_comment'
  34. DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces'
  35. LINE_COMMENT_MODE = 'line_comment'
  36. PARAMETER_MODE = 'parameter'
  37. FUNCTION_MODE = 'function'
  38. class JavaScriptTokenizer(tokenizer.Tokenizer):
  39. """JavaScript tokenizer.
  40. Convert JavaScript code in to an array of tokens.
  41. """
  42. # Useful patterns for JavaScript parsing.
  43. IDENTIFIER_CHAR = r'A-Za-z0-9_$.'
  44. # Number patterns based on:
  45. # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html
  46. MANTISSA = r"""
  47. (\d+(?!\.)) | # Matches '10'
  48. (\d+\.(?!\d)) | # Matches '10.'
  49. (\d*\.\d+) # Matches '.5' or '10.5'
  50. """
  51. DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA
  52. HEX_LITERAL = r'0[xX][0-9a-fA-F]+'
  53. NUMBER = re.compile(r"""
  54. ((%s)|(%s))
  55. """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE)
  56. # Strings come in three parts - first we match the start of the string, then
  57. # the contents, then the end. The contents consist of any character except a
  58. # backslash or end of string, or a backslash followed by any character, or a
  59. # backslash followed by end of line to support correct parsing of multi-line
  60. # strings.
  61. SINGLE_QUOTE = re.compile(r"'")
  62. SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+")
  63. DOUBLE_QUOTE = re.compile(r'"')
  64. DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+')
  65. START_SINGLE_LINE_COMMENT = re.compile(r'//')
  66. END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$')
  67. START_DOC_COMMENT = re.compile(r'/\*\*')
  68. START_BLOCK_COMMENT = re.compile(r'/\*')
  69. END_BLOCK_COMMENT = re.compile(r'\*/')
  70. BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+')
  71. # Comment text is anything that we are not going to parse into another special
  72. # token like (inline) flags or end comments. Complicated regex to match
  73. # most normal characters, and '*', '{', '}', and '@' when we are sure that
  74. # it is safe. Expression [^*{\s]@ must come first, or the other options will
  75. # match everything before @, and we won't match @'s that aren't part of flags
  76. # like in email addresses in the @author tag.
  77. DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+')
  78. DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+')
  79. # Match the prefix ' * ' that starts every line of jsdoc. Want to include
  80. # spaces after the '*', but nothing else that occurs after a '*', and don't
  81. # want to match the '*' in '*/'.
  82. DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))')
  83. START_BLOCK = re.compile('{')
  84. END_BLOCK = re.compile('}')
  85. REGEX_CHARACTER_CLASS = r"""
  86. \[ # Opening bracket
  87. ([^\]\\]|\\.)* # Anything but a ] or \,
  88. # or a backslash followed by anything
  89. \] # Closing bracket
  90. """
  91. # We ensure the regex is followed by one of the above tokens to avoid
  92. # incorrectly parsing something like x / y / z as x REGEX(/ y /) z
  93. POST_REGEX_LIST = [
  94. ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}']
  95. REGEX = re.compile(r"""
  96. / # opening slash
  97. (?!\*) # not the start of a comment
  98. (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything,
  99. # or anything but a / or [ or \,
  100. # or a character class
  101. / # closing slash
  102. [gimsx]* # optional modifiers
  103. (?=\s*(%s))
  104. """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)),
  105. re.VERBOSE)
  106. ANYTHING = re.compile(r'.*')
  107. PARAMETERS = re.compile(r'[^\)]+')
  108. CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*')
  109. FUNCTION_DECLARATION = re.compile(r'\bfunction\b')
  110. OPENING_PAREN = re.compile(r'\(')
  111. CLOSING_PAREN = re.compile(r'\)')
  112. OPENING_BRACKET = re.compile(r'\[')
  113. CLOSING_BRACKET = re.compile(r'\]')
  114. # We omit these JS keywords from the list:
  115. # function - covered by FUNCTION_DECLARATION.
  116. # delete, in, instanceof, new, typeof - included as operators.
  117. # this - included in identifiers.
  118. # null, undefined - not included, should go in some "special constant" list.
  119. KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else',
  120. 'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var',
  121. 'while', 'with']
  122. # Match a keyword string followed by a non-identifier character in order to
  123. # not match something like doSomething as do + Something.
  124. KEYWORD = re.compile('(%s)((?=[^%s])|$)' % (
  125. '|'.join(KEYWORD_LIST), IDENTIFIER_CHAR))
  126. # List of regular expressions to match as operators. Some notes: for our
  127. # purposes, the comma behaves similarly enough to a normal operator that we
  128. # include it here. r'\bin\b' actually matches 'in' surrounded by boundary
  129. # characters - this may not match some very esoteric uses of the in operator.
  130. # Operators that are subsets of larger operators must come later in this list
  131. # for proper matching, e.g., '>>' must come AFTER '>>>'.
  132. OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=',
  133. '!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+',
  134. '--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%',
  135. '&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?',
  136. r'\^', r'\bdelete\b', r'\bin\b', r'\binstanceof\b',
  137. r'\bnew\b', r'\btypeof\b', r'\bvoid\b']
  138. OPERATOR = re.compile('|'.join(OPERATOR_LIST))
  139. WHITESPACE = re.compile(r'\s+')
  140. SEMICOLON = re.compile(r';')
  141. # Technically JavaScript identifiers can't contain '.', but we treat a set of
  142. # nested identifiers as a single identifier.
  143. NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR
  144. IDENTIFIER = re.compile(NESTED_IDENTIFIER)
  145. SIMPLE_LVALUE = re.compile(r"""
  146. (?P<identifier>%s) # a valid identifier
  147. (?=\s* # optional whitespace
  148. \= # look ahead to equal sign
  149. (?!=)) # not follwed by equal
  150. """ % NESTED_IDENTIFIER, re.VERBOSE)
  151. # A doc flag is a @ sign followed by non-space characters that appears at the
  152. # beginning of the line, after whitespace, or after a '{'. The look-behind
  153. # check is necessary to not match someone@google.com as a flag.
  154. DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)')
  155. # To properly parse parameter names, we need to tokenize whitespace into a
  156. # token.
  157. DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P<name>%s)\b' %
  158. '|'.join(['param']))
  159. DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)')
  160. # Star followed by non-slash, i.e a star that does not end a comment.
  161. # This is used for TYPE_GROUP below.
  162. SAFE_STAR = r'(\*(?!/))'
  163. COMMON_DOC_MATCHERS = [
  164. # Find the end of the comment.
  165. Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT,
  166. JavaScriptModes.TEXT_MODE),
  167. # Tokenize documented flags like @private.
  168. Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG),
  169. Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG,
  170. JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE),
  171. # Encountering a doc flag should leave lex spaces mode.
  172. Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE),
  173. # Tokenize braces so we can find types.
  174. Matcher(START_BLOCK, Type.DOC_START_BRACE),
  175. Matcher(END_BLOCK, Type.DOC_END_BRACE),
  176. Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)]
  177. # The token matcher groups work as follows: it is an list of Matcher objects.
  178. # The matchers will be tried in this order, and the first to match will be
  179. # returned. Hence the order is important because the matchers that come first
  180. # overrule the matchers that come later.
  181. JAVASCRIPT_MATCHERS = {
  182. # Matchers for basic text mode.
  183. JavaScriptModes.TEXT_MODE: [
  184. # Check a big group - strings, starting comments, and regexes - all
  185. # of which could be intertwined. 'string with /regex/',
  186. # /regex with 'string'/, /* comment with /regex/ and string */ (and so
  187. # on)
  188. Matcher(START_DOC_COMMENT, Type.START_DOC_COMMENT,
  189. JavaScriptModes.DOC_COMMENT_MODE),
  190. Matcher(START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT,
  191. JavaScriptModes.BLOCK_COMMENT_MODE),
  192. Matcher(END_OF_LINE_SINGLE_LINE_COMMENT,
  193. Type.START_SINGLE_LINE_COMMENT),
  194. Matcher(START_SINGLE_LINE_COMMENT, Type.START_SINGLE_LINE_COMMENT,
  195. JavaScriptModes.LINE_COMMENT_MODE),
  196. Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START,
  197. JavaScriptModes.SINGLE_QUOTE_STRING_MODE),
  198. Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START,
  199. JavaScriptModes.DOUBLE_QUOTE_STRING_MODE),
  200. Matcher(REGEX, Type.REGEX),
  201. # Next we check for start blocks appearing outside any of the items
  202. # above.
  203. Matcher(START_BLOCK, Type.START_BLOCK),
  204. Matcher(END_BLOCK, Type.END_BLOCK),
  205. # Then we search for function declarations.
  206. Matcher(FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION,
  207. JavaScriptModes.FUNCTION_MODE),
  208. # Next, we convert non-function related parens to tokens.
  209. Matcher(OPENING_PAREN, Type.START_PAREN),
  210. Matcher(CLOSING_PAREN, Type.END_PAREN),
  211. # Next, we convert brackets to tokens.
  212. Matcher(OPENING_BRACKET, Type.START_BRACKET),
  213. Matcher(CLOSING_BRACKET, Type.END_BRACKET),
  214. # Find numbers. This has to happen before operators because scientific
  215. # notation numbers can have + and - in them.
  216. Matcher(NUMBER, Type.NUMBER),
  217. # Find operators and simple assignments
  218. Matcher(SIMPLE_LVALUE, Type.SIMPLE_LVALUE),
  219. Matcher(OPERATOR, Type.OPERATOR),
  220. # Find key words and whitespace.
  221. Matcher(KEYWORD, Type.KEYWORD),
  222. Matcher(WHITESPACE, Type.WHITESPACE),
  223. # Find identifiers.
  224. Matcher(IDENTIFIER, Type.IDENTIFIER),
  225. # Finally, we convert semicolons to tokens.
  226. Matcher(SEMICOLON, Type.SEMICOLON)],
  227. # Matchers for single quote strings.
  228. JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [
  229. Matcher(SINGLE_QUOTE_TEXT, Type.STRING_TEXT),
  230. Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END,
  231. JavaScriptModes.TEXT_MODE)],
  232. # Matchers for double quote strings.
  233. JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [
  234. Matcher(DOUBLE_QUOTE_TEXT, Type.STRING_TEXT),
  235. Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END,
  236. JavaScriptModes.TEXT_MODE)],
  237. # Matchers for block comments.
  238. JavaScriptModes.BLOCK_COMMENT_MODE: [
  239. # First we check for exiting a block comment.
  240. Matcher(END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT,
  241. JavaScriptModes.TEXT_MODE),
  242. # Match non-comment-ending text..
  243. Matcher(BLOCK_COMMENT_TEXT, Type.COMMENT)],
  244. # Matchers for doc comments.
  245. JavaScriptModes.DOC_COMMENT_MODE: COMMON_DOC_MATCHERS + [
  246. Matcher(DOC_COMMENT_TEXT, Type.COMMENT)],
  247. JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: COMMON_DOC_MATCHERS + [
  248. Matcher(WHITESPACE, Type.COMMENT),
  249. Matcher(DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)],
  250. # Matchers for single line comments.
  251. JavaScriptModes.LINE_COMMENT_MODE: [
  252. # We greedy match until the end of the line in line comment mode.
  253. Matcher(ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)],
  254. # Matchers for code after the function keyword.
  255. JavaScriptModes.FUNCTION_MODE: [
  256. # Must match open paren before anything else and move into parameter
  257. # mode, otherwise everything inside the parameter list is parsed
  258. # incorrectly.
  259. Matcher(OPENING_PAREN, Type.START_PARAMETERS,
  260. JavaScriptModes.PARAMETER_MODE),
  261. Matcher(WHITESPACE, Type.WHITESPACE),
  262. Matcher(IDENTIFIER, Type.FUNCTION_NAME)],
  263. # Matchers for function parameters
  264. JavaScriptModes.PARAMETER_MODE: [
  265. # When in function parameter mode, a closing paren is treated specially.
  266. # Everything else is treated as lines of parameters.
  267. Matcher(CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS,
  268. JavaScriptModes.TEXT_MODE),
  269. Matcher(PARAMETERS, Type.PARAMETERS, JavaScriptModes.PARAMETER_MODE)]}
  270. # When text is not matched, it is given this default type based on mode.
  271. # If unspecified in this map, the default default is Type.NORMAL.
  272. JAVASCRIPT_DEFAULT_TYPES = {
  273. JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT,
  274. JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT
  275. }
  276. def __init__(self, parse_js_doc = True):
  277. """Create a tokenizer object.
  278. Args:
  279. parse_js_doc: Whether to do detailed parsing of javascript doc comments,
  280. or simply treat them as normal comments. Defaults to parsing JsDoc.
  281. """
  282. matchers = self.JAVASCRIPT_MATCHERS
  283. if not parse_js_doc:
  284. # Make a copy so the original doesn't get modified.
  285. matchers = copy.deepcopy(matchers)
  286. matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[
  287. JavaScriptModes.BLOCK_COMMENT_MODE]
  288. tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers,
  289. self.JAVASCRIPT_DEFAULT_TYPES)
  290. def _CreateToken(self, string, token_type, line, line_number, values=None):
  291. """Creates a new JavaScriptToken object.
  292. Args:
  293. string: The string of input the token contains.
  294. token_type: The type of token.
  295. line: The text of the line this token is in.
  296. line_number: The line number of the token.
  297. values: A dict of named values within the token. For instance, a
  298. function declaration may have a value called 'name' which captures the
  299. name of the function.
  300. """
  301. return javascripttokens.JavaScriptToken(string, token_type, line,
  302. line_number, values, line_number)