PageRenderTime 40ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/distribution/libraries/Babel-1.0dev-py3.2/babel/messages/jslexer.py

https://github.com/tictactatic/Superdesk
Python | 176 lines | 131 code | 19 blank | 26 comment | 14 complexity | ef3f52c5457b6bcc4dc5cd48a22d3822 MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-3.0, GPL-2.0
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2008-2011 Edgewall Software
  4. # All rights reserved.
  5. #
  6. # This software is licensed as described in the file COPYING, which
  7. # you should have received as part of this distribution. The terms
  8. # are also available at http://babel.edgewall.org/wiki/License.
  9. #
  10. # This software consists of voluntary contributions made by many
  11. # individuals. For the exact contribution history, see the revision
  12. # history and logs, available at http://babel.edgewall.org/log/.
  13. """A simple JavaScript 1.5 lexer which is used for the JavaScript
  14. extractor.
  15. """
  16. from operator import itemgetter
  17. import re
  18. from babel.compat import unichr, u
  19. operators = [
  20. '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
  21. '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
  22. '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
  23. '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
  24. ]
  25. operators.sort(key=lambda a: -len(a))
  26. escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
  27. rules = [
  28. (None, re.compile(r'\s+(?u)')),
  29. (None, re.compile(r'<!--.*')),
  30. ('linecomment', re.compile(r'//.*')),
  31. ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
  32. ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
  33. ('number', re.compile(r'''(?x)(
  34. (?:0|[1-9]\d*)
  35. (\.\d+)?
  36. ([eE][-+]?\d+)? |
  37. (0x[a-fA-F0-9]+)
  38. )''')),
  39. ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
  40. ('string', re.compile(r'''(?xs)(
  41. '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
  42. "(?:[^"\\]*(?:\\.[^"\\]*)*)"
  43. )'''))
  44. ]
  45. division_re = re.compile(r'/=?')
  46. regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
  47. line_re = re.compile(r'(\r\n|\n|\r)')
  48. line_join_re = re.compile(r'\\' + line_re.pattern)
  49. uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
  50. class Token(tuple):
  51. """Represents a token as returned by `tokenize`."""
  52. __slots__ = ()
  53. def __new__(cls, type, value, lineno):
  54. return tuple.__new__(cls, (type, value, lineno))
  55. type = property(itemgetter(0))
  56. value = property(itemgetter(1))
  57. lineno = property(itemgetter(2))
  58. def indicates_division(token):
  59. """A helper function that helps the tokenizer to decide if the current
  60. token may be followed by a division operator.
  61. """
  62. if token.type == 'operator':
  63. return token.value in (')', ']', '}', '++', '--')
  64. return token.type in ('name', 'number', 'string', 'regexp')
  65. def unquote_string(string):
  66. """Unquote a string with JavaScript rules. The string has to start with
  67. string delimiters (``'`` or ``"``.)
  68. :return: a string
  69. """
  70. assert string and string[0] == string[-1] and string[0] in '"\'', \
  71. 'string provided is not properly delimited'
  72. string = line_join_re.sub('\\1', string[1:-1])
  73. result = []
  74. add = result.append
  75. pos = 0
  76. while 1:
  77. # scan for the next escape
  78. escape_pos = string.find('\\', pos)
  79. if escape_pos < 0:
  80. break
  81. add(string[pos:escape_pos])
  82. # check which character is escaped
  83. next_char = string[escape_pos + 1]
  84. if next_char in escapes:
  85. add(escapes[next_char])
  86. # unicode escapes. trie to consume up to four characters of
  87. # hexadecimal characters and try to interpret them as unicode
  88. # character point. If there is no such character point, put
  89. # all the consumed characters into the string.
  90. elif next_char in 'uU':
  91. escaped = uni_escape_re.match(string, escape_pos + 2)
  92. if escaped is not None:
  93. escaped_value = escaped.group()
  94. if len(escaped_value) == 4:
  95. try:
  96. add(unichr(int(escaped_value, 16)))
  97. except ValueError:
  98. pass
  99. else:
  100. pos = escape_pos + 6
  101. continue
  102. add(next_char + escaped_value)
  103. pos = escaped.end()
  104. continue
  105. else:
  106. add(next_char)
  107. # bogus escape. Just remove the backslash.
  108. else:
  109. add(next_char)
  110. pos = escape_pos + 2
  111. if pos < len(string):
  112. add(string[pos:])
  113. return u('').join(result)
  114. def tokenize(source):
  115. """Tokenize a JavaScript source.
  116. :return: generator of `Token`\s
  117. """
  118. may_divide = False
  119. pos = 0
  120. lineno = 1
  121. end = len(source)
  122. while pos < end:
  123. # handle regular rules first
  124. for token_type, rule in rules:
  125. match = rule.match(source, pos)
  126. if match is not None:
  127. break
  128. # if we don't have a match we don't give up yet, but check for
  129. # division operators or regular expression literals, based on
  130. # the status of `may_divide` which is determined by the last
  131. # processed non-whitespace token using `indicates_division`.
  132. else:
  133. if may_divide:
  134. match = division_re.match(source, pos)
  135. token_type = 'operator'
  136. else:
  137. match = regex_re.match(source, pos)
  138. token_type = 'regexp'
  139. if match is None:
  140. # woops. invalid syntax. jump one char ahead and try again.
  141. pos += 1
  142. continue
  143. token_value = match.group()
  144. if token_type is not None:
  145. token = Token(token_type, token_value, lineno)
  146. may_divide = indicates_division(token)
  147. yield token
  148. lineno += len(line_re.findall(token_value))
  149. pos = match.end()