PageRenderTime 60ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/extractors/JSLiteParser.py

http://raft.googlecode.com/
Python | 286 lines | 257 code | 9 blank | 20 comment | 25 complexity | 213f5e13ea702de78230f9d56226cf89 MD5 | raw file
Possible License(s): GPL-3.0
  1. #
  2. # Author: Gregory Fleischer (gfleischer@gmail.com)
  3. #
  4. # Copyright (c) 2011 RAFT Team
  5. #
  6. # This file is part of RAFT.
  7. #
  8. # RAFT is free software: you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation, either version 3 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # RAFT is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with RAFT. If not, see <http://www.gnu.org/licenses/>.
  20. #
  21. import re
  22. class JSLiteParser():
  23. S_BEGIN = 0
  24. S_QUOTE = 1
  25. S_REGEXP = 3
  26. S_COMMENT = 4
  27. S_LINE_COMMENT = 5
  28. S_SLASH = 6
  29. S_STAR = 7
  30. S_REGEXP_CC = 8
  31. def __init__(self):
  32. self._strings = []
  33. self._comments = []
  34. self.re_octal_digits = re.compile('[0-7]{3}')
  35. self.re_escape_string = re.compile(r'\\(?:[0-7]{3}|x[0-9a-fA-F]{2}|u[0-9a-fA-F]{4}|.)')
  36. self.re_identifier = re.compile(r'^[$_a-zA-Z0-9.]+$')
  37. self.re_keywords = re.compile(r'^(?:break|case|catch|const|continue|debugger|default|delete|do|else|enum|false|finally|for|function|if|in|instanceof|new|null|return|switch|this|throw|true|try|typeof|var|void|while|with)$')
  38. self.re_no_regex_start = re.compile(r'[\]]')
  39. self.re_space = re.compile(r'\s')
  40. def reset(self):
  41. self._strings = []
  42. self._comments = []
  43. def strings(self):
  44. return self._strings
  45. def comments(self):
  46. return self._comments
  47. def parse_inline(self, script, filename = '', lineno = 0):
  48. self.process(script)
  49. def parse(self, script, filename = '', lineno = 0):
  50. self.process(script)
  51. def parse_file(self, script, filename = '', lineno = 0):
  52. self.reset()
  53. self.process(script)
  54. def interpretEscape(self, match):
  55. m = match.group(0)[1:]
  56. if '0' == m:
  57. return '\0'
  58. elif 'b' == m:
  59. return '\b'
  60. elif 'f' == m:
  61. return '\f'
  62. elif 'n' == m:
  63. return '\n'
  64. elif 'r' == m:
  65. return '\r'
  66. elif 't' == m:
  67. return '\t'
  68. elif 'v' == m:
  69. return '\v'
  70. elif '\'' == m:
  71. return '\''
  72. elif '"' == m:
  73. return '"'
  74. elif '\\' == m:
  75. return '\\'
  76. else:
  77. try:
  78. i = None
  79. if 'u' == m[0] and 5 == len(m):
  80. i = int(m[1:], 16)
  81. elif 'x' == m[0] and 3 == len(m):
  82. i = int(m[1:], 16)
  83. elif 3 == len(m) and self.re_octal_digits.match(m):
  84. i = int(m, 8)
  85. if i is not None:
  86. if i < 128:
  87. return chr(i)
  88. else:
  89. return unichr(i)
  90. else:
  91. return m
  92. except ValueError:
  93. return m
  94. def parseString(self, value):
  95. if '\\' in value:
  96. try:
  97. return self.re_escape_string.sub(self.interpretEscape, value)
  98. except UnicodeDecodeError:
  99. print('oops', value)
  100. return value
  101. def process(self, script):
  102. state = self.S_BEGIN
  103. escape_next = False
  104. qchar = ''
  105. last_token = ''
  106. regex_paren_level = 0
  107. regex_use_heuristic = False
  108. pos = 0
  109. start_pos = pos
  110. s_len = len(script)
  111. rewind_pos = 0
  112. last_was_identifier = False
  113. while pos < s_len:
  114. c = script[pos]
  115. try:
  116. if escape_next:
  117. escape_next = False
  118. elif self.S_COMMENT == state:
  119. if '*' == c:
  120. state = self.S_STAR
  121. else:
  122. pass
  123. elif self.S_STAR == state:
  124. if '/' == c:
  125. self._comments.append(script[start_pos:pos+1])
  126. state = self.S_BEGIN
  127. start_pos = pos
  128. elif '*' == c:
  129. pass
  130. else:
  131. state = self.S_COMMENT
  132. elif '\n' == c:
  133. # newlines break everything except multiline-comment
  134. if self.S_LINE_COMMENT == state:
  135. self._comments.append(script[start_pos:pos])
  136. elif state in (self.S_REGEXP, self.S_REGEXP_CC):
  137. # invalid regex
  138. pos = rewind_pos
  139. if state != self.S_COMMENT:
  140. state = self.S_BEGIN
  141. start_pos = pos
  142. last_token = None
  143. elif self.S_QUOTE == state:
  144. if '\\' == c:
  145. escape_next = True
  146. elif qchar == c:
  147. self._strings.append(self.parseString(script[start_pos+1:pos]))
  148. state = self.S_BEGIN
  149. start_pos = pos
  150. else:
  151. pass
  152. elif self.S_REGEXP == state:
  153. if '\\' == c:
  154. escape_next = True
  155. elif '[' == c:
  156. state = self.S_REGEXP_CC
  157. elif ')' == c and 0 == regex_paren_level:
  158. # not valid
  159. pos = rewind_pos
  160. state = self.S_BEGIN
  161. start_pos = pos
  162. elif ';' == c and regex_use_heuristic and self.re_identifier.match(script[start_pos:pos].rstrip()):
  163. # probably not valid
  164. pos = rewind_pos
  165. state = self.S_BEGIN
  166. start_pos = pos
  167. elif '/' == c:
  168. # print('regex=',script[start_pos:pos])
  169. state = self.S_BEGIN
  170. start_pos = pos
  171. else:
  172. if '(' == c:
  173. regex_paren_level += 1
  174. elif ')' == c:
  175. regex_paren_level -= 1
  176. elif self.S_REGEXP_CC == state:
  177. if '\\' == c:
  178. escape_next = True
  179. elif ']' == c:
  180. state = self.S_REGEXP
  181. else:
  182. pass
  183. elif self.S_LINE_COMMENT == state:
  184. pass
  185. elif self.S_SLASH == state:
  186. if '*' == c:
  187. state = self.S_COMMENT
  188. elif '/' == c:
  189. state = self.S_LINE_COMMENT
  190. last_token = None
  191. else:
  192. if last_token:
  193. if self.re_identifier.match(last_token) and not self.re_keywords.match(last_token):
  194. is_re = False
  195. elif self.re_no_regex_start.match(last_token):
  196. is_re = False
  197. else:
  198. is_re = True
  199. else:
  200. is_re = True
  201. if is_re:
  202. if ')' == last_token:
  203. regex_use_heuristic = True
  204. else:
  205. regex_use_heuristic = False
  206. regex_paren_level = 0
  207. rewind_pos = pos
  208. if '[' == c:
  209. state = self.S_REGEXP_CC
  210. else:
  211. state = self.S_REGEXP
  212. if '\\' == c:
  213. escape_next = True
  214. elif '(' == c:
  215. regex_paren_level += 1
  216. start_pos = pos
  217. else:
  218. state = self.S_BEGIN
  219. elif self.S_BEGIN == state:
  220. if ';' == c:
  221. last_token = None
  222. start_pos = pos
  223. last_was_identifier = False
  224. elif '"' == c:
  225. state = self.S_QUOTE
  226. qchar = c
  227. start_pos = pos
  228. last_token = '__string__'
  229. last_was_identifier = False
  230. elif "'" == c:
  231. state = self.S_QUOTE
  232. qchar = c
  233. start_pos = pos
  234. last_token = '__string__'
  235. last_was_identifier = False
  236. elif '/' == c:
  237. state = self.S_SLASH
  238. this_token = script[start_pos:pos]
  239. if this_token:
  240. last_token = this_token
  241. start_pos = pos
  242. last_was_identifier = False
  243. elif self.re_identifier.match(c):
  244. if not last_was_identifier:
  245. this_token = script[start_pos:pos]
  246. if this_token:
  247. last_token = this_token
  248. start_pos = pos
  249. last_was_identifier = True
  250. else:
  251. this_token = script[start_pos:pos]
  252. if this_token:
  253. last_token = this_token
  254. start_pos = pos
  255. last_was_identifier = False
  256. else:
  257. raise Exception('unhandled state=' + state)
  258. except UnicodeEncodeError:
  259. # TODO: do something with char
  260. pass
  261. pos += 1
  262. if '__main__' == __name__:
  263. import sys
  264. parser = JSLiteParser()
  265. for a in sys.argv[1:]:
  266. script=open(a).read()
  267. parser.parse_file(script)
  268. print('\n'.join([s.encode('ascii', 'ignore') for s in parser.strings()]))
  269. print(parser.comments())