PageRenderTime 55ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/src/python/snow/lexer/transformations.py

https://gitlab.com/oytunistrator/snowscript
Python | 353 lines | 330 code | 20 blank | 3 comment | 26 complexity | c681a5bfe20e4436ec2f1e99fe10c5ce MD5 | raw file
  1. """
  2. This file holds a series of transformations of the lexer tokens.
  3. """
  4. from ply import lex
  5. import re
  6. from error import raise_indentation_error, raise_syntax_error
  7. from tokens import MISSING_PARENTHESIS, CASTS
  8. def build_token(_type, value, t):
  9. t2 = lex.LexToken()
  10. t2.type = _type
  11. t2.value = value
  12. t2.lineno = t.lineno
  13. t2.lexpos = -1
  14. try:
  15. t2.lexer = t.lexer
  16. except AttributeError:
  17. pass
  18. return t2
  19. def t_error(t):
  20. "Error token."
  21. raise_syntax_error("invalid syntax", t)
  22. def trim_beginning_newlines(token_stream):
  23. still_trim = True
  24. for t in token_stream:
  25. if still_trim and t.type == 'NEWLINE':
  26. continue
  27. else:
  28. still_trim = False
  29. yield t
  30. def delete_multiple_newlines(token_stream):
  31. prev_is_nl = False
  32. for t in token_stream:
  33. is_nl = t.type == 'NEWLINE'
  34. if prev_is_nl and is_nl:
  35. continue
  36. prev_is_nl = is_nl
  37. yield t
  38. def inject_case_tokens(token_stream):
  39. inside_switch = False
  40. case_indent = 0
  41. for t in token_stream:
  42. yield t
  43. if inside_switch:
  44. if t.type == 'NEWLINE':
  45. t2 = token_stream.next()
  46. yield t2
  47. if t2.type == 'WS':
  48. indent = len(t2.value)
  49. if case_indent == 0:
  50. case_indent = indent
  51. yield build_token('CASE', 'case', t2)
  52. else:
  53. if indent == case_indent:
  54. yield build_token('CASE', 'case', t2)
  55. elif indent < case_indent:
  56. inside_switch = False
  57. case_indent = 0
  58. elif t2.type == "SWITCH":
  59. case_indent = 0
  60. else:
  61. inside_switch = False
  62. case_indent = 0
  63. if t.type == "SWITCH":
  64. inside_switch = True
  65. case_indent = 0
  66. INDENT_ERROR = "Dedention matches no previous level."
  67. def inject_indent_tokens(lexer, token_stream):
  68. levels = [0]
  69. try:
  70. for t in token_stream:
  71. lexer.at_line_start = False
  72. if t.type == "NEWLINE":
  73. yield t
  74. lexer.at_line_start = True
  75. t2 = token_stream.next()
  76. level = len(t2.value) if t2.type == 'WS' else 0
  77. if level > levels[-1]:
  78. levels.append(level)
  79. yield build_token('INDENT', '', t2)
  80. elif level < levels[-1]:
  81. if level not in levels:
  82. raise_indentation_error(INDENT_ERROR, t2)
  83. while levels.pop() > level:
  84. yield build_token('DEDENT', '', t2)
  85. levels.append(level)
  86. if levels == []:
  87. levels = [0]
  88. if t2.type != 'WS':
  89. yield t2
  90. elif t.type == "WS":
  91. continue
  92. else:
  93. yield t
  94. except StopIteration:
  95. for level in range(0, len(levels) - 1):
  96. yield build_token('DEDENT', '', t)
  97. def mark_indentation_level(lexer, token_stream):
  98. lexer.indent_level = 0
  99. for t in token_stream:
  100. if t.type == 'INDENT':
  101. lexer.indent_level += 1
  102. elif t.type == 'DEDENT':
  103. lexer.indent_level -= 1
  104. yield t
  105. def add_endmarker(token_stream):
  106. for t in token_stream:
  107. yield t
  108. yield build_token("ENDMARKER", None, t)
  109. _add_endmarker = add_endmarker
  110. def remove_empty_concats(token_stream):
  111. for t in token_stream:
  112. if t.type == "STRING_WITH_CONCAT" and t.value == "":
  113. continue
  114. if t.type == "PERCENT":
  115. try:
  116. t2 = token_stream.next()
  117. except StopIteration:
  118. yield t
  119. raise StopIteration
  120. if not(t2.type in ("STRING_SINGLE", "STRING_DOUBLE")
  121. and t2.value == ""):
  122. yield t
  123. yield t2
  124. else:
  125. yield t
  126. def nuke_newlines_around_indent(token_stream):
  127. for t in token_stream:
  128. if t.type == 'NEWLINE':
  129. try:
  130. t2 = token_stream.next()
  131. except StopIteration:
  132. yield t
  133. raise StopIteration
  134. if t2.type in ('INDENT', 'PASS'):
  135. yield t2
  136. else:
  137. yield t
  138. yield t2
  139. elif t.type in ('INDENT', 'PASS'):
  140. try:
  141. t2 = token_stream.next()
  142. except StopIteration:
  143. yield t
  144. raise StopIteration
  145. if t2.type == 'NEWLINE':
  146. yield t
  147. else:
  148. yield t
  149. yield t2
  150. else:
  151. yield t
  152. def insert_missing_new(token_stream):
  153. prev_was_new = False
  154. prev_was_class = False
  155. for t in token_stream:
  156. if t.type == 'CLASS_NAME':
  157. t2 = token_stream.next()
  158. if t2.type == 'LPAR' and not prev_was_new and not prev_was_class:
  159. yield build_token('NEW', 'new', t)
  160. yield t
  161. yield t2
  162. else:
  163. yield t
  164. prev_was_new = t.type == 'NEW'
  165. prev_was_class = t.type == 'CLASS'
  166. def correct_class_accessor_names(token_stream):
  167. for t in token_stream:
  168. if t.type == 'DOT':
  169. t2 = token_stream.next()
  170. if t2.type == 'NAME':
  171. t2.type = 'PHP_STRING'
  172. yield t
  173. yield t2
  174. else:
  175. yield t
  176. def correct_function_call(token_stream):
  177. for t in token_stream:
  178. if t.type in ('NAME'):
  179. yield t
  180. t2 = token_stream.next()
  181. if t2.type == 'LPAR':
  182. t.type = 'PHP_STRING'
  183. yield t2
  184. else:
  185. yield t
  186. def correct_function_definition(token_stream):
  187. for t in token_stream:
  188. if t.type == 'FN':
  189. yield t
  190. t2 = token_stream.next()
  191. if t2.type == 'NAME':
  192. t2.type = 'PHP_STRING'
  193. yield t2
  194. else:
  195. yield t
  196. def casts_as_functioncalls(token_stream):
  197. remove_at_level = None
  198. for t in token_stream:
  199. if t.type in CASTS:
  200. t2 = token_stream.next()
  201. if t2.type == 'LPAR':
  202. remove_at_level = t2.lexer.bracket_level - 1
  203. yield build_token('%s_CAST' % t.type, '(int)', t)
  204. else:
  205. yield t
  206. yield t2
  207. elif t.type == 'RPAR' and t.lexer.bracket_level == remove_at_level:
  208. remove_at_level = None
  209. else:
  210. yield t
  211. def add_missing_parenthesis(token_stream):
  212. inside_expression = False
  213. for t in token_stream:
  214. if hasattr(t, 'lexer'):
  215. bracket_level = t.lexer.bracket_level
  216. else:
  217. bracket_level = 0
  218. if not inside_expression and t.type in MISSING_PARENTHESIS:
  219. start_bracket_level = t.lexer.bracket_level
  220. inside_expression = True
  221. yield t
  222. yield build_token('LPAR', '(', t)
  223. continue
  224. if (inside_expression and t.type in ('INDENT', 'COLON', 'THEN')
  225. and bracket_level == start_bracket_level):
  226. inside_expression = False
  227. yield build_token('RPAR', ')', t)
  228. yield t
  229. def add_missing_parenthesis_after_functions(token_stream):
  230. for t in token_stream:
  231. yield t
  232. if t.type == 'FN':
  233. t1 = token_stream.next()
  234. yield t1
  235. if t1.type == 'PHP_STRING':
  236. t2 = token_stream.next()
  237. if t2.type in ('INDENT', 'COLON'):
  238. yield build_token('LPAR', '(', t2)
  239. yield build_token('RPAR', ')', t2)
  240. yield t2
  241. def add_missing_this(token_stream):
  242. tks = ('PHP_STRING', 'NAME', 'CLASS_NAME', 'RPAR', 'RSQB')
  243. for t in token_stream:
  244. if t.type == 'DOT' and prev_t.type not in tks:
  245. yield build_token("NAME", "this", t)
  246. yield t
  247. prev_t = t
  248. def add_missing_self(token_stream):
  249. for t in token_stream:
  250. if t.type == 'DOUBLE_DOT' and prev_t.type not in (
  251. 'PHP_STRING', 'NAME', 'CLASS_NAME') and prev_t.value != 'parent':
  252. yield build_token("PHP_STRING", "self", t)
  253. yield t
  254. prev_t = t
  255. def debug(token_stream):
  256. print
  257. for t in token_stream:
  258. print t
  259. yield t
  260. def make_token_stream(lexer, add_endmarker=True):
  261. token_stream = iter(lexer.token, None)
  262. token_stream = trim_beginning_newlines(token_stream)
  263. token_stream = inject_case_tokens(token_stream)
  264. token_stream = inject_indent_tokens(lexer, token_stream)
  265. token_stream = mark_indentation_level(lexer, token_stream)
  266. token_stream = remove_empty_concats(token_stream)
  267. # TODO: Fix nuke_... so it is not neccessary to double call it.
  268. token_stream = nuke_newlines_around_indent(token_stream)
  269. token_stream = nuke_newlines_around_indent(token_stream)
  270. token_stream = insert_missing_new(token_stream)
  271. token_stream = correct_class_accessor_names(token_stream)
  272. token_stream = correct_function_call(token_stream)
  273. token_stream = correct_function_definition(token_stream)
  274. token_stream = casts_as_functioncalls(token_stream)
  275. token_stream = add_missing_parenthesis(token_stream)
  276. token_stream = add_missing_parenthesis_after_functions(token_stream)
  277. token_stream = delete_multiple_newlines(token_stream)
  278. token_stream = add_missing_this(token_stream)
  279. token_stream = add_missing_self(token_stream)
  280. #token_stream = debug(token_stream)
  281. if add_endmarker:
  282. token_stream = _add_endmarker(token_stream)
  283. return token_stream
  284. _newline_pattern = re.compile(r"\n")
  285. def get_line_offsets(text):
  286. offsets = [0]
  287. for m in _newline_pattern.finditer(text):
  288. offsets.append(m.end())
  289. # This is only really needed if the input does not end with a newline
  290. offsets.append(len(text))
  291. return offsets