""" This file holds a series of transformations of the lexer tokens. """ from ply import lex import re from error import raise_indentation_error, raise_syntax_error from tokens import MISSING_PARENTHESIS, CASTS def build_token(_type, value, t): t2 = lex.LexToken() t2.type = _type t2.value = value t2.lineno = t.lineno t2.lexpos = -1 try: t2.lexer = t.lexer except AttributeError: pass return t2 def t_error(t): "Error token." raise_syntax_error("invalid syntax", t) def trim_beginning_newlines(token_stream): still_trim = True for t in token_stream: if still_trim and t.type == 'NEWLINE': continue else: still_trim = False yield t def delete_multiple_newlines(token_stream): prev_is_nl = False for t in token_stream: is_nl = t.type == 'NEWLINE' if prev_is_nl and is_nl: continue prev_is_nl = is_nl yield t def inject_case_tokens(token_stream): inside_switch = False case_indent = 0 for t in token_stream: yield t if inside_switch: if t.type == 'NEWLINE': t2 = token_stream.next() yield t2 if t2.type == 'WS': indent = len(t2.value) if case_indent == 0: case_indent = indent yield build_token('CASE', 'case', t2) else: if indent == case_indent: yield build_token('CASE', 'case', t2) elif indent < case_indent: inside_switch = False case_indent = 0 elif t2.type == "SWITCH": case_indent = 0 else: inside_switch = False case_indent = 0 if t.type == "SWITCH": inside_switch = True case_indent = 0 INDENT_ERROR = "Dedention matches no previous level." def inject_indent_tokens(lexer, token_stream): levels = [0] try: for t in token_stream: lexer.at_line_start = False if t.type == "NEWLINE": yield t lexer.at_line_start = True t2 = token_stream.next() level = len(t2.value) if t2.type == 'WS' else 0 if level > levels[-1]: levels.append(level) yield build_token('INDENT', '', t2) elif level < levels[-1]: if level not in levels: raise_indentation_error(INDENT_ERROR, t2) while levels.pop() > level: yield build_token('DEDENT', '', t2) levels.append(level) if levels == []: levels = [0] if t2.type != 'WS': yield t2 elif t.type == "WS": continue else: yield t except StopIteration: for level in range(0, len(levels) - 1): yield build_token('DEDENT', '', t) def mark_indentation_level(lexer, token_stream): lexer.indent_level = 0 for t in token_stream: if t.type == 'INDENT': lexer.indent_level += 1 elif t.type == 'DEDENT': lexer.indent_level -= 1 yield t def add_endmarker(token_stream): for t in token_stream: yield t yield build_token("ENDMARKER", None, t) _add_endmarker = add_endmarker def remove_empty_concats(token_stream): for t in token_stream: if t.type == "STRING_WITH_CONCAT" and t.value == "": continue if t.type == "PERCENT": try: t2 = token_stream.next() except StopIteration: yield t raise StopIteration if not(t2.type in ("STRING_SINGLE", "STRING_DOUBLE") and t2.value == ""): yield t yield t2 else: yield t def nuke_newlines_around_indent(token_stream): for t in token_stream: if t.type == 'NEWLINE': try: t2 = token_stream.next() except StopIteration: yield t raise StopIteration if t2.type in ('INDENT', 'PASS'): yield t2 else: yield t yield t2 elif t.type in ('INDENT', 'PASS'): try: t2 = token_stream.next() except StopIteration: yield t raise StopIteration if t2.type == 'NEWLINE': yield t else: yield t yield t2 else: yield t def insert_missing_new(token_stream): prev_was_new = False prev_was_class = False for t in token_stream: if t.type == 'CLASS_NAME': t2 = token_stream.next() if t2.type == 'LPAR' and not prev_was_new and not prev_was_class: yield build_token('NEW', 'new', t) yield t yield t2 else: yield t prev_was_new = t.type == 'NEW' prev_was_class = t.type == 'CLASS' def correct_class_accessor_names(token_stream): for t in token_stream: if t.type == 'DOT': t2 = token_stream.next() if t2.type == 'NAME': t2.type = 'PHP_STRING' yield t yield t2 else: yield t def correct_function_call(token_stream): for t in token_stream: if t.type in ('NAME'): yield t t2 = token_stream.next() if t2.type == 'LPAR': t.type = 'PHP_STRING' yield t2 else: yield t def correct_function_definition(token_stream): for t in token_stream: if t.type == 'FN': yield t t2 = token_stream.next() if t2.type == 'NAME': t2.type = 'PHP_STRING' yield t2 else: yield t def casts_as_functioncalls(token_stream): remove_at_level = None for t in token_stream: if t.type in CASTS: t2 = token_stream.next() if t2.type == 'LPAR': remove_at_level = t2.lexer.bracket_level - 1 yield build_token('%s_CAST' % t.type, '(int)', t) else: yield t yield t2 elif t.type == 'RPAR' and t.lexer.bracket_level == remove_at_level: remove_at_level = None else: yield t def add_missing_parenthesis(token_stream): inside_expression = False for t in token_stream: if hasattr(t, 'lexer'): bracket_level = t.lexer.bracket_level else: bracket_level = 0 if not inside_expression and t.type in MISSING_PARENTHESIS: start_bracket_level = t.lexer.bracket_level inside_expression = True yield t yield build_token('LPAR', '(', t) continue if (inside_expression and t.type in ('INDENT', 'COLON', 'THEN') and bracket_level == start_bracket_level): inside_expression = False yield build_token('RPAR', ')', t) yield t def add_missing_parenthesis_after_functions(token_stream): for t in token_stream: yield t if t.type == 'FN': t1 = token_stream.next() yield t1 if t1.type == 'PHP_STRING': t2 = token_stream.next() if t2.type in ('INDENT', 'COLON'): yield build_token('LPAR', '(', t2) yield build_token('RPAR', ')', t2) yield t2 def add_missing_this(token_stream): tks = ('PHP_STRING', 'NAME', 'CLASS_NAME', 'RPAR', 'RSQB') for t in token_stream: if t.type == 'DOT' and prev_t.type not in tks: yield build_token("NAME", "this", t) yield t prev_t = t def add_missing_self(token_stream): for t in token_stream: if t.type == 'DOUBLE_DOT' and prev_t.type not in ( 'PHP_STRING', 'NAME', 'CLASS_NAME') and prev_t.value != 'parent': yield build_token("PHP_STRING", "self", t) yield t prev_t = t def debug(token_stream): print for t in token_stream: print t yield t def make_token_stream(lexer, add_endmarker=True): token_stream = iter(lexer.token, None) token_stream = trim_beginning_newlines(token_stream) token_stream = inject_case_tokens(token_stream) token_stream = inject_indent_tokens(lexer, token_stream) token_stream = mark_indentation_level(lexer, token_stream) token_stream = remove_empty_concats(token_stream) # TODO: Fix nuke_... so it is not neccessary to double call it. token_stream = nuke_newlines_around_indent(token_stream) token_stream = nuke_newlines_around_indent(token_stream) token_stream = insert_missing_new(token_stream) token_stream = correct_class_accessor_names(token_stream) token_stream = correct_function_call(token_stream) token_stream = correct_function_definition(token_stream) token_stream = casts_as_functioncalls(token_stream) token_stream = add_missing_parenthesis(token_stream) token_stream = add_missing_parenthesis_after_functions(token_stream) token_stream = delete_multiple_newlines(token_stream) token_stream = add_missing_this(token_stream) token_stream = add_missing_self(token_stream) #token_stream = debug(token_stream) if add_endmarker: token_stream = _add_endmarker(token_stream) return token_stream _newline_pattern = re.compile(r"\n") def get_line_offsets(text): offsets = [0] for m in _newline_pattern.finditer(text): offsets.append(m.end()) # This is only really needed if the input does not end with a newline offsets.append(len(text)) return offsets