transformations.py | searchcode

/src/python/snow/lexer/transformations.py

https://gitlab.com/oytunistrator/snowscript · Python · 353 lines · 285 code · 62 blank · 6 comment · 107 complexity · c681a5bfe20e4436ec2f1e99fe10c5ce MD5 · raw file

"""
This file holds a series of transformations of the lexer tokens.
"""

from ply import lex
import re
from error import raise_indentation_error, raise_syntax_error
from tokens import MISSING_PARENTHESIS, CASTS


def build_token(_type, value, t):
    t2 = lex.LexToken()
    t2.type = _type
    t2.value = value
    t2.lineno = t.lineno
    t2.lexpos = -1
    try:
        t2.lexer = t.lexer
    except AttributeError:
        pass
    return t2


def t_error(t):
    "Error token."
    raise_syntax_error("invalid syntax", t)


def trim_beginning_newlines(token_stream):
    still_trim = True
    for t in token_stream:
        if still_trim and t.type == 'NEWLINE':
            continue
        else:
            still_trim = False

        yield t
        
def delete_multiple_newlines(token_stream):
    prev_is_nl = False
    for t in token_stream:
        is_nl = t.type == 'NEWLINE'
        
        if prev_is_nl and is_nl:
            continue
        
        prev_is_nl = is_nl
        yield t
            
def inject_case_tokens(token_stream):
    inside_switch = False
    case_indent = 0
    for t in token_stream:
        yield t

        if inside_switch:
            if t.type == 'NEWLINE':
                t2 = token_stream.next()
                yield t2

                if t2.type == 'WS':
                    indent = len(t2.value)
                    if case_indent == 0:
                        case_indent = indent
                        yield build_token('CASE', 'case', t2)
                    else:
                        if indent == case_indent:
                            yield build_token('CASE', 'case', t2)
                        elif indent < case_indent:
                            inside_switch = False
                            case_indent = 0
                elif t2.type == "SWITCH":
                    case_indent = 0
                else:
                    inside_switch = False
                    case_indent = 0

        if t.type == "SWITCH":
            inside_switch = True
            case_indent = 0

INDENT_ERROR = "Dedention matches no previous level."


def inject_indent_tokens(lexer, token_stream):
    levels = [0]
    try:
        for t in token_stream:
            lexer.at_line_start = False
            if t.type == "NEWLINE":
                yield t
                lexer.at_line_start = True

                t2 = token_stream.next()
                level = len(t2.value) if t2.type == 'WS' else 0

                if level > levels[-1]:
                    levels.append(level)
                    yield build_token('INDENT', '', t2)
                elif level < levels[-1]:
                    if level not in levels:
                        raise_indentation_error(INDENT_ERROR, t2)

                    while levels.pop() > level:
                        yield build_token('DEDENT', '', t2)
                    levels.append(level)

                    if levels == []:
                        levels = [0]
                if t2.type != 'WS':
                    yield t2

            elif t.type == "WS":
                continue
            else:
                yield t
    except StopIteration:
        for level in range(0, len(levels) - 1):
            yield build_token('DEDENT', '', t)


def mark_indentation_level(lexer, token_stream):
    lexer.indent_level = 0
    for t in token_stream:
        if t.type == 'INDENT':
            lexer.indent_level += 1
        elif t.type == 'DEDENT':
            lexer.indent_level -= 1

        yield t


def add_endmarker(token_stream):
    for t in token_stream:
        yield t
    yield build_token("ENDMARKER", None, t)
_add_endmarker = add_endmarker


def remove_empty_concats(token_stream):
    for t in token_stream:
        if t.type == "STRING_WITH_CONCAT" and t.value == "":
            continue

        if t.type == "PERCENT":
            try:
                t2 = token_stream.next()
            except StopIteration:
                yield t
                raise StopIteration
            if not(t2.type in ("STRING_SINGLE", "STRING_DOUBLE") 
            and t2.value == ""):
                yield t
                yield t2
        else:
            yield t


def nuke_newlines_around_indent(token_stream):
    for t in token_stream:
        if t.type == 'NEWLINE':
            try:
                t2 = token_stream.next()
            except StopIteration:
                yield t
                raise StopIteration

            if t2.type in ('INDENT', 'PASS'):
                yield t2
            else:
                yield t
                yield t2
        elif t.type in ('INDENT', 'PASS'):
            try:
                t2 = token_stream.next()
            except StopIteration:
                yield t
                raise StopIteration
            if t2.type == 'NEWLINE':
                yield t
            else:
                yield t
                yield t2
        else:
            yield t


def insert_missing_new(token_stream):
    prev_was_new = False
    prev_was_class = False
    for t in token_stream:
        if t.type == 'CLASS_NAME':
            t2 = token_stream.next()
            if t2.type == 'LPAR' and not prev_was_new and not prev_was_class:
                yield build_token('NEW', 'new', t)
            yield t
            yield t2
        else:
            yield t

        prev_was_new = t.type == 'NEW'
        prev_was_class = t.type == 'CLASS'

def correct_class_accessor_names(token_stream):
    for t in token_stream:
        if t.type == 'DOT':
            t2 = token_stream.next()
            if t2.type == 'NAME':
                t2.type = 'PHP_STRING'
            yield t
            yield t2
        else:
            yield t


def correct_function_call(token_stream):
    for t in token_stream:
        if t.type in ('NAME'):
            yield t
            t2 = token_stream.next()
            if t2.type == 'LPAR':
                t.type = 'PHP_STRING'
            yield t2
        else:
            yield t


def correct_function_definition(token_stream):
    for t in token_stream:
        if t.type == 'FN':
            yield t
            t2 = token_stream.next()
            if t2.type == 'NAME':
                t2.type = 'PHP_STRING'
            yield t2
        else:
            yield t


def casts_as_functioncalls(token_stream):
    remove_at_level = None
    for t in token_stream:
        if t.type in CASTS:
            t2 = token_stream.next()
            if t2.type == 'LPAR':
                remove_at_level = t2.lexer.bracket_level - 1
                yield build_token('%s_CAST' % t.type, '(int)', t)
            else:
                yield t
                yield t2
        elif t.type == 'RPAR' and t.lexer.bracket_level == remove_at_level:
            remove_at_level = None
        else:
            yield t


def add_missing_parenthesis(token_stream):
    inside_expression = False
    for t in token_stream:
        if hasattr(t, 'lexer'):
            bracket_level = t.lexer.bracket_level
        else:
            bracket_level = 0
        if not inside_expression and t.type in MISSING_PARENTHESIS:
            start_bracket_level = t.lexer.bracket_level
            inside_expression = True
            yield t
            yield build_token('LPAR', '(', t)

            continue

        if (inside_expression and t.type in ('INDENT', 'COLON', 'THEN')
        and bracket_level == start_bracket_level):
            inside_expression = False
            yield build_token('RPAR', ')', t)

        yield t


def add_missing_parenthesis_after_functions(token_stream):
    for t in token_stream:
        yield t
        if t.type == 'FN':
            t1 = token_stream.next()
            yield t1
            if t1.type == 'PHP_STRING':
                t2 = token_stream.next()
                if t2.type in ('INDENT', 'COLON'):
                    yield build_token('LPAR', '(', t2)
                    yield build_token('RPAR', ')', t2)
                yield t2

def add_missing_this(token_stream):
    tks = ('PHP_STRING', 'NAME', 'CLASS_NAME', 'RPAR', 'RSQB')
    for t in token_stream:
        if t.type == 'DOT' and prev_t.type not in tks:
            yield build_token("NAME", "this", t)

        yield t
        prev_t = t

def add_missing_self(token_stream):
    for t in token_stream:
        if t.type == 'DOUBLE_DOT' and prev_t.type not in (
        'PHP_STRING', 'NAME', 'CLASS_NAME') and prev_t.value != 'parent':
            yield build_token("PHP_STRING", "self", t)

        yield t
        prev_t = t

def debug(token_stream):
    print
    for t in token_stream:
        print t
        yield t
        

def make_token_stream(lexer, add_endmarker=True):
    token_stream = iter(lexer.token, None)
    token_stream = trim_beginning_newlines(token_stream)
    token_stream = inject_case_tokens(token_stream)
    token_stream = inject_indent_tokens(lexer, token_stream)
    token_stream = mark_indentation_level(lexer, token_stream)
    token_stream = remove_empty_concats(token_stream)
    # TODO: Fix nuke_... so it is not neccessary to double call it.
    token_stream = nuke_newlines_around_indent(token_stream)
    token_stream = nuke_newlines_around_indent(token_stream)
    token_stream = insert_missing_new(token_stream)
    token_stream = correct_class_accessor_names(token_stream)
    token_stream = correct_function_call(token_stream)
    token_stream = correct_function_definition(token_stream)
    token_stream = casts_as_functioncalls(token_stream)
    token_stream = add_missing_parenthesis(token_stream)
    token_stream = add_missing_parenthesis_after_functions(token_stream)
    token_stream = delete_multiple_newlines(token_stream)
    token_stream = add_missing_this(token_stream)
    token_stream = add_missing_self(token_stream)
    #token_stream = debug(token_stream)

    if add_endmarker:
        token_stream = _add_endmarker(token_stream)
    return token_stream

_newline_pattern = re.compile(r"\n")


def get_line_offsets(text):
    offsets = [0]
    for m in _newline_pattern.finditer(text):
        offsets.append(m.end())
    # This is only really needed if the input does not end with a newline
    offsets.append(len(text))
    return offsets
Tech Fingerprint

Alerts (26)

'def' Ensure functions have docstrings for documentation
11 24 29 39 50 85 122 133 140 159 188 204 216 228 240 257 280 293 302 311 318 347
'try:' Ensure try blocks have corresponding except or finally blocks
87
Complexity hotspot; lines 303 to 305 (total complexity: 4)
303 304 305