dna_filtering.py - This is a Python script that filters a t…

/tools/stats/dna_filtering.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 349 lines · 279 code · 34 blank · 36 comment · 80 complexity · 49d311dc3d3fb050ef0a1e4048a4e92e MD5 · raw file

#!/usr/bin/env python

"""
This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.

usage: %prog [options]
    -i, --input=i: tabular input file
    -o, --output=o: filtered output file
    -c, --cond=c: conditions to filter on
    -n, --n_handling=n: how to handle N and X
    -l, --columns=l: columns 
    -t, --col_types=t: column types    

"""

#from __future__ import division
import os.path, re, string, string, sys
from galaxy import eggs
import pkg_resources; pkg_resources.require( "bx-python" )
from bx.cookbook import doc_optparse

from ast import parse, Module, walk

# Older py compatibility
try:
    set()
except:
    from sets import Set as set

AST_NODE_TYPE_WHITELIST = [
    'Expr',    'Load',
    'Str',     'Num',    'BoolOp',
    'Compare', 'And',    'Eq',
    'NotEq',   'Or',     'GtE',
    'LtE',     'Lt',     'Gt',
    'BinOp',   'Add',    'Div',
    'Sub',     'Mult',   'Mod',
    'Pow',     'LShift', 'GShift',
    'BitAnd',  'BitOr',  'BitXor',
    'UnaryOp', 'Invert', 'Not',
    'NotIn',   'In',     'Is',
    'IsNot',   'List',
    'Index',   'Subscript',
    # Further checks
    'Name',    'Call',    'Attribute',
]


BUILTIN_AND_MATH_FUNCTIONS = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'.split('|')
STRING_AND_LIST_METHODS = [ name for name in dir('') + dir([]) if not name.startswith('_') ]
VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS


def __check_name( ast_node ):
    name = ast_node.id
    if re.match(r'^c\d+$', name):
        return True
    elif name == "codes":
        return True
    else:
        return name in VALID_FUNCTIONS


def __check_attribute( ast_node ):
    attribute_name = ast_node.attr
    if attribute_name not in STRING_AND_LIST_METHODS:
        return False
    return True


def __check_call( ast_node ):
    # If we are calling a function or method, it better be a math,
    # string or list function.
    ast_func = ast_node.func
    ast_func_class = ast_func.__class__.__name__
    if ast_func_class == 'Name':
        if ast_func.id not in BUILTIN_AND_MATH_FUNCTIONS:
            return False
    elif ast_func_class == 'Attribute':
        if not __check_attribute( ast_func ):
            return False
    else:
        return False

    return True


def check_expression( text ):
    """

    >>> check_expression("c1=='chr1' and c3-c2>=2000 and c6=='+'")
    True
    >>> check_expression("eval('1+1')")
    False
    >>> check_expression("import sys")
    False
    >>> check_expression("[].__str__")
    False
    >>> check_expression("__builtins__")
    False
    >>> check_expression("'x' in globals")
    False
    >>> check_expression("'x' in [1,2,3]")
    True
    >>> check_expression("c3=='chr1' and c5>5")
    True
    >>> check_expression("c3=='chr1' and d5>5")  # Invalid d5 reference
    False
    >>> check_expression("c3=='chr1' and c5>5 or exec")
    False
    >>> check_expression("type(c1) != type(1)")
    True
    >>> check_expression("c1.split(',')[1] == '1'")
    True
    >>> check_expression("exec 1")
    False
    >>> check_expression("str(c2) in [\\\"a\\\",\\\"b\\\"]")
    True
    """
    try:
        module = parse( text )
    except SyntaxError:
        return False

    if not isinstance(module, Module):
        return False
    statements = module.body
    if not len( statements ) == 1:
        return False
    expression = statements[0]
    if expression.__class__.__name__ != 'Expr':
        return False

    for ast_node in walk( expression ):
        ast_node_class = ast_node.__class__.__name__

        # Toss out everything that is not a "simple" expression,
        # imports, error handling, etc...
        if ast_node_class not in AST_NODE_TYPE_WHITELIST:
            return False

        # White-list more potentially dangerous types AST elements.
        if ast_node_class == 'Name':
            # In order to prevent loading 'exec', 'eval', etc...
            # put string restriction on names allowed.
            if not __check_name( ast_node ):
                return False
        # Check only valid, white-listed functions are called.
        elif ast_node_class == 'Call':
            if not __check_call( ast_node ):
                return False
        # Check only valid, white-listed attributes are accessed
        elif ast_node_class == 'Attribute':
            if not __check_attribute( ast_node ):
                return False

    return True


def get_operands( filter_condition ):
    # Note that the order of all_operators is important
    items_to_strip = [ '==', '!=', ' and ', ' or ' ]
    for item in items_to_strip:
        if filter_condition.find( item ) >= 0:
            filter_condition = filter_condition.replace( item, ' ' )
    operands = set( filter_condition.split( ' ' ) )
    return operands

def stop_err( msg ):
    sys.stderr.write( msg )
    sys.exit()

def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    input = options.input
    output = options.output
    cond = options.cond
    n_handling = options.n_handling
    columns = options.columns
    col_types = options.col_types

    try:
        in_columns = int( columns )
        assert col_types  #check to see that the column types variable isn't null
        in_column_types = col_types.split( ',' )
    except:
        stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )

    # Unescape if input has been escaped
    cond_text = cond.replace( '__eq__', '==' ).replace( '__ne__', '!=' ).replace( '__sq__', "'" )
    orig_cond_text = cond_text
    # Expand to allow for DNA codes
    dot_letters = [ letter for letter in string.uppercase if letter not in \
                   [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] ]
    dot_letters.append( '.' )
    codes = {'A': [ 'A', 'D', 'H', 'M', 'R', 'V', 'W' ],
             'C': [ 'C', 'B', 'H', 'M', 'S', 'V', 'Y' ],
             'G': [ 'G', 'B', 'D', 'K', 'R', 'S', 'V' ],
             'T': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
             'U': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
             'K': [ 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'R', 'S', 'V', 'W', 'Y' ],
             'M': [ 'A', 'C', 'B', 'D', 'H', 'M', 'R', 'S', 'V', 'W', 'Y' ],
             'R': [ 'A', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
             'Y': [ 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'S', 'V', 'W', 'Y' ],
             'S': [ 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'Y' ],
             'W': [ 'A', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'V', 'W', 'Y' ],
             'B': [ 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
             'V': [ 'A', 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
             'H': [ 'A', 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
             'D': [ 'A', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
             '.': dot_letters,
             '-': [ '-' ]}
    # Add handling for N and X
    if n_handling == "all":
        codes[ 'N' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
        codes[ 'X' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
        for code in codes.keys():
            if code != '.' and code != '-':
                codes[code].append( 'N' )
                codes[code].append( 'X' )
    else:
        codes[ 'N' ] = dot_letters
        codes[ 'X' ] = dot_letters
        codes[ '.' ].extend( [ 'N', 'X' ] )
    # Expand conditions to allow for DNA codes
    try:
        match_replace = {}
        pat = re.compile( 'c\d+\s*[!=]=\s*[\w\d"\'+-.]+' )
        matches = pat.findall( cond_text )
        for match in matches:
            if match.find( 'chr' ) >= 0 or match.find( 'scaffold' ) >= 0 or match.find( '+' ) >= 0:
                if match.find( '==' ) >= 0:
                    match_parts = match.split( '==' )
                elif match.find( '!=' ) >= 0:
                    match_parts = match.split( '!=' )
                else:
                    raise Exception, "The operators '==' and '!=' were not found."
                left = match_parts[0].strip()
                right = match_parts[1].strip()
                new_match = "(%s)" % ( match )
            elif match.find( '==' ) > 0:
                match_parts = match.split( '==' )
                left = match_parts[0].strip()
                right = match_parts[1].strip()
                new_match = '(%s in codes[%s] and %s in codes[%s])' % ( left, right, right, left )
            elif match.find( '!=' ) > 0 :
                match_parts = match.split( '!=' )
                left = match_parts[0].strip()
                right = match_parts[1].strip()
                new_match = '(%s not in codes[%s] or %s not in codes[%s])' % ( left, right, right, left )
            else:
                raise Exception, "The operators '==' and '!=' were not found." 
            assert left.startswith( 'c' ), 'The column names should start with c (lowercase)'
            if right.find( "'" ) >= 0 or right.find( '"' ) >= 0:
                test = right.replace( "'", '' ).replace( '"', '' )
                assert test in string.uppercase or test.find( '+' ) >= 0 or test.find( '.' ) >= 0 or test.find( '-' ) >= 0\
                        or test.startswith( 'chr' ) or test.startswith( 'scaffold' ), \
                        'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). ' \
                        'Use the general filter tool to filter on anything else first'
            else:
                assert right.startswith( 'c' ), 'The column names should start with c (lowercase)'
            match_replace[match] = new_match
        if len( match_replace.keys() ) == 0:
            raise Exception, 'There do not appear to be any valid conditions'
        for match in match_replace.keys():
            cond_text = cond_text.replace( match, match_replace[match] )
    except Exception, e:
        stop_err( "At least one of your conditions is invalid. Make sure to use only '!=' or '==', valid column numbers, and valid base values.\n" + str(e) )

    # Attempt to determine if the condition includes executable stuff and, if so, exit
    secured = dir()
    operands = get_operands( cond_text )
    for operand in operands:
        try:
            check = int( operand )
        except:
            if operand in secured:
                stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )

    if not check_expression( cond_text ):
        stop_err( "Illegal/invalid in condition '%s'" % ( cond_text ) )

    # Prepare the column variable names and wrappers for column data types
    cols, type_casts = [], []
    for col in range( 1, in_columns + 1 ):
        col_name = "c%d" % col
        cols.append( col_name )
        col_type = in_column_types[ col - 1 ]
        type_cast = "%s(%s)" % ( col_type, col_name )
        type_casts.append( type_cast )

    col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
    type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
    assign = "%s = line.split( '\\t' )" % col_str
    wrap = "%s = %s" % ( col_str, type_cast_str )
    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = None
    lines_kept = 0
    total_lines = 0
    out = open( output, 'wt' )
    # Read and filter input file, skipping invalid lines
    code = '''
for i, line in enumerate( file( input ) ):
    total_lines += 1
    line = line.rstrip( '\\r\\n' )
    if not line or line.startswith( '#' ):
        skipped_lines += 1
        if not invalid_line:
            first_invalid_line = i + 1
            invalid_line = line
        continue
    try:
        %s = line.split( '\\t' )
        %s = %s
        if %s:
            lines_kept += 1
            print >> out, line
    except Exception, e:
        skipped_lines += 1
        if not invalid_line:
            first_invalid_line = i + 1
            invalid_line = line
''' % ( col_str, col_str, type_cast_str, cond_text )

    valid_filter = True
    try:
        exec code
    except Exception, e:
        out.close()
        if str( e ).startswith( 'invalid syntax' ):
            valid_filter = False
            stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % orig_cond_text + ' '+str(e))
        else:
            stop_err( str( e ) )

    if valid_filter:
        out.close()
        valid_lines = total_lines - skipped_lines
        print 'Filtering with %s, ' % orig_cond_text
        if valid_lines > 0:
            print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
        else:
            print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % orig_cond_text
        if skipped_lines > 0:
            print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
    
if __name__ == "__main__" : __main__()
Summary ✨

This is a Python script that filters a tab-delimited text file based on a given condition. The script takes two arguments: the input file and the output file. It reads each line of the input file, splits it into columns using the tab delimiter, and checks if the condition is true for that line. If the condition is true, the line is printed to the output file. If the condition is false, the line is skipped. The script also keeps track of the number of valid lines, invalid lines, and the first invalid line.
Tech Fingerprint

Alerts (9)

'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
27 277
'eval(' Avoid due to security risks; use ast.literal_eval for safer evaluation of literals
93
'type(' Use isinstance() for type checking instead of type()
111
'try:' Ensure try blocks have corresponding except or finally blocks
227
Complexity hotspot; lines 231 to 233 (total complexity: 6)
231 232 233
'open(' Use 'with open()' to ensure Files are properly closed
302