PageRenderTime 43ms CodeModel.GetById 17ms app.highlight 22ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/stats/dna_filtering.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 349 lines | 321 code | 8 blank | 20 comment | 26 complexity | 49d311dc3d3fb050ef0a1e4048a4e92e MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
  5
  6usage: %prog [options]
  7    -i, --input=i: tabular input file
  8    -o, --output=o: filtered output file
  9    -c, --cond=c: conditions to filter on
 10    -n, --n_handling=n: how to handle N and X
 11    -l, --columns=l: columns 
 12    -t, --col_types=t: column types    
 13
 14"""
 15
 16#from __future__ import division
 17import os.path, re, string, string, sys
 18from galaxy import eggs
 19import pkg_resources; pkg_resources.require( "bx-python" )
 20from bx.cookbook import doc_optparse
 21
 22from ast import parse, Module, walk
 23
 24# Older py compatibility
 25try:
 26    set()
 27except:
 28    from sets import Set as set
 29
 30AST_NODE_TYPE_WHITELIST = [
 31    'Expr',    'Load',
 32    'Str',     'Num',    'BoolOp',
 33    'Compare', 'And',    'Eq',
 34    'NotEq',   'Or',     'GtE',
 35    'LtE',     'Lt',     'Gt',
 36    'BinOp',   'Add',    'Div',
 37    'Sub',     'Mult',   'Mod',
 38    'Pow',     'LShift', 'GShift',
 39    'BitAnd',  'BitOr',  'BitXor',
 40    'UnaryOp', 'Invert', 'Not',
 41    'NotIn',   'In',     'Is',
 42    'IsNot',   'List',
 43    'Index',   'Subscript',
 44    # Further checks
 45    'Name',    'Call',    'Attribute',
 46]
 47
 48
 49BUILTIN_AND_MATH_FUNCTIONS = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'.split('|')
 50STRING_AND_LIST_METHODS = [ name for name in dir('') + dir([]) if not name.startswith('_') ]
 51VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS
 52
 53
 54def __check_name( ast_node ):
 55    name = ast_node.id
 56    if re.match(r'^c\d+$', name):
 57        return True
 58    elif name == "codes":
 59        return True
 60    else:
 61        return name in VALID_FUNCTIONS
 62
 63
 64def __check_attribute( ast_node ):
 65    attribute_name = ast_node.attr
 66    if attribute_name not in STRING_AND_LIST_METHODS:
 67        return False
 68    return True
 69
 70
 71def __check_call( ast_node ):
 72    # If we are calling a function or method, it better be a math,
 73    # string or list function.
 74    ast_func = ast_node.func
 75    ast_func_class = ast_func.__class__.__name__
 76    if ast_func_class == 'Name':
 77        if ast_func.id not in BUILTIN_AND_MATH_FUNCTIONS:
 78            return False
 79    elif ast_func_class == 'Attribute':
 80        if not __check_attribute( ast_func ):
 81            return False
 82    else:
 83        return False
 84
 85    return True
 86
 87
 88def check_expression( text ):
 89    """
 90
 91    >>> check_expression("c1=='chr1' and c3-c2>=2000 and c6=='+'")
 92    True
 93    >>> check_expression("eval('1+1')")
 94    False
 95    >>> check_expression("import sys")
 96    False
 97    >>> check_expression("[].__str__")
 98    False
 99    >>> check_expression("__builtins__")
100    False
101    >>> check_expression("'x' in globals")
102    False
103    >>> check_expression("'x' in [1,2,3]")
104    True
105    >>> check_expression("c3=='chr1' and c5>5")
106    True
107    >>> check_expression("c3=='chr1' and d5>5")  # Invalid d5 reference
108    False
109    >>> check_expression("c3=='chr1' and c5>5 or exec")
110    False
111    >>> check_expression("type(c1) != type(1)")
112    True
113    >>> check_expression("c1.split(',')[1] == '1'")
114    True
115    >>> check_expression("exec 1")
116    False
117    >>> check_expression("str(c2) in [\\\"a\\\",\\\"b\\\"]")
118    True
119    """
120    try:
121        module = parse( text )
122    except SyntaxError:
123        return False
124
125    if not isinstance(module, Module):
126        return False
127    statements = module.body
128    if not len( statements ) == 1:
129        return False
130    expression = statements[0]
131    if expression.__class__.__name__ != 'Expr':
132        return False
133
134    for ast_node in walk( expression ):
135        ast_node_class = ast_node.__class__.__name__
136
137        # Toss out everything that is not a "simple" expression,
138        # imports, error handling, etc...
139        if ast_node_class not in AST_NODE_TYPE_WHITELIST:
140            return False
141
142        # White-list more potentially dangerous types AST elements.
143        if ast_node_class == 'Name':
144            # In order to prevent loading 'exec', 'eval', etc...
145            # put string restriction on names allowed.
146            if not __check_name( ast_node ):
147                return False
148        # Check only valid, white-listed functions are called.
149        elif ast_node_class == 'Call':
150            if not __check_call( ast_node ):
151                return False
152        # Check only valid, white-listed attributes are accessed
153        elif ast_node_class == 'Attribute':
154            if not __check_attribute( ast_node ):
155                return False
156
157    return True
158
159
160def get_operands( filter_condition ):
161    # Note that the order of all_operators is important
162    items_to_strip = [ '==', '!=', ' and ', ' or ' ]
163    for item in items_to_strip:
164        if filter_condition.find( item ) >= 0:
165            filter_condition = filter_condition.replace( item, ' ' )
166    operands = set( filter_condition.split( ' ' ) )
167    return operands
168
169def stop_err( msg ):
170    sys.stderr.write( msg )
171    sys.exit()
172
173def __main__():
174    #Parse Command Line
175    options, args = doc_optparse.parse( __doc__ )
176    input = options.input
177    output = options.output
178    cond = options.cond
179    n_handling = options.n_handling
180    columns = options.columns
181    col_types = options.col_types
182
183    try:
184        in_columns = int( columns )
185        assert col_types  #check to see that the column types variable isn't null
186        in_column_types = col_types.split( ',' )
187    except:
188        stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
189
190    # Unescape if input has been escaped
191    cond_text = cond.replace( '__eq__', '==' ).replace( '__ne__', '!=' ).replace( '__sq__', "'" )
192    orig_cond_text = cond_text
193    # Expand to allow for DNA codes
194    dot_letters = [ letter for letter in string.uppercase if letter not in \
195                   [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] ]
196    dot_letters.append( '.' )
197    codes = {'A': [ 'A', 'D', 'H', 'M', 'R', 'V', 'W' ],
198             'C': [ 'C', 'B', 'H', 'M', 'S', 'V', 'Y' ],
199             'G': [ 'G', 'B', 'D', 'K', 'R', 'S', 'V' ],
200             'T': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
201             'U': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
202             'K': [ 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'R', 'S', 'V', 'W', 'Y' ],
203             'M': [ 'A', 'C', 'B', 'D', 'H', 'M', 'R', 'S', 'V', 'W', 'Y' ],
204             'R': [ 'A', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
205             'Y': [ 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'S', 'V', 'W', 'Y' ],
206             'S': [ 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'Y' ],
207             'W': [ 'A', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'V', 'W', 'Y' ],
208             'B': [ 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
209             'V': [ 'A', 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
210             'H': [ 'A', 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
211             'D': [ 'A', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
212             '.': dot_letters,
213             '-': [ '-' ]}
214    # Add handling for N and X
215    if n_handling == "all":
216        codes[ 'N' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
217        codes[ 'X' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
218        for code in codes.keys():
219            if code != '.' and code != '-':
220                codes[code].append( 'N' )
221                codes[code].append( 'X' )
222    else:
223        codes[ 'N' ] = dot_letters
224        codes[ 'X' ] = dot_letters
225        codes[ '.' ].extend( [ 'N', 'X' ] )
226    # Expand conditions to allow for DNA codes
227    try:
228        match_replace = {}
229        pat = re.compile( 'c\d+\s*[!=]=\s*[\w\d"\'+-.]+' )
230        matches = pat.findall( cond_text )
231        for match in matches:
232            if match.find( 'chr' ) >= 0 or match.find( 'scaffold' ) >= 0 or match.find( '+' ) >= 0:
233                if match.find( '==' ) >= 0:
234                    match_parts = match.split( '==' )
235                elif match.find( '!=' ) >= 0:
236                    match_parts = match.split( '!=' )
237                else:
238                    raise Exception, "The operators '==' and '!=' were not found."
239                left = match_parts[0].strip()
240                right = match_parts[1].strip()
241                new_match = "(%s)" % ( match )
242            elif match.find( '==' ) > 0:
243                match_parts = match.split( '==' )
244                left = match_parts[0].strip()
245                right = match_parts[1].strip()
246                new_match = '(%s in codes[%s] and %s in codes[%s])' % ( left, right, right, left )
247            elif match.find( '!=' ) > 0 :
248                match_parts = match.split( '!=' )
249                left = match_parts[0].strip()
250                right = match_parts[1].strip()
251                new_match = '(%s not in codes[%s] or %s not in codes[%s])' % ( left, right, right, left )
252            else:
253                raise Exception, "The operators '==' and '!=' were not found." 
254            assert left.startswith( 'c' ), 'The column names should start with c (lowercase)'
255            if right.find( "'" ) >= 0 or right.find( '"' ) >= 0:
256                test = right.replace( "'", '' ).replace( '"', '' )
257                assert test in string.uppercase or test.find( '+' ) >= 0 or test.find( '.' ) >= 0 or test.find( '-' ) >= 0\
258                        or test.startswith( 'chr' ) or test.startswith( 'scaffold' ), \
259                        'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). ' \
260                        'Use the general filter tool to filter on anything else first'
261            else:
262                assert right.startswith( 'c' ), 'The column names should start with c (lowercase)'
263            match_replace[match] = new_match
264        if len( match_replace.keys() ) == 0:
265            raise Exception, 'There do not appear to be any valid conditions'
266        for match in match_replace.keys():
267            cond_text = cond_text.replace( match, match_replace[match] )
268    except Exception, e:
269        stop_err( "At least one of your conditions is invalid. Make sure to use only '!=' or '==', valid column numbers, and valid base values.\n" + str(e) )
270
271    # Attempt to determine if the condition includes executable stuff and, if so, exit
272    secured = dir()
273    operands = get_operands( cond_text )
274    for operand in operands:
275        try:
276            check = int( operand )
277        except:
278            if operand in secured:
279                stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
280
281    if not check_expression( cond_text ):
282        stop_err( "Illegal/invalid in condition '%s'" % ( cond_text ) )
283
284    # Prepare the column variable names and wrappers for column data types
285    cols, type_casts = [], []
286    for col in range( 1, in_columns + 1 ):
287        col_name = "c%d" % col
288        cols.append( col_name )
289        col_type = in_column_types[ col - 1 ]
290        type_cast = "%s(%s)" % ( col_type, col_name )
291        type_casts.append( type_cast )
292
293    col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
294    type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
295    assign = "%s = line.split( '\\t' )" % col_str
296    wrap = "%s = %s" % ( col_str, type_cast_str )
297    skipped_lines = 0
298    first_invalid_line = 0
299    invalid_line = None
300    lines_kept = 0
301    total_lines = 0
302    out = open( output, 'wt' )
303    # Read and filter input file, skipping invalid lines
304    code = '''
305for i, line in enumerate( file( input ) ):
306    total_lines += 1
307    line = line.rstrip( '\\r\\n' )
308    if not line or line.startswith( '#' ):
309        skipped_lines += 1
310        if not invalid_line:
311            first_invalid_line = i + 1
312            invalid_line = line
313        continue
314    try:
315        %s = line.split( '\\t' )
316        %s = %s
317        if %s:
318            lines_kept += 1
319            print >> out, line
320    except Exception, e:
321        skipped_lines += 1
322        if not invalid_line:
323            first_invalid_line = i + 1
324            invalid_line = line
325''' % ( col_str, col_str, type_cast_str, cond_text )
326
327    valid_filter = True
328    try:
329        exec code
330    except Exception, e:
331        out.close()
332        if str( e ).startswith( 'invalid syntax' ):
333            valid_filter = False
334            stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % orig_cond_text + ' '+str(e))
335        else:
336            stop_err( str( e ) )
337
338    if valid_filter:
339        out.close()
340        valid_lines = total_lines - skipped_lines
341        print 'Filtering with %s, ' % orig_cond_text
342        if valid_lines > 0:
343            print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
344        else:
345            print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % orig_cond_text
346        if skipped_lines > 0:
347            print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
348    
349if __name__ == "__main__" : __main__()