PageRenderTime 37ms CodeModel.GetById 17ms app.highlight 17ms RepoModel.GetById 0ms app.codeStats 1ms

/tools/stats/dna_filtering.py

https://bitbucket.org/h_morita_dbcls/galaxy-central
Python | 216 lines | 185 code | 9 blank | 22 comment | 24 complexity | 011a8e08d940eef49a6f5a129ed614b4 MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
  5
  6usage: %prog [options]
  7    -i, --input=i: tabular input file
  8    -o, --output=o: filtered output file
  9    -c, --cond=c: conditions to filter on
 10    -n, --n_handling=n: how to handle N and X
 11    -l, --columns=l: columns 
 12    -t, --col_types=t: column types    
 13
 14"""
 15
 16#from __future__ import division
 17import os.path, re, string, string, sys
 18from galaxy import eggs
 19import pkg_resources; pkg_resources.require( "bx-python" )
 20from bx.cookbook import doc_optparse
 21
 22# Older py compatibility
 23try:
 24    set()
 25except:
 26    from sets import Set as set
 27
 28#assert sys.version_info[:2] >= ( 2, 4 )
 29
 30def get_operands( filter_condition ):
 31    # Note that the order of all_operators is important
 32    items_to_strip = [ '==', '!=', ' and ', ' or ' ]
 33    for item in items_to_strip:
 34        if filter_condition.find( item ) >= 0:
 35            filter_condition = filter_condition.replace( item, ' ' )
 36    operands = set( filter_condition.split( ' ' ) )
 37    return operands
 38
 39def stop_err( msg ):
 40    sys.stderr.write( msg )
 41    sys.exit()
 42
 43def __main__():
 44    #Parse Command Line
 45    options, args = doc_optparse.parse( __doc__ )
 46    input = options.input
 47    output = options.output
 48    cond = options.cond
 49    n_handling = options.n_handling
 50    columns = options.columns
 51    col_types = options.col_types
 52
 53    try:
 54        in_columns = int( columns )
 55        assert col_types  #check to see that the column types variable isn't null
 56        in_column_types = col_types.split( ',' )
 57    except:
 58        stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
 59
 60    # Unescape if input has been escaped
 61    cond_text = cond.replace( '__eq__', '==' ).replace( '__ne__', '!=' ).replace( '__sq__', "'" )
 62    orig_cond_text = cond_text
 63    # Expand to allow for DNA codes
 64    dot_letters = [ letter for letter in string.uppercase if letter not in \
 65                   [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] ]
 66    dot_letters.append( '.' )
 67    codes = {'A': [ 'A', 'D', 'H', 'M', 'R', 'V', 'W' ],
 68             'C': [ 'C', 'B', 'H', 'M', 'S', 'V', 'Y' ],
 69             'G': [ 'G', 'B', 'D', 'K', 'R', 'S', 'V' ],
 70             'T': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
 71             'U': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
 72             'K': [ 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'R', 'S', 'V', 'W', 'Y' ],
 73             'M': [ 'A', 'C', 'B', 'D', 'H', 'M', 'R', 'S', 'V', 'W', 'Y' ],
 74             'R': [ 'A', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
 75             'Y': [ 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'S', 'V', 'W', 'Y' ],
 76             'S': [ 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'Y' ],
 77             'W': [ 'A', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'V', 'W', 'Y' ],
 78             'B': [ 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
 79             'V': [ 'A', 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
 80             'H': [ 'A', 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
 81             'D': [ 'A', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
 82             '.': dot_letters,
 83             '-': [ '-' ]}
 84    # Add handling for N and X
 85    if n_handling == "all":
 86        codes[ 'N' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
 87        codes[ 'X' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
 88        for code in codes.keys():
 89            if code != '.' and code != '-':
 90                codes[code].append( 'N' )
 91                codes[code].append( 'X' )
 92    else:
 93        codes[ 'N' ] = dot_letters
 94        codes[ 'X' ] = dot_letters
 95        codes[ '.' ].extend( [ 'N', 'X' ] )
 96    # Expand conditions to allow for DNA codes
 97    try:
 98        match_replace = {}
 99        pat = re.compile( 'c\d+\s*[!=]=\s*[\w\d"\'+-.]+' )
100        matches = pat.findall( cond_text )
101        for match in matches:
102            if match.find( 'chr' ) >= 0 or match.find( 'scaffold' ) >= 0 or match.find( '+' ) >= 0:
103                if match.find( '==' ) >= 0:
104                    match_parts = match.split( '==' )
105                elif match.find( '!=' ) >= 0:
106                    match_parts = match.split( '!=' )
107                else:
108                    raise Exception, "The operators '==' and '!=' were not found."
109                left = match_parts[0].strip()
110                right = match_parts[1].strip()
111                new_match = "(%s)" % ( match )
112            elif match.find( '==' ) > 0:
113                match_parts = match.split( '==' )
114                left = match_parts[0].strip()
115                right = match_parts[1].strip()
116                new_match = '(%s in codes[%s] and %s in codes[%s])' % ( left, right, right, left )
117            elif match.find( '!=' ) > 0 :
118                match_parts = match.split( '!=' )
119                left = match_parts[0].strip()
120                right = match_parts[1].strip()
121                new_match = '(%s not in codes[%s] or %s not in codes[%s])' % ( left, right, right, left )
122            else:
123                raise Exception, "The operators '==' and '!=' were not found." 
124            assert left.startswith( 'c' ), 'The column names should start with c (lowercase)'
125            if right.find( "'" ) >= 0 or right.find( '"' ) >= 0:
126                test = right.replace( "'", '' ).replace( '"', '' )
127                assert test in string.uppercase or test.find( '+' ) >= 0 or test.find( '.' ) >= 0 or test.find( '-' ) >= 0\
128                        or test.startswith( 'chr' ) or test.startswith( 'scaffold' ), \
129                        'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). ' \
130                        'Use the general filter tool to filter on anything else first'
131            else:
132                assert right.startswith( 'c' ), 'The column names should start with c (lowercase)'
133            match_replace[match] = new_match
134        if len( match_replace.keys() ) == 0:
135            raise Exception, 'There do not appear to be any valid conditions'
136        for match in match_replace.keys():
137            cond_text = cond_text.replace( match, match_replace[match] )
138    except Exception, e:
139        stop_err( "At least one of your conditions is invalid. Make sure to use only '!=' or '==', valid column numbers, and valid base values.\n" + str(e) )
140
141    # Attempt to determine if the condition includes executable stuff and, if so, exit
142    secured = dir()
143    operands = get_operands( cond_text )
144    for operand in operands:
145        try:
146            check = int( operand )
147        except:
148            if operand in secured:
149                stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
150
151    # Prepare the column variable names and wrappers for column data types
152    cols, type_casts = [], []
153    for col in range( 1, in_columns + 1 ):
154        col_name = "c%d" % col
155        cols.append( col_name )
156        col_type = in_column_types[ col - 1 ]
157        type_cast = "%s(%s)" % ( col_type, col_name )
158        type_casts.append( type_cast )
159
160    col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
161    type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
162    assign = "%s = line.split( '\\t' )" % col_str
163    wrap = "%s = %s" % ( col_str, type_cast_str )
164    skipped_lines = 0
165    first_invalid_line = 0
166    invalid_line = None
167    lines_kept = 0
168    total_lines = 0
169    out = open( output, 'wt' )
170    # Read and filter input file, skipping invalid lines
171    code = '''
172for i, line in enumerate( file( input ) ):
173    total_lines += 1
174    line = line.rstrip( '\\r\\n' )
175    if not line or line.startswith( '#' ):
176        skipped_lines += 1
177        if not invalid_line:
178            first_invalid_line = i + 1
179            invalid_line = line
180        continue
181    try:
182        %s = line.split( '\\t' )
183        %s = %s
184        if %s:
185            lines_kept += 1
186            print >> out, line
187    except Exception, e:
188        skipped_lines += 1
189        if not invalid_line:
190            first_invalid_line = i + 1
191            invalid_line = line
192''' % ( col_str, col_str, type_cast_str, cond_text )
193
194    valid_filter = True
195    try:
196        exec code
197    except Exception, e:
198        out.close()
199        if str( e ).startswith( 'invalid syntax' ):
200            valid_filter = False
201            stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % orig_cond_text + ' '+str(e))
202        else:
203            stop_err( str( e ) )
204
205    if valid_filter:
206        out.close()
207        valid_lines = total_lines - skipped_lines
208        print 'Filtering with %s, ' % orig_cond_text
209        if valid_lines > 0:
210            print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
211        else:
212            print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % orig_cond_text
213        if skipped_lines > 0:
214            print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
215    
216if __name__ == "__main__" : __main__()