/tools/stats/dna_filtering.py
Python | 216 lines | 185 code | 9 blank | 22 comment | 24 complexity | 011a8e08d940eef49a6f5a129ed614b4 MD5 | raw file
1#!/usr/bin/env python 2 3""" 4This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped. 5 6usage: %prog [options] 7 -i, --input=i: tabular input file 8 -o, --output=o: filtered output file 9 -c, --cond=c: conditions to filter on 10 -n, --n_handling=n: how to handle N and X 11 -l, --columns=l: columns 12 -t, --col_types=t: column types 13 14""" 15 16#from __future__ import division 17import os.path, re, string, string, sys 18from galaxy import eggs 19import pkg_resources; pkg_resources.require( "bx-python" ) 20from bx.cookbook import doc_optparse 21 22# Older py compatibility 23try: 24 set() 25except: 26 from sets import Set as set 27 28#assert sys.version_info[:2] >= ( 2, 4 ) 29 30def get_operands( filter_condition ): 31 # Note that the order of all_operators is important 32 items_to_strip = [ '==', '!=', ' and ', ' or ' ] 33 for item in items_to_strip: 34 if filter_condition.find( item ) >= 0: 35 filter_condition = filter_condition.replace( item, ' ' ) 36 operands = set( filter_condition.split( ' ' ) ) 37 return operands 38 39def stop_err( msg ): 40 sys.stderr.write( msg ) 41 sys.exit() 42 43def __main__(): 44 #Parse Command Line 45 options, args = doc_optparse.parse( __doc__ ) 46 input = options.input 47 output = options.output 48 cond = options.cond 49 n_handling = options.n_handling 50 columns = options.columns 51 col_types = options.col_types 52 53 try: 54 in_columns = int( columns ) 55 assert col_types #check to see that the column types variable isn't null 56 in_column_types = col_types.split( ',' ) 57 except: 58 stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." ) 59 60 # Unescape if input has been escaped 61 cond_text = cond.replace( '__eq__', '==' ).replace( '__ne__', '!=' ).replace( '__sq__', "'" ) 62 orig_cond_text = cond_text 63 # Expand to allow for DNA codes 64 dot_letters = [ letter for letter in string.uppercase if letter not in \ 65 [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] ] 66 dot_letters.append( '.' ) 67 codes = {'A': [ 'A', 'D', 'H', 'M', 'R', 'V', 'W' ], 68 'C': [ 'C', 'B', 'H', 'M', 'S', 'V', 'Y' ], 69 'G': [ 'G', 'B', 'D', 'K', 'R', 'S', 'V' ], 70 'T': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ], 71 'U': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ], 72 'K': [ 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'R', 'S', 'V', 'W', 'Y' ], 73 'M': [ 'A', 'C', 'B', 'D', 'H', 'M', 'R', 'S', 'V', 'W', 'Y' ], 74 'R': [ 'A', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ], 75 'Y': [ 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'S', 'V', 'W', 'Y' ], 76 'S': [ 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'Y' ], 77 'W': [ 'A', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'V', 'W', 'Y' ], 78 'B': [ 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ], 79 'V': [ 'A', 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ], 80 'H': [ 'A', 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ], 81 'D': [ 'A', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ], 82 '.': dot_letters, 83 '-': [ '-' ]} 84 # Add handling for N and X 85 if n_handling == "all": 86 codes[ 'N' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] 87 codes[ 'X' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] 88 for code in codes.keys(): 89 if code != '.' and code != '-': 90 codes[code].append( 'N' ) 91 codes[code].append( 'X' ) 92 else: 93 codes[ 'N' ] = dot_letters 94 codes[ 'X' ] = dot_letters 95 codes[ '.' ].extend( [ 'N', 'X' ] ) 96 # Expand conditions to allow for DNA codes 97 try: 98 match_replace = {} 99 pat = re.compile( 'c\d+\s*[!=]=\s*[\w\d"\'+-.]+' ) 100 matches = pat.findall( cond_text ) 101 for match in matches: 102 if match.find( 'chr' ) >= 0 or match.find( 'scaffold' ) >= 0 or match.find( '+' ) >= 0: 103 if match.find( '==' ) >= 0: 104 match_parts = match.split( '==' ) 105 elif match.find( '!=' ) >= 0: 106 match_parts = match.split( '!=' ) 107 else: 108 raise Exception, "The operators '==' and '!=' were not found." 109 left = match_parts[0].strip() 110 right = match_parts[1].strip() 111 new_match = "(%s)" % ( match ) 112 elif match.find( '==' ) > 0: 113 match_parts = match.split( '==' ) 114 left = match_parts[0].strip() 115 right = match_parts[1].strip() 116 new_match = '(%s in codes[%s] and %s in codes[%s])' % ( left, right, right, left ) 117 elif match.find( '!=' ) > 0 : 118 match_parts = match.split( '!=' ) 119 left = match_parts[0].strip() 120 right = match_parts[1].strip() 121 new_match = '(%s not in codes[%s] or %s not in codes[%s])' % ( left, right, right, left ) 122 else: 123 raise Exception, "The operators '==' and '!=' were not found." 124 assert left.startswith( 'c' ), 'The column names should start with c (lowercase)' 125 if right.find( "'" ) >= 0 or right.find( '"' ) >= 0: 126 test = right.replace( "'", '' ).replace( '"', '' ) 127 assert test in string.uppercase or test.find( '+' ) >= 0 or test.find( '.' ) >= 0 or test.find( '-' ) >= 0\ 128 or test.startswith( 'chr' ) or test.startswith( 'scaffold' ), \ 129 'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). ' \ 130 'Use the general filter tool to filter on anything else first' 131 else: 132 assert right.startswith( 'c' ), 'The column names should start with c (lowercase)' 133 match_replace[match] = new_match 134 if len( match_replace.keys() ) == 0: 135 raise Exception, 'There do not appear to be any valid conditions' 136 for match in match_replace.keys(): 137 cond_text = cond_text.replace( match, match_replace[match] ) 138 except Exception, e: 139 stop_err( "At least one of your conditions is invalid. Make sure to use only '!=' or '==', valid column numbers, and valid base values.\n" + str(e) ) 140 141 # Attempt to determine if the condition includes executable stuff and, if so, exit 142 secured = dir() 143 operands = get_operands( cond_text ) 144 for operand in operands: 145 try: 146 check = int( operand ) 147 except: 148 if operand in secured: 149 stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) ) 150 151 # Prepare the column variable names and wrappers for column data types 152 cols, type_casts = [], [] 153 for col in range( 1, in_columns + 1 ): 154 col_name = "c%d" % col 155 cols.append( col_name ) 156 col_type = in_column_types[ col - 1 ] 157 type_cast = "%s(%s)" % ( col_type, col_name ) 158 type_casts.append( type_cast ) 159 160 col_str = ', '.join( cols ) # 'c1, c2, c3, c4' 161 type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)' 162 assign = "%s = line.split( '\\t' )" % col_str 163 wrap = "%s = %s" % ( col_str, type_cast_str ) 164 skipped_lines = 0 165 first_invalid_line = 0 166 invalid_line = None 167 lines_kept = 0 168 total_lines = 0 169 out = open( output, 'wt' ) 170 # Read and filter input file, skipping invalid lines 171 code = ''' 172for i, line in enumerate( file( input ) ): 173 total_lines += 1 174 line = line.rstrip( '\\r\\n' ) 175 if not line or line.startswith( '#' ): 176 skipped_lines += 1 177 if not invalid_line: 178 first_invalid_line = i + 1 179 invalid_line = line 180 continue 181 try: 182 %s = line.split( '\\t' ) 183 %s = %s 184 if %s: 185 lines_kept += 1 186 print >> out, line 187 except Exception, e: 188 skipped_lines += 1 189 if not invalid_line: 190 first_invalid_line = i + 1 191 invalid_line = line 192''' % ( col_str, col_str, type_cast_str, cond_text ) 193 194 valid_filter = True 195 try: 196 exec code 197 except Exception, e: 198 out.close() 199 if str( e ).startswith( 'invalid syntax' ): 200 valid_filter = False 201 stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % orig_cond_text + ' '+str(e)) 202 else: 203 stop_err( str( e ) ) 204 205 if valid_filter: 206 out.close() 207 valid_lines = total_lines - skipped_lines 208 print 'Filtering with %s, ' % orig_cond_text 209 if valid_lines > 0: 210 print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) 211 else: 212 print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % orig_cond_text 213 if skipped_lines > 0: 214 print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) 215 216if __name__ == "__main__" : __main__()