/tools/stats/dna_filtering.py

https://bitbucket.org/h_morita_dbcls/galaxy-central · Python · 216 lines · 176 code · 15 blank · 25 comment · 49 complexity · 011a8e08d940eef49a6f5a129ed614b4 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
  4. usage: %prog [options]
  5. -i, --input=i: tabular input file
  6. -o, --output=o: filtered output file
  7. -c, --cond=c: conditions to filter on
  8. -n, --n_handling=n: how to handle N and X
  9. -l, --columns=l: columns
  10. -t, --col_types=t: column types
  11. """
  12. #from __future__ import division
  13. import os.path, re, string, string, sys
  14. from galaxy import eggs
  15. import pkg_resources; pkg_resources.require( "bx-python" )
  16. from bx.cookbook import doc_optparse
  17. # Older py compatibility
  18. try:
  19. set()
  20. except:
  21. from sets import Set as set
  22. #assert sys.version_info[:2] >= ( 2, 4 )
  23. def get_operands( filter_condition ):
  24. # Note that the order of all_operators is important
  25. items_to_strip = [ '==', '!=', ' and ', ' or ' ]
  26. for item in items_to_strip:
  27. if filter_condition.find( item ) >= 0:
  28. filter_condition = filter_condition.replace( item, ' ' )
  29. operands = set( filter_condition.split( ' ' ) )
  30. return operands
  31. def stop_err( msg ):
  32. sys.stderr.write( msg )
  33. sys.exit()
  34. def __main__():
  35. #Parse Command Line
  36. options, args = doc_optparse.parse( __doc__ )
  37. input = options.input
  38. output = options.output
  39. cond = options.cond
  40. n_handling = options.n_handling
  41. columns = options.columns
  42. col_types = options.col_types
  43. try:
  44. in_columns = int( columns )
  45. assert col_types #check to see that the column types variable isn't null
  46. in_column_types = col_types.split( ',' )
  47. except:
  48. stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." )
  49. # Unescape if input has been escaped
  50. cond_text = cond.replace( '__eq__', '==' ).replace( '__ne__', '!=' ).replace( '__sq__', "'" )
  51. orig_cond_text = cond_text
  52. # Expand to allow for DNA codes
  53. dot_letters = [ letter for letter in string.uppercase if letter not in \
  54. [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] ]
  55. dot_letters.append( '.' )
  56. codes = {'A': [ 'A', 'D', 'H', 'M', 'R', 'V', 'W' ],
  57. 'C': [ 'C', 'B', 'H', 'M', 'S', 'V', 'Y' ],
  58. 'G': [ 'G', 'B', 'D', 'K', 'R', 'S', 'V' ],
  59. 'T': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
  60. 'U': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
  61. 'K': [ 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'R', 'S', 'V', 'W', 'Y' ],
  62. 'M': [ 'A', 'C', 'B', 'D', 'H', 'M', 'R', 'S', 'V', 'W', 'Y' ],
  63. 'R': [ 'A', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
  64. 'Y': [ 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'S', 'V', 'W', 'Y' ],
  65. 'S': [ 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'Y' ],
  66. 'W': [ 'A', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'V', 'W', 'Y' ],
  67. 'B': [ 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
  68. 'V': [ 'A', 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
  69. 'H': [ 'A', 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
  70. 'D': [ 'A', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
  71. '.': dot_letters,
  72. '-': [ '-' ]}
  73. # Add handling for N and X
  74. if n_handling == "all":
  75. codes[ 'N' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
  76. codes[ 'X' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
  77. for code in codes.keys():
  78. if code != '.' and code != '-':
  79. codes[code].append( 'N' )
  80. codes[code].append( 'X' )
  81. else:
  82. codes[ 'N' ] = dot_letters
  83. codes[ 'X' ] = dot_letters
  84. codes[ '.' ].extend( [ 'N', 'X' ] )
  85. # Expand conditions to allow for DNA codes
  86. try:
  87. match_replace = {}
  88. pat = re.compile( 'c\d+\s*[!=]=\s*[\w\d"\'+-.]+' )
  89. matches = pat.findall( cond_text )
  90. for match in matches:
  91. if match.find( 'chr' ) >= 0 or match.find( 'scaffold' ) >= 0 or match.find( '+' ) >= 0:
  92. if match.find( '==' ) >= 0:
  93. match_parts = match.split( '==' )
  94. elif match.find( '!=' ) >= 0:
  95. match_parts = match.split( '!=' )
  96. else:
  97. raise Exception, "The operators '==' and '!=' were not found."
  98. left = match_parts[0].strip()
  99. right = match_parts[1].strip()
  100. new_match = "(%s)" % ( match )
  101. elif match.find( '==' ) > 0:
  102. match_parts = match.split( '==' )
  103. left = match_parts[0].strip()
  104. right = match_parts[1].strip()
  105. new_match = '(%s in codes[%s] and %s in codes[%s])' % ( left, right, right, left )
  106. elif match.find( '!=' ) > 0 :
  107. match_parts = match.split( '!=' )
  108. left = match_parts[0].strip()
  109. right = match_parts[1].strip()
  110. new_match = '(%s not in codes[%s] or %s not in codes[%s])' % ( left, right, right, left )
  111. else:
  112. raise Exception, "The operators '==' and '!=' were not found."
  113. assert left.startswith( 'c' ), 'The column names should start with c (lowercase)'
  114. if right.find( "'" ) >= 0 or right.find( '"' ) >= 0:
  115. test = right.replace( "'", '' ).replace( '"', '' )
  116. assert test in string.uppercase or test.find( '+' ) >= 0 or test.find( '.' ) >= 0 or test.find( '-' ) >= 0\
  117. or test.startswith( 'chr' ) or test.startswith( 'scaffold' ), \
  118. 'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). ' \
  119. 'Use the general filter tool to filter on anything else first'
  120. else:
  121. assert right.startswith( 'c' ), 'The column names should start with c (lowercase)'
  122. match_replace[match] = new_match
  123. if len( match_replace.keys() ) == 0:
  124. raise Exception, 'There do not appear to be any valid conditions'
  125. for match in match_replace.keys():
  126. cond_text = cond_text.replace( match, match_replace[match] )
  127. except Exception, e:
  128. stop_err( "At least one of your conditions is invalid. Make sure to use only '!=' or '==', valid column numbers, and valid base values.\n" + str(e) )
  129. # Attempt to determine if the condition includes executable stuff and, if so, exit
  130. secured = dir()
  131. operands = get_operands( cond_text )
  132. for operand in operands:
  133. try:
  134. check = int( operand )
  135. except:
  136. if operand in secured:
  137. stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
  138. # Prepare the column variable names and wrappers for column data types
  139. cols, type_casts = [], []
  140. for col in range( 1, in_columns + 1 ):
  141. col_name = "c%d" % col
  142. cols.append( col_name )
  143. col_type = in_column_types[ col - 1 ]
  144. type_cast = "%s(%s)" % ( col_type, col_name )
  145. type_casts.append( type_cast )
  146. col_str = ', '.join( cols ) # 'c1, c2, c3, c4'
  147. type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)'
  148. assign = "%s = line.split( '\\t' )" % col_str
  149. wrap = "%s = %s" % ( col_str, type_cast_str )
  150. skipped_lines = 0
  151. first_invalid_line = 0
  152. invalid_line = None
  153. lines_kept = 0
  154. total_lines = 0
  155. out = open( output, 'wt' )
  156. # Read and filter input file, skipping invalid lines
  157. code = '''
  158. for i, line in enumerate( file( input ) ):
  159. total_lines += 1
  160. line = line.rstrip( '\\r\\n' )
  161. if not line or line.startswith( '#' ):
  162. skipped_lines += 1
  163. if not invalid_line:
  164. first_invalid_line = i + 1
  165. invalid_line = line
  166. continue
  167. try:
  168. %s = line.split( '\\t' )
  169. %s = %s
  170. if %s:
  171. lines_kept += 1
  172. print >> out, line
  173. except Exception, e:
  174. skipped_lines += 1
  175. if not invalid_line:
  176. first_invalid_line = i + 1
  177. invalid_line = line
  178. ''' % ( col_str, col_str, type_cast_str, cond_text )
  179. valid_filter = True
  180. try:
  181. exec code
  182. except Exception, e:
  183. out.close()
  184. if str( e ).startswith( 'invalid syntax' ):
  185. valid_filter = False
  186. stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % orig_cond_text + ' '+str(e))
  187. else:
  188. stop_err( str( e ) )
  189. if valid_filter:
  190. out.close()
  191. valid_lines = total_lines - skipped_lines
  192. print 'Filtering with %s, ' % orig_cond_text
  193. if valid_lines > 0:
  194. print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
  195. else:
  196. print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % orig_cond_text
  197. if skipped_lines > 0:
  198. print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
  199. if __name__ == "__main__" : __main__()