/tools/stats/dna_filtering.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 349 lines · 279 code · 34 blank · 36 comment · 80 complexity · 49d311dc3d3fb050ef0a1e4048a4e92e MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
  4. usage: %prog [options]
  5. -i, --input=i: tabular input file
  6. -o, --output=o: filtered output file
  7. -c, --cond=c: conditions to filter on
  8. -n, --n_handling=n: how to handle N and X
  9. -l, --columns=l: columns
  10. -t, --col_types=t: column types
  11. """
  12. #from __future__ import division
  13. import os.path, re, string, string, sys
  14. from galaxy import eggs
  15. import pkg_resources; pkg_resources.require( "bx-python" )
  16. from bx.cookbook import doc_optparse
  17. from ast import parse, Module, walk
  18. # Older py compatibility
  19. try:
  20. set()
  21. except:
  22. from sets import Set as set
  23. AST_NODE_TYPE_WHITELIST = [
  24. 'Expr', 'Load',
  25. 'Str', 'Num', 'BoolOp',
  26. 'Compare', 'And', 'Eq',
  27. 'NotEq', 'Or', 'GtE',
  28. 'LtE', 'Lt', 'Gt',
  29. 'BinOp', 'Add', 'Div',
  30. 'Sub', 'Mult', 'Mod',
  31. 'Pow', 'LShift', 'GShift',
  32. 'BitAnd', 'BitOr', 'BitXor',
  33. 'UnaryOp', 'Invert', 'Not',
  34. 'NotIn', 'In', 'Is',
  35. 'IsNot', 'List',
  36. 'Index', 'Subscript',
  37. # Further checks
  38. 'Name', 'Call', 'Attribute',
  39. ]
  40. BUILTIN_AND_MATH_FUNCTIONS = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'.split('|')
  41. STRING_AND_LIST_METHODS = [ name for name in dir('') + dir([]) if not name.startswith('_') ]
  42. VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS
  43. def __check_name( ast_node ):
  44. name = ast_node.id
  45. if re.match(r'^c\d+$', name):
  46. return True
  47. elif name == "codes":
  48. return True
  49. else:
  50. return name in VALID_FUNCTIONS
  51. def __check_attribute( ast_node ):
  52. attribute_name = ast_node.attr
  53. if attribute_name not in STRING_AND_LIST_METHODS:
  54. return False
  55. return True
  56. def __check_call( ast_node ):
  57. # If we are calling a function or method, it better be a math,
  58. # string or list function.
  59. ast_func = ast_node.func
  60. ast_func_class = ast_func.__class__.__name__
  61. if ast_func_class == 'Name':
  62. if ast_func.id not in BUILTIN_AND_MATH_FUNCTIONS:
  63. return False
  64. elif ast_func_class == 'Attribute':
  65. if not __check_attribute( ast_func ):
  66. return False
  67. else:
  68. return False
  69. return True
  70. def check_expression( text ):
  71. """
  72. >>> check_expression("c1=='chr1' and c3-c2>=2000 and c6=='+'")
  73. True
  74. >>> check_expression("eval('1+1')")
  75. False
  76. >>> check_expression("import sys")
  77. False
  78. >>> check_expression("[].__str__")
  79. False
  80. >>> check_expression("__builtins__")
  81. False
  82. >>> check_expression("'x' in globals")
  83. False
  84. >>> check_expression("'x' in [1,2,3]")
  85. True
  86. >>> check_expression("c3=='chr1' and c5>5")
  87. True
  88. >>> check_expression("c3=='chr1' and d5>5") # Invalid d5 reference
  89. False
  90. >>> check_expression("c3=='chr1' and c5>5 or exec")
  91. False
  92. >>> check_expression("type(c1) != type(1)")
  93. True
  94. >>> check_expression("c1.split(',')[1] == '1'")
  95. True
  96. >>> check_expression("exec 1")
  97. False
  98. >>> check_expression("str(c2) in [\\\"a\\\",\\\"b\\\"]")
  99. True
  100. """
  101. try:
  102. module = parse( text )
  103. except SyntaxError:
  104. return False
  105. if not isinstance(module, Module):
  106. return False
  107. statements = module.body
  108. if not len( statements ) == 1:
  109. return False
  110. expression = statements[0]
  111. if expression.__class__.__name__ != 'Expr':
  112. return False
  113. for ast_node in walk( expression ):
  114. ast_node_class = ast_node.__class__.__name__
  115. # Toss out everything that is not a "simple" expression,
  116. # imports, error handling, etc...
  117. if ast_node_class not in AST_NODE_TYPE_WHITELIST:
  118. return False
  119. # White-list more potentially dangerous types AST elements.
  120. if ast_node_class == 'Name':
  121. # In order to prevent loading 'exec', 'eval', etc...
  122. # put string restriction on names allowed.
  123. if not __check_name( ast_node ):
  124. return False
  125. # Check only valid, white-listed functions are called.
  126. elif ast_node_class == 'Call':
  127. if not __check_call( ast_node ):
  128. return False
  129. # Check only valid, white-listed attributes are accessed
  130. elif ast_node_class == 'Attribute':
  131. if not __check_attribute( ast_node ):
  132. return False
  133. return True
  134. def get_operands( filter_condition ):
  135. # Note that the order of all_operators is important
  136. items_to_strip = [ '==', '!=', ' and ', ' or ' ]
  137. for item in items_to_strip:
  138. if filter_condition.find( item ) >= 0:
  139. filter_condition = filter_condition.replace( item, ' ' )
  140. operands = set( filter_condition.split( ' ' ) )
  141. return operands
  142. def stop_err( msg ):
  143. sys.stderr.write( msg )
  144. sys.exit()
  145. def __main__():
  146. #Parse Command Line
  147. options, args = doc_optparse.parse( __doc__ )
  148. input = options.input
  149. output = options.output
  150. cond = options.cond
  151. n_handling = options.n_handling
  152. columns = options.columns
  153. col_types = options.col_types
  154. try:
  155. in_columns = int( columns )
  156. assert col_types #check to see that the column types variable isn't null
  157. in_column_types = col_types.split( ',' )
  158. except:
  159. stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." )
  160. # Unescape if input has been escaped
  161. cond_text = cond.replace( '__eq__', '==' ).replace( '__ne__', '!=' ).replace( '__sq__', "'" )
  162. orig_cond_text = cond_text
  163. # Expand to allow for DNA codes
  164. dot_letters = [ letter for letter in string.uppercase if letter not in \
  165. [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] ]
  166. dot_letters.append( '.' )
  167. codes = {'A': [ 'A', 'D', 'H', 'M', 'R', 'V', 'W' ],
  168. 'C': [ 'C', 'B', 'H', 'M', 'S', 'V', 'Y' ],
  169. 'G': [ 'G', 'B', 'D', 'K', 'R', 'S', 'V' ],
  170. 'T': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
  171. 'U': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
  172. 'K': [ 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'R', 'S', 'V', 'W', 'Y' ],
  173. 'M': [ 'A', 'C', 'B', 'D', 'H', 'M', 'R', 'S', 'V', 'W', 'Y' ],
  174. 'R': [ 'A', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
  175. 'Y': [ 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'S', 'V', 'W', 'Y' ],
  176. 'S': [ 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'Y' ],
  177. 'W': [ 'A', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'V', 'W', 'Y' ],
  178. 'B': [ 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
  179. 'V': [ 'A', 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
  180. 'H': [ 'A', 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
  181. 'D': [ 'A', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
  182. '.': dot_letters,
  183. '-': [ '-' ]}
  184. # Add handling for N and X
  185. if n_handling == "all":
  186. codes[ 'N' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
  187. codes[ 'X' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
  188. for code in codes.keys():
  189. if code != '.' and code != '-':
  190. codes[code].append( 'N' )
  191. codes[code].append( 'X' )
  192. else:
  193. codes[ 'N' ] = dot_letters
  194. codes[ 'X' ] = dot_letters
  195. codes[ '.' ].extend( [ 'N', 'X' ] )
  196. # Expand conditions to allow for DNA codes
  197. try:
  198. match_replace = {}
  199. pat = re.compile( 'c\d+\s*[!=]=\s*[\w\d"\'+-.]+' )
  200. matches = pat.findall( cond_text )
  201. for match in matches:
  202. if match.find( 'chr' ) >= 0 or match.find( 'scaffold' ) >= 0 or match.find( '+' ) >= 0:
  203. if match.find( '==' ) >= 0:
  204. match_parts = match.split( '==' )
  205. elif match.find( '!=' ) >= 0:
  206. match_parts = match.split( '!=' )
  207. else:
  208. raise Exception, "The operators '==' and '!=' were not found."
  209. left = match_parts[0].strip()
  210. right = match_parts[1].strip()
  211. new_match = "(%s)" % ( match )
  212. elif match.find( '==' ) > 0:
  213. match_parts = match.split( '==' )
  214. left = match_parts[0].strip()
  215. right = match_parts[1].strip()
  216. new_match = '(%s in codes[%s] and %s in codes[%s])' % ( left, right, right, left )
  217. elif match.find( '!=' ) > 0 :
  218. match_parts = match.split( '!=' )
  219. left = match_parts[0].strip()
  220. right = match_parts[1].strip()
  221. new_match = '(%s not in codes[%s] or %s not in codes[%s])' % ( left, right, right, left )
  222. else:
  223. raise Exception, "The operators '==' and '!=' were not found."
  224. assert left.startswith( 'c' ), 'The column names should start with c (lowercase)'
  225. if right.find( "'" ) >= 0 or right.find( '"' ) >= 0:
  226. test = right.replace( "'", '' ).replace( '"', '' )
  227. assert test in string.uppercase or test.find( '+' ) >= 0 or test.find( '.' ) >= 0 or test.find( '-' ) >= 0\
  228. or test.startswith( 'chr' ) or test.startswith( 'scaffold' ), \
  229. 'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). ' \
  230. 'Use the general filter tool to filter on anything else first'
  231. else:
  232. assert right.startswith( 'c' ), 'The column names should start with c (lowercase)'
  233. match_replace[match] = new_match
  234. if len( match_replace.keys() ) == 0:
  235. raise Exception, 'There do not appear to be any valid conditions'
  236. for match in match_replace.keys():
  237. cond_text = cond_text.replace( match, match_replace[match] )
  238. except Exception, e:
  239. stop_err( "At least one of your conditions is invalid. Make sure to use only '!=' or '==', valid column numbers, and valid base values.\n" + str(e) )
  240. # Attempt to determine if the condition includes executable stuff and, if so, exit
  241. secured = dir()
  242. operands = get_operands( cond_text )
  243. for operand in operands:
  244. try:
  245. check = int( operand )
  246. except:
  247. if operand in secured:
  248. stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
  249. if not check_expression( cond_text ):
  250. stop_err( "Illegal/invalid in condition '%s'" % ( cond_text ) )
  251. # Prepare the column variable names and wrappers for column data types
  252. cols, type_casts = [], []
  253. for col in range( 1, in_columns + 1 ):
  254. col_name = "c%d" % col
  255. cols.append( col_name )
  256. col_type = in_column_types[ col - 1 ]
  257. type_cast = "%s(%s)" % ( col_type, col_name )
  258. type_casts.append( type_cast )
  259. col_str = ', '.join( cols ) # 'c1, c2, c3, c4'
  260. type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)'
  261. assign = "%s = line.split( '\\t' )" % col_str
  262. wrap = "%s = %s" % ( col_str, type_cast_str )
  263. skipped_lines = 0
  264. first_invalid_line = 0
  265. invalid_line = None
  266. lines_kept = 0
  267. total_lines = 0
  268. out = open( output, 'wt' )
  269. # Read and filter input file, skipping invalid lines
  270. code = '''
  271. for i, line in enumerate( file( input ) ):
  272. total_lines += 1
  273. line = line.rstrip( '\\r\\n' )
  274. if not line or line.startswith( '#' ):
  275. skipped_lines += 1
  276. if not invalid_line:
  277. first_invalid_line = i + 1
  278. invalid_line = line
  279. continue
  280. try:
  281. %s = line.split( '\\t' )
  282. %s = %s
  283. if %s:
  284. lines_kept += 1
  285. print >> out, line
  286. except Exception, e:
  287. skipped_lines += 1
  288. if not invalid_line:
  289. first_invalid_line = i + 1
  290. invalid_line = line
  291. ''' % ( col_str, col_str, type_cast_str, cond_text )
  292. valid_filter = True
  293. try:
  294. exec code
  295. except Exception, e:
  296. out.close()
  297. if str( e ).startswith( 'invalid syntax' ):
  298. valid_filter = False
  299. stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % orig_cond_text + ' '+str(e))
  300. else:
  301. stop_err( str( e ) )
  302. if valid_filter:
  303. out.close()
  304. valid_lines = total_lines - skipped_lines
  305. print 'Filtering with %s, ' % orig_cond_text
  306. if valid_lines > 0:
  307. print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
  308. else:
  309. print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % orig_cond_text
  310. if skipped_lines > 0:
  311. print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
  312. if __name__ == "__main__" : __main__()