/tools/filters/gff/gff_filter_by_attribute.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 163 lines · 122 code · 19 blank · 22 comment · 19 complexity · 2984579bfcbaa3dae77aaf822bf04fca MD5 · raw file

  1. #!/usr/bin/env python
  2. # This tool takes a gff file as input and creates filters on attributes based on certain properties.
  3. # The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
  4. # TODO: much of this code is copied from the Filter1 tool (filtering.py in tools/stats/). The commonalities should be
  5. # abstracted and leveraged in each filtering tool.
  6. from __future__ import division
  7. import sys
  8. from galaxy import eggs
  9. from galaxy.util.json import to_json_string, from_json_string
  10. # Older py compatibility
  11. try:
  12. set()
  13. except:
  14. from sets import Set as set
  15. assert sys.version_info[:2] >= ( 2, 4 )
  16. #
  17. # Helper functions.
  18. #
  19. def get_operands( filter_condition ):
  20. # Note that the order of all_operators is important
  21. items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in ']
  22. for item in items_to_strip:
  23. if filter_condition.find( item ) >= 0:
  24. filter_condition = filter_condition.replace( item, ' ' )
  25. operands = set( filter_condition.split( ' ' ) )
  26. return operands
  27. def stop_err( msg ):
  28. sys.stderr.write( msg )
  29. sys.exit()
  30. def check_for_executable( text, description='' ):
  31. # Attempt to determine if the condition includes executable stuff and, if so, exit.
  32. secured = dir()
  33. operands = get_operands( text )
  34. for operand in operands:
  35. try:
  36. check = int( operand )
  37. except:
  38. if operand in secured:
  39. stop_err( "Illegal value '%s' in %s '%s'" % ( operand, description, text ) )
  40. #
  41. # Process inputs.
  42. #
  43. in_fname = sys.argv[1]
  44. out_fname = sys.argv[2]
  45. cond_text = sys.argv[3]
  46. attribute_types = from_json_string( sys.argv[4] )
  47. # Convert types from str to type objects.
  48. for name, a_type in attribute_types.items():
  49. check_for_executable(a_type)
  50. attribute_types[ name ] = eval( a_type )
  51. # Unescape if input has been escaped
  52. mapped_str = {
  53. '__lt__': '<',
  54. '__le__': '<=',
  55. '__eq__': '==',
  56. '__ne__': '!=',
  57. '__gt__': '>',
  58. '__ge__': '>=',
  59. '__sq__': '\'',
  60. '__dq__': '"',
  61. }
  62. for key, value in mapped_str.items():
  63. cond_text = cond_text.replace( key, value )
  64. # Attempt to determine if the condition includes executable stuff and, if so, exit.
  65. check_for_executable( cond_text, 'condition')
  66. # Prepare the column variable names and wrappers for column data types. Only
  67. # prepare columns up to largest column in condition.
  68. attrs, type_casts = [], []
  69. for name, attr_type in attribute_types.items():
  70. attrs.append( name )
  71. type_cast = "get_value('%(name)s', attribute_types['%(name)s'], attribute_values)" % ( {'name': name} )
  72. type_casts.append( type_cast )
  73. attr_str = ', '.join( attrs ) # 'c1, c2, c3, c4'
  74. type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)'
  75. wrap = "%s = %s" % ( attr_str, type_cast_str )
  76. # Stats
  77. skipped_lines = 0
  78. first_invalid_line = 0
  79. invalid_line = None
  80. lines_kept = 0
  81. total_lines = 0
  82. out = open( out_fname, 'wt' )
  83. # Helper function to safely get and type cast a value in a dict.
  84. def get_value(name, a_type, values_dict):
  85. if name in values_dict:
  86. return (a_type)(values_dict[ name ])
  87. else:
  88. return None
  89. # Read and filter input file, skipping invalid lines
  90. code = '''
  91. for i, line in enumerate( file( in_fname ) ):
  92. total_lines += 1
  93. line = line.rstrip( '\\r\\n' )
  94. if not line or line.startswith( '#' ):
  95. skipped_lines += 1
  96. if not invalid_line:
  97. first_invalid_line = i + 1
  98. invalid_line = line
  99. continue
  100. try:
  101. # Place attribute values into variables with attribute
  102. # name; type casting is done as well.
  103. elems = line.split( '\t' )
  104. attribute_values = {}
  105. for name_value_pair in elems[8].split(";"):
  106. pair = name_value_pair.strip().split(" ")
  107. if pair == '':
  108. continue
  109. name = pair[0].strip()
  110. if name == '':
  111. continue
  112. # Need to strip double quote from value and typecast.
  113. attribute_values[name] = pair[1].strip(" \\"")
  114. %s
  115. if %s:
  116. lines_kept += 1
  117. print >> out, line
  118. except Exception, e:
  119. print e
  120. skipped_lines += 1
  121. if not invalid_line:
  122. first_invalid_line = i + 1
  123. invalid_line = line
  124. ''' % ( wrap, cond_text )
  125. valid_filter = True
  126. try:
  127. exec code
  128. except Exception, e:
  129. out.close()
  130. if str( e ).startswith( 'invalid syntax' ):
  131. valid_filter = False
  132. stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text )
  133. else:
  134. stop_err( str( e ) )
  135. if valid_filter:
  136. out.close()
  137. valid_lines = total_lines - skipped_lines
  138. print 'Filtering with %s, ' % ( cond_text )
  139. if valid_lines > 0:
  140. print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
  141. else:
  142. print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
  143. if skipped_lines > 0:
  144. print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )