/tools/filters/gff/gff_filter_by_feature_count.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 88 lines · 61 code · 10 blank · 17 comment · 14 complexity · 200da793eaa81023b16e6f824fc6709a MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Filter a gff file using a criterion based on feature counts for a transcript.
  4. Usage:
  5. %prog input_name output_name feature_name condition
  6. """
  7. import sys
  8. from galaxy import eggs
  9. from galaxy.datatypes.util.gff_util import GFFReaderWrapper
  10. from bx.intervals.io import GenomicInterval
  11. # Valid operators, ordered so that complex operators (e.g. '>=') are
  12. # recognized before simple operators (e.g. '>')
  13. ops = [
  14. '>=',
  15. '<=',
  16. '<',
  17. '>',
  18. '==',
  19. '!='
  20. ]
  21. # Escape sequences for valid operators.
  22. mapped_ops = {
  23. '__ge__': ops[0],
  24. '__le__': ops[1],
  25. '__lt__': ops[2],
  26. '__gt__': ops[3],
  27. '__eq__': ops[4],
  28. '__ne__': ops[5],
  29. }
  30. def __main__():
  31. # Get args.
  32. input_name = sys.argv[1]
  33. output_name = sys.argv[2]
  34. feature_name = sys.argv[3]
  35. condition = sys.argv[4]
  36. # Unescape operations in condition str.
  37. for key, value in mapped_ops.items():
  38. condition = condition.replace( key, value )
  39. # Error checking: condition should be of the form <operator><number>
  40. for op in ops:
  41. if op in condition:
  42. empty, number_str = condition.split( op )
  43. try:
  44. number = float( number_str )
  45. except:
  46. number = None
  47. if empty != "" or not number:
  48. print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition
  49. return
  50. break
  51. # Do filtering.
  52. kept_features = 0
  53. skipped_lines = 0
  54. first_skipped_line = 0
  55. out = open( output_name, 'w' )
  56. for i, feature in enumerate( GFFReaderWrapper( open( input_name ) ) ):
  57. if not isinstance( feature, GenomicInterval ):
  58. continue
  59. count = 0
  60. for interval in feature.intervals:
  61. if interval.feature == feature_name:
  62. count += 1
  63. if eval( '%s %s' % ( count, condition ) ):
  64. # Keep feature.
  65. for interval in feature.intervals:
  66. out.write( "\t".join(interval.fields) + '\n' )
  67. kept_features += 1
  68. # Needed because i is 0-based but want to display stats using 1-based.
  69. i += 1
  70. # Clean up.
  71. out.close()
  72. info_msg = "%i of %i features kept (%.2f%%) using condition %s. " % \
  73. ( kept_features, i, float(kept_features)/i * 100.0, feature_name + condition )
  74. if skipped_lines > 0:
  75. info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
  76. print info_msg
  77. if __name__ == "__main__": __main__()