/tools/new_operations/subtract_query.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 113 lines · 90 code · 9 blank · 14 comment · 19 complexity · f915a61acdd2764902d769518e7ba8ec MD5 · raw file

  1. #!/usr/bin/env python
  2. # Greg Von Kuster
  3. """
  4. Subtract an entire query from another query
  5. usage: %prog in_file_1 in_file_2 begin_col end_col output
  6. --ignore-empty-end-cols: ignore empty end columns when subtracting
  7. """
  8. import sys, re
  9. from galaxy import eggs
  10. import pkg_resources; pkg_resources.require( "bx-python" )
  11. from bx.cookbook import doc_optparse
  12. # Older py compatibility
  13. try:
  14. set()
  15. except:
  16. from sets import Set as set
  17. assert sys.version_info[:2] >= ( 2, 4 )
  18. def get_lines(fname, begin_col='', end_col='', ignore_empty_end_cols=False):
  19. lines = set([])
  20. i = 0
  21. for i, line in enumerate(file(fname)):
  22. line = line.rstrip('\r\n')
  23. if line and not line.startswith('#'):
  24. if begin_col and end_col:
  25. """Both begin_col and end_col must be integers at this point."""
  26. try:
  27. line = line.split('\t')
  28. line = '\t'.join([line[j] for j in range(begin_col-1, end_col)])
  29. if ignore_empty_end_cols:
  30. # removing empty fields, we do not compare empty fields at the end of a line.
  31. line = line.rstrip()
  32. lines.add( line )
  33. except: pass
  34. else:
  35. if ignore_empty_end_cols:
  36. # removing empty fields, we do not compare empty fields at the end of a line.
  37. line = line.rstrip()
  38. lines.add( line )
  39. if i: return (i+1, lines)
  40. else: return (i, lines)
  41. def main():
  42. # Parsing Command Line here
  43. options, args = doc_optparse.parse( __doc__ )
  44. try:
  45. inp1_file, inp2_file, begin_col, end_col, out_file = args
  46. except:
  47. doc_optparse.exception()
  48. begin_col = begin_col.strip()
  49. end_col = end_col.strip()
  50. if begin_col != 'None' or end_col != 'None':
  51. """
  52. The user selected columns for restriction. We'll allow default
  53. values for both begin_col and end_col as long as the user selected
  54. at least one of them for restriction.
  55. """
  56. if begin_col == 'None':
  57. begin_col = end_col
  58. elif end_col == 'None':
  59. end_col = begin_col
  60. begin_col = int(begin_col)
  61. end_col = int(end_col)
  62. """Make sure that begin_col <= end_col (switch if not)"""
  63. if begin_col > end_col:
  64. tmp_col = end_col
  65. end_col = begin_col
  66. begin_col = tmp_col
  67. else:
  68. begin_col = end_col = ''
  69. try:
  70. fo = open(out_file,'w')
  71. except:
  72. print >> sys.stderr, "Unable to open output file"
  73. sys.exit()
  74. """
  75. len1 is the number of lines in inp1_file
  76. lines1 is the set of unique lines in inp1_file
  77. diff1 is the number of duplicate lines removed from inp1_file
  78. """
  79. len1, lines1 = get_lines(inp1_file, begin_col, end_col, options.ignore_empty_end_cols)
  80. diff1 = len1 - len(lines1)
  81. len2, lines2 = get_lines(inp2_file, begin_col, end_col, options.ignore_empty_end_cols)
  82. lines1.difference_update(lines2)
  83. """lines1 is now the set of unique lines in inp1_file - the set of unique lines in inp2_file"""
  84. for line in lines1:
  85. print >> fo, line
  86. fo.close()
  87. info_msg = 'Subtracted %d lines. ' %((len1 - diff1) - len(lines1))
  88. if begin_col and end_col:
  89. info_msg += 'Restricted to columns c' + str(begin_col) + ' thru c' + str(end_col) + '. '
  90. if diff1 > 0:
  91. info_msg += 'Eliminated %d duplicate/blank/comment/invalid lines from first query.' %diff1
  92. print info_msg
  93. if __name__ == "__main__":
  94. main()