PageRenderTime 17ms CodeModel.GetById 6ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/stats/grouping.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 180 lines | 176 code | 0 blank | 4 comment | 1 complexity | 24a067ca46027d8a099bbc2ad0f36533 MD5 | raw file
  1. #!/usr/bin/env python
  2. # Guruprasad Ananda
  3. # Refactored 2011 to use numpy instead of rpy, Kanwei Li
  4. """
  5. This tool provides the SQL "group by" functionality.
  6. """
  7. import sys, commands, tempfile, random
  8. try:
  9. import numpy
  10. except:
  11. from galaxy import eggs
  12. eggs.require( "numpy" )
  13. import numpy
  14. from itertools import groupby
  15. def stop_err(msg):
  16. sys.stderr.write(msg)
  17. sys.exit()
  18. def mode(data):
  19. counts = {}
  20. for x in data:
  21. counts[x] = counts.get(x,0) + 1
  22. maxcount = max(counts.values())
  23. modelist = []
  24. for x in counts:
  25. if counts[x] == maxcount:
  26. modelist.append( str(x) )
  27. return ','.join(modelist)
  28. def main():
  29. inputfile = sys.argv[2]
  30. ignorecase = int(sys.argv[4])
  31. ops = []
  32. cols = []
  33. round_val = []
  34. data_ary = []
  35. if sys.argv[5] != "None":
  36. oldfile = open(inputfile,'r')
  37. oldfilelines = oldfile.readlines()
  38. newinputfile = "input_cleaned.tsv"
  39. newfile = open(newinputfile,'w')
  40. asciitodelete = sys.argv[5].split(',')
  41. for i in range(len(asciitodelete)):
  42. asciitodelete[i] = chr(int(asciitodelete[i]))
  43. for line in oldfilelines:
  44. if line[0] not in asciitodelete:
  45. newfile.write(line)
  46. oldfile.close()
  47. newfile.close()
  48. inputfile = newinputfile
  49. for var in sys.argv[6:]:
  50. op, col, do_round = var.split()
  51. ops.append(op)
  52. cols.append(col)
  53. round_val.append(do_round)
  54. """
  55. At this point, ops, cols and rounds will look something like this:
  56. ops: ['mean', 'min', 'c']
  57. cols: ['1', '3', '4']
  58. round_val: ['no', 'yes' 'no']
  59. """
  60. try:
  61. group_col = int( sys.argv[3] )-1
  62. except:
  63. stop_err( "Group column not specified." )
  64. str_ops = ['c', 'length', 'unique', 'random', 'cuniq', 'Mode'] #ops that can handle string/non-numeric inputs
  65. tmpfile = tempfile.NamedTemporaryFile()
  66. try:
  67. """
  68. The -k option for the Posix sort command is as follows:
  69. -k, --key=POS1[,POS2]
  70. start a key at POS1, end it at POS2 (origin 1)
  71. In other words, column positions start at 1 rather than 0, so
  72. we need to add 1 to group_col.
  73. if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
  74. """
  75. case = ''
  76. if ignorecase == 1:
  77. case = '-f'
  78. command_line = "sort -t ' ' %s -k%s,%s -o %s %s" % (case, group_col+1, group_col+1, tmpfile.name, inputfile)
  79. except Exception, exc:
  80. stop_err( 'Initialization error -> %s' %str(exc) )
  81. error_code, stdout = commands.getstatusoutput(command_line)
  82. if error_code != 0:
  83. stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
  84. fout = open(sys.argv[1], "w")
  85. def is_new_item(line):
  86. try:
  87. item = line.strip().split("\t")[group_col]
  88. except IndexError:
  89. stop_err( "The following line didn't have %s columns: %s" % (group_col+1, line) )
  90. if ignorecase == 1:
  91. return item.lower()
  92. return item
  93. for key, line_list in groupby(tmpfile, key=is_new_item):
  94. op_vals = [ [] for op in ops ]
  95. out_str = key
  96. multiple_modes = False
  97. mode_index = None
  98. for line in line_list:
  99. fields = line.strip().split("\t")
  100. for i, col in enumerate(cols):
  101. col = int(col)-1 # cXX from galaxy is 1-based
  102. try:
  103. val = fields[col].strip()
  104. op_vals[i].append(val)
  105. except IndexError:
  106. sys.stderr.write( 'Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n' % (col+1, line) )
  107. sys.exit( 1 )
  108. # Generate string for each op for this group
  109. for i, op in enumerate( ops ):
  110. data = op_vals[i]
  111. rval = ""
  112. if op == "mode":
  113. rval = mode( data )
  114. elif op == "length":
  115. rval = len( data )
  116. elif op == "random":
  117. rval = random.choice(data)
  118. elif op in ['cat', 'cat_uniq']:
  119. if op == 'cat_uniq':
  120. data = numpy.unique(data)
  121. rval = ','.join(data)
  122. elif op == "unique":
  123. rval = len( numpy.unique(data) )
  124. else:
  125. # some kind of numpy fn
  126. try:
  127. data = map(float, data)
  128. except ValueError:
  129. sys.stderr.write( "Operation %s expected number values but got %s instead.\n" % (op, data) )
  130. sys.exit( 1 )
  131. rval = getattr(numpy, op)( data )
  132. if round_val[i] == 'yes':
  133. rval = round(rval)
  134. else:
  135. rval = '%g' % rval
  136. out_str += "\t%s" % rval
  137. fout.write(out_str + "\n")
  138. # Generate a useful info message.
  139. msg = "--Group by c%d: " %(group_col+1)
  140. for i, op in enumerate(ops):
  141. if op == 'cat':
  142. op = 'concat'
  143. elif op == 'cat_uniq':
  144. op = 'concat_distinct'
  145. elif op == 'length':
  146. op = 'count'
  147. elif op == 'unique':
  148. op = 'count_distinct'
  149. elif op == 'random':
  150. op = 'randomly_pick'
  151. msg += op + "[c" + cols[i] + "] "
  152. print msg
  153. fout.close()
  154. tmpfile.close()
  155. if __name__ == "__main__":
  156. main()