PageRenderTime 27ms CodeModel.GetById 8ms app.highlight 15ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/stats/grouping.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 180 lines | 176 code | 0 blank | 4 comment | 4 complexity | 24a067ca46027d8a099bbc2ad0f36533 MD5 | raw file
  1#!/usr/bin/env python
  2# Guruprasad Ananda
  3# Refactored 2011 to use numpy instead of rpy, Kanwei Li
  4"""
  5This tool provides the SQL "group by" functionality.
  6"""
  7import sys, commands, tempfile, random
  8try:
  9    import numpy
 10except:
 11    from galaxy import eggs
 12    eggs.require( "numpy" )
 13    import numpy
 14
 15from itertools import groupby
 16
 17def stop_err(msg):
 18    sys.stderr.write(msg)
 19    sys.exit()
 20
 21def mode(data):
 22    counts = {}
 23    for x in data:
 24        counts[x] = counts.get(x,0) + 1
 25    maxcount = max(counts.values())
 26    modelist = []
 27    for x in counts:
 28        if counts[x] == maxcount:
 29            modelist.append( str(x) )
 30    return ','.join(modelist)
 31    
 32def main():
 33    inputfile = sys.argv[2]
 34    ignorecase = int(sys.argv[4])
 35    ops = []
 36    cols = []
 37    round_val = []
 38    data_ary = []
 39    
 40    if sys.argv[5] != "None":
 41        oldfile = open(inputfile,'r')
 42        oldfilelines = oldfile.readlines()
 43        newinputfile = "input_cleaned.tsv"
 44        newfile = open(newinputfile,'w')
 45        asciitodelete = sys.argv[5].split(',')
 46        for i in range(len(asciitodelete)):
 47            asciitodelete[i] = chr(int(asciitodelete[i]))
 48        for line in oldfilelines:
 49            if line[0] not in asciitodelete:
 50                newfile.write(line)
 51        oldfile.close()
 52        newfile.close()
 53        inputfile = newinputfile
 54
 55    for var in sys.argv[6:]:
 56        op, col, do_round = var.split()
 57        ops.append(op)
 58        cols.append(col)
 59        round_val.append(do_round)
 60    """
 61    At this point, ops, cols and rounds will look something like this:
 62    ops:  ['mean', 'min', 'c']
 63    cols: ['1', '3', '4']
 64    round_val: ['no', 'yes' 'no']
 65    """
 66
 67    try:
 68        group_col = int( sys.argv[3] )-1
 69    except:
 70        stop_err( "Group column not specified." )
 71    
 72    str_ops = ['c', 'length', 'unique', 'random', 'cuniq', 'Mode'] #ops that can handle string/non-numeric inputs
 73    
 74    tmpfile = tempfile.NamedTemporaryFile()
 75    
 76    try:
 77        """
 78        The -k option for the Posix sort command is as follows:
 79        -k, --key=POS1[,POS2]
 80        start a key at POS1, end it at POS2 (origin 1)
 81        In other words, column positions start at 1 rather than 0, so 
 82        we need to add 1 to group_col.
 83        if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
 84        """
 85        case = ''
 86        if ignorecase == 1:
 87            case = '-f' 
 88        command_line = "sort -t '	' %s -k%s,%s -o %s %s" % (case, group_col+1, group_col+1, tmpfile.name, inputfile)
 89    except Exception, exc:
 90        stop_err( 'Initialization error -> %s' %str(exc) )
 91    
 92    error_code, stdout = commands.getstatusoutput(command_line)
 93    
 94    if error_code != 0:
 95        stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
 96        
 97    fout = open(sys.argv[1], "w")
 98    
 99    def is_new_item(line):
100        try:
101            item = line.strip().split("\t")[group_col]
102        except IndexError:
103            stop_err( "The following line didn't have %s columns: %s" % (group_col+1, line) )
104            
105        if ignorecase == 1:
106            return item.lower()
107        return item
108        
109    for key, line_list in groupby(tmpfile, key=is_new_item):
110        op_vals = [ [] for op in ops ]
111        out_str = key
112        multiple_modes = False
113        mode_index = None
114        
115        for line in line_list:
116            fields = line.strip().split("\t")
117            for i, col in enumerate(cols):
118                col = int(col)-1 # cXX from galaxy is 1-based
119                try:
120                    val = fields[col].strip()
121                    op_vals[i].append(val)
122                except IndexError:
123                    sys.stderr.write( 'Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n' % (col+1, line) )
124                    sys.exit( 1 )
125                
126        # Generate string for each op for this group
127        for i, op in enumerate( ops ):
128            data = op_vals[i]
129            rval = ""
130            if op == "mode":
131                rval = mode( data )
132            elif op == "length":
133                rval = len( data )
134            elif op == "random":
135                rval = random.choice(data)
136            elif op in ['cat', 'cat_uniq']:
137                if op == 'cat_uniq':
138                    data = numpy.unique(data)
139                rval = ','.join(data)
140            elif op == "unique":
141                rval = len( numpy.unique(data) )
142            else:
143                # some kind of numpy fn
144                try:
145                    data = map(float, data)
146                except ValueError:
147                    sys.stderr.write( "Operation %s expected number values but got %s instead.\n" % (op, data) )
148                    sys.exit( 1 )
149                rval = getattr(numpy, op)( data )
150                if round_val[i] == 'yes':
151                    rval = round(rval)
152                else:
153                    rval = '%g' % rval
154                        
155            out_str += "\t%s" % rval
156        
157        fout.write(out_str + "\n")
158    
159    # Generate a useful info message.
160    msg = "--Group by c%d: " %(group_col+1)
161    for i, op in enumerate(ops):
162        if op == 'cat':
163            op = 'concat'
164        elif op == 'cat_uniq':
165            op = 'concat_distinct'
166        elif op == 'length':
167            op = 'count'
168        elif op == 'unique':
169            op = 'count_distinct'
170        elif op == 'random':
171            op = 'randomly_pick'
172        
173        msg += op + "[c" + cols[i] + "] "
174    
175    print msg
176    fout.close()
177    tmpfile.close()
178
179if __name__ == "__main__":
180    main()