grouping.py - This is a Python script that performs various…

/tools/stats/grouping.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 180 lines · 168 code · 6 blank · 6 comment · 9 complexity · 24a067ca46027d8a099bbc2ad0f36533 MD5 · raw file


#!/usr/bin/env python
# Guruprasad Ananda
# Refactored 2011 to use numpy instead of rpy, Kanwei Li
"""
This tool provides the SQL "group by" functionality.
"""
import sys, commands, tempfile, random
try:
    import numpy
except:
    from galaxy import eggs
    eggs.require( "numpy" )
    import numpy

from itertools import groupby

def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()

def mode(data):
    counts = {}
    for x in data:
        counts[x] = counts.get(x,0) + 1
    maxcount = max(counts.values())
    modelist = []
    for x in counts:
        if counts[x] == maxcount:
            modelist.append( str(x) )
    return ','.join(modelist)
    
def main():
    inputfile = sys.argv[2]
    ignorecase = int(sys.argv[4])
    ops = []
    cols = []
    round_val = []
    data_ary = []
    
    if sys.argv[5] != "None":
        oldfile = open(inputfile,'r')
        oldfilelines = oldfile.readlines()
        newinputfile = "input_cleaned.tsv"
        newfile = open(newinputfile,'w')
        asciitodelete = sys.argv[5].split(',')
        for i in range(len(asciitodelete)):
            asciitodelete[i] = chr(int(asciitodelete[i]))
        for line in oldfilelines:
            if line[0] not in asciitodelete:
                newfile.write(line)
        oldfile.close()
        newfile.close()
        inputfile = newinputfile

    for var in sys.argv[6:]:
        op, col, do_round = var.split()
        ops.append(op)
        cols.append(col)
        round_val.append(do_round)
    """
    At this point, ops, cols and rounds will look something like this:
    ops:  ['mean', 'min', 'c']
    cols: ['1', '3', '4']
    round_val: ['no', 'yes' 'no']
    """

    try:
        group_col = int( sys.argv[3] )-1
    except:
        stop_err( "Group column not specified." )
    
    str_ops = ['c', 'length', 'unique', 'random', 'cuniq', 'Mode'] #ops that can handle string/non-numeric inputs
    
    tmpfile = tempfile.NamedTemporaryFile()
    
    try:
        """
        The -k option for the Posix sort command is as follows:
        -k, --key=POS1[,POS2]
        start a key at POS1, end it at POS2 (origin 1)
        In other words, column positions start at 1 rather than 0, so 
        we need to add 1 to group_col.
        if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
        """
        case = ''
        if ignorecase == 1:
            case = '-f' 
        command_line = "sort -t '	' %s -k%s,%s -o %s %s" % (case, group_col+1, group_col+1, tmpfile.name, inputfile)
    except Exception, exc:
        stop_err( 'Initialization error -> %s' %str(exc) )
    
    error_code, stdout = commands.getstatusoutput(command_line)
    
    if error_code != 0:
        stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
        
    fout = open(sys.argv[1], "w")
    
    def is_new_item(line):
        try:
            item = line.strip().split("\t")[group_col]
        except IndexError:
            stop_err( "The following line didn't have %s columns: %s" % (group_col+1, line) )
            
        if ignorecase == 1:
            return item.lower()
        return item
        
    for key, line_list in groupby(tmpfile, key=is_new_item):
        op_vals = [ [] for op in ops ]
        out_str = key
        multiple_modes = False
        mode_index = None
        
        for line in line_list:
            fields = line.strip().split("\t")
            for i, col in enumerate(cols):
                col = int(col)-1 # cXX from galaxy is 1-based
                try:
                    val = fields[col].strip()
                    op_vals[i].append(val)
                except IndexError:
                    sys.stderr.write( 'Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n' % (col+1, line) )
                    sys.exit( 1 )
                
        # Generate string for each op for this group
        for i, op in enumerate( ops ):
            data = op_vals[i]
            rval = ""
            if op == "mode":
                rval = mode( data )
            elif op == "length":
                rval = len( data )
            elif op == "random":
                rval = random.choice(data)
            elif op in ['cat', 'cat_uniq']:
                if op == 'cat_uniq':
                    data = numpy.unique(data)
                rval = ','.join(data)
            elif op == "unique":
                rval = len( numpy.unique(data) )
            else:
                # some kind of numpy fn
                try:
                    data = map(float, data)
                except ValueError:
                    sys.stderr.write( "Operation %s expected number values but got %s instead.\n" % (op, data) )
                    sys.exit( 1 )
                rval = getattr(numpy, op)( data )
                if round_val[i] == 'yes':
                    rval = round(rval)
                else:
                    rval = '%g' % rval
                        
            out_str += "\t%s" % rval
        
        fout.write(out_str + "\n")
    
    # Generate a useful info message.
    msg = "--Group by c%d: " %(group_col+1)
    for i, op in enumerate(ops):
        if op == 'cat':
            op = 'concat'
        elif op == 'cat_uniq':
            op = 'concat_distinct'
        elif op == 'length':
            op = 'count'
        elif op == 'unique':
            op = 'count_distinct'
        elif op == 'random':
            op = 'randomly_pick'
        
        msg += op + "[c" + cols[i] + "] "
    
    print msg
    fout.close()
    tmpfile.close()

if __name__ == "__main__":
    main()

Summary ✨

This is a Python script that performs various statistical operations on a dataset and groups the results by a specified column. It takes in several arguments, including the input file, the group column, and the operations to perform. The script uses the groupby function from the itertools module to group the data based on the specified column, and then applies the specified operations to each group. The output is written to a new file with the same name as the input file but with “_grouped” appended to it.

Tech Fingerprint

Alerts (9)

'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
10 69
'def' Ensure functions have docstrings for documentation
17 21 32 99
'open(' Use 'with open()' to ensure Files are properly closed
41 44 97