PageRenderTime 47ms CodeModel.GetById 19ms app.highlight 24ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/filters/uniq.py

https://bitbucket.org/ialbert/galaxy-genetrack
Python | 117 lines | 88 code | 11 blank | 18 comment | 13 complexity | 0a6a294f09633743fabc8dfc7e67b611 MD5 | raw file
  1# Filename: uniq.py
  2# Author: Ian N. Schenck
  3# Version: 19/12/2005
  4#
  5# This script accepts an input file, an output file, a column
  6# delimiter, and a list of columns.  The script then grabs unique
  7# lines based on the columns, and returns those records with a count
  8# of occurences of each unique column, inserted before the columns.
  9#
 10# This executes the command pipeline:
 11#       cut -f $fields | sort  | uniq -C
 12#
 13# -i            Input file
 14# -o            Output file
 15# -d            Delimiter
 16# -c            Column list (Comma Seperated)
 17
 18import sys
 19import re
 20import string
 21import commands
 22
 23# This function is exceedingly useful, perhaps package for reuse?
 24def getopts(argv):
 25    opts = {}
 26    while argv:
 27        if argv[0][0] == '-':
 28            opts[argv[0]] = argv[1]
 29            argv = argv[2:]
 30        else:
 31            argv = argv[1:]
 32    return opts
 33
 34def main():
 35    args = sys.argv[1:]
 36
 37    try:
 38        opts = getopts(args)
 39    except IndexError:
 40        print "Usage:"
 41        print " -i        Input file"
 42        print " -o        Output file"
 43        print " -c        Column list (comma seperated)"
 44        print " -d        Delimiter:"
 45        print "                     T   Tab"
 46        print "                     C   Comma"
 47        print "                     D   Dash"
 48        print "                     U   Underscore"
 49        print "                     P   Pipe"
 50        print "                     Dt  Dot"
 51        print "                     Sp  Space"
 52        return 0
 53
 54    outputfile = opts.get("-o")
 55    if outputfile == None:
 56        print "No output file specified."
 57        return -1
 58    
 59    inputfile = opts.get("-i")
 60    if inputfile == None:
 61        print "No input file specified."
 62        return -2
 63
 64    delim = opts.get("-d")
 65    if delim == None:
 66        print "Field delimiter not specified."
 67        return -3
 68
 69    columns = opts.get("-c")
 70    if columns == None or columns == 'None':
 71        print "Columns not specified."
 72        return -4
 73
 74    # All inputs have been specified at this point, now validate.
 75    fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
 76    columnRegEx = re.compile("([0-9]{1,},?)+")
 77
 78    if not columnRegEx.match(columns):
 79        print "Illegal column specification."
 80        return -4
 81    if not fileRegEx.match(outputfile):
 82        print "Illegal output filename."
 83        return -5
 84    if not fileRegEx.match(inputfile):
 85        print "Illegal input filename."
 86        return -6
 87
 88    column_list = re.split(",",columns)
 89    columns_for_display = ""
 90    for col in column_list:
 91        columns_for_display += "c"+col+", "
 92
 93    commandline = "cut "
 94    # Set delimiter
 95    if delim=='C':
 96        commandline += "-d \",\" "
 97    if delim=='D':
 98        commandline += "-d \"-\" "
 99    if delim=='U':
100        commandline += "-d \"_\" "
101    if delim=='P':
102        commandline += "-d \"|\" "
103    if delim=='Dt':
104        commandline += "-d \".\" "
105    if delim=='Sp':
106        commandline += "-d \" \" "
107
108    # set columns
109    commandline += "-f " + columns
110    commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile
111    errorcode, stdout = commands.getstatusoutput(commandline)
112    
113    print "Count of unique values in " + columns_for_display
114    return errorcode
115
116if __name__ == "__main__":
117    main()