/tools/filters/uniq.py
Python | 117 lines | 88 code | 11 blank | 18 comment | 13 complexity | 0a6a294f09633743fabc8dfc7e67b611 MD5 | raw file
1# Filename: uniq.py
2# Author: Ian N. Schenck
3# Version: 19/12/2005
4#
5# This script accepts an input file, an output file, a column
6# delimiter, and a list of columns. The script then grabs unique
7# lines based on the columns, and returns those records with a count
8# of occurences of each unique column, inserted before the columns.
9#
10# This executes the command pipeline:
11# cut -f $fields | sort | uniq -C
12#
13# -i Input file
14# -o Output file
15# -d Delimiter
16# -c Column list (Comma Seperated)
17
18import sys
19import re
20import string
21import commands
22
23# This function is exceedingly useful, perhaps package for reuse?
24def getopts(argv):
25 opts = {}
26 while argv:
27 if argv[0][0] == '-':
28 opts[argv[0]] = argv[1]
29 argv = argv[2:]
30 else:
31 argv = argv[1:]
32 return opts
33
34def main():
35 args = sys.argv[1:]
36
37 try:
38 opts = getopts(args)
39 except IndexError:
40 print "Usage:"
41 print " -i Input file"
42 print " -o Output file"
43 print " -c Column list (comma seperated)"
44 print " -d Delimiter:"
45 print " T Tab"
46 print " C Comma"
47 print " D Dash"
48 print " U Underscore"
49 print " P Pipe"
50 print " Dt Dot"
51 print " Sp Space"
52 return 0
53
54 outputfile = opts.get("-o")
55 if outputfile == None:
56 print "No output file specified."
57 return -1
58
59 inputfile = opts.get("-i")
60 if inputfile == None:
61 print "No input file specified."
62 return -2
63
64 delim = opts.get("-d")
65 if delim == None:
66 print "Field delimiter not specified."
67 return -3
68
69 columns = opts.get("-c")
70 if columns == None or columns == 'None':
71 print "Columns not specified."
72 return -4
73
74 # All inputs have been specified at this point, now validate.
75 fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
76 columnRegEx = re.compile("([0-9]{1,},?)+")
77
78 if not columnRegEx.match(columns):
79 print "Illegal column specification."
80 return -4
81 if not fileRegEx.match(outputfile):
82 print "Illegal output filename."
83 return -5
84 if not fileRegEx.match(inputfile):
85 print "Illegal input filename."
86 return -6
87
88 column_list = re.split(",",columns)
89 columns_for_display = ""
90 for col in column_list:
91 columns_for_display += "c"+col+", "
92
93 commandline = "cut "
94 # Set delimiter
95 if delim=='C':
96 commandline += "-d \",\" "
97 if delim=='D':
98 commandline += "-d \"-\" "
99 if delim=='U':
100 commandline += "-d \"_\" "
101 if delim=='P':
102 commandline += "-d \"|\" "
103 if delim=='Dt':
104 commandline += "-d \".\" "
105 if delim=='Sp':
106 commandline += "-d \" \" "
107
108 # set columns
109 commandline += "-f " + columns
110 commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile
111 errorcode, stdout = commands.getstatusoutput(commandline)
112
113 print "Count of unique values in " + columns_for_display
114 return errorcode
115
116if __name__ == "__main__":
117 main()