PageRenderTime 50ms CodeModel.GetById 38ms app.highlight 9ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/filters/grep.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 135 lines | 99 code | 14 blank | 22 comment | 15 complexity | 5f0242f8aeb42e48cc1c7eee961ac3c0 MD5 | raw file
  1# Filename: grep.py
  2# Author: Ian N. Schenck
  3# Version: 8/23/2005
  4#
  5# This script accepts regular expressions, as well as an "invert"
  6# option, and applies the regular expression using grep.  This wrapper
  7# provides security and pipeline.
  8#
  9# Grep is launched based on these inputs:
 10# -i		Input file
 11# -o		Output file
 12# -pattern	RegEx pattern
 13# -v	        true or false (output NON-matching lines)
 14
 15import sys
 16import os
 17import re
 18import string
 19import commands
 20from subprocess import Popen, PIPE
 21from tempfile import NamedTemporaryFile
 22
 23# This function is exceedingly useful, perhaps package for reuse?
 24def getopts(argv):
 25    opts = {}
 26    while argv:
 27	if argv[0][0] == '-':
 28	    opts[argv[0]] = argv[1]
 29	    argv = argv[2:]
 30	else:
 31	    argv = argv[1:]
 32    return opts
 33
 34def main():
 35    args = sys.argv[1:]
 36
 37    try:
 38        opts = getopts(args)
 39    except IndexError:
 40        print "Usage:"
 41        print " -i		Input file"
 42        print " -o		Output file"
 43        print " -pattern	RegEx pattern"
 44        print " -v		true or false (Invert match)"
 45        return 0
 46
 47    outputfile = opts.get("-o")
 48    if outputfile == None:
 49        print "No output file specified."
 50        return -1
 51    
 52    inputfile = opts.get("-i")
 53    if inputfile == None:
 54        print "No input file specified."
 55        return -2
 56
 57    invert = opts.get("-v")
 58    if invert == None:
 59        print "Match style (Invert or normal) not specified."
 60        return -3
 61
 62    pattern = opts.get("-pattern")
 63    if pattern == None:
 64        print "RegEx pattern not specified."
 65        return -4
 66
 67    # All inputs have been specified at this point, now validate.
 68
 69    # replace if input has been escaped, remove sq
 70    # characters that are allowed but need to be escaped
 71    mapped_chars =  {   '>' :'__gt__',
 72                        '<' :'__lt__',
 73                        '\'':'__sq__',
 74                        '"' :'__dq__',
 75                        '[' :'__ob__',
 76                        ']' :'__cb__',
 77                        '{' :'__oc__',
 78                        '}' :'__cc__'
 79                    }
 80    
 81    # with new sanitizing we only need to replace for single quote,
 82    # but this needs to remain for backwards compatibility
 83    for key, value in mapped_chars.items():
 84        pattern = pattern.replace(value, key)
 85
 86    # match filename and invert flag
 87    fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
 88    invertRegEx = re.compile("(true)|(false)")
 89
 90    # verify that filename and inversion flag are in the correct format
 91    if not fileRegEx.match(outputfile):
 92        print "Illegal output filename."
 93        return -5
 94    if not fileRegEx.match(inputfile):
 95        print "Illegal input filename."
 96        return -6
 97    if not invertRegEx.match(invert):
 98        print "Illegal invert option."
 99        return -7
100
101    # invert grep search?
102    if invert == "true":
103        invertflag = "-v"
104        print "Not matching pattern: %s" % pattern
105    else:
106        invertflag = ""
107        print "Matching pattern: %s" % pattern
108
109    # set version flag
110    versionflag  = "-P"
111    
112    # MacOS 10.8.2 does not support -P option for perl-regex anymore
113    versionmatch = Popen("grep -V | grep 'BSD'", shell=True, stdout=PIPE).communicate()[0];
114    if versionmatch:
115        versionflag = "-E"
116
117    # create temp file holding pattern
118    # by using a file to hold the pattern, we don't have worry about sanitizing grep commandline and can include single quotes in pattern
119    pattern_file_name = NamedTemporaryFile().name
120    open( pattern_file_name, 'w' ).write( pattern )
121    
122    # generate grep command
123    commandline = "grep %s %s -f %s %s > %s" % ( versionflag, invertflag, pattern_file_name, inputfile, outputfile )
124    
125    # run grep
126    errorcode, stdout = commands.getstatusoutput(commandline)
127    
128    # remove temp pattern file
129    os.unlink( pattern_file_name )
130    
131    # return error code
132    return errorcode
133
134if __name__ == "__main__":
135    main()