/tools/filters/grep.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 135 lines · 83 code · 22 blank · 30 comment · 17 complexity · 5f0242f8aeb42e48cc1c7eee961ac3c0 MD5 · raw file
- # Filename: grep.py
- # Author: Ian N. Schenck
- # Version: 8/23/2005
- #
- # This script accepts regular expressions, as well as an "invert"
- # option, and applies the regular expression using grep. This wrapper
- # provides security and pipeline.
- #
- # Grep is launched based on these inputs:
- # -i Input file
- # -o Output file
- # -pattern RegEx pattern
- # -v true or false (output NON-matching lines)
- import sys
- import os
- import re
- import string
- import commands
- from subprocess import Popen, PIPE
- from tempfile import NamedTemporaryFile
- # This function is exceedingly useful, perhaps package for reuse?
- def getopts(argv):
- opts = {}
- while argv:
- if argv[0][0] == '-':
- opts[argv[0]] = argv[1]
- argv = argv[2:]
- else:
- argv = argv[1:]
- return opts
- def main():
- args = sys.argv[1:]
- try:
- opts = getopts(args)
- except IndexError:
- print "Usage:"
- print " -i Input file"
- print " -o Output file"
- print " -pattern RegEx pattern"
- print " -v true or false (Invert match)"
- return 0
- outputfile = opts.get("-o")
- if outputfile == None:
- print "No output file specified."
- return -1
-
- inputfile = opts.get("-i")
- if inputfile == None:
- print "No input file specified."
- return -2
- invert = opts.get("-v")
- if invert == None:
- print "Match style (Invert or normal) not specified."
- return -3
- pattern = opts.get("-pattern")
- if pattern == None:
- print "RegEx pattern not specified."
- return -4
- # All inputs have been specified at this point, now validate.
- # replace if input has been escaped, remove sq
- # characters that are allowed but need to be escaped
- mapped_chars = { '>' :'__gt__',
- '<' :'__lt__',
- '\'':'__sq__',
- '"' :'__dq__',
- '[' :'__ob__',
- ']' :'__cb__',
- '{' :'__oc__',
- '}' :'__cc__'
- }
-
- # with new sanitizing we only need to replace for single quote,
- # but this needs to remain for backwards compatibility
- for key, value in mapped_chars.items():
- pattern = pattern.replace(value, key)
- # match filename and invert flag
- fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
- invertRegEx = re.compile("(true)|(false)")
- # verify that filename and inversion flag are in the correct format
- if not fileRegEx.match(outputfile):
- print "Illegal output filename."
- return -5
- if not fileRegEx.match(inputfile):
- print "Illegal input filename."
- return -6
- if not invertRegEx.match(invert):
- print "Illegal invert option."
- return -7
- # invert grep search?
- if invert == "true":
- invertflag = "-v"
- print "Not matching pattern: %s" % pattern
- else:
- invertflag = ""
- print "Matching pattern: %s" % pattern
- # set version flag
- versionflag = "-P"
-
- # MacOS 10.8.2 does not support -P option for perl-regex anymore
- versionmatch = Popen("grep -V | grep 'BSD'", shell=True, stdout=PIPE).communicate()[0];
- if versionmatch:
- versionflag = "-E"
- # create temp file holding pattern
- # by using a file to hold the pattern, we don't have worry about sanitizing grep commandline and can include single quotes in pattern
- pattern_file_name = NamedTemporaryFile().name
- open( pattern_file_name, 'w' ).write( pattern )
-
- # generate grep command
- commandline = "grep %s %s -f %s %s > %s" % ( versionflag, invertflag, pattern_file_name, inputfile, outputfile )
-
- # run grep
- errorcode, stdout = commands.getstatusoutput(commandline)
-
- # remove temp pattern file
- os.unlink( pattern_file_name )
-
- # return error code
- return errorcode
- if __name__ == "__main__":
- main()