/tools/filters/grep.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 135 lines · 83 code · 22 blank · 30 comment · 17 complexity · 5f0242f8aeb42e48cc1c7eee961ac3c0 MD5 · raw file

  1. # Filename: grep.py
  2. # Author: Ian N. Schenck
  3. # Version: 8/23/2005
  4. #
  5. # This script accepts regular expressions, as well as an "invert"
  6. # option, and applies the regular expression using grep. This wrapper
  7. # provides security and pipeline.
  8. #
  9. # Grep is launched based on these inputs:
  10. # -i Input file
  11. # -o Output file
  12. # -pattern RegEx pattern
  13. # -v true or false (output NON-matching lines)
  14. import sys
  15. import os
  16. import re
  17. import string
  18. import commands
  19. from subprocess import Popen, PIPE
  20. from tempfile import NamedTemporaryFile
  21. # This function is exceedingly useful, perhaps package for reuse?
  22. def getopts(argv):
  23. opts = {}
  24. while argv:
  25. if argv[0][0] == '-':
  26. opts[argv[0]] = argv[1]
  27. argv = argv[2:]
  28. else:
  29. argv = argv[1:]
  30. return opts
  31. def main():
  32. args = sys.argv[1:]
  33. try:
  34. opts = getopts(args)
  35. except IndexError:
  36. print "Usage:"
  37. print " -i Input file"
  38. print " -o Output file"
  39. print " -pattern RegEx pattern"
  40. print " -v true or false (Invert match)"
  41. return 0
  42. outputfile = opts.get("-o")
  43. if outputfile == None:
  44. print "No output file specified."
  45. return -1
  46. inputfile = opts.get("-i")
  47. if inputfile == None:
  48. print "No input file specified."
  49. return -2
  50. invert = opts.get("-v")
  51. if invert == None:
  52. print "Match style (Invert or normal) not specified."
  53. return -3
  54. pattern = opts.get("-pattern")
  55. if pattern == None:
  56. print "RegEx pattern not specified."
  57. return -4
  58. # All inputs have been specified at this point, now validate.
  59. # replace if input has been escaped, remove sq
  60. # characters that are allowed but need to be escaped
  61. mapped_chars = { '>' :'__gt__',
  62. '<' :'__lt__',
  63. '\'':'__sq__',
  64. '"' :'__dq__',
  65. '[' :'__ob__',
  66. ']' :'__cb__',
  67. '{' :'__oc__',
  68. '}' :'__cc__'
  69. }
  70. # with new sanitizing we only need to replace for single quote,
  71. # but this needs to remain for backwards compatibility
  72. for key, value in mapped_chars.items():
  73. pattern = pattern.replace(value, key)
  74. # match filename and invert flag
  75. fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
  76. invertRegEx = re.compile("(true)|(false)")
  77. # verify that filename and inversion flag are in the correct format
  78. if not fileRegEx.match(outputfile):
  79. print "Illegal output filename."
  80. return -5
  81. if not fileRegEx.match(inputfile):
  82. print "Illegal input filename."
  83. return -6
  84. if not invertRegEx.match(invert):
  85. print "Illegal invert option."
  86. return -7
  87. # invert grep search?
  88. if invert == "true":
  89. invertflag = "-v"
  90. print "Not matching pattern: %s" % pattern
  91. else:
  92. invertflag = ""
  93. print "Matching pattern: %s" % pattern
  94. # set version flag
  95. versionflag = "-P"
  96. # MacOS 10.8.2 does not support -P option for perl-regex anymore
  97. versionmatch = Popen("grep -V | grep 'BSD'", shell=True, stdout=PIPE).communicate()[0];
  98. if versionmatch:
  99. versionflag = "-E"
  100. # create temp file holding pattern
  101. # by using a file to hold the pattern, we don't have worry about sanitizing grep commandline and can include single quotes in pattern
  102. pattern_file_name = NamedTemporaryFile().name
  103. open( pattern_file_name, 'w' ).write( pattern )
  104. # generate grep command
  105. commandline = "grep %s %s -f %s %s > %s" % ( versionflag, invertflag, pattern_file_name, inputfile, outputfile )
  106. # run grep
  107. errorcode, stdout = commands.getstatusoutput(commandline)
  108. # remove temp pattern file
  109. os.unlink( pattern_file_name )
  110. # return error code
  111. return errorcode
  112. if __name__ == "__main__":
  113. main()