/tools/vcf_tools/filter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 162 lines · 118 code · 25 blank · 19 comment · 52 complexity · 8ad5363c6a3fea957daaa44448494747 MD5 · raw file

  1. #!/usr/bin/python
  2. import os.path
  3. import sys
  4. import optparse
  5. import vcfClass
  6. from vcfClass import *
  7. import tools
  8. from tools import *
  9. if __name__ == "__main__":
  10. main()
  11. def filterFail(text, file):
  12. print >> sys.stderr, text
  13. if file != None: os.remove(file)
  14. exit(1)
  15. def main():
  16. # Parse the command line options
  17. usage = "Usage: vcfPytools.py filter [options]"
  18. parser = optparse.OptionParser(usage = usage)
  19. parser.add_option("-i", "--in",
  20. action="store", type="string",
  21. dest="vcfFile", help="input vcf file")
  22. parser.add_option("-o", "--out",
  23. action="store", type="string",
  24. dest="output", help="output vcf file")
  25. parser.add_option("-q", "--quality",
  26. action="store", type="int",
  27. dest="quality", help="filter out SNPs with qualities lower than selected value")
  28. parser.add_option("-n", "--info",
  29. action="append", type="string", nargs=3,
  30. dest="infoFilters", help="filter based on entries in the info string")
  31. parser.add_option("-r", "--remove-genotypes",
  32. action="store_true", default=False,
  33. dest="removeGeno", help="remove the genotype strings from the vcf file")
  34. parser.add_option("-m", "--mark-as-pass",
  35. action="store_true", default=False,
  36. dest="markPass", help="Mark all records as having passed filters")
  37. (options, args) = parser.parse_args()
  38. # Check that a single vcf file is given.
  39. if options.vcfFile == None:
  40. parser.print_help()
  41. print >> sys.stderr, "\nInput vcf file (-i, --input) is required for vcf filtering."
  42. exit(1)
  43. # The --mark-as-pass option can only be used if no actual filters
  44. # have been specified.
  45. if options.markPass and options.infoFilters:
  46. print >> sys.stderr, "--mark-as-pass cannot be used in conjunction with filters."
  47. exit(1)
  48. # Set the output file to stdout if no output file was specified.
  49. outputFile, writeOut = setOutput(options.output) # tools.py
  50. v = vcf() # Define vcf object.
  51. # Open the vcf file.
  52. v.openVcf(options.vcfFile)
  53. # Read in the header information.
  54. v.parseHeader(options.vcfFile, writeOut)
  55. taskDescriptor = "##vcfPytools="
  56. if options.infoFilters:
  57. taskDescriptor += "filtered using the following filters: "
  58. for filter, value, logic in options.infoFilters: taskDescriptor += str(filter) + str(value) + ","
  59. taskDescriptor = taskDescriptor.rstrip(",")
  60. if options.markPass: taskDescriptor += "marked all records as PASS"
  61. writeHeader(outputFile, v, options.removeGeno, taskDescriptor)
  62. # Check that specified filters from the info field are either integers or floats.
  63. if options.infoFilters:
  64. v.processInfo = True # Process the info string
  65. filters = {}
  66. filterValues = {}
  67. filterLogic = {}
  68. for filter, value, logic in options.infoFilters:
  69. filterName = str(filter) + str(value)
  70. if "-" in filter or "-" in value or "-" in logic:
  71. print >> sys.stderr, "\n--info (-n) requires three arguments, for example:"
  72. print >> sys.stderr, "\t--info DP 5 lt: filter records with DP less than (lt) 5.\n"
  73. print >> sys.stderr, "allowed logic arguments:\n\tgt: greater than\n\tlt: less than."
  74. print >> sys.stderr, "\nError in:", filter
  75. exit(1)
  76. if logic != "gt" and logic != "lt":
  77. print >> sys.stderr, "\nfilter logic not recognised."
  78. print >> sys.stderr, "allowed logic arguments:\n\tgt: greater than\n\tlt: less than."
  79. print >> sys.stderr, "\nError in:", filter
  80. exit(1)
  81. if v.infoHeaderTags.has_key(filter):
  82. if v.infoHeaderTags[filter][1].lower() == "integer":
  83. try:
  84. filters[filterName] = filter
  85. filterValues[filterName] = int(value)
  86. filterLogic[filterName] = logic
  87. #filterLogic[filterName] = logic
  88. except ValueError:
  89. text = "Filter " + filter + " requires an integer entry, not " + str(type(value))
  90. filterFail(text, options.output)
  91. if v.infoHeaderTags[filter][1].lower() == "float":
  92. try:
  93. filters[filterName] = filter
  94. filterValues[filterName] = float(value)
  95. filterLogic[filterName] = logic
  96. #filters[filterName] = float(value)
  97. #filterLogic[filterName] = logic
  98. except ValueError:
  99. text = "Filter " + filter + " requires an float entry, not " + str(type(value))
  100. filterFail(text, options.output)
  101. else:
  102. text = "Filter " + filter + " has no explanation in the header. Unknown type for the entry."
  103. filterFail(text, options.output)
  104. # Parse the vcf file and check if any of the filters are failed. If
  105. # so, build up a string of failed filters.
  106. while v.getRecord():
  107. filterString = ""
  108. # Mark the record as "PASS" if --mark-as-pass was applied.
  109. if options.markPass: v.filters = "PASS"
  110. # Check for quality filtering.
  111. if options.quality != None:
  112. if v.quality < options.quality:
  113. filterString = filterString + ";" + "Q" + str(options.quality) if filterString != "" else "Q" + str(options.quality)
  114. # Check for filtering on info string filters.
  115. if options.infoFilters:
  116. for filterName, filter in filters.iteritems():
  117. value = filterValues[filterName]
  118. logic = filterLogic[filterName]
  119. if v.infoTags.has_key(filter):
  120. if type(value) == int:
  121. if logic == "lt" and int(v.infoTags[filter]) < value:
  122. filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value)
  123. if logic == "gt" and int(v.infoTags[filter]) > value:
  124. filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value)
  125. elif type(value) == float:
  126. if logic == "lt" and float(v.infoTags[filter]) < value:
  127. filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value)
  128. if logic == "gt" and float(v.infoTags[filter]) > value:
  129. filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value)
  130. filterString = "PASS" if filterString == "" else filterString
  131. v.filters = filterString
  132. record = v.buildRecord(options.removeGeno)
  133. outputFile.write(record)
  134. # Close the vcf files.
  135. v.closeVcf(options.vcfFile)
  136. # Terminate the program.
  137. return 0